diff --git "a/checkpoint-2709/trainer_state.json" "b/checkpoint-2709/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2709/trainer_state.json" @@ -0,0 +1,40781 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0003691569378432, + "eval_steps": 339, + "global_step": 2709, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003691569378432006, + "grad_norm": 22.75, + "kl": 0.0, + "learning_rate": 0.0, + "logits/chosen": 2555141026.909091, + "logits/rejected": 1722975436.8, + "logps/chosen": -251.8519620028409, + "logps/rejected": -332.2370361328125, + "loss": 0.5, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0007383138756864012, + "grad_norm": 27.875, + "kl": 0.0, + "learning_rate": 2.5000000000000004e-07, + "logits/chosen": 2082756500.2105262, + "logits/rejected": 2078594441.8461537, + "logps/chosen": -306.5312243009868, + "logps/rejected": -322.86951622596155, + "loss": 0.5, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.0011074708135296017, + "grad_norm": 37.0, + "kl": 0.283663272857666, + "learning_rate": 5.000000000000001e-07, + "logits/chosen": 1958897078.857143, + "logits/rejected": 1782147299.5555556, + "logps/chosen": -274.17801339285717, + "logps/rejected": -417.18321397569446, + "loss": 0.4966, + "rewards/chosen": 0.011177281183855874, + "rewards/margins": 0.018043566554311722, + "rewards/rejected": -0.006866285370455848, + "step": 3 + }, + { + "epoch": 0.0014766277513728024, + "grad_norm": 35.25, + "kl": 0.2572214603424072, + "learning_rate": 7.5e-07, + "logits/chosen": 1230686354.2857144, + "logits/rejected": 1339229525.3333333, + "logps/chosen": -289.994384765625, + "logps/rejected": -427.45372178819446, + "loss": 0.4953, + "rewards/chosen": -0.0008347396339688982, + "rewards/margins": 0.004339981646764846, + "rewards/rejected": -0.005174721280733745, + "step": 4 + }, + { + "epoch": 0.001845784689216003, + "grad_norm": 32.0, + "kl": 0.27948784828186035, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": 2612449437.5384617, + "logits/rejected": 1402594142.3157895, + "logps/chosen": -226.52452674278845, + "logps/rejected": -396.50986842105266, + "loss": 0.4904, + "rewards/chosen": -0.013464003801345825, + "rewards/margins": 0.04047517556893198, + "rewards/rejected": -0.053939179370277805, + "step": 5 + }, + { + "epoch": 0.0022149416270592034, + "grad_norm": 31.5, + "kl": 0.05804014205932617, + "learning_rate": 1.25e-06, + "logits/chosen": 2320796435.6923075, + "logits/rejected": 1748089802.1052632, + "logps/chosen": -256.34786283052887, + "logps/rejected": -338.63111636513156, + "loss": 0.4988, + "rewards/chosen": 0.011130671088512126, + "rewards/margins": 0.015404057074413608, + "rewards/rejected": -0.004273385985901481, + "step": 6 + }, + { + "epoch": 0.0025840985649024043, + "grad_norm": 27.875, + "kl": 0.17689180374145508, + "learning_rate": 1.5e-06, + "logits/chosen": 2760544451.047619, + "logits/rejected": 2041941457.4545455, + "logps/chosen": -296.3612583705357, + "logps/rejected": -362.95363547585225, + "loss": 0.5007, + "rewards/chosen": -0.021251624538784937, + "rewards/margins": 0.02772714849158283, + "rewards/rejected": -0.04897877303036777, + "step": 7 + }, + { + "epoch": 0.002953255502745605, + "grad_norm": 28.125, + "kl": 0.29871606826782227, + "learning_rate": 1.75e-06, + "logits/chosen": 1756430131.2, + "logits/rejected": 1819027626.6666667, + "logps/chosen": -257.7052734375, + "logps/rejected": -455.510498046875, + "loss": 0.4882, + "rewards/chosen": 0.020974960923194886, + "rewards/margins": 0.12168754835923512, + "rewards/rejected": -0.10071258743604024, + "step": 8 + }, + { + "epoch": 0.0033224124405888053, + "grad_norm": 32.75, + "kl": 0.10448455810546875, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": 1686372522.6666667, + "logits/rejected": 1566218342.4, + "logps/chosen": -356.6879069010417, + "logps/rejected": -333.321240234375, + "loss": 0.4925, + "rewards/chosen": -0.01961027830839157, + "rewards/margins": 0.04176507145166397, + "rewards/rejected": -0.06137534976005554, + "step": 9 + }, + { + "epoch": 0.003691569378432006, + "grad_norm": 28.375, + "kl": 0.027356624603271484, + "learning_rate": 2.25e-06, + "logits/chosen": 1681367142.4, + "logits/rejected": 1744424618.6666667, + "logps/chosen": -232.4369384765625, + "logps/rejected": -382.1427408854167, + "loss": 0.4852, + "rewards/chosen": 0.022665101289749145, + "rewards/margins": 0.14383291999499004, + "rewards/rejected": -0.12116781870524089, + "step": 10 + }, + { + "epoch": 0.004060726316275206, + "grad_norm": 27.875, + "kl": 0.0, + "learning_rate": 2.5e-06, + "logits/chosen": 1780963930.3529413, + "logits/rejected": 1877864311.4666667, + "logps/chosen": -329.39720243566177, + "logps/rejected": -287.20159505208335, + "loss": 0.4781, + "rewards/chosen": 0.013715656364665312, + "rewards/margins": 0.18694175855786194, + "rewards/rejected": -0.17322610219319662, + "step": 11 + }, + { + "epoch": 0.004429883254118407, + "grad_norm": 24.625, + "kl": 0.0, + "learning_rate": 2.7500000000000004e-06, + "logits/chosen": 1958743868.952381, + "logits/rejected": 1672948642.909091, + "logps/chosen": -238.36486235119048, + "logps/rejected": -335.138427734375, + "loss": 0.477, + "rewards/chosen": 0.003446015573683239, + "rewards/margins": 0.26674821172957813, + "rewards/rejected": -0.2633021961558949, + "step": 12 + }, + { + "epoch": 0.004799040191961607, + "grad_norm": 27.5, + "kl": 0.0, + "learning_rate": 3e-06, + "logits/chosen": 1642847501.4736843, + "logits/rejected": 1847050712.6153846, + "logps/chosen": -254.55658922697367, + "logps/rejected": -344.00338040865387, + "loss": 0.469, + "rewards/chosen": 0.006821201820122569, + "rewards/margins": 0.30787281848882375, + "rewards/rejected": -0.30105161666870117, + "step": 13 + }, + { + "epoch": 0.005168197129804809, + "grad_norm": 27.75, + "kl": 0.0, + "learning_rate": 3.2500000000000002e-06, + "logits/chosen": 1689543378.8235295, + "logits/rejected": 2345390353.0666666, + "logps/chosen": -319.5962775735294, + "logps/rejected": -290.24296875, + "loss": 0.4407, + "rewards/chosen": 0.0822614080765668, + "rewards/margins": 0.5117390361486697, + "rewards/rejected": -0.4294776280721029, + "step": 14 + }, + { + "epoch": 0.005537354067648009, + "grad_norm": 30.125, + "kl": 0.0, + "learning_rate": 3.5e-06, + "logits/chosen": 1949523727.0588236, + "logits/rejected": 1690669875.2, + "logps/chosen": -269.87184053308823, + "logps/rejected": -450.46067708333334, + "loss": 0.4273, + "rewards/chosen": 0.012773087796042948, + "rewards/margins": 0.6631298891469544, + "rewards/rejected": -0.6503568013509115, + "step": 15 + }, + { + "epoch": 0.00590651100549121, + "grad_norm": 25.25, + "kl": 0.0, + "learning_rate": 3.7500000000000005e-06, + "logits/chosen": 1412717977.6, + "logits/rejected": 1897293653.3333333, + "logps/chosen": -212.9364013671875, + "logps/rejected": -375.4115804036458, + "loss": 0.4311, + "rewards/chosen": 0.04288797378540039, + "rewards/margins": 0.7666228294372559, + "rewards/rejected": -0.7237348556518555, + "step": 16 + }, + { + "epoch": 0.00627566794333441, + "grad_norm": 27.375, + "kl": 0.0, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": 2409784320.0, + "logits/rejected": 2257375232.0, + "logps/chosen": -280.689453125, + "logps/rejected": -499.2878689236111, + "loss": 0.3772, + "rewards/chosen": 0.07687444346291679, + "rewards/margins": 1.128713424243624, + "rewards/rejected": -1.0518389807807074, + "step": 17 + }, + { + "epoch": 0.006644824881177611, + "grad_norm": 25.125, + "kl": 0.0, + "learning_rate": 4.25e-06, + "logits/chosen": 1563148288.0, + "logits/rejected": 2064093952.0, + "logps/chosen": -276.1801452636719, + "logps/rejected": -396.57135009765625, + "loss": 0.3865, + "rewards/chosen": 0.053327564150094986, + "rewards/margins": 1.0444594658911228, + "rewards/rejected": -0.9911319017410278, + "step": 18 + }, + { + "epoch": 0.007013981819020811, + "grad_norm": 22.75, + "kl": 0.0, + "learning_rate": 4.5e-06, + "logits/chosen": 1444451388.235294, + "logits/rejected": 2488889617.0666666, + "logps/chosen": -223.35120346966912, + "logps/rejected": -440.9142252604167, + "loss": 0.3669, + "rewards/chosen": 0.012831165510065416, + "rewards/margins": 1.4073760212636461, + "rewards/rejected": -1.3945448557535807, + "step": 19 + }, + { + "epoch": 0.007383138756864012, + "grad_norm": 21.5, + "kl": 0.0, + "learning_rate": 4.75e-06, + "logits/chosen": 1629098586.3529413, + "logits/rejected": 1632897706.6666667, + "logps/chosen": -239.83998736213235, + "logps/rejected": -396.71429036458335, + "loss": 0.3576, + "rewards/chosen": 0.07079794126398423, + "rewards/margins": 1.4907841574911977, + "rewards/rejected": -1.4199862162272134, + "step": 20 + }, + { + "epoch": 0.007752295694707212, + "grad_norm": 20.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2269135667.2, + "logits/rejected": 2437404310.5882354, + "logps/chosen": -308.4866536458333, + "logps/rejected": -420.2818244485294, + "loss": 0.3174, + "rewards/chosen": 0.01794281005859375, + "rewards/margins": 1.9573586856617646, + "rewards/rejected": -1.939415875603171, + "step": 21 + }, + { + "epoch": 0.008121452632550413, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 5.2500000000000006e-06, + "logits/chosen": 1493249417.8461537, + "logits/rejected": 1447019250.5263157, + "logps/chosen": -288.2017352764423, + "logps/rejected": -401.0751182154605, + "loss": 0.2948, + "rewards/chosen": 0.049938201904296875, + "rewards/margins": 2.054342169510691, + "rewards/rejected": -2.004403967606394, + "step": 22 + }, + { + "epoch": 0.008490609570393614, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 5.500000000000001e-06, + "logits/chosen": 1338189824.0, + "logits/rejected": 1443653745.7777777, + "logps/chosen": -225.77328055245536, + "logps/rejected": -396.80059136284723, + "loss": 0.2875, + "rewards/chosen": 0.1072447555405753, + "rewards/margins": 2.363371159349169, + "rewards/rejected": -2.2561264038085938, + "step": 23 + }, + { + "epoch": 0.008859766508236814, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 5.75e-06, + "logits/chosen": 2125301174.857143, + "logits/rejected": 1851773610.6666667, + "logps/chosen": -331.76150948660717, + "logps/rejected": -425.1775716145833, + "loss": 0.2708, + "rewards/chosen": 0.101840112890516, + "rewards/margins": 2.6721992369682073, + "rewards/rejected": -2.570359124077691, + "step": 24 + }, + { + "epoch": 0.009228923446080015, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 6e-06, + "logits/chosen": 1630805196.8, + "logits/rejected": 1687169024.0, + "logps/chosen": -299.4375244140625, + "logps/rejected": -309.2835693359375, + "loss": 0.3601, + "rewards/chosen": 0.09011529088020324, + "rewards/margins": 2.1960324267546336, + "rewards/rejected": -2.10591713587443, + "step": 25 + }, + { + "epoch": 0.009598080383923215, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 6.25e-06, + "logits/chosen": 2366178157.714286, + "logits/rejected": 1714451456.0, + "logps/chosen": -359.56455775669644, + "logps/rejected": -432.9134928385417, + "loss": 0.262, + "rewards/chosen": 0.025659620761871338, + "rewards/margins": 3.4077053666114807, + "rewards/rejected": -3.3820457458496094, + "step": 26 + }, + { + "epoch": 0.009967237321766416, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 6.5000000000000004e-06, + "logits/chosen": 2193663122.285714, + "logits/rejected": 1679756856.8888888, + "logps/chosen": -235.818115234375, + "logps/rejected": -456.0146484375, + "loss": 0.2258, + "rewards/chosen": 0.21129742690495082, + "rewards/margins": 4.282008774696834, + "rewards/rejected": -4.0707113477918835, + "step": 27 + }, + { + "epoch": 0.010336394259609617, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 6.750000000000001e-06, + "logits/chosen": 1532497042.2857144, + "logits/rejected": 1563654371.5555556, + "logps/chosen": -345.4580078125, + "logps/rejected": -528.5141059027778, + "loss": 0.2312, + "rewards/chosen": 0.08127937146595546, + "rewards/margins": 4.695724527987222, + "rewards/rejected": -4.614445156521267, + "step": 28 + }, + { + "epoch": 0.010705551197452817, + "grad_norm": 15.375, + "kl": 0.0, + "learning_rate": 7e-06, + "logits/chosen": 1362604513.8823528, + "logits/rejected": 1938225561.6, + "logps/chosen": -291.52001953125, + "logps/rejected": -444.8460286458333, + "loss": 0.2666, + "rewards/chosen": 0.24524837381699505, + "rewards/margins": 4.950824443966735, + "rewards/rejected": -4.70557607014974, + "step": 29 + }, + { + "epoch": 0.011074708135296018, + "grad_norm": 23.5, + "kl": 0.0, + "learning_rate": 7.25e-06, + "logits/chosen": 1780472320.0, + "logits/rejected": 2500088832.0, + "logps/chosen": -293.9621175130208, + "logps/rejected": -485.8729248046875, + "loss": 0.3438, + "rewards/chosen": 0.3926080067952474, + "rewards/margins": 4.688485463460286, + "rewards/rejected": -4.295877456665039, + "step": 30 + }, + { + "epoch": 0.011443865073139218, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 7.500000000000001e-06, + "logits/chosen": 1438574464.0, + "logits/rejected": 1654970112.0, + "logps/chosen": -245.86178588867188, + "logps/rejected": -390.7449035644531, + "loss": 0.2853, + "rewards/chosen": 0.16007846593856812, + "rewards/margins": 4.362633645534515, + "rewards/rejected": -4.202555179595947, + "step": 31 + }, + { + "epoch": 0.01181302201098242, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 7.75e-06, + "logits/chosen": 2016926720.0, + "logits/rejected": 1886617827.5555556, + "logps/chosen": -271.82486397879467, + "logps/rejected": -485.7682834201389, + "loss": 0.2217, + "rewards/chosen": 0.2042757272720337, + "rewards/margins": 6.365384380022685, + "rewards/rejected": -6.161108652750651, + "step": 32 + }, + { + "epoch": 0.012182178948825619, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 8.000000000000001e-06, + "logits/chosen": 1638726784.0, + "logits/rejected": 2490309632.0, + "logps/chosen": -334.6903076171875, + "logps/rejected": -540.9254150390625, + "loss": 0.2856, + "rewards/chosen": 0.1194494366645813, + "rewards/margins": 5.167553722858429, + "rewards/rejected": -5.048104286193848, + "step": 33 + }, + { + "epoch": 0.01255133588666882, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 8.25e-06, + "logits/chosen": 2277872876.3076925, + "logits/rejected": 1634631895.5789473, + "logps/chosen": -161.47468449519232, + "logps/rejected": -396.28759765625, + "loss": 0.186, + "rewards/chosen": 1.003878666804387, + "rewards/margins": 5.884395923691723, + "rewards/rejected": -4.880517256887336, + "step": 34 + }, + { + "epoch": 0.01292049282451202, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 8.5e-06, + "logits/chosen": 1731094407.5294118, + "logits/rejected": 1873154184.5333333, + "logps/chosen": -224.08095415900735, + "logps/rejected": -430.52314453125, + "loss": 0.226, + "rewards/chosen": 0.5978934344123391, + "rewards/margins": 6.125739707198798, + "rewards/rejected": -5.527846272786459, + "step": 35 + }, + { + "epoch": 0.013289649762355221, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 8.750000000000001e-06, + "logits/chosen": 1384088868.5714285, + "logits/rejected": 2011694876.4444444, + "logps/chosen": -226.98458426339286, + "logps/rejected": -484.6765407986111, + "loss": 0.2063, + "rewards/chosen": 0.6680049896240234, + "rewards/margins": 6.333443323771159, + "rewards/rejected": -5.665438334147136, + "step": 36 + }, + { + "epoch": 0.013658806700198423, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 9e-06, + "logits/chosen": 1342492416.0, + "logits/rejected": 1389898496.0, + "logps/chosen": -318.62762451171875, + "logps/rejected": -474.1055908203125, + "loss": 0.2231, + "rewards/chosen": 0.7867870330810547, + "rewards/margins": 6.662944316864014, + "rewards/rejected": -5.876157283782959, + "step": 37 + }, + { + "epoch": 0.014027963638041622, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.250000000000001e-06, + "logits/chosen": 1467727510.5882354, + "logits/rejected": 1590144477.8666666, + "logps/chosen": -284.17790670955884, + "logps/rejected": -361.7457682291667, + "loss": 0.287, + "rewards/chosen": 0.3623030325945686, + "rewards/margins": 4.944236626344568, + "rewards/rejected": -4.58193359375, + "step": 38 + }, + { + "epoch": 0.014397120575884824, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 9.5e-06, + "logits/chosen": 2600348785.7777777, + "logits/rejected": 1680918089.142857, + "logps/chosen": -322.96826171875, + "logps/rejected": -401.3741978236607, + "loss": 0.3034, + "rewards/chosen": 0.0875967608557807, + "rewards/margins": 5.292944094491383, + "rewards/rejected": -5.205347333635602, + "step": 39 + }, + { + "epoch": 0.014766277513728023, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 9.75e-06, + "logits/chosen": 1276814592.0, + "logits/rejected": 1708352896.0, + "logps/chosen": -316.9996643066406, + "logps/rejected": -516.47412109375, + "loss": 0.2507, + "rewards/chosen": 0.17344313859939575, + "rewards/margins": 6.922157108783722, + "rewards/rejected": -6.748713970184326, + "step": 40 + }, + { + "epoch": 0.015135434451571225, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 1e-05, + "logits/chosen": 1520733605.6470587, + "logits/rejected": 1560093081.6, + "logps/chosen": -274.39734604779414, + "logps/rejected": -492.22047526041666, + "loss": 0.2438, + "rewards/chosen": 0.5887775421142578, + "rewards/margins": 6.674698257446289, + "rewards/rejected": -6.085920715332032, + "step": 41 + }, + { + "epoch": 0.015504591389414424, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.999996536281763e-06, + "logits/chosen": 1878332022.1538463, + "logits/rejected": 1688040933.0526316, + "logps/chosen": -324.23475060096155, + "logps/rejected": -385.46425267269734, + "loss": 0.2469, + "rewards/chosen": -0.3026095170241136, + "rewards/margins": 5.725915182939907, + "rewards/rejected": -6.028524699964021, + "step": 42 + }, + { + "epoch": 0.015873748327257624, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.999986145131847e-06, + "logits/chosen": 2449532791.4666667, + "logits/rejected": 2073857325.1764705, + "logps/chosen": -319.5615559895833, + "logps/rejected": -430.82238051470586, + "loss": 0.2306, + "rewards/chosen": 0.5627494176228841, + "rewards/margins": 5.6860094668818455, + "rewards/rejected": -5.1232600492589615, + "step": 43 + }, + { + "epoch": 0.016242905265100825, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 9.999968826564655e-06, + "logits/chosen": 1568372736.0, + "logits/rejected": 1516755353.6, + "logps/chosen": -255.79056803385416, + "logps/rejected": -429.844091796875, + "loss": 0.1592, + "rewards/chosen": 0.7009109656016032, + "rewards/margins": 6.3849418799082445, + "rewards/rejected": -5.684030914306641, + "step": 44 + }, + { + "epoch": 0.016612062202944027, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 9.999944580604174e-06, + "logits/chosen": 1150999990.857143, + "logits/rejected": 1218021376.0, + "logps/chosen": -256.4312453497024, + "logps/rejected": -359.18845436789775, + "loss": 0.2696, + "rewards/chosen": 0.9469997769310361, + "rewards/margins": 5.699052430850602, + "rewards/rejected": -4.752052653919566, + "step": 45 + }, + { + "epoch": 0.016981219140787228, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.999913407284001e-06, + "logits/chosen": 2178425072.9411764, + "logits/rejected": 1514750225.0666666, + "logps/chosen": -290.7824276194853, + "logps/rejected": -377.14329427083334, + "loss": 0.2103, + "rewards/chosen": 1.0636055890251608, + "rewards/margins": 5.781053505691828, + "rewards/rejected": -4.717447916666667, + "step": 46 + }, + { + "epoch": 0.01735037607863043, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 9.999875306647327e-06, + "logits/chosen": 2119525034.6666667, + "logits/rejected": 1734975926.857143, + "logps/chosen": -304.3030598958333, + "logps/rejected": -406.95706612723217, + "loss": 0.267, + "rewards/chosen": 0.4767913818359375, + "rewards/margins": 6.05388913835798, + "rewards/rejected": -5.5770977565220425, + "step": 47 + }, + { + "epoch": 0.017719533016473627, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 9.999830278746938e-06, + "logits/chosen": 1849448448.0, + "logits/rejected": 1809036288.0, + "logps/chosen": -308.7493896484375, + "logps/rejected": -417.7481282552083, + "loss": 0.2787, + "rewards/chosen": 0.49034552574157714, + "rewards/margins": 6.644803253809611, + "rewards/rejected": -6.154457728068034, + "step": 48 + }, + { + "epoch": 0.01808868995431683, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 9.99977832364522e-06, + "logits/chosen": 1877806622.1176472, + "logits/rejected": 1842179959.4666667, + "logps/chosen": -271.62867647058823, + "logps/rejected": -498.37220052083336, + "loss": 0.2441, + "rewards/chosen": 0.5726045159732595, + "rewards/margins": 7.073316592796177, + "rewards/rejected": -6.500712076822917, + "step": 49 + }, + { + "epoch": 0.01845784689216003, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 9.999719441414155e-06, + "logits/chosen": 1857280186.1818182, + "logits/rejected": 1451943014.4, + "logps/chosen": -306.71364524147725, + "logps/rejected": -391.633154296875, + "loss": 0.3183, + "rewards/chosen": 0.33392316644841974, + "rewards/margins": 5.827245157415216, + "rewards/rejected": -5.4933219909667965, + "step": 50 + }, + { + "epoch": 0.01882700383000323, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 9.999653632135325e-06, + "logits/chosen": 1494967847.3846154, + "logits/rejected": 1591785579.7894738, + "logps/chosen": -250.60661433293268, + "logps/rejected": -404.4912623355263, + "loss": 0.2167, + "rewards/chosen": 0.15932913926931527, + "rewards/margins": 6.253341349512942, + "rewards/rejected": -6.094012210243626, + "step": 51 + }, + { + "epoch": 0.01919616076784643, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 9.999580895899908e-06, + "logits/chosen": 1338781286.4, + "logits/rejected": 1373899682.909091, + "logps/chosen": -306.9797607421875, + "logps/rejected": -356.29341264204544, + "loss": 0.1584, + "rewards/chosen": 0.7750319480895996, + "rewards/margins": 6.473371236974543, + "rewards/rejected": -5.698339288884943, + "step": 52 + }, + { + "epoch": 0.01956531770568963, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 9.999501232808678e-06, + "logits/chosen": 1816235101.090909, + "logits/rejected": 1874532556.8, + "logps/chosen": -229.54740767045453, + "logps/rejected": -438.01259765625, + "loss": 0.2747, + "rewards/chosen": 0.9943005821921609, + "rewards/margins": 6.325253798744895, + "rewards/rejected": -5.330953216552734, + "step": 53 + }, + { + "epoch": 0.019934474643532832, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 9.99941464297201e-06, + "logits/chosen": 1768479690.1052632, + "logits/rejected": 2750227062.1538463, + "logps/chosen": -260.1775544819079, + "logps/rejected": -534.6139948918269, + "loss": 0.2363, + "rewards/chosen": 0.7120265960693359, + "rewards/margins": 7.150468532855694, + "rewards/rejected": -6.438441936786358, + "step": 54 + }, + { + "epoch": 0.020303631581376033, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.99932112650987e-06, + "logits/chosen": 1212389717.3333333, + "logits/rejected": 1421195264.0, + "logps/chosen": -186.72550455729166, + "logps/rejected": -445.15792410714283, + "loss": 0.1812, + "rewards/chosen": 1.3645939297146268, + "rewards/margins": 7.265176379491413, + "rewards/rejected": -5.900582449776786, + "step": 55 + }, + { + "epoch": 0.020672788519219235, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 9.999220683551823e-06, + "logits/chosen": 1872288587.2941177, + "logits/rejected": 1455958698.6666667, + "logps/chosen": -325.3602941176471, + "logps/rejected": -391.61220703125, + "loss": 0.2451, + "rewards/chosen": 0.7901762233060949, + "rewards/margins": 5.693230015623803, + "rewards/rejected": -4.903053792317708, + "step": 56 + }, + { + "epoch": 0.021041945457062432, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 9.999113314237036e-06, + "logits/chosen": 1607395012.9230769, + "logits/rejected": 1758834472.4210527, + "logps/chosen": -224.738037109375, + "logps/rejected": -384.92041015625, + "loss": 0.2056, + "rewards/chosen": 0.5672861979557917, + "rewards/margins": 5.63627519877816, + "rewards/rejected": -5.068989000822368, + "step": 57 + }, + { + "epoch": 0.021411102394905634, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 9.998999018714264e-06, + "logits/chosen": 2056694442.6666667, + "logits/rejected": 2724740827.428571, + "logps/chosen": -221.12300618489584, + "logps/rejected": -595.9167131696429, + "loss": 0.2111, + "rewards/chosen": 1.227534082200792, + "rewards/margins": 8.258503974430145, + "rewards/rejected": -7.030969892229352, + "step": 58 + }, + { + "epoch": 0.021780259332748835, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 9.998877797141864e-06, + "logits/chosen": 2459097560.6153846, + "logits/rejected": 2502096572.631579, + "logps/chosen": -306.51461087740387, + "logps/rejected": -569.040861430921, + "loss": 0.1967, + "rewards/chosen": 0.23113008645864633, + "rewards/margins": 7.162394863391213, + "rewards/rejected": -6.931264776932566, + "step": 59 + }, + { + "epoch": 0.022149416270592037, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 9.998749649687784e-06, + "logits/chosen": 1715101559.4666667, + "logits/rejected": 1654963862.5882354, + "logps/chosen": -281.77239583333335, + "logps/rejected": -463.271484375, + "loss": 0.2371, + "rewards/chosen": 0.246796719233195, + "rewards/margins": 6.170544062408746, + "rewards/rejected": -5.923747343175552, + "step": 60 + }, + { + "epoch": 0.022518573208435234, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 9.998614576529575e-06, + "logits/chosen": 1463421269.3333333, + "logits/rejected": 1394006630.4, + "logps/chosen": -364.2314453125, + "logps/rejected": -443.4783203125, + "loss": 0.192, + "rewards/chosen": 0.3698062101999919, + "rewards/margins": 5.932623211542766, + "rewards/rejected": -5.562817001342774, + "step": 61 + }, + { + "epoch": 0.022887730146278436, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 9.998472577854377e-06, + "logits/chosen": 2008661196.8, + "logits/rejected": 1713685082.3529413, + "logps/chosen": -243.009765625, + "logps/rejected": -311.3816348805147, + "loss": 0.2156, + "rewards/chosen": 0.4195224444071452, + "rewards/margins": 5.323240831786511, + "rewards/rejected": -4.9037183873793655, + "step": 62 + }, + { + "epoch": 0.023256887084121637, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.998323653858927e-06, + "logits/chosen": 1677440000.0, + "logits/rejected": 1899185720.8888888, + "logps/chosen": -295.54966517857144, + "logps/rejected": -423.97398546006946, + "loss": 0.223, + "rewards/chosen": 0.3550265516553606, + "rewards/margins": 5.674383681917948, + "rewards/rejected": -5.319357130262587, + "step": 63 + }, + { + "epoch": 0.02362604402196484, + "grad_norm": 13.3125, + "kl": 0.0580594539642334, + "learning_rate": 9.998167804749557e-06, + "logits/chosen": 1694137472.0, + "logits/rejected": 2233405184.0, + "logps/chosen": -286.14202880859375, + "logps/rejected": -451.09979248046875, + "loss": 0.1781, + "rewards/chosen": 1.5691519975662231, + "rewards/margins": 7.318243622779846, + "rewards/rejected": -5.749091625213623, + "step": 64 + }, + { + "epoch": 0.02399520095980804, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.998005030742195e-06, + "logits/chosen": 2032204619.2941177, + "logits/rejected": 2416886033.0666666, + "logps/chosen": -190.55962775735293, + "logps/rejected": -468.30091145833336, + "loss": 0.2329, + "rewards/chosen": 0.6395805583280676, + "rewards/margins": 7.008909375059838, + "rewards/rejected": -6.3693288167317705, + "step": 65 + }, + { + "epoch": 0.024364357897651238, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 9.997835332062362e-06, + "logits/chosen": 1805828232.5333333, + "logits/rejected": 1742596698.3529413, + "logps/chosen": -236.96793619791666, + "logps/rejected": -455.2342888327206, + "loss": 0.2032, + "rewards/chosen": 0.6867312113444011, + "rewards/margins": 5.979020496443207, + "rewards/rejected": -5.292289285098805, + "step": 66 + }, + { + "epoch": 0.02473351483549444, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 9.997658708945173e-06, + "logits/chosen": 1767146973.8666666, + "logits/rejected": 1543355331.764706, + "logps/chosen": -280.15100911458336, + "logps/rejected": -375.7935431985294, + "loss": 0.1851, + "rewards/chosen": 1.3260906219482422, + "rewards/margins": 5.505223868874943, + "rewards/rejected": -4.179133246926701, + "step": 67 + }, + { + "epoch": 0.02510267177333764, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 9.997475161635339e-06, + "logits/chosen": 2129411449.2631578, + "logits/rejected": 1924609575.3846154, + "logps/chosen": -314.2047697368421, + "logps/rejected": -495.8671123798077, + "loss": 0.2577, + "rewards/chosen": 0.7498713041606703, + "rewards/margins": 6.013877698767041, + "rewards/rejected": -5.26400639460637, + "step": 68 + }, + { + "epoch": 0.025471828711180842, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.99728469038716e-06, + "logits/chosen": 1612928068.2666667, + "logits/rejected": 1459199638.5882354, + "logps/chosen": -279.89801432291665, + "logps/rejected": -508.3785615808824, + "loss": 0.1614, + "rewards/chosen": 1.44455935160319, + "rewards/margins": 6.4668663548488245, + "rewards/rejected": -5.0223070032456345, + "step": 69 + }, + { + "epoch": 0.02584098564902404, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 9.99708729546453e-06, + "logits/chosen": 1801447544.4705882, + "logits/rejected": 1763752209.0666666, + "logps/chosen": -230.23594037224265, + "logps/rejected": -364.57679036458336, + "loss": 0.182, + "rewards/chosen": 1.579604653751149, + "rewards/margins": 5.4770597420486755, + "rewards/rejected": -3.897455088297526, + "step": 70 + }, + { + "epoch": 0.02621014258686724, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.996882977140942e-06, + "logits/chosen": 2398131200.0, + "logits/rejected": 1937977600.0, + "logps/chosen": -227.99656677246094, + "logps/rejected": -473.57281494140625, + "loss": 0.1918, + "rewards/chosen": 1.0035066604614258, + "rewards/margins": 6.597690582275391, + "rewards/rejected": -5.594183921813965, + "step": 71 + }, + { + "epoch": 0.026579299524710442, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 9.996671735699473e-06, + "logits/chosen": 2165318314.6666665, + "logits/rejected": 1767100708.5714285, + "logps/chosen": -236.366455078125, + "logps/rejected": -349.53194754464283, + "loss": 0.2196, + "rewards/chosen": 1.3846684561835394, + "rewards/margins": 5.1746383091760055, + "rewards/rejected": -3.7899698529924666, + "step": 72 + }, + { + "epoch": 0.026948456462553644, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 9.996453571432797e-06, + "logits/chosen": 1467223186.2857144, + "logits/rejected": 1655437425.7777777, + "logps/chosen": -301.57247488839283, + "logps/rejected": -497.4978841145833, + "loss": 0.196, + "rewards/chosen": 0.5845005171639579, + "rewards/margins": 6.067470013149201, + "rewards/rejected": -5.482969495985243, + "step": 73 + }, + { + "epoch": 0.027317613400396845, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 9.996228484643176e-06, + "logits/chosen": 2630569984.0, + "logits/rejected": 1748253459.6923077, + "logps/chosen": -333.8983604029605, + "logps/rejected": -617.9084660456731, + "loss": 0.265, + "rewards/chosen": 0.42280814522191096, + "rewards/margins": 6.57642835161464, + "rewards/rejected": -6.153620206392729, + "step": 74 + }, + { + "epoch": 0.027686770338240043, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 9.995996475642466e-06, + "logits/chosen": 1252732928.0, + "logits/rejected": 1389756416.0, + "logps/chosen": -229.048583984375, + "logps/rejected": -433.85942925347223, + "loss": 0.1986, + "rewards/chosen": 0.9273087637765067, + "rewards/margins": 5.176066444033668, + "rewards/rejected": -4.248757680257161, + "step": 75 + }, + { + "epoch": 0.028055927276083244, + "grad_norm": 10.6875, + "kl": 2.4262642860412598, + "learning_rate": 9.995757544752114e-06, + "logits/chosen": 1759469158.4, + "logits/rejected": 2334256911.0588236, + "logps/chosen": -276.42845052083334, + "logps/rejected": -583.1529181985294, + "loss": 0.1751, + "rewards/chosen": 1.7980982462565105, + "rewards/margins": 8.691719354367724, + "rewards/rejected": -6.893621108111213, + "step": 76 + }, + { + "epoch": 0.028425084213926446, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 9.995511692303153e-06, + "logits/chosen": 1861607303.5294118, + "logits/rejected": 1953125853.8666666, + "logps/chosen": -343.29859834558823, + "logps/rejected": -479.34270833333335, + "loss": 0.2532, + "rewards/chosen": 0.3977852709153119, + "rewards/margins": 5.109022860433542, + "rewards/rejected": -4.7112375895182295, + "step": 77 + }, + { + "epoch": 0.028794241151769647, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 9.995258918636209e-06, + "logits/chosen": 1292165963.2941177, + "logits/rejected": 1305069704.5333333, + "logps/chosen": -263.4663947610294, + "logps/rejected": -461.77392578125, + "loss": 0.1798, + "rewards/chosen": 1.2859969419591568, + "rewards/margins": 7.121590610578949, + "rewards/rejected": -5.835593668619792, + "step": 78 + }, + { + "epoch": 0.029163398089612845, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 9.994999224101498e-06, + "logits/chosen": 2465983536.7619047, + "logits/rejected": 1553448587.6363637, + "logps/chosen": -321.2909691220238, + "logps/rejected": -356.27543501420456, + "loss": 0.2672, + "rewards/chosen": 0.5891254515874953, + "rewards/margins": 5.790280680635791, + "rewards/rejected": -5.201155229048296, + "step": 79 + }, + { + "epoch": 0.029532555027456046, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 9.994732609058824e-06, + "logits/chosen": 1922855680.0, + "logits/rejected": 1678234112.0, + "logps/chosen": -312.1918029785156, + "logps/rejected": -335.2796630859375, + "loss": 0.2394, + "rewards/chosen": 0.39574065804481506, + "rewards/margins": 4.852455765008926, + "rewards/rejected": -4.456715106964111, + "step": 80 + }, + { + "epoch": 0.029901711965299248, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 9.994459073877577e-06, + "logits/chosen": 1455374043.4285715, + "logits/rejected": 1726403470.2222223, + "logps/chosen": -254.13539341517858, + "logps/rejected": -398.309814453125, + "loss": 0.2246, + "rewards/chosen": 0.7208952222551618, + "rewards/margins": 5.393262318202427, + "rewards/rejected": -4.672367095947266, + "step": 81 + }, + { + "epoch": 0.03027086890314245, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 9.994178618936736e-06, + "logits/chosen": 1985200990.3157895, + "logits/rejected": 2359929462.1538463, + "logps/chosen": -319.4258583470395, + "logps/rejected": -365.87804236778845, + "loss": 0.2709, + "rewards/chosen": 0.6256721396195261, + "rewards/margins": 5.31967443782791, + "rewards/rejected": -4.6940022982083836, + "step": 82 + }, + { + "epoch": 0.03064002584098565, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.99389124462487e-06, + "logits/chosen": 2027066274.909091, + "logits/rejected": 2257553700.571429, + "logps/chosen": -230.66288618607953, + "logps/rejected": -506.45814732142856, + "loss": 0.2274, + "rewards/chosen": -0.09618000550703569, + "rewards/margins": 5.4014923025519295, + "rewards/rejected": -5.497672308058966, + "step": 83 + }, + { + "epoch": 0.03100918277882885, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.99359695134013e-06, + "logits/chosen": 1777332087.4666667, + "logits/rejected": 2275405824.0, + "logps/chosen": -311.42467447916664, + "logps/rejected": -558.0045955882352, + "loss": 0.2086, + "rewards/chosen": 0.5744876861572266, + "rewards/margins": 6.198888105504653, + "rewards/rejected": -5.624400419347427, + "step": 84 + }, + { + "epoch": 0.03137833971667205, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 9.993295739490259e-06, + "logits/chosen": 1831358610.2857144, + "logits/rejected": 2195003164.4444447, + "logps/chosen": -238.24769810267858, + "logps/rejected": -477.46875, + "loss": 0.1829, + "rewards/chosen": 0.7761190959385463, + "rewards/margins": 6.587837082999093, + "rewards/rejected": -5.811717987060547, + "step": 85 + }, + { + "epoch": 0.03174749665451525, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 9.992987609492578e-06, + "logits/chosen": 2042234752.0, + "logits/rejected": 2022515072.0, + "logps/chosen": -362.3477478027344, + "logps/rejected": -459.9891357421875, + "loss": 0.241, + "rewards/chosen": 0.5852454900741577, + "rewards/margins": 5.293747305870056, + "rewards/rejected": -4.708501815795898, + "step": 86 + }, + { + "epoch": 0.03211665359235845, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.992672561774001e-06, + "logits/chosen": 1858031616.0, + "logits/rejected": 2070205147.4285715, + "logps/chosen": -252.31770833333334, + "logps/rejected": -453.39222935267856, + "loss": 0.2913, + "rewards/chosen": 0.04599475860595703, + "rewards/margins": 5.78010926927839, + "rewards/rejected": -5.734114510672433, + "step": 87 + }, + { + "epoch": 0.03248581053020165, + "grad_norm": 13.0625, + "kl": 0.045295000076293945, + "learning_rate": 9.99235059677102e-06, + "logits/chosen": 2844576861.090909, + "logits/rejected": 2324120722.285714, + "logps/chosen": -318.09841086647725, + "logps/rejected": -529.4165736607143, + "loss": 0.1681, + "rewards/chosen": 0.7012535442005504, + "rewards/margins": 6.182985140647723, + "rewards/rejected": -5.4817315964471724, + "step": 88 + }, + { + "epoch": 0.03285496746804485, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.992021714929714e-06, + "logits/chosen": 1851180800.0, + "logits/rejected": 1355009536.0, + "logps/chosen": -294.095947265625, + "logps/rejected": -489.3984069824219, + "loss": 0.2363, + "rewards/chosen": 0.6235851049423218, + "rewards/margins": 5.485925078392029, + "rewards/rejected": -4.862339973449707, + "step": 89 + }, + { + "epoch": 0.03322412440588805, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 9.991685916705748e-06, + "logits/chosen": 2079479020.3076923, + "logits/rejected": 1698660783.1578948, + "logps/chosen": -352.05911959134613, + "logps/rejected": -490.8021175986842, + "loss": 0.185, + "rewards/chosen": 0.6049078061030462, + "rewards/margins": 7.19097018724511, + "rewards/rejected": -6.586062381142064, + "step": 90 + }, + { + "epoch": 0.033593281343731254, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 9.991343202564358e-06, + "logits/chosen": 1390758518.1538463, + "logits/rejected": 2231418233.263158, + "logps/chosen": -308.30014272836536, + "logps/rejected": -411.1084755345395, + "loss": 0.1712, + "rewards/chosen": 0.9770946502685547, + "rewards/margins": 5.507719441464073, + "rewards/rejected": -4.530624791195518, + "step": 91 + }, + { + "epoch": 0.033962438281574456, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 9.99099357298038e-06, + "logits/chosen": 2862299721.142857, + "logits/rejected": 1536312888.8888888, + "logps/chosen": -252.69503348214286, + "logps/rejected": -442.3046061197917, + "loss": 0.1883, + "rewards/chosen": 0.718670300074986, + "rewards/margins": 6.826822477673727, + "rewards/rejected": -6.1081521775987415, + "step": 92 + }, + { + "epoch": 0.03433159521941766, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 9.990637028438213e-06, + "logits/chosen": 2640026112.0, + "logits/rejected": 2393911808.0, + "logps/chosen": -349.05731201171875, + "logps/rejected": -311.5893249511719, + "loss": 0.2436, + "rewards/chosen": 0.2633194029331207, + "rewards/margins": 5.2781175673007965, + "rewards/rejected": -5.014798164367676, + "step": 93 + }, + { + "epoch": 0.03470075215726086, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 9.99027356943185e-06, + "logits/chosen": 2197273856.0, + "logits/rejected": 2080851072.0, + "logps/chosen": -329.997802734375, + "logps/rejected": -505.42755126953125, + "loss": 0.2367, + "rewards/chosen": 0.7300698757171631, + "rewards/margins": 4.740393400192261, + "rewards/rejected": -4.010323524475098, + "step": 94 + }, + { + "epoch": 0.03506990909510405, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 9.989903196464858e-06, + "logits/chosen": 1441355776.0, + "logits/rejected": 1445555260.235294, + "logps/chosen": -252.445556640625, + "logps/rejected": -368.59607651654414, + "loss": 0.2342, + "rewards/chosen": 0.3057329813639323, + "rewards/margins": 6.0151827494303385, + "rewards/rejected": -5.709449768066406, + "step": 95 + }, + { + "epoch": 0.035439066032947254, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 9.989525910050382e-06, + "logits/chosen": 2195819264.0, + "logits/rejected": 2371022336.0, + "logps/chosen": -253.74082946777344, + "logps/rejected": -425.7666931152344, + "loss": 0.2066, + "rewards/chosen": 0.9258164763450623, + "rewards/margins": 5.9472731947898865, + "rewards/rejected": -5.021456718444824, + "step": 96 + }, + { + "epoch": 0.035808222970790456, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 9.989141710711149e-06, + "logits/chosen": 2114243697.7777777, + "logits/rejected": 2152229449.142857, + "logps/chosen": -242.63430447048611, + "logps/rejected": -409.2127162388393, + "loss": 0.2407, + "rewards/chosen": 0.9091711044311523, + "rewards/margins": 6.099010603768485, + "rewards/rejected": -5.1898394993373325, + "step": 97 + }, + { + "epoch": 0.03617737990863366, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.988750598979464e-06, + "logits/chosen": 1479391232.0, + "logits/rejected": 1785386188.8, + "logps/chosen": -227.9235636393229, + "logps/rejected": -475.828369140625, + "loss": 0.2022, + "rewards/chosen": 0.6630279223124186, + "rewards/margins": 5.525640074412029, + "rewards/rejected": -4.86261215209961, + "step": 98 + }, + { + "epoch": 0.03654653684647686, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 9.988352575397204e-06, + "logits/chosen": 1519752192.0, + "logits/rejected": 1145407658.6666667, + "logps/chosen": -227.961572265625, + "logps/rejected": -361.073974609375, + "loss": 0.2889, + "rewards/chosen": 0.8448988914489746, + "rewards/margins": 5.649965635935466, + "rewards/rejected": -4.805066744486491, + "step": 99 + }, + { + "epoch": 0.03691569378432006, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 9.987947640515827e-06, + "logits/chosen": 1711863053.4736843, + "logits/rejected": 1618657280.0, + "logps/chosen": -296.8956877055921, + "logps/rejected": -406.9450871394231, + "loss": 0.2558, + "rewards/chosen": 0.6898695293225741, + "rewards/margins": 5.727167291679845, + "rewards/rejected": -5.037297762357271, + "step": 100 + }, + { + "epoch": 0.03728485072216326, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 9.987535794896366e-06, + "logits/chosen": 1989321614.2222223, + "logits/rejected": 2097976173.7142856, + "logps/chosen": -317.0855305989583, + "logps/rejected": -452.925537109375, + "loss": 0.2217, + "rewards/chosen": 0.755950927734375, + "rewards/margins": 5.925982339041574, + "rewards/rejected": -5.170031411307199, + "step": 101 + }, + { + "epoch": 0.03765400766000646, + "grad_norm": 17.25, + "kl": 0.0, + "learning_rate": 9.987117039109427e-06, + "logits/chosen": 2183288832.0, + "logits/rejected": 2276737024.0, + "logps/chosen": -403.1256103515625, + "logps/rejected": -362.5157775878906, + "loss": 0.2504, + "rewards/chosen": 0.5024189352989197, + "rewards/margins": 5.7441834807395935, + "rewards/rejected": -5.241764545440674, + "step": 102 + }, + { + "epoch": 0.038023164597849664, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 9.986691373735191e-06, + "logits/chosen": 1648751796.7058823, + "logits/rejected": 1363656430.9333334, + "logps/chosen": -224.86160098805146, + "logps/rejected": -446.74371744791665, + "loss": 0.2626, + "rewards/chosen": 0.37699444153729605, + "rewards/margins": 5.768255224414901, + "rewards/rejected": -5.3912607828776045, + "step": 103 + }, + { + "epoch": 0.03839232153569286, + "grad_norm": 18.0, + "kl": 0.12965011596679688, + "learning_rate": 9.986258799363412e-06, + "logits/chosen": 1755156870.0952382, + "logits/rejected": 2008543976.7272727, + "logps/chosen": -313.30245535714283, + "logps/rejected": -560.2453835227273, + "loss": 0.3148, + "rewards/chosen": 0.4358068193708147, + "rewards/margins": 7.7045175626680455, + "rewards/rejected": -7.26871074329723, + "step": 104 + }, + { + "epoch": 0.03876147847353606, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 9.985819316593416e-06, + "logits/chosen": 2025366186.6666667, + "logits/rejected": 1718867236.5714285, + "logps/chosen": -294.03716362847223, + "logps/rejected": -334.9393833705357, + "loss": 0.1871, + "rewards/chosen": 1.2875531514485676, + "rewards/margins": 6.514278048560732, + "rewards/rejected": -5.226724897112165, + "step": 105 + }, + { + "epoch": 0.03913063541137926, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.9853729260341e-06, + "logits/chosen": 1558354261.3333333, + "logits/rejected": 1251716505.6, + "logps/chosen": -367.9998372395833, + "logps/rejected": -378.8102294921875, + "loss": 0.183, + "rewards/chosen": 0.6617204745610555, + "rewards/margins": 5.560971268018086, + "rewards/rejected": -4.899250793457031, + "step": 106 + }, + { + "epoch": 0.03949979234922246, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 9.984919628303934e-06, + "logits/chosen": 1908911786.6666667, + "logits/rejected": 2275983155.2, + "logps/chosen": -304.4996337890625, + "logps/rejected": -526.97568359375, + "loss": 0.1993, + "rewards/chosen": 0.5054636001586914, + "rewards/margins": 7.350333595275879, + "rewards/rejected": -6.844869995117188, + "step": 107 + }, + { + "epoch": 0.039868949287065664, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 9.984459424030958e-06, + "logits/chosen": 1900433817.6, + "logits/rejected": 1773175125.3333333, + "logps/chosen": -347.9761962890625, + "logps/rejected": -506.8036295572917, + "loss": 0.2913, + "rewards/chosen": 0.3913978815078735, + "rewards/margins": 6.795141625404358, + "rewards/rejected": -6.403743743896484, + "step": 108 + }, + { + "epoch": 0.040238106224908865, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 9.983992313852776e-06, + "logits/chosen": 1554206935.5789473, + "logits/rejected": 2160550675.6923075, + "logps/chosen": -282.8433388157895, + "logps/rejected": -630.1656400240385, + "loss": 0.2778, + "rewards/chosen": 0.6889306118613795, + "rewards/margins": 8.18255595929227, + "rewards/rejected": -7.493625347430889, + "step": 109 + }, + { + "epoch": 0.040607263162752066, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 9.983518298416564e-06, + "logits/chosen": 1846246570.6666667, + "logits/rejected": 2043037081.6, + "logps/chosen": -342.9548746744792, + "logps/rejected": -384.856884765625, + "loss": 0.1835, + "rewards/chosen": 0.5253825982411703, + "rewards/margins": 5.616027816136678, + "rewards/rejected": -5.090645217895508, + "step": 110 + }, + { + "epoch": 0.04097642010059527, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 9.983037378379064e-06, + "logits/chosen": 1713726805.3333333, + "logits/rejected": 2360315494.4, + "logps/chosen": -226.1729939778646, + "logps/rejected": -502.5017578125, + "loss": 0.1747, + "rewards/chosen": 1.201509157816569, + "rewards/margins": 6.545764605204265, + "rewards/rejected": -5.344255447387695, + "step": 111 + }, + { + "epoch": 0.04134557703843847, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 9.982549554406585e-06, + "logits/chosen": 2131049050.3529413, + "logits/rejected": 1641782476.8, + "logps/chosen": -328.17038143382354, + "logps/rejected": -680.3260416666667, + "loss": 0.2105, + "rewards/chosen": 1.273947323069853, + "rewards/margins": 8.76672228644876, + "rewards/rejected": -7.492774963378906, + "step": 112 + }, + { + "epoch": 0.041714733976281664, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 9.982054827175e-06, + "logits/chosen": 1623167522.1333334, + "logits/rejected": 2323720312.470588, + "logps/chosen": -246.65091145833333, + "logps/rejected": -316.68014705882354, + "loss": 0.216, + "rewards/chosen": 0.693026606241862, + "rewards/margins": 5.106558638927983, + "rewards/rejected": -4.413532032686121, + "step": 113 + }, + { + "epoch": 0.042083890914124865, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 9.981553197369752e-06, + "logits/chosen": 1396239667.2, + "logits/rejected": 2098433194.6666667, + "logps/chosen": -263.24794921875, + "logps/rejected": -456.585693359375, + "loss": 0.2394, + "rewards/chosen": 1.0839725494384767, + "rewards/margins": 8.15456288655599, + "rewards/rejected": -7.070590337117513, + "step": 114 + }, + { + "epoch": 0.042453047851968066, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.981044665685834e-06, + "logits/chosen": 2574838945.6842103, + "logits/rejected": 2421310070.1538463, + "logps/chosen": -282.8310032894737, + "logps/rejected": -489.0692608173077, + "loss": 0.2382, + "rewards/chosen": 0.7279243469238281, + "rewards/margins": 7.7287577115572414, + "rewards/rejected": -7.000833364633413, + "step": 115 + }, + { + "epoch": 0.04282220478981127, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 9.980529232827819e-06, + "logits/chosen": 2209159529.4117646, + "logits/rejected": 2453668386.133333, + "logps/chosen": -337.8021024816176, + "logps/rejected": -496.6529947916667, + "loss": 0.2489, + "rewards/chosen": 0.5998899796429802, + "rewards/margins": 7.659581345202876, + "rewards/rejected": -7.059691365559896, + "step": 116 + }, + { + "epoch": 0.04319136172765447, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 9.980006899509827e-06, + "logits/chosen": 1529360896.0, + "logits/rejected": 1943247104.0, + "logps/chosen": -228.32723999023438, + "logps/rejected": -532.2776489257812, + "loss": 0.2304, + "rewards/chosen": 0.42655372619628906, + "rewards/margins": 8.87204360961914, + "rewards/rejected": -8.445489883422852, + "step": 117 + }, + { + "epoch": 0.04356051866549767, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 9.979477666455547e-06, + "logits/chosen": 2176278528.0, + "logits/rejected": 1787879796.3636363, + "logps/chosen": -305.897998046875, + "logps/rejected": -459.18825461647725, + "loss": 0.1549, + "rewards/chosen": 1.167719554901123, + "rewards/margins": 6.292212893746116, + "rewards/rejected": -5.124493338844993, + "step": 118 + }, + { + "epoch": 0.04392967560334087, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 9.978941534398224e-06, + "logits/chosen": 1797277559.4666667, + "logits/rejected": 1434951198.1176472, + "logps/chosen": -277.3956705729167, + "logps/rejected": -396.1455939797794, + "loss": 0.2444, + "rewards/chosen": 0.7151554743448894, + "rewards/margins": 4.542776934305827, + "rewards/rejected": -3.8276214599609375, + "step": 119 + }, + { + "epoch": 0.04429883254118407, + "grad_norm": 15.125, + "kl": 0.0, + "learning_rate": 9.978398504080661e-06, + "logits/chosen": 2059608064.0, + "logits/rejected": 1970344072.5333333, + "logps/chosen": -335.3508731617647, + "logps/rejected": -447.71790364583336, + "loss": 0.2586, + "rewards/chosen": 0.3449854289784151, + "rewards/margins": 6.147816951602112, + "rewards/rejected": -5.8028315226236975, + "step": 120 + }, + { + "epoch": 0.044667989479027274, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.97784857625522e-06, + "logits/chosen": 1657897062.4, + "logits/rejected": 1288226389.3333333, + "logps/chosen": -281.0583984375, + "logps/rejected": -521.1830240885416, + "loss": 0.2349, + "rewards/chosen": 0.9630319595336914, + "rewards/margins": 6.114233207702637, + "rewards/rejected": -5.151201248168945, + "step": 121 + }, + { + "epoch": 0.04503714641687047, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.977291751683821e-06, + "logits/chosen": 1902853059.764706, + "logits/rejected": 1981701188.2666667, + "logps/chosen": -260.3522518382353, + "logps/rejected": -531.2846354166667, + "loss": 0.1971, + "rewards/chosen": 1.098404603845933, + "rewards/margins": 8.396848207361558, + "rewards/rejected": -7.298443603515625, + "step": 122 + }, + { + "epoch": 0.04540630335471367, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.976728031137936e-06, + "logits/chosen": 1758258107.7333333, + "logits/rejected": 1697665385.4117646, + "logps/chosen": -299.39443359375, + "logps/rejected": -451.6884765625, + "loss": 0.2204, + "rewards/chosen": 0.6505784352620443, + "rewards/margins": 6.2329601886225685, + "rewards/rejected": -5.582381753360524, + "step": 123 + }, + { + "epoch": 0.04577546029255687, + "grad_norm": 15.125, + "kl": 0.4802436828613281, + "learning_rate": 9.976157415398591e-06, + "logits/chosen": 1585688791.5789473, + "logits/rejected": 1639487645.5384614, + "logps/chosen": -292.54111842105266, + "logps/rejected": -412.7202899639423, + "loss": 0.2459, + "rewards/chosen": 0.9496644672594572, + "rewards/margins": 7.109580097893471, + "rewards/rejected": -6.159915630634014, + "step": 124 + }, + { + "epoch": 0.04614461723040007, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.97557990525637e-06, + "logits/chosen": 2208791066.9473686, + "logits/rejected": 1935981016.6153846, + "logps/chosen": -258.21718236019734, + "logps/rejected": -463.4070012019231, + "loss": 0.2218, + "rewards/chosen": 0.8023242448505602, + "rewards/margins": 7.352701723816906, + "rewards/rejected": -6.550377478966346, + "step": 125 + }, + { + "epoch": 0.046513774168243274, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 9.974995501511404e-06, + "logits/chosen": 1565059218.2857144, + "logits/rejected": 1764496042.6666667, + "logps/chosen": -235.54947335379464, + "logps/rejected": -480.8943142361111, + "loss": 0.1247, + "rewards/chosen": 1.7268641335623605, + "rewards/margins": 8.040589075239877, + "rewards/rejected": -6.313724941677517, + "step": 126 + }, + { + "epoch": 0.046882931106086476, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 9.974404204973376e-06, + "logits/chosen": 1453867349.3333333, + "logits/rejected": 1547031405.7142856, + "logps/chosen": -247.05194769965277, + "logps/rejected": -496.84354073660717, + "loss": 0.2602, + "rewards/chosen": 0.30126484235127765, + "rewards/margins": 7.139560915174938, + "rewards/rejected": -6.838296072823661, + "step": 127 + }, + { + "epoch": 0.04725208804392968, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 9.973806016461522e-06, + "logits/chosen": 1617054019.368421, + "logits/rejected": 1849859465.8461537, + "logps/chosen": -307.8926552220395, + "logps/rejected": -358.07504507211536, + "loss": 0.2861, + "rewards/chosen": 0.4247779846191406, + "rewards/margins": 4.838160588191106, + "rewards/rejected": -4.413382603571965, + "step": 128 + }, + { + "epoch": 0.04762124498177288, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 9.973200936804624e-06, + "logits/chosen": 1714075648.0, + "logits/rejected": 1633756598.857143, + "logps/chosen": -297.0863986545139, + "logps/rejected": -408.29282924107144, + "loss": 0.2623, + "rewards/chosen": 0.28452467918395996, + "rewards/margins": 6.9117099557604105, + "rewards/rejected": -6.6271852765764505, + "step": 129 + }, + { + "epoch": 0.04799040191961608, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 9.972588966841013e-06, + "logits/chosen": 2087618921.4117646, + "logits/rejected": 2081517431.4666667, + "logps/chosen": -365.6372644761029, + "logps/rejected": -327.20940755208335, + "loss": 0.2334, + "rewards/chosen": 0.5694898717543658, + "rewards/margins": 5.698076704436657, + "rewards/rejected": -5.128586832682291, + "step": 130 + }, + { + "epoch": 0.048359558857459274, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.971970107418562e-06, + "logits/chosen": 1364215808.0, + "logits/rejected": 1746171264.0, + "logps/chosen": -304.3335876464844, + "logps/rejected": -462.2740783691406, + "loss": 0.1771, + "rewards/chosen": 1.2956247329711914, + "rewards/margins": 7.219841957092285, + "rewards/rejected": -5.924217224121094, + "step": 131 + }, + { + "epoch": 0.048728715795302475, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 9.971344359394696e-06, + "logits/chosen": 1409807488.0, + "logits/rejected": 1437029376.0, + "logps/chosen": -252.76441955566406, + "logps/rejected": -393.7583821614583, + "loss": 0.1415, + "rewards/chosen": 0.2677656412124634, + "rewards/margins": 5.021211584409078, + "rewards/rejected": -4.753445943196614, + "step": 132 + }, + { + "epoch": 0.04909787273314568, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 9.970711723636382e-06, + "logits/chosen": 1882551680.0, + "logits/rejected": 2164357632.0, + "logps/chosen": -285.0903015136719, + "logps/rejected": -545.5709228515625, + "loss": 0.1789, + "rewards/chosen": 0.8549647331237793, + "rewards/margins": 7.554940223693848, + "rewards/rejected": -6.699975490570068, + "step": 133 + }, + { + "epoch": 0.04946702967098888, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 9.970072201020127e-06, + "logits/chosen": 2046003200.0, + "logits/rejected": 2017410951.5294118, + "logps/chosen": -340.6033203125, + "logps/rejected": -371.1426355698529, + "loss": 0.2222, + "rewards/chosen": 0.304661496480306, + "rewards/margins": 5.604309108210545, + "rewards/rejected": -5.299647611730239, + "step": 134 + }, + { + "epoch": 0.04983618660883208, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 9.969425792431982e-06, + "logits/chosen": 1472845004.8, + "logits/rejected": 1534854609.4545455, + "logps/chosen": -242.8343994140625, + "logps/rejected": -487.8655894886364, + "loss": 0.1384, + "rewards/chosen": 0.5666323184967041, + "rewards/margins": 7.705569141561335, + "rewards/rejected": -7.138936823064631, + "step": 135 + }, + { + "epoch": 0.05020534354667528, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 9.968772498767537e-06, + "logits/chosen": 2157815398.4, + "logits/rejected": 2144619178.6666667, + "logps/chosen": -264.1074462890625, + "logps/rejected": -421.3930257161458, + "loss": 0.2877, + "rewards/chosen": 0.45233545303344724, + "rewards/margins": 6.125611082712809, + "rewards/rejected": -5.673275629679362, + "step": 136 + }, + { + "epoch": 0.05057450048451848, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.96811232093192e-06, + "logits/chosen": 1840451993.6, + "logits/rejected": 2367653767.529412, + "logps/chosen": -271.489404296875, + "logps/rejected": -547.3714958639706, + "loss": 0.2329, + "rewards/chosen": 0.22568483352661134, + "rewards/margins": 7.247097402460435, + "rewards/rejected": -7.021412568933823, + "step": 137 + }, + { + "epoch": 0.050943657422361684, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.967445259839805e-06, + "logits/chosen": 1211621760.0, + "logits/rejected": 1779529728.0, + "logps/chosen": -287.6078796386719, + "logps/rejected": -522.9850463867188, + "loss": 0.1964, + "rewards/chosen": 1.0404701232910156, + "rewards/margins": 6.724546432495117, + "rewards/rejected": -5.684076309204102, + "step": 138 + }, + { + "epoch": 0.051312814360204885, + "grad_norm": 15.375, + "kl": 0.0, + "learning_rate": 9.966771316415391e-06, + "logits/chosen": 1176933429.8947368, + "logits/rejected": 1345533479.3846154, + "logps/chosen": -273.7075709292763, + "logps/rejected": -454.73399939903845, + "loss": 0.2286, + "rewards/chosen": 0.9516277313232422, + "rewards/margins": 7.460347542395959, + "rewards/rejected": -6.508719811072717, + "step": 139 + }, + { + "epoch": 0.05168197129804808, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 9.966090491592422e-06, + "logits/chosen": 1860881588.7058823, + "logits/rejected": 1663990988.8, + "logps/chosen": -195.26230755974265, + "logps/rejected": -377.9548828125, + "loss": 0.1918, + "rewards/chosen": 1.3310065549962662, + "rewards/margins": 6.553645511701995, + "rewards/rejected": -5.222638956705729, + "step": 140 + }, + { + "epoch": 0.05205112823589128, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.96540278631417e-06, + "logits/chosen": 1410664174.9333334, + "logits/rejected": 2254645850.352941, + "logps/chosen": -325.3301106770833, + "logps/rejected": -404.6810087316176, + "loss": 0.1979, + "rewards/chosen": 1.0128177642822265, + "rewards/margins": 6.171277147180893, + "rewards/rejected": -5.158459382898667, + "step": 141 + }, + { + "epoch": 0.05242028517373448, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 9.964708201533441e-06, + "logits/chosen": 1835615800.8888888, + "logits/rejected": 1733942272.0, + "logps/chosen": -299.5881618923611, + "logps/rejected": -423.69688197544644, + "loss": 0.228, + "rewards/chosen": 0.9985835817125108, + "rewards/margins": 6.859615250239297, + "rewards/rejected": -5.861031668526786, + "step": 142 + }, + { + "epoch": 0.052789442111577684, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.964006738212574e-06, + "logits/chosen": 1723572224.0, + "logits/rejected": 1716936817.7777777, + "logps/chosen": -316.1324986049107, + "logps/rejected": -445.02039930555554, + "loss": 0.1667, + "rewards/chosen": 0.8416002137320382, + "rewards/margins": 6.770903277018713, + "rewards/rejected": -5.9293030632866754, + "step": 143 + }, + { + "epoch": 0.053158599049420885, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.963298397323443e-06, + "logits/chosen": 1438611828.3636363, + "logits/rejected": 1188689432.3809524, + "logps/chosen": -272.90640536221593, + "logps/rejected": -352.6194661458333, + "loss": 0.1765, + "rewards/chosen": 0.30639527060768823, + "rewards/margins": 5.695366508516915, + "rewards/rejected": -5.388971237909226, + "step": 144 + }, + { + "epoch": 0.053527755987264086, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 9.962583179847436e-06, + "logits/chosen": 1521943473.2307692, + "logits/rejected": 1792270443.7894738, + "logps/chosen": -295.5730731670673, + "logps/rejected": -548.1491570723684, + "loss": 0.2084, + "rewards/chosen": 0.18294525146484375, + "rewards/margins": 8.166454515959087, + "rewards/rejected": -7.983509264494243, + "step": 145 + }, + { + "epoch": 0.05389691292510729, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 9.961861086775483e-06, + "logits/chosen": 2049235787.2941177, + "logits/rejected": 2662717849.6, + "logps/chosen": -279.9469784007353, + "logps/rejected": -405.4477864583333, + "loss": 0.2471, + "rewards/chosen": 0.6134995853199678, + "rewards/margins": 7.04523837519627, + "rewards/rejected": -6.431738789876302, + "step": 146 + }, + { + "epoch": 0.05426606986295049, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 9.961132119108036e-06, + "logits/chosen": 1196804949.3333333, + "logits/rejected": 1285277081.6, + "logps/chosen": -300.0386555989583, + "logps/rejected": -423.7908203125, + "loss": 0.1142, + "rewards/chosen": 1.710240364074707, + "rewards/margins": 7.602990531921387, + "rewards/rejected": -5.89275016784668, + "step": 147 + }, + { + "epoch": 0.05463522680079369, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 9.960396277855067e-06, + "logits/chosen": 2119628920.4705882, + "logits/rejected": 2497171182.9333334, + "logps/chosen": -256.9636661305147, + "logps/rejected": -425.97486979166666, + "loss": 0.2647, + "rewards/chosen": 0.25106113097246957, + "rewards/margins": 6.095615269156063, + "rewards/rejected": -5.844554138183594, + "step": 148 + }, + { + "epoch": 0.055004383738636885, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.959653564036077e-06, + "logits/chosen": 2687640098.133333, + "logits/rejected": 1640455830.5882354, + "logps/chosen": -258.4703125, + "logps/rejected": -438.32292624080884, + "loss": 0.218, + "rewards/chosen": 0.7572470347086588, + "rewards/margins": 5.740309778849284, + "rewards/rejected": -4.983062744140625, + "step": 149 + }, + { + "epoch": 0.055373540676480086, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 9.958903978680086e-06, + "logits/chosen": 2401418003.6923075, + "logits/rejected": 1632841943.5789473, + "logps/chosen": -260.27152193509613, + "logps/rejected": -426.1464072779605, + "loss": 0.1984, + "rewards/chosen": 0.3203093455387996, + "rewards/margins": 5.774446244181892, + "rewards/rejected": -5.4541368986430925, + "step": 150 + }, + { + "epoch": 0.05574269761432329, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 9.958147522825634e-06, + "logits/chosen": 1990153637.6470587, + "logits/rejected": 1811087086.9333334, + "logps/chosen": -309.74445657169116, + "logps/rejected": -384.76666666666665, + "loss": 0.2156, + "rewards/chosen": 0.7645788753733915, + "rewards/margins": 5.694301470588235, + "rewards/rejected": -4.929722595214844, + "step": 151 + }, + { + "epoch": 0.05611185455216649, + "grad_norm": 15.375, + "kl": 0.0, + "learning_rate": 9.957384197520782e-06, + "logits/chosen": 1929592832.0, + "logits/rejected": 1732439722.6666667, + "logps/chosen": -286.061572265625, + "logps/rejected": -467.577392578125, + "loss": 0.263, + "rewards/chosen": 0.690336275100708, + "rewards/margins": 7.056879091262817, + "rewards/rejected": -6.366542816162109, + "step": 152 + }, + { + "epoch": 0.05648101149000969, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 9.956614003823107e-06, + "logits/chosen": 2206007296.0, + "logits/rejected": 1737327104.0, + "logps/chosen": -335.0188293457031, + "logps/rejected": -432.6441345214844, + "loss": 0.2165, + "rewards/chosen": 0.5905541777610779, + "rewards/margins": 7.540182054042816, + "rewards/rejected": -6.949627876281738, + "step": 153 + }, + { + "epoch": 0.05685016842785289, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 9.955836942799704e-06, + "logits/chosen": 1556232055.4666667, + "logits/rejected": 1265479318.5882354, + "logps/chosen": -239.9814453125, + "logps/rejected": -373.8742244944853, + "loss": 0.1936, + "rewards/chosen": 1.0524824778238933, + "rewards/margins": 5.694553472481521, + "rewards/rejected": -4.642070994657629, + "step": 154 + }, + { + "epoch": 0.05721932536569609, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 9.955053015527178e-06, + "logits/chosen": 1438291558.4, + "logits/rejected": 1913495913.4117646, + "logps/chosen": -264.23564453125, + "logps/rejected": -581.5668658088235, + "loss": 0.1318, + "rewards/chosen": 1.6814014434814453, + "rewards/margins": 8.076269284416647, + "rewards/rejected": -6.394867840935202, + "step": 155 + }, + { + "epoch": 0.057588482303539294, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 9.954262223091654e-06, + "logits/chosen": 1976908458.6666667, + "logits/rejected": 1801096285.090909, + "logps/chosen": -268.6873837425595, + "logps/rejected": -384.91725852272725, + "loss": 0.2974, + "rewards/chosen": 0.6272712889171782, + "rewards/margins": 5.663436352948606, + "rewards/rejected": -5.036165064031428, + "step": 156 + }, + { + "epoch": 0.057957639241382496, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 9.953464566588762e-06, + "logits/chosen": 2200508602.181818, + "logits/rejected": 1757060339.8095238, + "logps/chosen": -276.67189719460225, + "logps/rejected": -454.5063709077381, + "loss": 0.1173, + "rewards/chosen": 1.2091949636285955, + "rewards/margins": 8.021958722696676, + "rewards/rejected": -6.812763759068081, + "step": 157 + }, + { + "epoch": 0.05832679617922569, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 9.952660047123647e-06, + "logits/chosen": 1398910520.8888888, + "logits/rejected": 1652572013.7142856, + "logps/chosen": -277.05620659722223, + "logps/rejected": -458.380859375, + "loss": 0.187, + "rewards/chosen": 1.2973305384318035, + "rewards/margins": 6.729965164547875, + "rewards/rejected": -5.432634626116071, + "step": 158 + }, + { + "epoch": 0.05869595311706889, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 9.95184866581096e-06, + "logits/chosen": 1755749106.5263157, + "logits/rejected": 964426988.3076923, + "logps/chosen": -292.1627775493421, + "logps/rejected": -317.9696514423077, + "loss": 0.2828, + "rewards/chosen": 0.6497310839201275, + "rewards/margins": 4.934784923970458, + "rewards/rejected": -4.28505384005033, + "step": 159 + }, + { + "epoch": 0.05906511005491209, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.951030423774858e-06, + "logits/chosen": 2644732011.7894735, + "logits/rejected": 1906819859.6923077, + "logps/chosen": -289.92007606907896, + "logps/rejected": -536.0785381610577, + "loss": 0.2074, + "rewards/chosen": 0.7927007173237047, + "rewards/margins": 7.4333603430373465, + "rewards/rejected": -6.640659625713642, + "step": 160 + }, + { + "epoch": 0.059434266992755294, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 9.950205322149007e-06, + "logits/chosen": 1669616071.1111112, + "logits/rejected": 1819049837.7142856, + "logps/chosen": -280.81385633680554, + "logps/rejected": -491.8168247767857, + "loss": 0.248, + "rewards/chosen": 0.6197273466322157, + "rewards/margins": 7.244415631369939, + "rewards/rejected": -6.624688284737723, + "step": 161 + }, + { + "epoch": 0.059803423930598495, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 9.949373362076572e-06, + "logits/chosen": 1531173774.2222223, + "logits/rejected": 1973284864.0, + "logps/chosen": -306.318603515625, + "logps/rejected": -511.62479073660717, + "loss": 0.2675, + "rewards/chosen": 0.12365718682607015, + "rewards/margins": 6.651160064197722, + "rewards/rejected": -6.527502877371652, + "step": 162 + }, + { + "epoch": 0.0601725808684417, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 9.948534544710228e-06, + "logits/chosen": 2057528413.090909, + "logits/rejected": 1475775244.1904762, + "logps/chosen": -353.39450905539775, + "logps/rejected": -348.2158668154762, + "loss": 0.1606, + "rewards/chosen": 1.2516573125665837, + "rewards/margins": 6.118905121113831, + "rewards/rejected": -4.867247808547247, + "step": 163 + }, + { + "epoch": 0.0605417378062849, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 9.947688871212142e-06, + "logits/chosen": 1549554145.8823528, + "logits/rejected": 1380027869.8666666, + "logps/chosen": -241.73764935661765, + "logps/rejected": -516.6527994791667, + "loss": 0.1799, + "rewards/chosen": 1.4479791977826286, + "rewards/margins": 9.347383087756587, + "rewards/rejected": -7.899403889973958, + "step": 164 + }, + { + "epoch": 0.0609108947441281, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 9.946836342753982e-06, + "logits/chosen": 1892361485.4736843, + "logits/rejected": 1943659756.3076923, + "logps/chosen": -302.43007298519734, + "logps/rejected": -517.0065730168269, + "loss": 0.2254, + "rewards/chosen": 1.1371040344238281, + "rewards/margins": 6.716646928053636, + "rewards/rejected": -5.5795428936298075, + "step": 165 + }, + { + "epoch": 0.0612800516819713, + "grad_norm": 46.5, + "kl": 0.0, + "learning_rate": 9.945976960516921e-06, + "logits/chosen": 1739663974.4, + "logits/rejected": 1829369002.6666667, + "logps/chosen": -335.09169921875, + "logps/rejected": -617.3716227213541, + "loss": 0.2644, + "rewards/chosen": 0.44480624198913576, + "rewards/margins": 7.539785846074422, + "rewards/rejected": -7.094979604085286, + "step": 166 + }, + { + "epoch": 0.061649208619814495, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 9.945110725691618e-06, + "logits/chosen": 1619395764.7058823, + "logits/rejected": 1431729902.9333334, + "logps/chosen": -267.6011316636029, + "logps/rejected": -402.2577799479167, + "loss": 0.1864, + "rewards/chosen": 1.2293080722584444, + "rewards/margins": 7.933381151685528, + "rewards/rejected": -6.704073079427084, + "step": 167 + }, + { + "epoch": 0.0620183655576577, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 9.944237639478232e-06, + "logits/chosen": 1981292916.3636363, + "logits/rejected": 2284940531.8095236, + "logps/chosen": -351.1681463068182, + "logps/rejected": -438.3822079613095, + "loss": 0.2216, + "rewards/chosen": 0.22398909655484286, + "rewards/margins": 5.096196374851904, + "rewards/rejected": -4.872207278297061, + "step": 168 + }, + { + "epoch": 0.0623875224955009, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 9.943357703086411e-06, + "logits/chosen": 2168239718.4, + "logits/rejected": 2831547934.117647, + "logps/chosen": -262.52356770833336, + "logps/rejected": -499.00080422794116, + "loss": 0.1938, + "rewards/chosen": 1.2805671691894531, + "rewards/margins": 7.524218166575713, + "rewards/rejected": -6.2436509973862595, + "step": 169 + }, + { + "epoch": 0.0627566794333441, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 9.942470917735299e-06, + "logits/chosen": 1497043041.5238094, + "logits/rejected": 1222771805.090909, + "logps/chosen": -196.6768043154762, + "logps/rejected": -539.0693359375, + "loss": 0.2716, + "rewards/chosen": 0.9215043385823568, + "rewards/margins": 3.808020909627279, + "rewards/rejected": -2.886516571044922, + "step": 170 + }, + { + "epoch": 0.06312583637118731, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 9.941577284653523e-06, + "logits/chosen": 2424778496.0, + "logits/rejected": 1769192704.0, + "logps/chosen": -352.7112121582031, + "logps/rejected": -445.326171875, + "loss": 0.2518, + "rewards/chosen": 0.4472127854824066, + "rewards/margins": 6.623797506093979, + "rewards/rejected": -6.176584720611572, + "step": 171 + }, + { + "epoch": 0.0634949933090305, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 9.940676805079201e-06, + "logits/chosen": 1570746496.0, + "logits/rejected": 1724764288.0, + "logps/chosen": -357.9239501953125, + "logps/rejected": -452.7489013671875, + "loss": 0.2256, + "rewards/chosen": 0.6584669351577759, + "rewards/margins": 5.512856602668762, + "rewards/rejected": -4.854389667510986, + "step": 172 + }, + { + "epoch": 0.0638641502468737, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.939769480259937e-06, + "logits/chosen": 1901196580.5714285, + "logits/rejected": 1888412330.6666667, + "logps/chosen": -228.95063127790178, + "logps/rejected": -352.1045735677083, + "loss": 0.1867, + "rewards/chosen": 1.2099661145891463, + "rewards/margins": 6.505925587245397, + "rewards/rejected": -5.29595947265625, + "step": 173 + }, + { + "epoch": 0.0642333071847169, + "grad_norm": 13.9375, + "kl": 0.20199871063232422, + "learning_rate": 9.938855311452818e-06, + "logits/chosen": 1128744374.857143, + "logits/rejected": 1232993652.3636363, + "logps/chosen": -234.67420014880952, + "logps/rejected": -494.70028409090907, + "loss": 0.2564, + "rewards/chosen": 0.9493690672374907, + "rewards/margins": 6.306448313064906, + "rewards/rejected": -5.357079245827415, + "step": 174 + }, + { + "epoch": 0.0646024641225601, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.93793429992441e-06, + "logits/chosen": 2461948586.6666665, + "logits/rejected": 1785004754.8235295, + "logps/chosen": -267.17294921875, + "logps/rejected": -465.64283662683823, + "loss": 0.185, + "rewards/chosen": 0.9911238352457682, + "rewards/margins": 8.411316329357671, + "rewards/rejected": -7.420192494111903, + "step": 175 + }, + { + "epoch": 0.0649716210604033, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.937006446950768e-06, + "logits/chosen": 1653422694.4, + "logits/rejected": 1690656426.6666667, + "logps/chosen": -266.283935546875, + "logps/rejected": -385.8729654947917, + "loss": 0.2755, + "rewards/chosen": 0.7713281154632569, + "rewards/margins": 6.205270910263062, + "rewards/rejected": -5.433942794799805, + "step": 176 + }, + { + "epoch": 0.0653407779982465, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 9.936071753817416e-06, + "logits/chosen": 2647660182.5882354, + "logits/rejected": 2037809561.6, + "logps/chosen": -324.83498965992646, + "logps/rejected": -528.6254231770833, + "loss": 0.2235, + "rewards/chosen": 0.6302930046530331, + "rewards/margins": 8.15356621835746, + "rewards/rejected": -7.523273213704427, + "step": 177 + }, + { + "epoch": 0.0657099349360897, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 9.935130221819361e-06, + "logits/chosen": 1682948253.5384614, + "logits/rejected": 2623847585.6842103, + "logps/chosen": -306.7768366887019, + "logps/rejected": -494.61986019736844, + "loss": 0.1497, + "rewards/chosen": 1.2759845440204327, + "rewards/margins": 7.237956027752957, + "rewards/rejected": -5.961971483732524, + "step": 178 + }, + { + "epoch": 0.0660790918739329, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 9.934181852261084e-06, + "logits/chosen": 1601886617.6, + "logits/rejected": 1737047319.2727273, + "logps/chosen": -289.715087890625, + "logps/rejected": -537.2959428267045, + "loss": 0.0954, + "rewards/chosen": 1.6865406036376953, + "rewards/margins": 9.190039721402254, + "rewards/rejected": -7.50349911776456, + "step": 179 + }, + { + "epoch": 0.0664482488117761, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 9.93322664645654e-06, + "logits/chosen": 1798584477.5384614, + "logits/rejected": 1927257249.6842105, + "logps/chosen": -245.40478515625, + "logps/rejected": -522.7278988486842, + "loss": 0.1407, + "rewards/chosen": 1.4086092435396635, + "rewards/margins": 9.413722544063923, + "rewards/rejected": -8.00511330052426, + "step": 180 + }, + { + "epoch": 0.06681740574961931, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.932264605729152e-06, + "logits/chosen": 2033656832.0, + "logits/rejected": 2891849113.6, + "logps/chosen": -300.49049886067706, + "logps/rejected": -457.447119140625, + "loss": 0.1939, + "rewards/chosen": 0.5647591749827067, + "rewards/margins": 6.921987263361613, + "rewards/rejected": -6.357228088378906, + "step": 181 + }, + { + "epoch": 0.06718656268746251, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 9.931295731411819e-06, + "logits/chosen": 1556286403.764706, + "logits/rejected": 1556958412.8, + "logps/chosen": -237.14914119944854, + "logps/rejected": -374.16106770833335, + "loss": 0.2328, + "rewards/chosen": 0.7590243395637063, + "rewards/margins": 6.520092556523341, + "rewards/rejected": -5.761068216959635, + "step": 182 + }, + { + "epoch": 0.06755571962530571, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 9.930320024846899e-06, + "logits/chosen": 1982816802.1333334, + "logits/rejected": 1769219734.5882354, + "logps/chosen": -258.62159830729166, + "logps/rejected": -375.94450827205884, + "loss": 0.2451, + "rewards/chosen": 0.42381070454915365, + "rewards/margins": 5.4041959799972235, + "rewards/rejected": -4.98038527544807, + "step": 183 + }, + { + "epoch": 0.06792487656314891, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 9.929337487386225e-06, + "logits/chosen": 1831180151.4666667, + "logits/rejected": 1761786819.764706, + "logps/chosen": -304.87604166666665, + "logps/rejected": -517.9893152573529, + "loss": 0.2191, + "rewards/chosen": 0.47626304626464844, + "rewards/margins": 6.387571334838867, + "rewards/rejected": -5.911308288574219, + "step": 184 + }, + { + "epoch": 0.06829403350099211, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 9.928348120391087e-06, + "logits/chosen": 2572690500.266667, + "logits/rejected": 2167752101.647059, + "logps/chosen": -261.04521484375, + "logps/rejected": -637.8296760110294, + "loss": 0.1565, + "rewards/chosen": 1.2647260030110676, + "rewards/margins": 9.390314812753715, + "rewards/rejected": -8.125588809742647, + "step": 185 + }, + { + "epoch": 0.06866319043883531, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.927351925232245e-06, + "logits/chosen": 1925735082.6666667, + "logits/rejected": 1461771926.5882354, + "logps/chosen": -311.31324869791666, + "logps/rejected": -433.6963752297794, + "loss": 0.2135, + "rewards/chosen": 0.5511041641235351, + "rewards/margins": 6.438059874141917, + "rewards/rejected": -5.886955710018382, + "step": 186 + }, + { + "epoch": 0.06903234737667852, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 9.92634890328991e-06, + "logits/chosen": 2131489792.0, + "logits/rejected": 2103618432.0, + "logps/chosen": -301.91326904296875, + "logps/rejected": -339.43524169921875, + "loss": 0.2214, + "rewards/chosen": 0.452073335647583, + "rewards/margins": 5.639359712600708, + "rewards/rejected": -5.187286376953125, + "step": 187 + }, + { + "epoch": 0.06940150431452172, + "grad_norm": 13.1875, + "kl": 0.12229537963867188, + "learning_rate": 9.92533905595376e-06, + "logits/chosen": 1855865675.2941177, + "logits/rejected": 1644342476.8, + "logps/chosen": -312.7223115808824, + "logps/rejected": -438.82958984375, + "loss": 0.2195, + "rewards/chosen": 1.2117050395292395, + "rewards/margins": 6.721602907367782, + "rewards/rejected": -5.509897867838542, + "step": 188 + }, + { + "epoch": 0.06977066125236492, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 9.924322384622922e-06, + "logits/chosen": 1751344274.2857144, + "logits/rejected": 1878398293.3333333, + "logps/chosen": -321.732666015625, + "logps/rejected": -450.19715711805554, + "loss": 0.1964, + "rewards/chosen": 0.5876798629760742, + "rewards/margins": 5.983964602152507, + "rewards/rejected": -5.396284739176433, + "step": 189 + }, + { + "epoch": 0.0701398181902081, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.923298890705983e-06, + "logits/chosen": 1472814226.2857144, + "logits/rejected": 1493376227.5555556, + "logps/chosen": -320.85654994419644, + "logps/rejected": -483.9733072916667, + "loss": 0.1421, + "rewards/chosen": 1.493065425327846, + "rewards/margins": 8.704915061829581, + "rewards/rejected": -7.211849636501736, + "step": 190 + }, + { + "epoch": 0.07050897512805131, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.922268575620981e-06, + "logits/chosen": 1411139824.9411764, + "logits/rejected": 2175930094.9333334, + "logps/chosen": -259.9036075367647, + "logps/rejected": -476.1249674479167, + "loss": 0.2209, + "rewards/chosen": 0.5080053666058708, + "rewards/margins": 6.565353490792068, + "rewards/rejected": -6.0573481241861975, + "step": 191 + }, + { + "epoch": 0.07087813206589451, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 9.921231440795404e-06, + "logits/chosen": 1411518621.5384614, + "logits/rejected": 1857303605.8947368, + "logps/chosen": -264.09021935096155, + "logps/rejected": -491.94443873355266, + "loss": 0.1887, + "rewards/chosen": 1.1723593931931715, + "rewards/margins": 8.337286022510606, + "rewards/rejected": -7.164926629317434, + "step": 192 + }, + { + "epoch": 0.07124728900373771, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.92018748766619e-06, + "logits/chosen": 1469235200.0, + "logits/rejected": 1730677418.6666667, + "logps/chosen": -292.8130615234375, + "logps/rejected": -502.1977945963542, + "loss": 0.1947, + "rewards/chosen": 1.605323600769043, + "rewards/margins": 7.637721188863118, + "rewards/rejected": -6.032397588094075, + "step": 193 + }, + { + "epoch": 0.07161644594158091, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 9.919136717679723e-06, + "logits/chosen": 1489188096.0, + "logits/rejected": 1393334528.0, + "logps/chosen": -254.79714965820312, + "logps/rejected": -382.0877990722656, + "loss": 0.1984, + "rewards/chosen": 0.8846181035041809, + "rewards/margins": 6.8402188420295715, + "rewards/rejected": -5.955600738525391, + "step": 194 + }, + { + "epoch": 0.07198560287942411, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.918079132291828e-06, + "logits/chosen": 1967694787.764706, + "logits/rejected": 1463822199.4666667, + "logps/chosen": -250.33412798713235, + "logps/rejected": -340.72864583333336, + "loss": 0.2111, + "rewards/chosen": 0.7158857233384076, + "rewards/margins": 6.207194317088408, + "rewards/rejected": -5.49130859375, + "step": 195 + }, + { + "epoch": 0.07235475981726731, + "grad_norm": 13.0625, + "kl": 0.6617727279663086, + "learning_rate": 9.917014732967782e-06, + "logits/chosen": 1367304192.0, + "logits/rejected": 1434894677.3333333, + "logps/chosen": -328.46407645089283, + "logps/rejected": -423.6180013020833, + "loss": 0.1493, + "rewards/chosen": 1.8524954659598214, + "rewards/margins": 8.105687519860647, + "rewards/rejected": -6.2531920539008246, + "step": 196 + }, + { + "epoch": 0.07272391675511052, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 9.915943521182292e-06, + "logits/chosen": 2595679280.7619047, + "logits/rejected": 2466666309.818182, + "logps/chosen": -294.67013113839283, + "logps/rejected": -320.38427734375, + "loss": 0.2736, + "rewards/chosen": 1.0027737390427363, + "rewards/margins": 5.51562059906138, + "rewards/rejected": -4.5128468600186435, + "step": 197 + }, + { + "epoch": 0.07309307369295372, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 9.91486549841951e-06, + "logits/chosen": 1599045847.5789473, + "logits/rejected": 1839370870.1538463, + "logps/chosen": -195.8356291118421, + "logps/rejected": -439.2619816706731, + "loss": 0.1594, + "rewards/chosen": 1.6159053601716693, + "rewards/margins": 8.587747025586333, + "rewards/rejected": -6.971841665414663, + "step": 198 + }, + { + "epoch": 0.07346223063079692, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 9.913780666173022e-06, + "logits/chosen": 1744811008.0, + "logits/rejected": 1738365696.0, + "logps/chosen": -280.6553039550781, + "logps/rejected": -496.47833251953125, + "loss": 0.2129, + "rewards/chosen": 0.6269842386245728, + "rewards/margins": 6.745821595191956, + "rewards/rejected": -6.118837356567383, + "step": 199 + }, + { + "epoch": 0.07383138756864012, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.912689025945851e-06, + "logits/chosen": 1731220772.5714285, + "logits/rejected": 1485328270.2222223, + "logps/chosen": -295.63528878348217, + "logps/rejected": -421.65370008680554, + "loss": 0.2287, + "rewards/chosen": 0.35897983823503765, + "rewards/margins": 5.639596598488944, + "rewards/rejected": -5.280616760253906, + "step": 200 + }, + { + "epoch": 0.07420054450648332, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 9.91159057925045e-06, + "logits/chosen": 1223482481.7777777, + "logits/rejected": 1530489709.7142856, + "logps/chosen": -262.52924262152777, + "logps/rejected": -473.86781529017856, + "loss": 0.2419, + "rewards/chosen": 0.4861944516499837, + "rewards/margins": 8.065860589345297, + "rewards/rejected": -7.5796661376953125, + "step": 201 + }, + { + "epoch": 0.07456970144432652, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 9.910485327608702e-06, + "logits/chosen": 2591656251.076923, + "logits/rejected": 2065203846.7368422, + "logps/chosen": -229.14823091947116, + "logps/rejected": -489.9150390625, + "loss": 0.157, + "rewards/chosen": 0.9946117401123047, + "rewards/margins": 6.7041474392539575, + "rewards/rejected": -5.709535699141653, + "step": 202 + }, + { + "epoch": 0.07493885838216972, + "grad_norm": 15.125, + "kl": 0.0, + "learning_rate": 9.909373272551919e-06, + "logits/chosen": 1617803904.0, + "logits/rejected": 2005531008.0, + "logps/chosen": -297.0911865234375, + "logps/rejected": -520.767822265625, + "loss": 0.2081, + "rewards/chosen": 0.7357272505760193, + "rewards/margins": 6.25931590795517, + "rewards/rejected": -5.52358865737915, + "step": 203 + }, + { + "epoch": 0.07530801532001292, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 9.90825441562084e-06, + "logits/chosen": 1821062197.8947368, + "logits/rejected": 1995083460.9230769, + "logps/chosen": -340.32421875, + "logps/rejected": -447.14400540865387, + "loss": 0.2485, + "rewards/chosen": 0.5422391891479492, + "rewards/margins": 6.957692586458647, + "rewards/rejected": -6.4154533973106975, + "step": 204 + }, + { + "epoch": 0.07567717225785613, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 9.907128758365627e-06, + "logits/chosen": 1669836492.8, + "logits/rejected": 1947845973.3333333, + "logps/chosen": -264.425830078125, + "logps/rejected": -527.2880452473959, + "loss": 0.2248, + "rewards/chosen": 1.1608144760131835, + "rewards/margins": 6.972369702657064, + "rewards/rejected": -5.81155522664388, + "step": 205 + }, + { + "epoch": 0.07604632919569933, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 9.905996302345863e-06, + "logits/chosen": 1922174020.2666667, + "logits/rejected": 2222210831.0588236, + "logps/chosen": -267.587939453125, + "logps/rejected": -508.81858915441177, + "loss": 0.155, + "rewards/chosen": 1.1608455657958985, + "rewards/margins": 9.207422570621267, + "rewards/rejected": -8.046577004825368, + "step": 206 + }, + { + "epoch": 0.07641548613354253, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 9.904857049130553e-06, + "logits/chosen": 1536168263.68, + "logits/rejected": 1545219218.2857144, + "logps/chosen": -278.0503515625, + "logps/rejected": -670.1974051339286, + "loss": 0.2941, + "rewards/chosen": 0.8895104217529297, + "rewards/margins": 11.655464575631278, + "rewards/rejected": -10.765954153878349, + "step": 207 + }, + { + "epoch": 0.07678464307138572, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 9.903711000298118e-06, + "logits/chosen": 1557965952.0, + "logits/rejected": 1031605952.0, + "logps/chosen": -342.5038146972656, + "logps/rejected": -415.4825439453125, + "loss": 0.2119, + "rewards/chosen": 0.7806885838508606, + "rewards/margins": 7.948217689990997, + "rewards/rejected": -7.167529106140137, + "step": 208 + }, + { + "epoch": 0.07715380000922892, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 9.902558157436392e-06, + "logits/chosen": 2105853952.0, + "logits/rejected": 1438852778.6666667, + "logps/chosen": -318.1811767578125, + "logps/rejected": -373.058349609375, + "loss": 0.2836, + "rewards/chosen": 0.5601299285888672, + "rewards/margins": 5.551716613769531, + "rewards/rejected": -4.991586685180664, + "step": 209 + }, + { + "epoch": 0.07752295694707212, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 9.901398522142624e-06, + "logits/chosen": 1771273420.8, + "logits/rejected": 2357349677.1764708, + "logps/chosen": -367.66998697916665, + "logps/rejected": -511.3906824448529, + "loss": 0.2003, + "rewards/chosen": 0.5893281936645508, + "rewards/margins": 7.0993271883796245, + "rewards/rejected": -6.509998994715073, + "step": 210 + }, + { + "epoch": 0.07789211388491532, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.900232096023478e-06, + "logits/chosen": 1714014754.1333334, + "logits/rejected": 1482416007.5294118, + "logps/chosen": -197.12638346354166, + "logps/rejected": -477.98928653492646, + "loss": 0.1687, + "rewards/chosen": 1.0620689392089844, + "rewards/margins": 8.536500594195196, + "rewards/rejected": -7.474431654986213, + "step": 211 + }, + { + "epoch": 0.07826127082275852, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.899058880695019e-06, + "logits/chosen": 1908046506.6666667, + "logits/rejected": 3238210379.2941175, + "logps/chosen": -291.6744140625, + "logps/rejected": -380.1985294117647, + "loss": 0.1727, + "rewards/chosen": 1.3485973358154297, + "rewards/margins": 6.66027784908519, + "rewards/rejected": -5.311680513269761, + "step": 212 + }, + { + "epoch": 0.07863042776060172, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 9.89787887778272e-06, + "logits/chosen": 1433902421.3333333, + "logits/rejected": 1510644073.4117646, + "logps/chosen": -282.03053385416666, + "logps/rejected": -396.88220932904414, + "loss": 0.1759, + "rewards/chosen": 1.2443155924479166, + "rewards/margins": 6.28171937231924, + "rewards/rejected": -5.037403779871323, + "step": 213 + }, + { + "epoch": 0.07899958469844492, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 9.896692088921466e-06, + "logits/chosen": 2571602147.5555553, + "logits/rejected": 2169690843.428571, + "logps/chosen": -283.45494249131946, + "logps/rejected": -565.0706263950893, + "loss": 0.187, + "rewards/chosen": 1.0868806838989258, + "rewards/margins": 8.722324235098704, + "rewards/rejected": -7.635443551199777, + "step": 214 + }, + { + "epoch": 0.07936874163628813, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 9.895498515755532e-06, + "logits/chosen": 2180546087.3846154, + "logits/rejected": 1674937290.1052632, + "logps/chosen": -240.181640625, + "logps/rejected": -494.97291324013156, + "loss": 0.1915, + "rewards/chosen": 0.6344446769127479, + "rewards/margins": 7.282570576378209, + "rewards/rejected": -6.648125899465461, + "step": 215 + }, + { + "epoch": 0.07973789857413133, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 9.894298159938605e-06, + "logits/chosen": 2025556992.0, + "logits/rejected": 1648880469.3333333, + "logps/chosen": -306.396533203125, + "logps/rejected": -387.446044921875, + "loss": 0.2603, + "rewards/chosen": 0.7764100551605224, + "rewards/margins": 6.467584880193074, + "rewards/rejected": -5.691174825032552, + "step": 216 + }, + { + "epoch": 0.08010705551197453, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.893091023133756e-06, + "logits/chosen": 2599144886.857143, + "logits/rejected": 1580353536.0, + "logps/chosen": -284.59225027901783, + "logps/rejected": -435.24305555555554, + "loss": 0.2221, + "rewards/chosen": 0.38711816923958914, + "rewards/margins": 6.742737387853956, + "rewards/rejected": -6.3556192186143665, + "step": 217 + }, + { + "epoch": 0.08047621244981773, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.891877107013461e-06, + "logits/chosen": 1952478412.8, + "logits/rejected": 2008730925.1764705, + "logps/chosen": -313.202734375, + "logps/rejected": -541.3700597426471, + "loss": 0.1945, + "rewards/chosen": 0.6368071873982747, + "rewards/margins": 6.487811395233753, + "rewards/rejected": -5.851004207835478, + "step": 218 + }, + { + "epoch": 0.08084536938766093, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.890656413259585e-06, + "logits/chosen": 1868918647.4666667, + "logits/rejected": 2070145385.4117646, + "logps/chosen": -339.6949869791667, + "logps/rejected": -409.09719669117646, + "loss": 0.219, + "rewards/chosen": 0.47541306813557943, + "rewards/margins": 6.362307294209798, + "rewards/rejected": -5.886894226074219, + "step": 219 + }, + { + "epoch": 0.08121452632550413, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.889428943563382e-06, + "logits/chosen": 2613116050.285714, + "logits/rejected": 1694930375.1111112, + "logps/chosen": -291.2520228794643, + "logps/rejected": -371.1877170138889, + "loss": 0.1896, + "rewards/chosen": 1.125910827091762, + "rewards/margins": 6.569520276690286, + "rewards/rejected": -5.443609449598524, + "step": 220 + }, + { + "epoch": 0.08158368326334733, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.888194699625499e-06, + "logits/chosen": 2747854392.888889, + "logits/rejected": 1755361426.2857144, + "logps/chosen": -320.61073133680554, + "logps/rejected": -453.8185337611607, + "loss": 0.2227, + "rewards/chosen": 0.7362472746107314, + "rewards/margins": 7.750842851305765, + "rewards/rejected": -7.014595576695034, + "step": 221 + }, + { + "epoch": 0.08195284020119054, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.886953683155964e-06, + "logits/chosen": 1992880128.0, + "logits/rejected": 1669637120.0, + "logps/chosen": -332.87245008680554, + "logps/rejected": -551.91552734375, + "loss": 0.2056, + "rewards/chosen": 1.164188067118327, + "rewards/margins": 7.371108690897624, + "rewards/rejected": -6.206920623779297, + "step": 222 + }, + { + "epoch": 0.08232199713903374, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 9.885705895874188e-06, + "logits/chosen": 2185397816.888889, + "logits/rejected": 2089003008.0, + "logps/chosen": -322.3407931857639, + "logps/rejected": -458.33119419642856, + "loss": 0.2248, + "rewards/chosen": 0.7197759946187338, + "rewards/margins": 7.195091633569627, + "rewards/rejected": -6.475315638950893, + "step": 223 + }, + { + "epoch": 0.08269115407687694, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 9.884451339508967e-06, + "logits/chosen": 2107596526.9333334, + "logits/rejected": 2767462159.0588236, + "logps/chosen": -279.2965494791667, + "logps/rejected": -449.5297277113971, + "loss": 0.1588, + "rewards/chosen": 1.315366236368815, + "rewards/margins": 7.897823834886738, + "rewards/rejected": -6.582457598517923, + "step": 224 + }, + { + "epoch": 0.08306031101472014, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 9.883190015798469e-06, + "logits/chosen": 1882893390.7692308, + "logits/rejected": 2124771112.4210527, + "logps/chosen": -216.56268780048077, + "logps/rejected": -520.6907894736842, + "loss": 0.154, + "rewards/chosen": 1.7223923022930439, + "rewards/margins": 7.329244783532764, + "rewards/rejected": -5.60685248123972, + "step": 225 + }, + { + "epoch": 0.08342946795256333, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 9.881921926490245e-06, + "logits/chosen": 1618491392.0, + "logits/rejected": 1681094016.0, + "logps/chosen": -321.0919189453125, + "logps/rejected": -514.27099609375, + "loss": 0.2249, + "rewards/chosen": 0.5107834935188293, + "rewards/margins": 6.200155079364777, + "rewards/rejected": -5.689371585845947, + "step": 226 + }, + { + "epoch": 0.08379862489040653, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 9.880647073341219e-06, + "logits/chosen": 1637272576.0, + "logits/rejected": 1694106331.4285715, + "logps/chosen": -310.8017306857639, + "logps/rejected": -432.19932338169644, + "loss": 0.2295, + "rewards/chosen": 1.2232177522447374, + "rewards/margins": 7.278936310419961, + "rewards/rejected": -6.055718558175223, + "step": 227 + }, + { + "epoch": 0.08416778182824973, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 9.879365458117678e-06, + "logits/chosen": 2029691611.4285715, + "logits/rejected": 1335902776.8888888, + "logps/chosen": -322.3362513950893, + "logps/rejected": -414.431884765625, + "loss": 0.216, + "rewards/chosen": 0.31933518818446566, + "rewards/margins": 4.937679813021705, + "rewards/rejected": -4.618344624837239, + "step": 228 + }, + { + "epoch": 0.08453693876609293, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 9.878077082595287e-06, + "logits/chosen": 1970508706.909091, + "logits/rejected": 2426740736.0, + "logps/chosen": -280.591796875, + "logps/rejected": -514.5986328125, + "loss": 0.123, + "rewards/chosen": 1.1186904907226562, + "rewards/margins": 8.958988371349516, + "rewards/rejected": -7.84029788062686, + "step": 229 + }, + { + "epoch": 0.08490609570393613, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.876781948559073e-06, + "logits/chosen": 2207755806.117647, + "logits/rejected": 1596277555.2, + "logps/chosen": -279.7059972426471, + "logps/rejected": -469.0556640625, + "loss": 0.1721, + "rewards/chosen": 1.3132118898279526, + "rewards/margins": 7.817259537939932, + "rewards/rejected": -6.504047648111979, + "step": 230 + }, + { + "epoch": 0.08527525264177933, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 9.87548005780343e-06, + "logits/chosen": 1703355245.7142856, + "logits/rejected": 2056368583.1111112, + "logps/chosen": -299.69859095982144, + "logps/rejected": -474.4347330729167, + "loss": 0.163, + "rewards/chosen": 1.37884521484375, + "rewards/margins": 8.054289923773872, + "rewards/rejected": -6.675444708930121, + "step": 231 + }, + { + "epoch": 0.08564440957962254, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 9.874171412132107e-06, + "logits/chosen": 1956650097.7777777, + "logits/rejected": 1761728219.4285715, + "logps/chosen": -324.7473415798611, + "logps/rejected": -478.4915248325893, + "loss": 0.2467, + "rewards/chosen": 0.3728671073913574, + "rewards/margins": 7.447789941515241, + "rewards/rejected": -7.074922834123884, + "step": 232 + }, + { + "epoch": 0.08601356651746574, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 9.872856013358219e-06, + "logits/chosen": 2656566923.6363635, + "logits/rejected": 1751575795.8095238, + "logps/chosen": -257.83780184659093, + "logps/rejected": -421.71405319940476, + "loss": 0.1178, + "rewards/chosen": 1.272094813260165, + "rewards/margins": 7.441989077118052, + "rewards/rejected": -6.169894263857887, + "step": 233 + }, + { + "epoch": 0.08638272345530894, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.871533863304234e-06, + "logits/chosen": 1777459768.8888888, + "logits/rejected": 1737914806.857143, + "logps/chosen": -281.3489583333333, + "logps/rejected": -475.93739536830356, + "loss": 0.1926, + "rewards/chosen": 1.9016390906439886, + "rewards/margins": 9.053498919048007, + "rewards/rejected": -7.151859828404018, + "step": 234 + }, + { + "epoch": 0.08675188039315214, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 9.870204963801974e-06, + "logits/chosen": 1908824502.857143, + "logits/rejected": 2459854848.0, + "logps/chosen": -274.90096609933033, + "logps/rejected": -539.6013454861111, + "loss": 0.2095, + "rewards/chosen": 0.3777961390359061, + "rewards/margins": 6.885999433578007, + "rewards/rejected": -6.508203294542101, + "step": 235 + }, + { + "epoch": 0.08712103733099534, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 9.86886931669261e-06, + "logits/chosen": 1987539968.0, + "logits/rejected": 1767886592.0, + "logps/chosen": -285.40362548828125, + "logps/rejected": -402.8817138671875, + "loss": 0.1882, + "rewards/chosen": 0.8615530729293823, + "rewards/margins": 7.9712032079696655, + "rewards/rejected": -7.109650135040283, + "step": 236 + }, + { + "epoch": 0.08749019426883854, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.867526923826668e-06, + "logits/chosen": 1903570397.8666666, + "logits/rejected": 1530725556.7058823, + "logps/chosen": -330.64361979166665, + "logps/rejected": -511.91796875, + "loss": 0.165, + "rewards/chosen": 1.117876942952474, + "rewards/margins": 8.44961378808115, + "rewards/rejected": -7.331736845128677, + "step": 237 + }, + { + "epoch": 0.08785935120668174, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 9.866177787064013e-06, + "logits/chosen": 2198547275.2941175, + "logits/rejected": 1873264093.8666666, + "logps/chosen": -288.7004825367647, + "logps/rejected": -511.0263671875, + "loss": 0.1863, + "rewards/chosen": 1.0603486229391659, + "rewards/margins": 7.821169138889687, + "rewards/rejected": -6.760820515950521, + "step": 238 + }, + { + "epoch": 0.08822850814452494, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.864821908273861e-06, + "logits/chosen": 1772254939.4285715, + "logits/rejected": 1570908615.1111112, + "logps/chosen": -258.10878208705356, + "logps/rejected": -511.54291449652777, + "loss": 0.186, + "rewards/chosen": 0.627357006072998, + "rewards/margins": 9.116165532006157, + "rewards/rejected": -8.488808525933159, + "step": 239 + }, + { + "epoch": 0.08859766508236815, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.863459289334758e-06, + "logits/chosen": 2675938099.2, + "logits/rejected": 1956669560.4705882, + "logps/chosen": -298.59560546875, + "logps/rejected": -542.7853285845588, + "loss": 0.1963, + "rewards/chosen": 0.7494485219319661, + "rewards/margins": 7.094848715090285, + "rewards/rejected": -6.345400193158318, + "step": 240 + }, + { + "epoch": 0.08896682202021135, + "grad_norm": 11.8125, + "kl": 0.9455204010009766, + "learning_rate": 9.862089932134601e-06, + "logits/chosen": 1215964774.4, + "logits/rejected": 1623221187.764706, + "logps/chosen": -215.22179361979167, + "logps/rejected": -376.16417738970586, + "loss": 0.1947, + "rewards/chosen": 1.438476816813151, + "rewards/margins": 6.256367657231349, + "rewards/rejected": -4.817890840418198, + "step": 241 + }, + { + "epoch": 0.08933597895805455, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 9.860713838570616e-06, + "logits/chosen": 2768228592.9411764, + "logits/rejected": 2219111901.866667, + "logps/chosen": -219.66696346507354, + "logps/rejected": -476.97369791666665, + "loss": 0.1999, + "rewards/chosen": 0.9496830210966223, + "rewards/margins": 7.9543725556018305, + "rewards/rejected": -7.004689534505208, + "step": 242 + }, + { + "epoch": 0.08970513589589775, + "grad_norm": 12.5625, + "kl": 0.3970003128051758, + "learning_rate": 9.859331010549362e-06, + "logits/chosen": 2242105051.428571, + "logits/rejected": 1974337536.0, + "logps/chosen": -245.67314801897322, + "logps/rejected": -436.80360243055554, + "loss": 0.1929, + "rewards/chosen": 0.5280186789376395, + "rewards/margins": 6.862357820783343, + "rewards/rejected": -6.334339141845703, + "step": 243 + }, + { + "epoch": 0.09007429283374094, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 9.85794144998673e-06, + "logits/chosen": 1540820081.7777777, + "logits/rejected": 1823759506.2857144, + "logps/chosen": -221.62223307291666, + "logps/rejected": -467.1923130580357, + "loss": 0.192, + "rewards/chosen": 1.3148587544759114, + "rewards/margins": 7.734915778750465, + "rewards/rejected": -6.420057024274554, + "step": 244 + }, + { + "epoch": 0.09044344977158414, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.856545158807938e-06, + "logits/chosen": 1524417194.6666667, + "logits/rejected": 1303430445.1764705, + "logps/chosen": -246.04095052083332, + "logps/rejected": -368.9867302389706, + "loss": 0.2207, + "rewards/chosen": 0.3987483342488607, + "rewards/margins": 5.639583636265176, + "rewards/rejected": -5.240835302016315, + "step": 245 + }, + { + "epoch": 0.09081260670942734, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 9.855142138947532e-06, + "logits/chosen": 1980737945.6, + "logits/rejected": 1801239552.0, + "logps/chosen": -243.0710205078125, + "logps/rejected": -360.29296875, + "loss": 0.1918, + "rewards/chosen": 1.7831703186035157, + "rewards/margins": 7.3502250671386715, + "rewards/rejected": -5.567054748535156, + "step": 246 + }, + { + "epoch": 0.09118176364727054, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 9.853732392349376e-06, + "logits/chosen": 2048307200.0, + "logits/rejected": 1606512399.0588236, + "logps/chosen": -254.74156901041667, + "logps/rejected": -449.9626034007353, + "loss": 0.1953, + "rewards/chosen": 1.150223159790039, + "rewards/margins": 7.08902733746697, + "rewards/rejected": -5.93880417767693, + "step": 247 + }, + { + "epoch": 0.09155092058511374, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 9.852315920966653e-06, + "logits/chosen": 2144834323.6923077, + "logits/rejected": 1339052786.5263157, + "logps/chosen": -312.3495342548077, + "logps/rejected": -418.9804944490132, + "loss": 0.2206, + "rewards/chosen": 0.19657709048344538, + "rewards/margins": 5.523775319821438, + "rewards/rejected": -5.327198229337993, + "step": 248 + }, + { + "epoch": 0.09192007752295694, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 9.850892726761874e-06, + "logits/chosen": 1366033347.764706, + "logits/rejected": 1395864507.7333333, + "logps/chosen": -241.67718864889707, + "logps/rejected": -455.53831380208334, + "loss": 0.1852, + "rewards/chosen": 1.348412457634421, + "rewards/margins": 7.927757652133119, + "rewards/rejected": -6.579345194498698, + "step": 249 + }, + { + "epoch": 0.09228923446080015, + "grad_norm": 12.125, + "kl": 0.993544340133667, + "learning_rate": 9.84946281170685e-06, + "logits/chosen": 2226978084.571429, + "logits/rejected": 2548566698.6666665, + "logps/chosen": -301.3502720424107, + "logps/rejected": -424.406982421875, + "loss": 0.1512, + "rewards/chosen": 1.3940033231462752, + "rewards/margins": 6.916240041218106, + "rewards/rejected": -5.522236718071832, + "step": 250 + }, + { + "epoch": 0.09265839139864335, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 9.848026177782713e-06, + "logits/chosen": 2367221113.263158, + "logits/rejected": 2140729974.1538463, + "logps/chosen": -260.51870888157896, + "logps/rejected": -555.0961162860577, + "loss": 0.1969, + "rewards/chosen": 1.2923580972771895, + "rewards/margins": 9.751017342694857, + "rewards/rejected": -8.458659245417667, + "step": 251 + }, + { + "epoch": 0.09302754833648655, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 9.846582826979899e-06, + "logits/chosen": 1431866368.0, + "logits/rejected": 1509573973.3333333, + "logps/chosen": -305.49595424107144, + "logps/rejected": -499.3811848958333, + "loss": 0.1722, + "rewards/chosen": 0.7817914145333427, + "rewards/margins": 7.3967040833972755, + "rewards/rejected": -6.614912668863933, + "step": 252 + }, + { + "epoch": 0.09339670527432975, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 9.845132761298154e-06, + "logits/chosen": 1868903693.4736843, + "logits/rejected": 1952465053.5384614, + "logps/chosen": -303.1103515625, + "logps/rejected": -490.50060096153845, + "loss": 0.1811, + "rewards/chosen": 1.359496467991879, + "rewards/margins": 8.57860798391736, + "rewards/rejected": -7.219111515925481, + "step": 253 + }, + { + "epoch": 0.09376586221217295, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 9.843675982746526e-06, + "logits/chosen": 2268049723.076923, + "logits/rejected": 2117635449.2631578, + "logps/chosen": -303.2747145432692, + "logps/rejected": -470.2289782072368, + "loss": 0.1674, + "rewards/chosen": 1.4120873671311598, + "rewards/margins": 8.027750594413233, + "rewards/rejected": -6.615663227282073, + "step": 254 + }, + { + "epoch": 0.09413501915001615, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.84221249334336e-06, + "logits/chosen": 1693802496.0, + "logits/rejected": 2199852732.631579, + "logps/chosen": -232.17328350360577, + "logps/rejected": -396.2252775493421, + "loss": 0.1011, + "rewards/chosen": 2.0058023012601414, + "rewards/margins": 8.063178158964705, + "rewards/rejected": -6.057375857704564, + "step": 255 + }, + { + "epoch": 0.09450417608785935, + "grad_norm": 14.1875, + "kl": 0.4275493621826172, + "learning_rate": 9.840742295116306e-06, + "logits/chosen": 1124594748.235294, + "logits/rejected": 1288799300.2666667, + "logps/chosen": -273.73161764705884, + "logps/rejected": -364.12021484375, + "loss": 0.2001, + "rewards/chosen": 1.1605049582088696, + "rewards/margins": 5.961187026079964, + "rewards/rejected": -4.800682067871094, + "step": 256 + }, + { + "epoch": 0.09487333302570256, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 9.839265390102301e-06, + "logits/chosen": 1764619264.0, + "logits/rejected": 1703910297.6, + "logps/chosen": -251.419921875, + "logps/rejected": -416.073974609375, + "loss": 0.1462, + "rewards/chosen": 1.4828344980875652, + "rewards/margins": 8.041036097208659, + "rewards/rejected": -6.558201599121094, + "step": 257 + }, + { + "epoch": 0.09524248996354576, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 9.837781780347584e-06, + "logits/chosen": 1664917661.5384614, + "logits/rejected": 1781539570.5263157, + "logps/chosen": -295.6209247295673, + "logps/rejected": -516.7000925164474, + "loss": 0.173, + "rewards/chosen": 0.6074054791377141, + "rewards/margins": 8.232161338512714, + "rewards/rejected": -7.624755859375, + "step": 258 + }, + { + "epoch": 0.09561164690138896, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.836291467907671e-06, + "logits/chosen": 1723986688.0, + "logits/rejected": 2695691264.0, + "logps/chosen": -255.94284057617188, + "logps/rejected": -430.0029602050781, + "loss": 0.2381, + "rewards/chosen": 0.3577594459056854, + "rewards/margins": 5.885409325361252, + "rewards/rejected": -5.527649879455566, + "step": 259 + }, + { + "epoch": 0.09598080383923216, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 9.834794454847373e-06, + "logits/chosen": 2283068484.266667, + "logits/rejected": 2248847841.882353, + "logps/chosen": -241.690234375, + "logps/rejected": -524.4182559742648, + "loss": 0.1234, + "rewards/chosen": 1.905566151936849, + "rewards/margins": 10.329769463632621, + "rewards/rejected": -8.424203311695772, + "step": 260 + }, + { + "epoch": 0.09634996077707536, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 9.833290743240785e-06, + "logits/chosen": 1639894667.6363637, + "logits/rejected": 1420454960.7619047, + "logps/chosen": -329.5035511363636, + "logps/rejected": -456.81715029761904, + "loss": 0.1125, + "rewards/chosen": 1.0936532454057173, + "rewards/margins": 8.699429351013976, + "rewards/rejected": -7.605776105608259, + "step": 261 + }, + { + "epoch": 0.09671911771491855, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 9.83178033517128e-06, + "logits/chosen": 1432696978.2857144, + "logits/rejected": 1841446684.4444444, + "logps/chosen": -273.81663295200894, + "logps/rejected": -421.1220974392361, + "loss": 0.1934, + "rewards/chosen": 0.8194854600088937, + "rewards/margins": 7.565872517843095, + "rewards/rejected": -6.746387057834202, + "step": 262 + }, + { + "epoch": 0.09708827465276175, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.830263232731505e-06, + "logits/chosen": 2048985673.142857, + "logits/rejected": 1535940266.6666667, + "logps/chosen": -267.709228515625, + "logps/rejected": -486.6457248263889, + "loss": 0.145, + "rewards/chosen": 1.7929661614554269, + "rewards/margins": 9.245319744897268, + "rewards/rejected": -7.45235358344184, + "step": 263 + }, + { + "epoch": 0.09745743159060495, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.82873943802339e-06, + "logits/chosen": 1753461940.7058823, + "logits/rejected": 1516693367.4666667, + "logps/chosen": -344.5571863511029, + "logps/rejected": -535.6671875, + "loss": 0.2285, + "rewards/chosen": 0.352659590104047, + "rewards/margins": 8.420944705663942, + "rewards/rejected": -8.068285115559895, + "step": 264 + }, + { + "epoch": 0.09782658852844815, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 9.827208953158132e-06, + "logits/chosen": 1843572495.0588236, + "logits/rejected": 2058218700.8, + "logps/chosen": -294.9776826746324, + "logps/rejected": -391.48251953125, + "loss": 0.2896, + "rewards/chosen": 0.24041147793040557, + "rewards/margins": 6.456691079981186, + "rewards/rejected": -6.216279602050781, + "step": 265 + }, + { + "epoch": 0.09819574546629135, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 9.825671780256197e-06, + "logits/chosen": 1811588189.090909, + "logits/rejected": 1754528621.7142856, + "logps/chosen": -280.61849698153407, + "logps/rejected": -429.42987351190476, + "loss": 0.1567, + "rewards/chosen": 0.6178337443958629, + "rewards/margins": 7.383796617582247, + "rewards/rejected": -6.765962873186384, + "step": 266 + }, + { + "epoch": 0.09856490240413456, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 9.824127921447321e-06, + "logits/chosen": 2296939776.0, + "logits/rejected": 2201672448.0, + "logps/chosen": -262.82366943359375, + "logps/rejected": -359.08526611328125, + "loss": 0.2166, + "rewards/chosen": 0.47066012024879456, + "rewards/margins": 6.092517286539078, + "rewards/rejected": -5.621857166290283, + "step": 267 + }, + { + "epoch": 0.09893405934197776, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 9.822577378870502e-06, + "logits/chosen": 2328476262.4, + "logits/rejected": 1703655765.3333333, + "logps/chosen": -218.2671630859375, + "logps/rejected": -515.38330078125, + "loss": 0.2266, + "rewards/chosen": 0.9317961692810058, + "rewards/margins": 8.50515661239624, + "rewards/rejected": -7.573360443115234, + "step": 268 + }, + { + "epoch": 0.09930321627982096, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 9.82102015467399e-06, + "logits/chosen": 1978378624.0, + "logits/rejected": 1710202880.0, + "logps/chosen": -322.7784118652344, + "logps/rejected": -468.9908752441406, + "loss": 0.2167, + "rewards/chosen": 0.6909960508346558, + "rewards/margins": 6.601669192314148, + "rewards/rejected": -5.910673141479492, + "step": 269 + }, + { + "epoch": 0.09967237321766416, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 9.81945625101531e-06, + "logits/chosen": 3050243998.4761906, + "logits/rejected": 2361141992.7272725, + "logps/chosen": -345.8775111607143, + "logps/rejected": -397.1536310369318, + "loss": 0.2298, + "rewards/chosen": 0.94533447992234, + "rewards/margins": 6.897675733029584, + "rewards/rejected": -5.952341253107244, + "step": 270 + }, + { + "epoch": 0.10004153015550736, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 9.81788567006122e-06, + "logits/chosen": 1528746120.5333333, + "logits/rejected": 1531104677.6470587, + "logps/chosen": -241.090234375, + "logps/rejected": -482.3291590073529, + "loss": 0.2172, + "rewards/chosen": 0.39494520823160806, + "rewards/margins": 6.7609236735923615, + "rewards/rejected": -6.365978465360754, + "step": 271 + }, + { + "epoch": 0.10041068709335056, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.816308413987747e-06, + "logits/chosen": 2232662339.368421, + "logits/rejected": 2012502173.5384614, + "logps/chosen": -285.95972964638156, + "logps/rejected": -404.50672325721155, + "loss": 0.2036, + "rewards/chosen": 1.2899169921875, + "rewards/margins": 7.265409616323618, + "rewards/rejected": -5.975492624136118, + "step": 272 + }, + { + "epoch": 0.10077984403119376, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 9.814724484980156e-06, + "logits/chosen": 1728581085.8666666, + "logits/rejected": 2453942512.9411764, + "logps/chosen": -244.68253580729166, + "logps/rejected": -412.92945772058823, + "loss": 0.1745, + "rewards/chosen": 1.0816758473714192, + "rewards/margins": 7.44792179032868, + "rewards/rejected": -6.366245942957261, + "step": 273 + }, + { + "epoch": 0.10114900096903696, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 9.813133885232962e-06, + "logits/chosen": 1383893219.5555556, + "logits/rejected": 1365664182.857143, + "logps/chosen": -265.19639756944446, + "logps/rejected": -401.4805385044643, + "loss": 0.2205, + "rewards/chosen": 0.8240726788838705, + "rewards/margins": 6.715718927837553, + "rewards/rejected": -5.891646248953683, + "step": 274 + }, + { + "epoch": 0.10151815790688017, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.811536616949921e-06, + "logits/chosen": 2127971564.3076923, + "logits/rejected": 2102075392.0, + "logps/chosen": -309.9605243389423, + "logps/rejected": -411.66270045230266, + "loss": 0.2082, + "rewards/chosen": 0.3268109835111178, + "rewards/margins": 5.845621054954375, + "rewards/rejected": -5.518810071443257, + "step": 275 + }, + { + "epoch": 0.10188731484472337, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 9.809932682344026e-06, + "logits/chosen": 1758097954.1333334, + "logits/rejected": 1285664045.1764705, + "logps/chosen": -214.67169596354168, + "logps/rejected": -421.70751953125, + "loss": 0.1434, + "rewards/chosen": 1.2558099110921224, + "rewards/margins": 8.197803175683115, + "rewards/rejected": -6.941993264590993, + "step": 276 + }, + { + "epoch": 0.10225647178256657, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 9.80832208363751e-06, + "logits/chosen": 2048963206.7368422, + "logits/rejected": 1745305284.9230769, + "logps/chosen": -330.1112510279605, + "logps/rejected": -380.83901742788464, + "loss": 0.2671, + "rewards/chosen": 0.39155254865947525, + "rewards/margins": 6.559855818265845, + "rewards/rejected": -6.16830326960637, + "step": 277 + }, + { + "epoch": 0.10262562872040977, + "grad_norm": 14.75, + "kl": 0.11701345443725586, + "learning_rate": 9.806704823061837e-06, + "logits/chosen": 1958317933.7142856, + "logits/rejected": 2712888661.3333335, + "logps/chosen": -371.164794921875, + "logps/rejected": -465.4070638020833, + "loss": 0.1909, + "rewards/chosen": 0.5397615432739258, + "rewards/margins": 7.27199141184489, + "rewards/rejected": -6.732229868570964, + "step": 278 + }, + { + "epoch": 0.10299478565825297, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 9.8050809028577e-06, + "logits/chosen": 1918053034.6666667, + "logits/rejected": 1417137814.5882354, + "logps/chosen": -386.6728515625, + "logps/rejected": -474.89401424632354, + "loss": 0.1921, + "rewards/chosen": 0.8444217681884766, + "rewards/margins": 5.563980484008789, + "rewards/rejected": -4.7195587158203125, + "step": 279 + }, + { + "epoch": 0.10336394259609616, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 9.803450325275018e-06, + "logits/chosen": 1564940151.4666667, + "logits/rejected": 1606698164.7058823, + "logps/chosen": -332.02177734375, + "logps/rejected": -423.7408088235294, + "loss": 0.1932, + "rewards/chosen": 0.6410570780436198, + "rewards/margins": 6.578573234408509, + "rewards/rejected": -5.9375161563648895, + "step": 280 + }, + { + "epoch": 0.10373309953393936, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 9.801813092572938e-06, + "logits/chosen": 2022453521.0666666, + "logits/rejected": 1927797699.764706, + "logps/chosen": -316.55735677083334, + "logps/rejected": -602.2984834558823, + "loss": 0.2096, + "rewards/chosen": 0.5625272115071615, + "rewards/margins": 6.531296060599533, + "rewards/rejected": -5.968768849092371, + "step": 281 + }, + { + "epoch": 0.10410225647178256, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 9.800169207019826e-06, + "logits/chosen": 1590730638.2222223, + "logits/rejected": 2300578084.571429, + "logps/chosen": -261.65985785590277, + "logps/rejected": -388.1918247767857, + "loss": 0.2176, + "rewards/chosen": 1.3194899029201932, + "rewards/margins": 6.649861759609646, + "rewards/rejected": -5.330371856689453, + "step": 282 + }, + { + "epoch": 0.10447141340962576, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 9.798518670893263e-06, + "logits/chosen": 1997319633.4545455, + "logits/rejected": 1809895628.8, + "logps/chosen": -300.9884588068182, + "logps/rejected": -305.700537109375, + "loss": 0.257, + "rewards/chosen": 1.1478494297374378, + "rewards/margins": 6.705653537403453, + "rewards/rejected": -5.557804107666016, + "step": 283 + }, + { + "epoch": 0.10484057034746896, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 9.796861486480045e-06, + "logits/chosen": 2167737074.5263157, + "logits/rejected": 1999945097.8461537, + "logps/chosen": -355.8480674342105, + "logps/rejected": -462.99789663461536, + "loss": 0.2363, + "rewards/chosen": 0.7633656451576635, + "rewards/margins": 7.962476409881221, + "rewards/rejected": -7.1991107647235575, + "step": 284 + }, + { + "epoch": 0.10520972728531217, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 9.795197656076182e-06, + "logits/chosen": 1632783360.0, + "logits/rejected": 1622156083.2, + "logps/chosen": -365.6959635416667, + "logps/rejected": -389.415771484375, + "loss": 0.2035, + "rewards/chosen": 0.1614638070265452, + "rewards/margins": 5.783546326557795, + "rewards/rejected": -5.62208251953125, + "step": 285 + }, + { + "epoch": 0.10557888422315537, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 9.793527181986888e-06, + "logits/chosen": 1597025621.3333333, + "logits/rejected": 1995834368.0, + "logps/chosen": -288.8605550130208, + "logps/rejected": -583.08427734375, + "loss": 0.1599, + "rewards/chosen": 0.588905135790507, + "rewards/margins": 7.20903529326121, + "rewards/rejected": -6.620130157470703, + "step": 286 + }, + { + "epoch": 0.10594804116099857, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 9.791850066526584e-06, + "logits/chosen": 3136440661.3333335, + "logits/rejected": 1721279078.4, + "logps/chosen": -299.9694010416667, + "logps/rejected": -455.729541015625, + "loss": 0.1563, + "rewards/chosen": 0.8755815029144287, + "rewards/margins": 6.879615545272827, + "rewards/rejected": -6.004034042358398, + "step": 287 + }, + { + "epoch": 0.10631719809884177, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.790166312018894e-06, + "logits/chosen": 1912333516.8, + "logits/rejected": 1403121749.3333333, + "logps/chosen": -269.272802734375, + "logps/rejected": -585.6920572916666, + "loss": 0.1895, + "rewards/chosen": 1.3292658805847168, + "rewards/margins": 8.927832571665446, + "rewards/rejected": -7.5985666910807295, + "step": 288 + }, + { + "epoch": 0.10668635503668497, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 9.788475920796638e-06, + "logits/chosen": 1915768149.3333333, + "logits/rejected": 2182330368.0, + "logps/chosen": -219.4649658203125, + "logps/rejected": -606.148486328125, + "loss": 0.1433, + "rewards/chosen": 1.0323289235432942, + "rewards/margins": 8.743295415242514, + "rewards/rejected": -7.710966491699219, + "step": 289 + }, + { + "epoch": 0.10705551197452817, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 9.78677889520183e-06, + "logits/chosen": 1379898254.2222223, + "logits/rejected": 1465265737.142857, + "logps/chosen": -278.90703667534723, + "logps/rejected": -607.9679129464286, + "loss": 0.1724, + "rewards/chosen": 1.3309804068671331, + "rewards/margins": 10.745853802514453, + "rewards/rejected": -9.414873395647321, + "step": 290 + }, + { + "epoch": 0.10742466891237137, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 9.785075237585678e-06, + "logits/chosen": 1743149738.6666667, + "logits/rejected": 2367486771.2, + "logps/chosen": -246.84893798828125, + "logps/rejected": -442.88212890625, + "loss": 0.1815, + "rewards/chosen": 0.6989297866821289, + "rewards/margins": 6.813461112976074, + "rewards/rejected": -6.114531326293945, + "step": 291 + }, + { + "epoch": 0.10779382585021458, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 9.783364950308578e-06, + "logits/chosen": 2242759626.105263, + "logits/rejected": 1857791054.7692308, + "logps/chosen": -222.3205694901316, + "logps/rejected": -491.5148362379808, + "loss": 0.2273, + "rewards/chosen": 1.0162039305034436, + "rewards/margins": 7.69055378871408, + "rewards/rejected": -6.674349858210637, + "step": 292 + }, + { + "epoch": 0.10816298278805778, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 9.78164803574011e-06, + "logits/chosen": 1661178831.2380953, + "logits/rejected": 1743329280.0, + "logps/chosen": -267.8080357142857, + "logps/rejected": -461.7286931818182, + "loss": 0.237, + "rewards/chosen": 1.0526900518508184, + "rewards/margins": 7.210590775394852, + "rewards/rejected": -6.157900723544034, + "step": 293 + }, + { + "epoch": 0.10853213972590098, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 9.77992449625904e-06, + "logits/chosen": 2574269553.7777777, + "logits/rejected": 1951157737.7391305, + "logps/chosen": -338.5415310329861, + "logps/rejected": -520.6604110054348, + "loss": 0.1202, + "rewards/chosen": 0.9983696407741971, + "rewards/margins": 7.912341914891044, + "rewards/rejected": -6.913972274116848, + "step": 294 + }, + { + "epoch": 0.10890129666374418, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 9.778194334253308e-06, + "logits/chosen": 1710206976.0, + "logits/rejected": 1996958515.2, + "logps/chosen": -320.86170151654414, + "logps/rejected": -440.87568359375, + "loss": 0.185, + "rewards/chosen": 1.0007241192985983, + "rewards/margins": 9.306014341466566, + "rewards/rejected": -8.305290222167969, + "step": 295 + }, + { + "epoch": 0.10927045360158738, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 9.776457552120034e-06, + "logits/chosen": 1621165738.6666667, + "logits/rejected": 2276909465.6, + "logps/chosen": -332.0484212239583, + "logps/rejected": -559.043603515625, + "loss": 0.1573, + "rewards/chosen": 0.8084496657053629, + "rewards/margins": 7.511607472101848, + "rewards/rejected": -6.703157806396485, + "step": 296 + }, + { + "epoch": 0.10963961053943058, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 9.774714152265504e-06, + "logits/chosen": 1846064878.9333334, + "logits/rejected": 1589098977.8823528, + "logps/chosen": -370.32848307291664, + "logps/rejected": -522.0892693014706, + "loss": 0.1944, + "rewards/chosen": 0.820379638671875, + "rewards/margins": 7.711372195973116, + "rewards/rejected": -6.8909925573012405, + "step": 297 + }, + { + "epoch": 0.11000876747727377, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 9.772964137105179e-06, + "logits/chosen": 2103658752.0, + "logits/rejected": 1707513600.0, + "logps/chosen": -293.0218505859375, + "logps/rejected": -517.441162109375, + "loss": 0.2262, + "rewards/chosen": 0.7265486121177673, + "rewards/margins": 7.379050195217133, + "rewards/rejected": -6.652501583099365, + "step": 298 + }, + { + "epoch": 0.11037792441511697, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 9.771207509063682e-06, + "logits/chosen": 2663499190.857143, + "logits/rejected": 2131321287.1111112, + "logps/chosen": -314.42539760044644, + "logps/rejected": -698.0623914930555, + "loss": 0.1401, + "rewards/chosen": 1.2839648383004325, + "rewards/margins": 9.750318360707118, + "rewards/rejected": -8.466353522406685, + "step": 299 + }, + { + "epoch": 0.11074708135296017, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 9.769444270574799e-06, + "logits/chosen": 1830025489.0666666, + "logits/rejected": 2108509485.1764705, + "logps/chosen": -279.42522786458335, + "logps/rejected": -379.85486557904414, + "loss": 0.1707, + "rewards/chosen": 1.3648675282796223, + "rewards/margins": 7.256301244099935, + "rewards/rejected": -5.8914337158203125, + "step": 300 + }, + { + "epoch": 0.11111623829080337, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 9.767674424081472e-06, + "logits/chosen": 2045057647.3043478, + "logits/rejected": 1836874638.2222223, + "logps/chosen": -272.3753821331522, + "logps/rejected": -629.0846896701389, + "loss": 0.2531, + "rewards/chosen": 1.2450005904487942, + "rewards/margins": 6.804306417271711, + "rewards/rejected": -5.559305826822917, + "step": 301 + }, + { + "epoch": 0.11148539522864657, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 9.765897972035806e-06, + "logits/chosen": 1641214520.8888888, + "logits/rejected": 2251253760.0, + "logps/chosen": -216.07187228732639, + "logps/rejected": -446.01681082589283, + "loss": 0.2016, + "rewards/chosen": 1.267933315700955, + "rewards/margins": 6.7012731158544145, + "rewards/rejected": -5.43333980015346, + "step": 302 + }, + { + "epoch": 0.11185455216648978, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 9.764114916899049e-06, + "logits/chosen": 1728858670.5454545, + "logits/rejected": 1600480012.1904762, + "logps/chosen": -306.89450905539775, + "logps/rejected": -547.4903738839286, + "loss": 0.122, + "rewards/chosen": 1.2787579623135654, + "rewards/margins": 8.23571219175925, + "rewards/rejected": -6.956954229445684, + "step": 303 + }, + { + "epoch": 0.11222370910433298, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 9.762325261141602e-06, + "logits/chosen": 1973804032.0, + "logits/rejected": 1802498420.3636363, + "logps/chosen": -259.93291015625, + "logps/rejected": -454.11501242897725, + "loss": 0.1344, + "rewards/chosen": 0.8137189865112304, + "rewards/margins": 7.040212821960449, + "rewards/rejected": -6.226493835449219, + "step": 304 + }, + { + "epoch": 0.11259286604217618, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 9.760529007243011e-06, + "logits/chosen": 1233308672.0, + "logits/rejected": 1955375427.368421, + "logps/chosen": -256.69649564302887, + "logps/rejected": -445.79183799342104, + "loss": 0.1076, + "rewards/chosen": 1.6934173290546124, + "rewards/margins": 8.444660518816125, + "rewards/rejected": -6.751243189761513, + "step": 305 + }, + { + "epoch": 0.11296202298001938, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 9.758726157691961e-06, + "logits/chosen": 1571382681.6, + "logits/rejected": 1838468778.6666667, + "logps/chosen": -286.5646240234375, + "logps/rejected": -386.9343668619792, + "loss": 0.2698, + "rewards/chosen": 0.43589019775390625, + "rewards/margins": 7.736979802449544, + "rewards/rejected": -7.301089604695638, + "step": 306 + }, + { + "epoch": 0.11333117991786258, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.75691671498628e-06, + "logits/chosen": 1985576277.3333333, + "logits/rejected": 1881284900.5714285, + "logps/chosen": -274.95830620659723, + "logps/rejected": -349.3916713169643, + "loss": 0.225, + "rewards/chosen": 1.0141366322835286, + "rewards/margins": 6.746578125726609, + "rewards/rejected": -5.732441493443081, + "step": 307 + }, + { + "epoch": 0.11370033685570578, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.755100681632924e-06, + "logits/chosen": 2392320602.352941, + "logits/rejected": 1872387959.4666667, + "logps/chosen": -367.8361385569853, + "logps/rejected": -436.31256510416665, + "loss": 0.1868, + "rewards/chosen": 1.0853500366210938, + "rewards/margins": 7.814613850911458, + "rewards/rejected": -6.7292638142903645, + "step": 308 + }, + { + "epoch": 0.11406949379354898, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.75327806014799e-06, + "logits/chosen": 1578351160.8888888, + "logits/rejected": 1900455204.5714285, + "logps/chosen": -245.11911349826389, + "logps/rejected": -531.3168247767857, + "loss": 0.2329, + "rewards/chosen": 0.5866188473171658, + "rewards/margins": 8.406115002102322, + "rewards/rejected": -7.819496154785156, + "step": 309 + }, + { + "epoch": 0.11443865073139219, + "grad_norm": 13.6875, + "kl": 0.30956125259399414, + "learning_rate": 9.75144885305669e-06, + "logits/chosen": 2038926677.3333333, + "logits/rejected": 2394494537.142857, + "logps/chosen": -314.65980360243054, + "logps/rejected": -412.06295340401783, + "loss": 0.2237, + "rewards/chosen": 1.2123469246758356, + "rewards/margins": 5.655106302291628, + "rewards/rejected": -4.4427593776157925, + "step": 310 + }, + { + "epoch": 0.11480780766923539, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 9.749613062893373e-06, + "logits/chosen": 1840382739.6923077, + "logits/rejected": 2134430127.1578948, + "logps/chosen": -234.12056790865384, + "logps/rejected": -493.5501644736842, + "loss": 0.1754, + "rewards/chosen": 0.6219647480891302, + "rewards/margins": 7.399088450288966, + "rewards/rejected": -6.777123702199836, + "step": 311 + }, + { + "epoch": 0.11517696460707859, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 9.7477706922015e-06, + "logits/chosen": 2764598125.714286, + "logits/rejected": 3046856704.0, + "logps/chosen": -257.80653599330356, + "logps/rejected": -560.7020399305555, + "loss": 0.1703, + "rewards/chosen": 1.3177879878452845, + "rewards/margins": 9.57884891449459, + "rewards/rejected": -8.261060926649305, + "step": 312 + }, + { + "epoch": 0.11554612154492179, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.745921743533653e-06, + "logits/chosen": 1473042311.5294118, + "logits/rejected": 2416495820.8, + "logps/chosen": -293.54865579044116, + "logps/rejected": -524.4080078125, + "loss": 0.1692, + "rewards/chosen": 1.6227103962617762, + "rewards/margins": 8.117225878846412, + "rewards/rejected": -6.494515482584635, + "step": 313 + }, + { + "epoch": 0.11591527848276499, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.744066219451526e-06, + "logits/chosen": 1809048629.8947368, + "logits/rejected": 1631027515.0769231, + "logps/chosen": -259.85580283717104, + "logps/rejected": -495.7014723557692, + "loss": 0.2551, + "rewards/chosen": 0.9207908228824013, + "rewards/margins": 6.081833843277534, + "rewards/rejected": -5.161043020395132, + "step": 314 + }, + { + "epoch": 0.11628443542060819, + "grad_norm": 12.625, + "kl": 0.8117055892944336, + "learning_rate": 9.742204122525925e-06, + "logits/chosen": 1466590208.0, + "logits/rejected": 2062477482.6666667, + "logps/chosen": -240.2080078125, + "logps/rejected": -498.1715901692708, + "loss": 0.1909, + "rewards/chosen": 1.6859901428222657, + "rewards/margins": 9.303480911254884, + "rewards/rejected": -7.617490768432617, + "step": 315 + }, + { + "epoch": 0.11665359235845138, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 9.740335455336762e-06, + "logits/chosen": 1330146872.8888888, + "logits/rejected": 1089062326.857143, + "logps/chosen": -257.4879557291667, + "logps/rejected": -413.96895926339283, + "loss": 0.1654, + "rewards/chosen": 1.4695846769544814, + "rewards/margins": 8.430021906655933, + "rewards/rejected": -6.9604372297014505, + "step": 316 + }, + { + "epoch": 0.11702274929629458, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 9.73846022047305e-06, + "logits/chosen": 1498745446.4, + "logits/rejected": 2244384954.181818, + "logps/chosen": -263.574169921875, + "logps/rejected": -554.7884410511364, + "loss": 0.1055, + "rewards/chosen": 1.3974539756774902, + "rewards/margins": 8.149565930800005, + "rewards/rejected": -6.752111955122515, + "step": 317 + }, + { + "epoch": 0.11739190623413778, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 9.736578420532904e-06, + "logits/chosen": 2426956746.105263, + "logits/rejected": 1393608388.9230769, + "logps/chosen": -329.1658357319079, + "logps/rejected": -409.3966721754808, + "loss": 0.2252, + "rewards/chosen": 1.0377998352050781, + "rewards/margins": 7.1182242173414965, + "rewards/rejected": -6.080424382136418, + "step": 318 + }, + { + "epoch": 0.11776106317198098, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 9.734690058123534e-06, + "logits/chosen": 1712908928.0, + "logits/rejected": 1759267328.0, + "logps/chosen": -218.2560577392578, + "logps/rejected": -392.0929870605469, + "loss": 0.1897, + "rewards/chosen": 0.8327862024307251, + "rewards/margins": 6.859002709388733, + "rewards/rejected": -6.026216506958008, + "step": 319 + }, + { + "epoch": 0.11813022010982419, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 9.732795135861245e-06, + "logits/chosen": 2732491629.714286, + "logits/rejected": 2010875221.3333333, + "logps/chosen": -347.94185965401783, + "logps/rejected": -581.3672417534722, + "loss": 0.1928, + "rewards/chosen": 0.5334914071219308, + "rewards/margins": 9.250272175622365, + "rewards/rejected": -8.716780768500435, + "step": 320 + }, + { + "epoch": 0.11849937704766739, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 9.73089365637142e-06, + "logits/chosen": 1505722488.4705882, + "logits/rejected": 1816177186.1333334, + "logps/chosen": -304.00896139705884, + "logps/rejected": -391.8509765625, + "loss": 0.194, + "rewards/chosen": 1.3245123694924748, + "rewards/margins": 7.886983377793255, + "rewards/rejected": -6.562471008300781, + "step": 321 + }, + { + "epoch": 0.11886853398551059, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 9.728985622288542e-06, + "logits/chosen": 2038449058.909091, + "logits/rejected": 1983236973.7142856, + "logps/chosen": -262.05118075284093, + "logps/rejected": -489.99232700892856, + "loss": 0.1054, + "rewards/chosen": 1.5703854994340376, + "rewards/margins": 8.481939745155763, + "rewards/rejected": -6.911554245721726, + "step": 322 + }, + { + "epoch": 0.11923769092335379, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.727071036256166e-06, + "logits/chosen": 1860948608.0, + "logits/rejected": 1410873216.0, + "logps/chosen": -269.7547912597656, + "logps/rejected": -541.3761596679688, + "loss": 0.211, + "rewards/chosen": 0.6384012699127197, + "rewards/margins": 7.099425554275513, + "rewards/rejected": -6.461024284362793, + "step": 323 + }, + { + "epoch": 0.11960684786119699, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 9.725149900926925e-06, + "logits/chosen": 1793793638.4, + "logits/rejected": 1772903484.235294, + "logps/chosen": -240.184130859375, + "logps/rejected": -397.7808191636029, + "loss": 0.1603, + "rewards/chosen": 1.5827176411946615, + "rewards/margins": 7.00188824522729, + "rewards/rejected": -5.419170604032629, + "step": 324 + }, + { + "epoch": 0.11997600479904019, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.723222218962529e-06, + "logits/chosen": 1886661911.2727273, + "logits/rejected": 1809809408.0, + "logps/chosen": -277.7151988636364, + "logps/rejected": -460.1244419642857, + "loss": 0.1644, + "rewards/chosen": 0.5133852091702548, + "rewards/margins": 7.712075714425091, + "rewards/rejected": -7.198690505254836, + "step": 325 + }, + { + "epoch": 0.1203451617368834, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 9.721287993033757e-06, + "logits/chosen": 1701160116.7058823, + "logits/rejected": 2235247547.733333, + "logps/chosen": -241.08389820772058, + "logps/rejected": -506.75305989583336, + "loss": 0.1638, + "rewards/chosen": 1.340056812061983, + "rewards/margins": 8.68946273654115, + "rewards/rejected": -7.349405924479167, + "step": 326 + }, + { + "epoch": 0.1207143186747266, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 9.719347225820453e-06, + "logits/chosen": 1275668224.0, + "logits/rejected": 1838634240.0, + "logps/chosen": -300.2945251464844, + "logps/rejected": -420.5171203613281, + "loss": 0.2081, + "rewards/chosen": 0.9389196634292603, + "rewards/margins": 7.048129200935364, + "rewards/rejected": -6.1092095375061035, + "step": 327 + }, + { + "epoch": 0.1210834756125698, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 9.717399920011527e-06, + "logits/chosen": 2098533888.0, + "logits/rejected": 2152257536.0, + "logps/chosen": -291.8856608072917, + "logps/rejected": -451.26796875, + "loss": 0.1319, + "rewards/chosen": 1.4894905090332031, + "rewards/margins": 7.712226486206054, + "rewards/rejected": -6.222735977172851, + "step": 328 + }, + { + "epoch": 0.121452632550413, + "grad_norm": 13.5, + "kl": 0.36493349075317383, + "learning_rate": 9.715446078304946e-06, + "logits/chosen": 1826282961.4545455, + "logits/rejected": 2429105688.3809524, + "logps/chosen": -294.46493252840907, + "logps/rejected": -470.8598865327381, + "loss": 0.1512, + "rewards/chosen": 0.8230190277099609, + "rewards/margins": 7.52409063066755, + "rewards/rejected": -6.701071602957589, + "step": 329 + }, + { + "epoch": 0.1218217894882562, + "grad_norm": 16.75, + "kl": 0.20928645133972168, + "learning_rate": 9.713485703407732e-06, + "logits/chosen": 2193311623.529412, + "logits/rejected": 2448197222.4, + "logps/chosen": -385.33898207720586, + "logps/rejected": -484.82776692708336, + "loss": 0.219, + "rewards/chosen": 0.7785900901345646, + "rewards/margins": 7.287018027960086, + "rewards/rejected": -6.5084279378255205, + "step": 330 + }, + { + "epoch": 0.1221909464260994, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 9.71151879803596e-06, + "logits/chosen": 2241329259.7894735, + "logits/rejected": 1130339485.5384614, + "logps/chosen": -260.50791529605266, + "logps/rejected": -332.5345928485577, + "loss": 0.248, + "rewards/chosen": 0.8129700610512182, + "rewards/margins": 5.653278910679373, + "rewards/rejected": -4.840308849628155, + "step": 331 + }, + { + "epoch": 0.1225601033639426, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 9.709545364914754e-06, + "logits/chosen": 2046657536.0, + "logits/rejected": 1764187704.8888888, + "logps/chosen": -333.5389927455357, + "logps/rejected": -372.90869140625, + "loss": 0.1923, + "rewards/chosen": 0.5461257525852748, + "rewards/margins": 7.192624962519086, + "rewards/rejected": -6.646499209933811, + "step": 332 + }, + { + "epoch": 0.1229292603017858, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 9.707565406778277e-06, + "logits/chosen": 1631706112.0, + "logits/rejected": 1564445440.0, + "logps/chosen": -312.7308349609375, + "logps/rejected": -476.3707275390625, + "loss": 0.186, + "rewards/chosen": 0.8617057204246521, + "rewards/margins": 7.816310822963715, + "rewards/rejected": -6.9546051025390625, + "step": 333 + }, + { + "epoch": 0.12329841723962899, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 9.70557892636974e-06, + "logits/chosen": 1563645405.8666666, + "logits/rejected": 2005603388.235294, + "logps/chosen": -325.72259114583335, + "logps/rejected": -540.3509880514706, + "loss": 0.1756, + "rewards/chosen": 1.1565523783365885, + "rewards/margins": 8.76632231170056, + "rewards/rejected": -7.609769933363971, + "step": 334 + }, + { + "epoch": 0.12366757417747219, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 9.703585926441383e-06, + "logits/chosen": 1484886893.7142856, + "logits/rejected": 2080737280.0, + "logps/chosen": -170.3687744140625, + "logps/rejected": -483.0564236111111, + "loss": 0.1411, + "rewards/chosen": 2.1464057649884904, + "rewards/margins": 8.668394103882804, + "rewards/rejected": -6.521988338894314, + "step": 335 + }, + { + "epoch": 0.1240367311153154, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 9.701586409754484e-06, + "logits/chosen": 1545110797.4736843, + "logits/rejected": 1759782596.9230769, + "logps/chosen": -304.8999280427632, + "logps/rejected": -524.0590444711538, + "loss": 0.2321, + "rewards/chosen": 1.1281626851935136, + "rewards/margins": 8.60429207033474, + "rewards/rejected": -7.476129385141226, + "step": 336 + }, + { + "epoch": 0.1244058880531586, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 9.699580379079348e-06, + "logits/chosen": 1697938688.0, + "logits/rejected": 1860971520.0, + "logps/chosen": -261.2745361328125, + "logps/rejected": -562.5830078125, + "loss": 0.2311, + "rewards/chosen": 0.3474116325378418, + "rewards/margins": 7.845529556274414, + "rewards/rejected": -7.498117923736572, + "step": 337 + }, + { + "epoch": 0.1247750449910018, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 9.697567837195303e-06, + "logits/chosen": 2704874556.2352943, + "logits/rejected": 1485390097.0666666, + "logps/chosen": -337.89728860294116, + "logps/rejected": -408.2734375, + "loss": 0.186, + "rewards/chosen": 1.374041950001436, + "rewards/margins": 7.340348000619925, + "rewards/rejected": -5.966306050618489, + "step": 338 + }, + { + "epoch": 0.125144201928845, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 9.695548786890701e-06, + "logits/chosen": 1532585642.6666667, + "logits/rejected": 1877556955.4285715, + "logps/chosen": -239.21115451388889, + "logps/rejected": -441.5912388392857, + "loss": 0.1551, + "rewards/chosen": 2.1020791795518665, + "rewards/margins": 9.117091103205606, + "rewards/rejected": -7.015011923653739, + "step": 339 + }, + { + "epoch": 0.125144201928845, + "eval_kl": 0.0, + "eval_logits/chosen": 3507292306.985646, + "eval_logits/rejected": 3533149028.848485, + "eval_logps/chosen": -295.8917090311005, + "eval_logps/rejected": -468.6000405844156, + "eval_loss": 0.16918018460273743, + "eval_rewards/chosen": 1.1123073158081638, + "eval_rewards/margins": 8.199268409727265, + "eval_rewards/rejected": -7.0869610939191015, + "eval_runtime": 109.5846, + "eval_samples_per_second": 7.994, + "eval_steps_per_second": 0.502, + "step": 339 + }, + { + "epoch": 0.1255133588666882, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 9.693523230962914e-06, + "logits/chosen": 1651297109.3333333, + "logits/rejected": 1667680563.2, + "logps/chosen": -226.25838216145834, + "logps/rejected": -351.815673828125, + "loss": 0.1159, + "rewards/chosen": 1.7283385594685872, + "rewards/margins": 6.712252744038899, + "rewards/rejected": -4.983914184570312, + "step": 340 + }, + { + "epoch": 0.12588251580453141, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 9.691491172218318e-06, + "logits/chosen": 2188598218.105263, + "logits/rejected": 2458437474.4615383, + "logps/chosen": -262.4993832236842, + "logps/rejected": -522.5113431490385, + "loss": 0.2481, + "rewards/chosen": 0.8335970828407689, + "rewards/margins": 11.080357524547498, + "rewards/rejected": -10.24676044170673, + "step": 341 + }, + { + "epoch": 0.12625167274237462, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 9.689452613472309e-06, + "logits/chosen": 1638323712.0, + "logits/rejected": 2167182540.8, + "logps/chosen": -275.1349283854167, + "logps/rejected": -418.9759765625, + "loss": 0.1506, + "rewards/chosen": 1.4113213221232097, + "rewards/margins": 7.963069979349773, + "rewards/rejected": -6.551748657226563, + "step": 342 + }, + { + "epoch": 0.1266208296802178, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 9.687407557549282e-06, + "logits/chosen": 1754517288.4210527, + "logits/rejected": 1831039448.6153846, + "logps/chosen": -292.2557308799342, + "logps/rejected": -441.79871544471155, + "loss": 0.1941, + "rewards/chosen": 1.4750100185996609, + "rewards/margins": 8.412591007556992, + "rewards/rejected": -6.937580988957332, + "step": 343 + }, + { + "epoch": 0.126989986618061, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 9.685356007282639e-06, + "logits/chosen": 1718597339.4285715, + "logits/rejected": 1479137373.090909, + "logps/chosen": -250.99583798363096, + "logps/rejected": -394.78280362215907, + "loss": 0.227, + "rewards/chosen": 1.3655876886276972, + "rewards/margins": 7.31252316891889, + "rewards/rejected": -5.946935480291193, + "step": 344 + }, + { + "epoch": 0.1273591435559042, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 9.683297965514774e-06, + "logits/chosen": 1946383938.7826087, + "logits/rejected": 1930582243.5555556, + "logps/chosen": -288.39451002038044, + "logps/rejected": -341.96275499131946, + "loss": 0.3316, + "rewards/chosen": 0.5754285480665124, + "rewards/margins": 5.109883741480141, + "rewards/rejected": -4.534455193413629, + "step": 345 + }, + { + "epoch": 0.1277283004937474, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 9.681233435097078e-06, + "logits/chosen": 1143471360.0, + "logits/rejected": 1413154560.0, + "logps/chosen": -261.47674560546875, + "logps/rejected": -494.34368896484375, + "loss": 0.1635, + "rewards/chosen": 1.2011501789093018, + "rewards/margins": 9.124915361404419, + "rewards/rejected": -7.923765182495117, + "step": 346 + }, + { + "epoch": 0.1280974574315906, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.679162418889932e-06, + "logits/chosen": 1108339029.3333333, + "logits/rejected": 1607356416.0, + "logps/chosen": -256.07411024305554, + "logps/rejected": -452.017333984375, + "loss": 0.193, + "rewards/chosen": 1.5214390224880643, + "rewards/margins": 7.945553794739739, + "rewards/rejected": -6.4241147722516745, + "step": 347 + }, + { + "epoch": 0.1284666143694338, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 9.677084919762703e-06, + "logits/chosen": 1343635937.8823528, + "logits/rejected": 2056398711.4666667, + "logps/chosen": -260.0425379136029, + "logps/rejected": -497.38255208333334, + "loss": 0.1698, + "rewards/chosen": 1.5038629419663374, + "rewards/margins": 8.75862927904316, + "rewards/rejected": -7.2547663370768225, + "step": 348 + }, + { + "epoch": 0.128835771307277, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 9.675000940593738e-06, + "logits/chosen": 1967014249.4117646, + "logits/rejected": 1834801834.6666667, + "logps/chosen": -295.69651884191177, + "logps/rejected": -461.6944986979167, + "loss": 0.1516, + "rewards/chosen": 1.4664307762594784, + "rewards/margins": 8.267338165582395, + "rewards/rejected": -6.800907389322917, + "step": 349 + }, + { + "epoch": 0.1292049282451202, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.672910484270367e-06, + "logits/chosen": 2704886784.0, + "logits/rejected": 2213251328.0, + "logps/chosen": -344.67547607421875, + "logps/rejected": -511.8169250488281, + "loss": 0.1856, + "rewards/chosen": 1.0536870956420898, + "rewards/margins": 9.18661880493164, + "rewards/rejected": -8.13293170928955, + "step": 350 + }, + { + "epoch": 0.1295740851829634, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 9.670813553688888e-06, + "logits/chosen": 2081259008.0, + "logits/rejected": 2255987968.0, + "logps/chosen": -303.13360595703125, + "logps/rejected": -460.738037109375, + "loss": 0.1976, + "rewards/chosen": 0.7987386584281921, + "rewards/margins": 6.779437243938446, + "rewards/rejected": -5.980698585510254, + "step": 351 + }, + { + "epoch": 0.1299432421208066, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.668710151754572e-06, + "logits/chosen": 2571791521.6842103, + "logits/rejected": 1672823256.6153846, + "logps/chosen": -229.90560752467104, + "logps/rejected": -448.85535606971155, + "loss": 0.2012, + "rewards/chosen": 1.4017832906622636, + "rewards/margins": 7.323555000397841, + "rewards/rejected": -5.921771709735577, + "step": 352 + }, + { + "epoch": 0.1303123990586498, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 9.666600281381657e-06, + "logits/chosen": 1628455227.0769231, + "logits/rejected": 2085885628.631579, + "logps/chosen": -323.06847205528845, + "logps/rejected": -530.048828125, + "loss": 0.1701, + "rewards/chosen": 0.5004661266620343, + "rewards/margins": 9.518498802957264, + "rewards/rejected": -9.01803267629523, + "step": 353 + }, + { + "epoch": 0.130681555996493, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 9.664483945493342e-06, + "logits/chosen": 1895652420.2666667, + "logits/rejected": 1853399040.0, + "logps/chosen": -326.6705729166667, + "logps/rejected": -428.90866268382354, + "loss": 0.1936, + "rewards/chosen": 0.7678691864013671, + "rewards/margins": 7.024685377233169, + "rewards/rejected": -6.256816190831802, + "step": 354 + }, + { + "epoch": 0.1310507129343362, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.66236114702178e-06, + "logits/chosen": 2007081915.7333333, + "logits/rejected": 1816121103.0588236, + "logps/chosen": -269.1964518229167, + "logps/rejected": -401.4315544577206, + "loss": 0.1985, + "rewards/chosen": 0.5368651072184245, + "rewards/margins": 6.8147743299895644, + "rewards/rejected": -6.2779092227711395, + "step": 355 + }, + { + "epoch": 0.1314198698721794, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 9.660231888908085e-06, + "logits/chosen": 1526057984.0, + "logits/rejected": 1883630592.0, + "logps/chosen": -237.85587565104166, + "logps/rejected": -416.067919921875, + "loss": 0.1383, + "rewards/chosen": 1.2615716457366943, + "rewards/margins": 6.7582234859466555, + "rewards/rejected": -5.496651840209961, + "step": 356 + }, + { + "epoch": 0.1317890268100226, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 9.658096174102314e-06, + "logits/chosen": 2144994872.8888888, + "logits/rejected": 1681049014.857143, + "logps/chosen": -291.23741319444446, + "logps/rejected": -578.5938197544643, + "loss": 0.2063, + "rewards/chosen": 1.0190391540527344, + "rewards/margins": 9.165148598807198, + "rewards/rejected": -8.146109444754464, + "step": 357 + }, + { + "epoch": 0.1321581837478658, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 9.655954005563475e-06, + "logits/chosen": 2134261613.7142856, + "logits/rejected": 2313300878.2222223, + "logps/chosen": -261.28731863839283, + "logps/rejected": -567.4992947048611, + "loss": 0.1841, + "rewards/chosen": 0.5613866533551898, + "rewards/margins": 8.280863141256665, + "rewards/rejected": -7.719476487901476, + "step": 358 + }, + { + "epoch": 0.132527340685709, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 9.653805386259514e-06, + "logits/chosen": 1798780807.5294118, + "logits/rejected": 2186673493.3333335, + "logps/chosen": -204.4872328814338, + "logps/rejected": -393.27692057291665, + "loss": 0.2058, + "rewards/chosen": 1.0325897441190832, + "rewards/margins": 7.006276471007104, + "rewards/rejected": -5.973686726888021, + "step": 359 + }, + { + "epoch": 0.1328964976235522, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 9.651650319167318e-06, + "logits/chosen": 2094073309.8666666, + "logits/rejected": 2171559213.1764708, + "logps/chosen": -330.50556640625, + "logps/rejected": -503.8912568933824, + "loss": 0.2035, + "rewards/chosen": 0.5402606328328451, + "rewards/margins": 7.653179710986567, + "rewards/rejected": -7.112919078153722, + "step": 360 + }, + { + "epoch": 0.1332656545613954, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 9.649488807272703e-06, + "logits/chosen": 2180775480.888889, + "logits/rejected": 1574576420.5714285, + "logps/chosen": -318.3544108072917, + "logps/rejected": -387.89100864955356, + "loss": 0.2227, + "rewards/chosen": 0.6724677615695529, + "rewards/margins": 7.089004910181439, + "rewards/rejected": -6.416537148611886, + "step": 361 + }, + { + "epoch": 0.13363481149923861, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 9.647320853570415e-06, + "logits/chosen": 1386188390.4, + "logits/rejected": 1224655811.764706, + "logps/chosen": -259.0849609375, + "logps/rejected": -412.5089326746324, + "loss": 0.1662, + "rewards/chosen": 1.3073441823323568, + "rewards/margins": 6.889398769303864, + "rewards/rejected": -5.582054586971507, + "step": 362 + }, + { + "epoch": 0.13400396843708182, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 9.645146461064129e-06, + "logits/chosen": 1579306368.0, + "logits/rejected": 2029641344.0, + "logps/chosen": -258.07177734375, + "logps/rejected": -465.22698974609375, + "loss": 0.1438, + "rewards/chosen": 1.7284153699874878, + "rewards/margins": 8.150326609611511, + "rewards/rejected": -6.421911239624023, + "step": 363 + }, + { + "epoch": 0.13437312537492502, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.642965632766437e-06, + "logits/chosen": 1939270542.2222223, + "logits/rejected": 2043072804.5714285, + "logps/chosen": -297.59784613715277, + "logps/rejected": -480.90774972098217, + "loss": 0.2402, + "rewards/chosen": 0.47220929463704425, + "rewards/margins": 6.6547501881917315, + "rewards/rejected": -6.1825408935546875, + "step": 364 + }, + { + "epoch": 0.13474228231276822, + "grad_norm": 24.25, + "kl": 0.0, + "learning_rate": 9.64077837169885e-06, + "logits/chosen": 2184852102.736842, + "logits/rejected": 1433711064.6153846, + "logps/chosen": -340.2958727384868, + "logps/rejected": -438.2702073317308, + "loss": 0.2597, + "rewards/chosen": 0.4157489224484092, + "rewards/margins": 6.1706458199844665, + "rewards/rejected": -5.7548968975360575, + "step": 365 + }, + { + "epoch": 0.13511143925061142, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.638584680891787e-06, + "logits/chosen": 1377009408.0, + "logits/rejected": 1552623744.0, + "logps/chosen": -264.9659423828125, + "logps/rejected": -515.0516967773438, + "loss": 0.1546, + "rewards/chosen": 1.8077906370162964, + "rewards/margins": 8.66408383846283, + "rewards/rejected": -6.856293201446533, + "step": 366 + }, + { + "epoch": 0.13548059618845462, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 9.636384563384584e-06, + "logits/chosen": 2714350250.6666665, + "logits/rejected": 1742696857.6, + "logps/chosen": -232.76542154947916, + "logps/rejected": -429.0328125, + "loss": 0.1596, + "rewards/chosen": 0.8740178743998209, + "rewards/margins": 7.745548025767008, + "rewards/rejected": -6.871530151367187, + "step": 367 + }, + { + "epoch": 0.13584975312629782, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 9.634178022225474e-06, + "logits/chosen": 1541684792.8888888, + "logits/rejected": 1660612900.5714285, + "logps/chosen": -310.5464138454861, + "logps/rejected": -466.06333705357144, + "loss": 0.1992, + "rewards/chosen": 1.0209729936387804, + "rewards/margins": 8.210181433057027, + "rewards/rejected": -7.189208439418247, + "step": 368 + }, + { + "epoch": 0.13621891006414102, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 9.631965060471587e-06, + "logits/chosen": 1774833117.8666666, + "logits/rejected": 1743265430.5882354, + "logps/chosen": -350.38310546875, + "logps/rejected": -518.9946001838235, + "loss": 0.1355, + "rewards/chosen": 1.6168473561604817, + "rewards/margins": 9.287025937847062, + "rewards/rejected": -7.670178581686581, + "step": 369 + }, + { + "epoch": 0.13658806700198423, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 9.62974568118896e-06, + "logits/chosen": 1333180054.5882354, + "logits/rejected": 1949014562.1333334, + "logps/chosen": -313.18537454044116, + "logps/rejected": -438.5080078125, + "loss": 0.2563, + "rewards/chosen": 0.395992166855756, + "rewards/margins": 8.279177998561485, + "rewards/rejected": -7.883185831705729, + "step": 370 + }, + { + "epoch": 0.13695722393982743, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.627519887452512e-06, + "logits/chosen": 1461645312.0, + "logits/rejected": 1523881252.5714285, + "logps/chosen": -232.86726888020834, + "logps/rejected": -404.53438895089283, + "loss": 0.2131, + "rewards/chosen": 0.7098377015855577, + "rewards/margins": 7.758998795161172, + "rewards/rejected": -7.049161093575614, + "step": 371 + }, + { + "epoch": 0.13732638087767063, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 9.625287682346051e-06, + "logits/chosen": 1684139446.857143, + "logits/rejected": 1731419249.7777777, + "logps/chosen": -312.4532993861607, + "logps/rejected": -485.69325086805554, + "loss": 0.1471, + "rewards/chosen": 1.1038735253470284, + "rewards/margins": 8.035731588091169, + "rewards/rejected": -6.931858062744141, + "step": 372 + }, + { + "epoch": 0.13769553781551383, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 9.62304906896227e-06, + "logits/chosen": 2146312021.3333333, + "logits/rejected": 1677266329.6, + "logps/chosen": -295.2914225260417, + "logps/rejected": -571.55419921875, + "loss": 0.1506, + "rewards/chosen": 0.7845592498779297, + "rewards/margins": 8.31790657043457, + "rewards/rejected": -7.533347320556641, + "step": 373 + }, + { + "epoch": 0.13806469475335703, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 9.620804050402738e-06, + "logits/chosen": 2354609083.733333, + "logits/rejected": 1766122797.1764705, + "logps/chosen": -231.64274088541666, + "logps/rejected": -583.2599379595588, + "loss": 0.1549, + "rewards/chosen": 1.2507829030354818, + "rewards/margins": 9.06287899391324, + "rewards/rejected": -7.812096090877757, + "step": 374 + }, + { + "epoch": 0.13843385169120023, + "grad_norm": 11.9375, + "kl": 0.1382436752319336, + "learning_rate": 9.618552629777904e-06, + "logits/chosen": 1584362732.3076923, + "logits/rejected": 1476279026.5263157, + "logps/chosen": -240.361572265625, + "logps/rejected": -382.98758737664474, + "loss": 0.1666, + "rewards/chosen": 1.0091890188363881, + "rewards/margins": 6.631763674469612, + "rewards/rejected": -5.6225746556332235, + "step": 375 + }, + { + "epoch": 0.13880300862904343, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 9.616294810207077e-06, + "logits/chosen": 1225417334.1538463, + "logits/rejected": 1242104993.6842105, + "logps/chosen": -283.8134577824519, + "logps/rejected": -516.4565172697369, + "loss": 0.1208, + "rewards/chosen": 1.4325991410475512, + "rewards/margins": 9.382924554801663, + "rewards/rejected": -7.950325413754112, + "step": 376 + }, + { + "epoch": 0.13917216556688664, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 9.61403059481844e-06, + "logits/chosen": 2342250359.4666667, + "logits/rejected": 1338388239.0588236, + "logps/chosen": -264.5478515625, + "logps/rejected": -502.7103630514706, + "loss": 0.1313, + "rewards/chosen": 1.7053250630696615, + "rewards/margins": 8.98542526843501, + "rewards/rejected": -7.28010020536535, + "step": 377 + }, + { + "epoch": 0.13954132250472984, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 9.611759986749036e-06, + "logits/chosen": 1735746560.0, + "logits/rejected": 2172561035.6363635, + "logps/chosen": -279.0372802734375, + "logps/rejected": -553.6524325284091, + "loss": 0.106, + "rewards/chosen": 1.6629119873046876, + "rewards/margins": 9.801081015846945, + "rewards/rejected": -8.138169028542258, + "step": 378 + }, + { + "epoch": 0.139910479442573, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 9.60948298914476e-06, + "logits/chosen": 1631841099.2941177, + "logits/rejected": 2150236979.2, + "logps/chosen": -319.9461454503676, + "logps/rejected": -540.7320963541666, + "loss": 0.2001, + "rewards/chosen": 0.868106393253102, + "rewards/margins": 8.632727364932789, + "rewards/rejected": -7.764620971679688, + "step": 379 + }, + { + "epoch": 0.1402796363804162, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.607199605160367e-06, + "logits/chosen": 2015413376.0, + "logits/rejected": 1986445824.0, + "logps/chosen": -297.494140625, + "logps/rejected": -541.105712890625, + "loss": 0.2012, + "rewards/chosen": 0.6245735883712769, + "rewards/margins": 8.115182280540466, + "rewards/rejected": -7.4906086921691895, + "step": 380 + }, + { + "epoch": 0.1406487933182594, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.604909837959456e-06, + "logits/chosen": 1235722778.9473684, + "logits/rejected": 1658123972.9230769, + "logps/chosen": -245.80111533717104, + "logps/rejected": -544.52783203125, + "loss": 0.1952, + "rewards/chosen": 1.3158021224172491, + "rewards/margins": 9.832599701669052, + "rewards/rejected": -8.516797579251802, + "step": 381 + }, + { + "epoch": 0.14101795025610261, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 9.602613690714468e-06, + "logits/chosen": 1798145228.8, + "logits/rejected": 1696284431.0588236, + "logps/chosen": -246.39615885416666, + "logps/rejected": -535.2096737132352, + "loss": 0.1557, + "rewards/chosen": 1.6111700693766275, + "rewards/margins": 7.949309771668677, + "rewards/rejected": -6.338139702292049, + "step": 382 + }, + { + "epoch": 0.14138710719394582, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.600311166606687e-06, + "logits/chosen": 1723616000.0, + "logits/rejected": 1721759872.0, + "logps/chosen": -247.85740661621094, + "logps/rejected": -430.78985595703125, + "loss": 0.1906, + "rewards/chosen": 1.1673980951309204, + "rewards/margins": 7.764888405799866, + "rewards/rejected": -6.597490310668945, + "step": 383 + }, + { + "epoch": 0.14175626413178902, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.59800226882623e-06, + "logits/chosen": 1383921800.5333333, + "logits/rejected": 1733275045.6470587, + "logps/chosen": -268.0473307291667, + "logps/rejected": -462.4274471507353, + "loss": 0.1725, + "rewards/chosen": 1.16984011332194, + "rewards/margins": 8.810279315125708, + "rewards/rejected": -7.640439201803768, + "step": 384 + }, + { + "epoch": 0.14212542106963222, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.595687000572049e-06, + "logits/chosen": 1998519374.7692308, + "logits/rejected": 2749480528.8421054, + "logps/chosen": -321.11733774038464, + "logps/rejected": -401.9541015625, + "loss": 0.1853, + "rewards/chosen": 0.8211579689612756, + "rewards/margins": 6.721532532078053, + "rewards/rejected": -5.9003745631167765, + "step": 385 + }, + { + "epoch": 0.14249457800747542, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 9.593365365051915e-06, + "logits/chosen": 1360691830.1538463, + "logits/rejected": 1720151417.2631578, + "logps/chosen": -177.74866661658655, + "logps/rejected": -441.29702919407896, + "loss": 0.1702, + "rewards/chosen": 0.8808885721059946, + "rewards/margins": 7.712759203273757, + "rewards/rejected": -6.831870631167763, + "step": 386 + }, + { + "epoch": 0.14286373494531862, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 9.591037365482424e-06, + "logits/chosen": 2256725196.8, + "logits/rejected": 2085053952.0, + "logps/chosen": -305.390380859375, + "logps/rejected": -459.892333984375, + "loss": 0.2566, + "rewards/chosen": 0.7829440116882325, + "rewards/margins": 9.776160208384196, + "rewards/rejected": -8.993216196695963, + "step": 387 + }, + { + "epoch": 0.14323289188316182, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.588703005088994e-06, + "logits/chosen": 1741253563.7333333, + "logits/rejected": 1799208960.0, + "logps/chosen": -257.41346028645836, + "logps/rejected": -508.30003446691177, + "loss": 0.1939, + "rewards/chosen": 0.6550120671590169, + "rewards/margins": 8.80546282973944, + "rewards/rejected": -8.150450762580423, + "step": 388 + }, + { + "epoch": 0.14360204882100502, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 9.58636228710585e-06, + "logits/chosen": 1820005797.6470587, + "logits/rejected": 1911634329.6, + "logps/chosen": -342.0751378676471, + "logps/rejected": -481.21692708333336, + "loss": 0.2505, + "rewards/chosen": 0.30485038196339326, + "rewards/margins": 9.440752278122247, + "rewards/rejected": -9.135901896158854, + "step": 389 + }, + { + "epoch": 0.14397120575884823, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 9.584015214776025e-06, + "logits/chosen": 1938715554.909091, + "logits/rejected": 1540675788.8, + "logps/chosen": -287.2528631036932, + "logps/rejected": -324.00595703125, + "loss": 0.2331, + "rewards/chosen": 1.344713644547896, + "rewards/margins": 6.631439642472701, + "rewards/rejected": -5.286725997924805, + "step": 390 + }, + { + "epoch": 0.14434036269669143, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 9.58166179135136e-06, + "logits/chosen": 1857606948.5714285, + "logits/rejected": 1266557383.1111112, + "logps/chosen": -299.09517996651783, + "logps/rejected": -426.6337619357639, + "loss": 0.1607, + "rewards/chosen": 1.1080269813537598, + "rewards/margins": 7.754366821712917, + "rewards/rejected": -6.646339840359158, + "step": 391 + }, + { + "epoch": 0.14470951963453463, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 9.579302020092491e-06, + "logits/chosen": 1742936883.2, + "logits/rejected": 1950550186.6666667, + "logps/chosen": -289.5045654296875, + "logps/rejected": -446.995361328125, + "loss": 0.2415, + "rewards/chosen": 0.8267616271972656, + "rewards/margins": 8.21956164042155, + "rewards/rejected": -7.392800013224284, + "step": 392 + }, + { + "epoch": 0.14507867657237783, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 9.576935904268853e-06, + "logits/chosen": 1455717668.5714285, + "logits/rejected": 1913486904.8888888, + "logps/chosen": -308.18258231026783, + "logps/rejected": -437.35047743055554, + "loss": 0.1589, + "rewards/chosen": 1.2528223310198103, + "rewards/margins": 7.9809246971493675, + "rewards/rejected": -6.728102366129558, + "step": 393 + }, + { + "epoch": 0.14544783351022103, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 9.574563447158671e-06, + "logits/chosen": 2003695495.5294118, + "logits/rejected": 2082482312.5333333, + "logps/chosen": -256.9423253676471, + "logps/rejected": -444.6559244791667, + "loss": 0.1812, + "rewards/chosen": 1.133278678445255, + "rewards/margins": 8.316498588113223, + "rewards/rejected": -7.183219909667969, + "step": 394 + }, + { + "epoch": 0.14581699044806423, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 9.57218465204895e-06, + "logits/chosen": 1726928759.4666667, + "logits/rejected": 1980301071.0588236, + "logps/chosen": -273.89326171875, + "logps/rejected": -412.39636948529414, + "loss": 0.1712, + "rewards/chosen": 0.9172369639078776, + "rewards/margins": 7.958772734099743, + "rewards/rejected": -7.0415357701918655, + "step": 395 + }, + { + "epoch": 0.14618614738590743, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 9.569799522235484e-06, + "logits/chosen": 1709594624.0, + "logits/rejected": 1723621376.0, + "logps/chosen": -258.77618815104165, + "logps/rejected": -381.2661994485294, + "loss": 0.1734, + "rewards/chosen": 0.8910394032796224, + "rewards/margins": 7.070839100258023, + "rewards/rejected": -6.1797996969784, + "step": 396 + }, + { + "epoch": 0.14655530432375063, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 9.567408061022838e-06, + "logits/chosen": 3571920896.0, + "logits/rejected": 2737334954.6666665, + "logps/chosen": -239.96843610491072, + "logps/rejected": -519.90673828125, + "loss": 0.1324, + "rewards/chosen": 1.3232063565935408, + "rewards/margins": 7.757024326021709, + "rewards/rejected": -6.433817969428168, + "step": 397 + }, + { + "epoch": 0.14692446126159384, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 9.565010271724353e-06, + "logits/chosen": 1958242759.1111112, + "logits/rejected": 2901280182.857143, + "logps/chosen": -344.83238389756946, + "logps/rejected": -403.4195033482143, + "loss": 0.2265, + "rewards/chosen": 0.8392838372124566, + "rewards/margins": 6.468618423219711, + "rewards/rejected": -5.629334586007254, + "step": 398 + }, + { + "epoch": 0.14729361819943704, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 9.562606157662132e-06, + "logits/chosen": 1543794560.0, + "logits/rejected": 1796631808.0, + "logps/chosen": -332.0307922363281, + "logps/rejected": -409.88067626953125, + "loss": 0.1721, + "rewards/chosen": 1.094642996788025, + "rewards/margins": 7.752733588218689, + "rewards/rejected": -6.658090591430664, + "step": 399 + }, + { + "epoch": 0.14766277513728024, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.56019572216705e-06, + "logits/chosen": 1905797347.5555556, + "logits/rejected": 1789911478.857143, + "logps/chosen": -329.1207682291667, + "logps/rejected": -488.9536830357143, + "loss": 0.172, + "rewards/chosen": 1.2870570288764105, + "rewards/margins": 8.21430039784265, + "rewards/rejected": -6.927243368966239, + "step": 400 + }, + { + "epoch": 0.14803193207512344, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 9.557778968578728e-06, + "logits/chosen": 1614072627.2, + "logits/rejected": 1632268288.0, + "logps/chosen": -265.12169596354164, + "logps/rejected": -416.1413143382353, + "loss": 0.1676, + "rewards/chosen": 1.638983662923177, + "rewards/margins": 8.142224809235218, + "rewards/rejected": -6.503241146312041, + "step": 401 + }, + { + "epoch": 0.14840108901296664, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 9.555355900245553e-06, + "logits/chosen": 2180031022.5454545, + "logits/rejected": 1780363878.4, + "logps/chosen": -311.76285067471593, + "logps/rejected": -382.5916015625, + "loss": 0.2326, + "rewards/chosen": 1.2490649656815962, + "rewards/margins": 7.760044427351518, + "rewards/rejected": -6.510979461669922, + "step": 402 + }, + { + "epoch": 0.14877024595080984, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.552926520524654e-06, + "logits/chosen": 1896104538.3529413, + "logits/rejected": 1815742327.4666667, + "logps/chosen": -274.66345932904414, + "logps/rejected": -575.7646484375, + "loss": 0.2028, + "rewards/chosen": 0.9009261411779067, + "rewards/margins": 7.843930495019053, + "rewards/rejected": -6.943004353841146, + "step": 403 + }, + { + "epoch": 0.14913940288865304, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.550490832781905e-06, + "logits/chosen": 1683728263.5294118, + "logits/rejected": 2133273122.1333334, + "logps/chosen": -283.2207892922794, + "logps/rejected": -461.8162434895833, + "loss": 0.2061, + "rewards/chosen": 1.3944896249210132, + "rewards/margins": 8.208423951092888, + "rewards/rejected": -6.813934326171875, + "step": 404 + }, + { + "epoch": 0.14950855982649625, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 9.54804884039192e-06, + "logits/chosen": 1576172150.1538463, + "logits/rejected": 1565305478.7368422, + "logps/chosen": -256.34557166466345, + "logps/rejected": -413.0727025082237, + "loss": 0.1811, + "rewards/chosen": 0.5149996097271259, + "rewards/margins": 6.631817686412981, + "rewards/rejected": -6.116818076685855, + "step": 405 + }, + { + "epoch": 0.14987771676433945, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.545600546738047e-06, + "logits/chosen": 2661444039.111111, + "logits/rejected": 2792508269.714286, + "logps/chosen": -267.06049262152777, + "logps/rejected": -504.3657924107143, + "loss": 0.2014, + "rewards/chosen": 0.9517600801255968, + "rewards/margins": 8.4590728547838, + "rewards/rejected": -7.507312774658203, + "step": 406 + }, + { + "epoch": 0.15024687370218265, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 9.54314595521237e-06, + "logits/chosen": 1397186816.0, + "logits/rejected": 1663922944.0, + "logps/chosen": -299.0145263671875, + "logps/rejected": -339.6547546386719, + "loss": 0.1692, + "rewards/chosen": 1.0805680751800537, + "rewards/margins": 6.122377157211304, + "rewards/rejected": -5.04180908203125, + "step": 407 + }, + { + "epoch": 0.15061603064002585, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.540685069215693e-06, + "logits/chosen": 1831780036.9230769, + "logits/rejected": 1716071262.3157895, + "logps/chosen": -246.77422626201923, + "logps/rejected": -422.6981136924342, + "loss": 0.187, + "rewards/chosen": 0.9568292177640475, + "rewards/margins": 6.36310772760677, + "rewards/rejected": -5.406278509842722, + "step": 408 + }, + { + "epoch": 0.15098518757786905, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.53821789215754e-06, + "logits/chosen": 1877976192.0, + "logits/rejected": 1774899072.0, + "logps/chosen": -287.7886962890625, + "logps/rejected": -454.25482177734375, + "loss": 0.2496, + "rewards/chosen": 0.13521817326545715, + "rewards/margins": 6.945976287126541, + "rewards/rejected": -6.810758113861084, + "step": 409 + }, + { + "epoch": 0.15135434451571225, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 9.535744427456156e-06, + "logits/chosen": 1482375395.5555556, + "logits/rejected": 1468752310.857143, + "logps/chosen": -300.3171115451389, + "logps/rejected": -499.8369140625, + "loss": 0.2392, + "rewards/chosen": 0.488131841023763, + "rewards/margins": 7.519225983392625, + "rewards/rejected": -7.031094142368862, + "step": 410 + }, + { + "epoch": 0.15172350145355545, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 9.533264678538493e-06, + "logits/chosen": 2152330581.3333335, + "logits/rejected": 1885027147.2941177, + "logps/chosen": -209.37931315104166, + "logps/rejected": -471.7879423253676, + "loss": 0.1853, + "rewards/chosen": 1.0260955810546875, + "rewards/margins": 7.536616964901195, + "rewards/rejected": -6.510521383846507, + "step": 411 + }, + { + "epoch": 0.15209265839139866, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 9.530778648840213e-06, + "logits/chosen": 2040008476.4444444, + "logits/rejected": 1421106761.142857, + "logps/chosen": -298.0353732638889, + "logps/rejected": -468.2005092075893, + "loss": 0.1862, + "rewards/chosen": 1.4321862326727972, + "rewards/margins": 8.714710886516269, + "rewards/rejected": -7.282524653843471, + "step": 412 + }, + { + "epoch": 0.15246181532924186, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.528286341805675e-06, + "logits/chosen": 1386916864.0, + "logits/rejected": 1678343168.0, + "logps/chosen": -312.471728515625, + "logps/rejected": -413.3424479166667, + "loss": 0.2002, + "rewards/chosen": 1.1034334182739258, + "rewards/margins": 8.104323514302571, + "rewards/rejected": -7.0008900960286455, + "step": 413 + }, + { + "epoch": 0.15283097226708506, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.525787760887945e-06, + "logits/chosen": 2307841566.117647, + "logits/rejected": 2275797674.6666665, + "logps/chosen": -364.07571231617646, + "logps/rejected": -416.757421875, + "loss": 0.1865, + "rewards/chosen": 1.2652756186092602, + "rewards/margins": 7.000926253374885, + "rewards/rejected": -5.735650634765625, + "step": 414 + }, + { + "epoch": 0.15320012920492823, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.523282909548773e-06, + "logits/chosen": 1289296099.5555556, + "logits/rejected": 1995946276.5714285, + "logps/chosen": -299.05986870659723, + "logps/rejected": -408.19810267857144, + "loss": 0.2144, + "rewards/chosen": 1.0565063688490126, + "rewards/margins": 8.21663755083841, + "rewards/rejected": -7.160131181989398, + "step": 415 + }, + { + "epoch": 0.15356928614277143, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.520771791258593e-06, + "logits/chosen": 1849796198.4, + "logits/rejected": 1834064715.2941177, + "logps/chosen": -340.3735026041667, + "logps/rejected": -407.8080480238971, + "loss": 0.1908, + "rewards/chosen": 0.8948452631632487, + "rewards/margins": 6.197336843901989, + "rewards/rejected": -5.3024915807387405, + "step": 416 + }, + { + "epoch": 0.15393844308061463, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 9.518254409496536e-06, + "logits/chosen": 2121342012.235294, + "logits/rejected": 1217952563.2, + "logps/chosen": -306.6924689797794, + "logps/rejected": -430.39033203125, + "loss": 0.1601, + "rewards/chosen": 1.6364097595214844, + "rewards/margins": 8.554572296142577, + "rewards/rejected": -6.918162536621094, + "step": 417 + }, + { + "epoch": 0.15430760001845784, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 9.515730767750397e-06, + "logits/chosen": 1601389616.7619047, + "logits/rejected": 1655772439.2727273, + "logps/chosen": -249.9218982514881, + "logps/rejected": -344.3494762073864, + "loss": 0.2229, + "rewards/chosen": 1.1368172963460286, + "rewards/margins": 7.215015815966057, + "rewards/rejected": -6.078198519620028, + "step": 418 + }, + { + "epoch": 0.15467675695630104, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 9.513200869516651e-06, + "logits/chosen": 1381346397.090909, + "logits/rejected": 2113726951.6190476, + "logps/chosen": -339.06289950284093, + "logps/rejected": -388.5776134672619, + "loss": 0.1918, + "rewards/chosen": 0.5060956261374734, + "rewards/margins": 6.6043382496028755, + "rewards/rejected": -6.098242623465402, + "step": 419 + }, + { + "epoch": 0.15504591389414424, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 9.51066471830044e-06, + "logits/chosen": 1852270336.0, + "logits/rejected": 1473811968.0, + "logps/chosen": -266.4080505371094, + "logps/rejected": -403.548095703125, + "loss": 0.193, + "rewards/chosen": 0.7962771058082581, + "rewards/margins": 7.894688665866852, + "rewards/rejected": -7.098411560058594, + "step": 420 + }, + { + "epoch": 0.15541507083198744, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.50812231761557e-06, + "logits/chosen": 1747788406.1538463, + "logits/rejected": 2023733571.368421, + "logps/chosen": -290.64847506009613, + "logps/rejected": -439.35567434210526, + "loss": 0.1465, + "rewards/chosen": 1.2459685985858624, + "rewards/margins": 7.018829075431051, + "rewards/rejected": -5.772860476845189, + "step": 421 + }, + { + "epoch": 0.15578422776983064, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 9.505573670984502e-06, + "logits/chosen": 2004292230.7368422, + "logits/rejected": 2002882402.4615386, + "logps/chosen": -184.99994860197367, + "logps/rejected": -483.9354717548077, + "loss": 0.1853, + "rewards/chosen": 1.5995942165977077, + "rewards/margins": 7.2835841082368304, + "rewards/rejected": -5.683989891639123, + "step": 422 + }, + { + "epoch": 0.15615338470767384, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.503018781938358e-06, + "logits/chosen": 1236752128.0, + "logits/rejected": 2007274752.0, + "logps/chosen": -271.8293762207031, + "logps/rejected": -445.6889343261719, + "loss": 0.1992, + "rewards/chosen": 0.8957881927490234, + "rewards/margins": 7.318389892578125, + "rewards/rejected": -6.422601699829102, + "step": 423 + }, + { + "epoch": 0.15652254164551704, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 9.5004576540169e-06, + "logits/chosen": 2672929645.714286, + "logits/rejected": 2408996408.888889, + "logps/chosen": -259.61373465401783, + "logps/rejected": -549.6809895833334, + "loss": 0.1808, + "rewards/chosen": 0.7344390324183873, + "rewards/margins": 7.449676120091998, + "rewards/rejected": -6.715237087673611, + "step": 424 + }, + { + "epoch": 0.15689169858336025, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.49789029076854e-06, + "logits/chosen": 1468558563.5555556, + "logits/rejected": 1477639753.142857, + "logps/chosen": -224.86748589409723, + "logps/rejected": -318.23440987723217, + "loss": 0.1835, + "rewards/chosen": 1.3313792546590169, + "rewards/margins": 7.426517804463704, + "rewards/rejected": -6.0951385498046875, + "step": 425 + }, + { + "epoch": 0.15726085552120345, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 9.49531669575033e-06, + "logits/chosen": 2148819080.5333333, + "logits/rejected": 2368397312.0, + "logps/chosen": -297.6759440104167, + "logps/rejected": -509.75212545955884, + "loss": 0.1917, + "rewards/chosen": 0.8936670303344727, + "rewards/margins": 8.30569517472211, + "rewards/rejected": -7.412028144387638, + "step": 426 + }, + { + "epoch": 0.15763001245904665, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 9.492736872527948e-06, + "logits/chosen": 1491219200.0, + "logits/rejected": 1513951488.0, + "logps/chosen": -287.7457580566406, + "logps/rejected": -357.4420471191406, + "loss": 0.2049, + "rewards/chosen": 1.028751015663147, + "rewards/margins": 7.207629323005676, + "rewards/rejected": -6.178878307342529, + "step": 427 + }, + { + "epoch": 0.15799916939688985, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.49015082467571e-06, + "logits/chosen": 1451139192.4705882, + "logits/rejected": 1425122781.8666666, + "logps/chosen": -360.09426700367646, + "logps/rejected": -481.3603515625, + "loss": 0.1819, + "rewards/chosen": 1.2073700848747702, + "rewards/margins": 8.74751901065602, + "rewards/rejected": -7.54014892578125, + "step": 428 + }, + { + "epoch": 0.15836832633473305, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 9.48755855577655e-06, + "logits/chosen": 1648751130.9473684, + "logits/rejected": 1315853705.8461537, + "logps/chosen": -325.03238075657896, + "logps/rejected": -571.3363131009615, + "loss": 0.2126, + "rewards/chosen": 1.0955026526200144, + "rewards/margins": 8.588325739872118, + "rewards/rejected": -7.492823087252104, + "step": 429 + }, + { + "epoch": 0.15873748327257625, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 9.484960069422026e-06, + "logits/chosen": 1799661568.0, + "logits/rejected": 2723344156.4444447, + "logps/chosen": -341.4420689174107, + "logps/rejected": -609.2888454861111, + "loss": 0.1067, + "rewards/chosen": 1.6041766575404577, + "rewards/margins": 10.343394627646795, + "rewards/rejected": -8.739217970106337, + "step": 430 + }, + { + "epoch": 0.15910664021041945, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.482355369212307e-06, + "logits/chosen": 2755885641.142857, + "logits/rejected": 1985955157.3333333, + "logps/chosen": -259.13295200892856, + "logps/rejected": -564.4095052083334, + "loss": 0.1446, + "rewards/chosen": 1.4627829960414342, + "rewards/margins": 9.625913014487615, + "rewards/rejected": -8.16313001844618, + "step": 431 + }, + { + "epoch": 0.15947579714826265, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 9.47974445875617e-06, + "logits/chosen": 1956324040.347826, + "logits/rejected": 1521241201.7777777, + "logps/chosen": -342.52373471467394, + "logps/rejected": -434.6310763888889, + "loss": 0.2382, + "rewards/chosen": 1.4340296206266985, + "rewards/margins": 8.555200512282514, + "rewards/rejected": -7.121170891655816, + "step": 432 + }, + { + "epoch": 0.15984495408610586, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 9.477127341671e-06, + "logits/chosen": 1398344557.7142856, + "logits/rejected": 1425347697.7777777, + "logps/chosen": -306.09486607142856, + "logps/rejected": -429.64643012152777, + "loss": 0.1417, + "rewards/chosen": 1.1269269670758928, + "rewards/margins": 8.88234383719308, + "rewards/rejected": -7.7554168701171875, + "step": 433 + }, + { + "epoch": 0.16021411102394906, + "grad_norm": 13.8125, + "kl": 0.9919271469116211, + "learning_rate": 9.47450402158278e-06, + "logits/chosen": 1627436373.3333333, + "logits/rejected": 1671521426.2857144, + "logps/chosen": -268.61469184027777, + "logps/rejected": -419.15488978794644, + "loss": 0.2059, + "rewards/chosen": 1.1425614886813693, + "rewards/margins": 9.435201629759774, + "rewards/rejected": -8.292640141078405, + "step": 434 + }, + { + "epoch": 0.16058326796179226, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 9.471874502126087e-06, + "logits/chosen": 1883517366.857143, + "logits/rejected": 2270909326.2222223, + "logps/chosen": -298.3540736607143, + "logps/rejected": -640.4641927083334, + "loss": 0.1484, + "rewards/chosen": 1.019315242767334, + "rewards/margins": 11.426795270707872, + "rewards/rejected": -10.407480027940538, + "step": 435 + }, + { + "epoch": 0.16095242489963546, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 9.469238786944086e-06, + "logits/chosen": 1889522073.6, + "logits/rejected": 1955934208.0, + "logps/chosen": -265.3699951171875, + "logps/rejected": -436.8512369791667, + "loss": 0.2776, + "rewards/chosen": 0.5707391738891602, + "rewards/margins": 7.696090126037598, + "rewards/rejected": -7.1253509521484375, + "step": 436 + }, + { + "epoch": 0.16132158183747866, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 9.466596879688525e-06, + "logits/chosen": 1524982306.1333334, + "logits/rejected": 2101468220.235294, + "logps/chosen": -243.32716471354166, + "logps/rejected": -347.1457088694853, + "loss": 0.2071, + "rewards/chosen": 0.7067002614339193, + "rewards/margins": 6.42231637543323, + "rewards/rejected": -5.71561611399931, + "step": 437 + }, + { + "epoch": 0.16169073877532186, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.463948784019736e-06, + "logits/chosen": 2382068736.0, + "logits/rejected": 1903634432.0, + "logps/chosen": -302.1851318359375, + "logps/rejected": -429.0208629261364, + "loss": 0.1255, + "rewards/chosen": 1.1733879089355468, + "rewards/margins": 7.524258908358487, + "rewards/rejected": -6.35087099942294, + "step": 438 + }, + { + "epoch": 0.16205989571316506, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 9.461294503606621e-06, + "logits/chosen": 1423123456.0, + "logits/rejected": 1668021361.7777777, + "logps/chosen": -274.70179966517856, + "logps/rejected": -358.2701009114583, + "loss": 0.189, + "rewards/chosen": 0.8811074665614537, + "rewards/margins": 6.934006721254379, + "rewards/rejected": -6.0528992546929254, + "step": 439 + }, + { + "epoch": 0.16242905265100827, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 9.458634042126651e-06, + "logits/chosen": 1840356773.6470587, + "logits/rejected": 1948377088.0, + "logps/chosen": -283.4951746323529, + "logps/rejected": -492.25970052083335, + "loss": 0.1799, + "rewards/chosen": 0.9979782104492188, + "rewards/margins": 7.6272425333658855, + "rewards/rejected": -6.629264322916667, + "step": 440 + }, + { + "epoch": 0.16279820958885147, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 9.455967403265861e-06, + "logits/chosen": 1764379776.0, + "logits/rejected": 2547751424.0, + "logps/chosen": -279.8078308105469, + "logps/rejected": -405.1233825683594, + "loss": 0.1761, + "rewards/chosen": 1.0658788681030273, + "rewards/margins": 7.703524589538574, + "rewards/rejected": -6.637645721435547, + "step": 441 + }, + { + "epoch": 0.16316736652669467, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 9.453294590718846e-06, + "logits/chosen": 2153193267.2, + "logits/rejected": 2300990464.0, + "logps/chosen": -302.87109375, + "logps/rejected": -680.265625, + "loss": 0.262, + "rewards/chosen": 0.6101669788360595, + "rewards/margins": 9.794128211339315, + "rewards/rejected": -9.183961232503256, + "step": 442 + }, + { + "epoch": 0.16353652346453787, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 9.450615608188755e-06, + "logits/chosen": 2295278170.352941, + "logits/rejected": 1724589397.3333333, + "logps/chosen": -293.3492647058824, + "logps/rejected": -446.7376953125, + "loss": 0.2028, + "rewards/chosen": 1.1460014792049633, + "rewards/margins": 6.843126723345588, + "rewards/rejected": -5.697125244140625, + "step": 443 + }, + { + "epoch": 0.16390568040238107, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 9.447930459387284e-06, + "logits/chosen": 1544387820.3076923, + "logits/rejected": 1523285800.4210527, + "logps/chosen": -176.35471754807693, + "logps/rejected": -418.89170435855266, + "loss": 0.1422, + "rewards/chosen": 1.3574663308950572, + "rewards/margins": 7.512983909020058, + "rewards/rejected": -6.155517578125, + "step": 444 + }, + { + "epoch": 0.16427483734022427, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 9.445239148034673e-06, + "logits/chosen": 2157634267.428571, + "logits/rejected": 1811817472.0, + "logps/chosen": -260.056396484375, + "logps/rejected": -435.5497233072917, + "loss": 0.1568, + "rewards/chosen": 1.190401213509696, + "rewards/margins": 7.259217080615816, + "rewards/rejected": -6.06881586710612, + "step": 445 + }, + { + "epoch": 0.16464399427806747, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.442541677859695e-06, + "logits/chosen": 1888418032.9411764, + "logits/rejected": 1410723976.5333333, + "logps/chosen": -279.02326516544116, + "logps/rejected": -406.93017578125, + "loss": 0.1867, + "rewards/chosen": 1.0029088188620174, + "rewards/margins": 7.282194504083371, + "rewards/rejected": -6.279285685221354, + "step": 446 + }, + { + "epoch": 0.16501315121591068, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 9.439838052599668e-06, + "logits/chosen": 2164433211.076923, + "logits/rejected": 1802583093.8947368, + "logps/chosen": -314.37691556490387, + "logps/rejected": -576.5401418585526, + "loss": 0.1103, + "rewards/chosen": 1.3447289100060096, + "rewards/margins": 10.43544160788841, + "rewards/rejected": -9.0907126978824, + "step": 447 + }, + { + "epoch": 0.16538230815375388, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 9.437128276000424e-06, + "logits/chosen": 1466425659.0769231, + "logits/rejected": 1750580601.2631578, + "logps/chosen": -290.70688100961536, + "logps/rejected": -557.6423725328947, + "loss": 0.1811, + "rewards/chosen": 0.3252264903141902, + "rewards/margins": 7.791055636849963, + "rewards/rejected": -7.465829146535773, + "step": 448 + }, + { + "epoch": 0.16575146509159708, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 9.434412351816329e-06, + "logits/chosen": 1857439428.9230769, + "logits/rejected": 2492173904.8421054, + "logps/chosen": -220.86519681490384, + "logps/rejected": -358.22049753289474, + "loss": 0.1425, + "rewards/chosen": 1.5148622072660005, + "rewards/margins": 8.31200222833919, + "rewards/rejected": -6.797140021073191, + "step": 449 + }, + { + "epoch": 0.16612062202944028, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 9.431690283810257e-06, + "logits/chosen": 1876025457.7777777, + "logits/rejected": 1827281042.2857144, + "logps/chosen": -311.16832139756946, + "logps/rejected": -584.4556361607143, + "loss": 0.188, + "rewards/chosen": 1.1974159876505535, + "rewards/margins": 7.976609865824382, + "rewards/rejected": -6.779193878173828, + "step": 450 + }, + { + "epoch": 0.16648977896728345, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 9.428962075753602e-06, + "logits/chosen": 1794954854.4, + "logits/rejected": 1651961976.4705882, + "logps/chosen": -219.48981119791668, + "logps/rejected": -511.2558019301471, + "loss": 0.1202, + "rewards/chosen": 1.911020533243815, + "rewards/margins": 8.24248182259354, + "rewards/rejected": -6.331461289349725, + "step": 451 + }, + { + "epoch": 0.16685893590512665, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 9.42622773142626e-06, + "logits/chosen": 2014075172.5714285, + "logits/rejected": 2313355036.4444447, + "logps/chosen": -296.16514369419644, + "logps/rejected": -478.00151909722223, + "loss": 0.1854, + "rewards/chosen": 0.7648314748491559, + "rewards/margins": 8.013518371279277, + "rewards/rejected": -7.248686896430121, + "step": 452 + }, + { + "epoch": 0.16722809284296986, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 9.423487254616632e-06, + "logits/chosen": 1507809664.0, + "logits/rejected": 1533274624.0, + "logps/chosen": -340.4511413574219, + "logps/rejected": -493.132568359375, + "loss": 0.1657, + "rewards/chosen": 1.3845552206039429, + "rewards/margins": 8.302598357200623, + "rewards/rejected": -6.91804313659668, + "step": 453 + }, + { + "epoch": 0.16759724978081306, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 9.420740649121611e-06, + "logits/chosen": 1880290417.7777777, + "logits/rejected": 1329632402.2857144, + "logps/chosen": -237.93340386284723, + "logps/rejected": -404.05698939732144, + "loss": 0.2316, + "rewards/chosen": 0.5798261430528429, + "rewards/margins": 7.000474445403569, + "rewards/rejected": -6.420648302350726, + "step": 454 + }, + { + "epoch": 0.16796640671865626, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.417987918746587e-06, + "logits/chosen": 1998635885.7142856, + "logits/rejected": 2313715712.0, + "logps/chosen": -388.2079380580357, + "logps/rejected": -539.5053168402778, + "loss": 0.1664, + "rewards/chosen": 0.744182995387486, + "rewards/margins": 9.420208386012487, + "rewards/rejected": -8.676025390625, + "step": 455 + }, + { + "epoch": 0.16833556365649946, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 9.41522906730543e-06, + "logits/chosen": 1481927017.4117646, + "logits/rejected": 1647460215.4666667, + "logps/chosen": -258.1116153492647, + "logps/rejected": -398.8003255208333, + "loss": 0.1879, + "rewards/chosen": 1.0994893242331112, + "rewards/margins": 7.681139104506549, + "rewards/rejected": -6.5816497802734375, + "step": 456 + }, + { + "epoch": 0.16870472059434266, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 9.412464098620495e-06, + "logits/chosen": 1388586780.4444444, + "logits/rejected": 1447887433.142857, + "logps/chosen": -380.5871310763889, + "logps/rejected": -404.9984654017857, + "loss": 0.219, + "rewards/chosen": 0.9128994411892362, + "rewards/margins": 6.943295493958489, + "rewards/rejected": -6.030396052769253, + "step": 457 + }, + { + "epoch": 0.16907387753218586, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 9.409693016522613e-06, + "logits/chosen": 1475822933.3333333, + "logits/rejected": 1880910994.2857144, + "logps/chosen": -357.24956597222223, + "logps/rejected": -345.016845703125, + "loss": 0.21, + "rewards/chosen": 1.5323240492078993, + "rewards/margins": 6.078353003850059, + "rewards/rejected": -4.54602895464216, + "step": 458 + }, + { + "epoch": 0.16944303447002906, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.40691582485108e-06, + "logits/chosen": 1580634697.142857, + "logits/rejected": 1367673628.4444444, + "logps/chosen": -349.52113560267856, + "logps/rejected": -476.46533203125, + "loss": 0.1783, + "rewards/chosen": 0.6020426068987165, + "rewards/margins": 7.91927028837658, + "rewards/rejected": -7.317227681477864, + "step": 459 + }, + { + "epoch": 0.16981219140787226, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.404132527453662e-06, + "logits/chosen": 2117280153.6, + "logits/rejected": 1796125515.2941177, + "logps/chosen": -302.4202473958333, + "logps/rejected": -554.7742417279412, + "loss": 0.1619, + "rewards/chosen": 0.9770669937133789, + "rewards/margins": 9.58953863031724, + "rewards/rejected": -8.61247163660386, + "step": 460 + }, + { + "epoch": 0.17018134834571547, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 9.40134312818658e-06, + "logits/chosen": 1579234560.0, + "logits/rejected": 2085654528.0, + "logps/chosen": -275.31854248046875, + "logps/rejected": -544.705810546875, + "loss": 0.1647, + "rewards/chosen": 1.4008495807647705, + "rewards/margins": 9.715579271316528, + "rewards/rejected": -8.314729690551758, + "step": 461 + }, + { + "epoch": 0.17055050528355867, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 9.398547630914512e-06, + "logits/chosen": 1836288097.5238094, + "logits/rejected": 1681981626.1818182, + "logps/chosen": -323.7904110863095, + "logps/rejected": -471.47860440340907, + "loss": 0.2442, + "rewards/chosen": 0.9647959754580543, + "rewards/margins": 7.631681351434617, + "rewards/rejected": -6.6668853759765625, + "step": 462 + }, + { + "epoch": 0.17091966222140187, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 9.395746039510585e-06, + "logits/chosen": 1478776393.142857, + "logits/rejected": 1432818119.1111112, + "logps/chosen": -231.71540178571428, + "logps/rejected": -430.23505316840277, + "loss": 0.1202, + "rewards/chosen": 1.7084026336669922, + "rewards/margins": 9.498465432061089, + "rewards/rejected": -7.790062798394097, + "step": 463 + }, + { + "epoch": 0.17128881915924507, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.392938357856367e-06, + "logits/chosen": 1837313877.3333333, + "logits/rejected": 1759940198.4, + "logps/chosen": -306.7340901692708, + "logps/rejected": -326.896484375, + "loss": 0.1661, + "rewards/chosen": 0.9637904167175293, + "rewards/margins": 6.768612957000732, + "rewards/rejected": -5.804822540283203, + "step": 464 + }, + { + "epoch": 0.17165797609708827, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 9.390124589841866e-06, + "logits/chosen": 1832702080.0, + "logits/rejected": 2140270592.0, + "logps/chosen": -260.9884338378906, + "logps/rejected": -644.94921875, + "loss": 0.1466, + "rewards/chosen": 1.4802207946777344, + "rewards/margins": 10.892483711242676, + "rewards/rejected": -9.412262916564941, + "step": 465 + }, + { + "epoch": 0.17202713303493147, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 9.387304739365524e-06, + "logits/chosen": 1832524980.7058823, + "logits/rejected": 1617883955.2, + "logps/chosen": -251.7400333180147, + "logps/rejected": -468.75738932291665, + "loss": 0.232, + "rewards/chosen": 0.5938537261065315, + "rewards/margins": 8.656721462922938, + "rewards/rejected": -8.062867736816406, + "step": 466 + }, + { + "epoch": 0.17239628997277467, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 9.384478810334202e-06, + "logits/chosen": 2719824715.2941175, + "logits/rejected": 2452078592.0, + "logps/chosen": -246.74488740808823, + "logps/rejected": -493.70875651041666, + "loss": 0.2345, + "rewards/chosen": 0.3798382702995749, + "rewards/margins": 8.062478345983168, + "rewards/rejected": -7.682640075683594, + "step": 467 + }, + { + "epoch": 0.17276544691061788, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.381646806663195e-06, + "logits/chosen": 1188046301.8666666, + "logits/rejected": 1470587602.8235295, + "logps/chosen": -234.28460286458332, + "logps/rejected": -427.8580537683824, + "loss": 0.1875, + "rewards/chosen": 0.8390771230061849, + "rewards/margins": 7.5509620142918, + "rewards/rejected": -6.7118848912856155, + "step": 468 + }, + { + "epoch": 0.17313460384846108, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 9.378808732276206e-06, + "logits/chosen": 1693333276.4444444, + "logits/rejected": 1718176182.857143, + "logps/chosen": -297.3453776041667, + "logps/rejected": -375.6416713169643, + "loss": 0.219, + "rewards/chosen": 1.1055640114678278, + "rewards/margins": 6.6584836717635865, + "rewards/rejected": -5.552919660295759, + "step": 469 + }, + { + "epoch": 0.17350376078630428, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 9.37596459110535e-06, + "logits/chosen": 1928345600.0, + "logits/rejected": 2211051861.3333335, + "logps/chosen": -301.3369140625, + "logps/rejected": -394.3219807942708, + "loss": 0.1633, + "rewards/chosen": 1.9237049102783204, + "rewards/margins": 8.652132797241212, + "rewards/rejected": -6.728427886962891, + "step": 470 + }, + { + "epoch": 0.17387291772414748, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.373114387091148e-06, + "logits/chosen": 1599539053.7142856, + "logits/rejected": 1522536903.1111112, + "logps/chosen": -228.62154715401786, + "logps/rejected": -406.00599500868054, + "loss": 0.1381, + "rewards/chosen": 1.2966367176600866, + "rewards/margins": 7.772359136551145, + "rewards/rejected": -6.475722418891059, + "step": 471 + }, + { + "epoch": 0.17424207466199068, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 9.370258124182525e-06, + "logits/chosen": 1871466086.4, + "logits/rejected": 2209802240.0, + "logps/chosen": -280.2326171875, + "logps/rejected": -600.40966796875, + "loss": 0.2434, + "rewards/chosen": 0.7489444255828858, + "rewards/margins": 9.939755582809449, + "rewards/rejected": -9.190811157226562, + "step": 472 + }, + { + "epoch": 0.17461123159983388, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.367395806336793e-06, + "logits/chosen": 2080404626.2857144, + "logits/rejected": 1564791714.909091, + "logps/chosen": -270.8211960565476, + "logps/rejected": -423.20321377840907, + "loss": 0.2198, + "rewards/chosen": 1.1135547274634952, + "rewards/margins": 7.375220071701777, + "rewards/rejected": -6.261665344238281, + "step": 473 + }, + { + "epoch": 0.17498038853767708, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 9.364527437519658e-06, + "logits/chosen": 1680217216.0, + "logits/rejected": 2128451456.0, + "logps/chosen": -271.66949462890625, + "logps/rejected": -426.18597412109375, + "loss": 0.2014, + "rewards/chosen": 1.0132856369018555, + "rewards/margins": 7.316222667694092, + "rewards/rejected": -6.302937030792236, + "step": 474 + }, + { + "epoch": 0.17534954547552029, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 9.361653021705211e-06, + "logits/chosen": 2177999592.7272725, + "logits/rejected": 2288232652.8, + "logps/chosen": -320.27978515625, + "logps/rejected": -808.631201171875, + "loss": 0.2696, + "rewards/chosen": 0.6454575712030585, + "rewards/margins": 11.774324148351496, + "rewards/rejected": -11.128866577148438, + "step": 475 + }, + { + "epoch": 0.1757187024133635, + "grad_norm": 18.5, + "kl": 0.0, + "learning_rate": 9.358772562875914e-06, + "logits/chosen": 1779464647.1111112, + "logits/rejected": 2097166189.7142856, + "logps/chosen": -276.9006076388889, + "logps/rejected": -389.137939453125, + "loss": 0.1909, + "rewards/chosen": 1.0232006708780925, + "rewards/margins": 7.866908709208171, + "rewards/rejected": -6.843708038330078, + "step": 476 + }, + { + "epoch": 0.1760878593512067, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 9.355886065022611e-06, + "logits/chosen": 1595497708.3076923, + "logits/rejected": 2594478187.7894735, + "logps/chosen": -229.4954552283654, + "logps/rejected": -412.3012952302632, + "loss": 0.1806, + "rewards/chosen": 0.6330189338097205, + "rewards/margins": 7.243601787428142, + "rewards/rejected": -6.610582853618421, + "step": 477 + }, + { + "epoch": 0.1764570162890499, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 9.352993532144505e-06, + "logits/chosen": 1333980160.0, + "logits/rejected": 1994839210.6666667, + "logps/chosen": -181.42242431640625, + "logps/rejected": -518.6885579427084, + "loss": 0.0867, + "rewards/chosen": 1.152406096458435, + "rewards/margins": 9.0311998128891, + "rewards/rejected": -7.878793716430664, + "step": 478 + }, + { + "epoch": 0.1768261732268931, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 9.350094968249163e-06, + "logits/chosen": 2194639320.6153846, + "logits/rejected": 2490766174.3157897, + "logps/chosen": -266.01705228365387, + "logps/rejected": -380.25917454769734, + "loss": 0.1785, + "rewards/chosen": 0.7653886354886569, + "rewards/margins": 6.724045749617974, + "rewards/rejected": -5.958657114129317, + "step": 479 + }, + { + "epoch": 0.1771953301647363, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 9.347190377352512e-06, + "logits/chosen": 1560867354.9473684, + "logits/rejected": 1299225048.6153846, + "logps/chosen": -388.36366673519734, + "logps/rejected": -436.19869290865387, + "loss": 0.223, + "rewards/chosen": 0.8749334435713919, + "rewards/margins": 6.251118934106247, + "rewards/rejected": -5.376185490534856, + "step": 480 + }, + { + "epoch": 0.1775644871025795, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 9.344279763478823e-06, + "logits/chosen": 2076912981.3333333, + "logits/rejected": 1763164641.8823528, + "logps/chosen": -259.7587890625, + "logps/rejected": -507.12258731617646, + "loss": 0.1529, + "rewards/chosen": 1.4007269541422527, + "rewards/margins": 8.68908799863329, + "rewards/rejected": -7.2883610444910385, + "step": 481 + }, + { + "epoch": 0.1779336440404227, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 9.341363130660714e-06, + "logits/chosen": 2137540608.0, + "logits/rejected": 2237207347.2, + "logps/chosen": -259.72446695963544, + "logps/rejected": -461.426708984375, + "loss": 0.1589, + "rewards/chosen": 0.6215546925862631, + "rewards/margins": 7.199357732137044, + "rewards/rejected": -6.577803039550782, + "step": 482 + }, + { + "epoch": 0.1783028009782659, + "grad_norm": 15.4375, + "kl": 0.10217571258544922, + "learning_rate": 9.338440482939146e-06, + "logits/chosen": 1521586068.2105262, + "logits/rejected": 1922938092.3076923, + "logps/chosen": -300.8343955592105, + "logps/rejected": -322.84326171875, + "loss": 0.2111, + "rewards/chosen": 0.9895760385613692, + "rewards/margins": 6.97859499232489, + "rewards/rejected": -5.989018953763521, + "step": 483 + }, + { + "epoch": 0.1786719579161091, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.335511824363407e-06, + "logits/chosen": 1551830084.2666667, + "logits/rejected": 2047619312.9411764, + "logps/chosen": -225.37298177083332, + "logps/rejected": -491.97409237132354, + "loss": 0.1481, + "rewards/chosen": 1.2445401509602865, + "rewards/margins": 7.407126138724533, + "rewards/rejected": -6.162585987764246, + "step": 484 + }, + { + "epoch": 0.1790411148539523, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 9.332577158991118e-06, + "logits/chosen": 1995962368.0, + "logits/rejected": 2586446409.142857, + "logps/chosen": -234.26925998263889, + "logps/rejected": -497.11959402901783, + "loss": 0.2209, + "rewards/chosen": 0.7873786290486654, + "rewards/margins": 7.336417652311779, + "rewards/rejected": -6.549039023263114, + "step": 485 + }, + { + "epoch": 0.1794102717917955, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 9.32963649088822e-06, + "logits/chosen": 1795679027.2, + "logits/rejected": 1622218752.0, + "logps/chosen": -297.19228515625, + "logps/rejected": -484.64171645220586, + "loss": 0.16, + "rewards/chosen": 1.0754651387532552, + "rewards/margins": 8.482411657595167, + "rewards/rejected": -7.406946518841912, + "step": 486 + }, + { + "epoch": 0.17977942872963867, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 9.326689824128971e-06, + "logits/chosen": 1621383168.0, + "logits/rejected": 1497072103.6190476, + "logps/chosen": -339.66914506392044, + "logps/rejected": -566.118908110119, + "loss": 0.1232, + "rewards/chosen": 1.1408760764382102, + "rewards/margins": 10.624549172141336, + "rewards/rejected": -9.483673095703125, + "step": 487 + }, + { + "epoch": 0.18014858566748188, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.323737162795941e-06, + "logits/chosen": 2212555190.857143, + "logits/rejected": 2225090787.5555553, + "logps/chosen": -234.53543526785714, + "logps/rejected": -566.3039279513889, + "loss": 0.1672, + "rewards/chosen": 0.9922229221888951, + "rewards/margins": 7.503315214126829, + "rewards/rejected": -6.511092291937934, + "step": 488 + }, + { + "epoch": 0.18051774260532508, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 9.320778510980004e-06, + "logits/chosen": 2285489265.7777777, + "logits/rejected": 1902621562.4347825, + "logps/chosen": -336.9425998263889, + "logps/rejected": -515.7549252717391, + "loss": 0.1061, + "rewards/chosen": 0.8516276147630479, + "rewards/margins": 8.702518728044298, + "rewards/rejected": -7.85089111328125, + "step": 489 + }, + { + "epoch": 0.18088689954316828, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 9.317813872780336e-06, + "logits/chosen": 1913687478.857143, + "logits/rejected": 1657847239.1111112, + "logps/chosen": -269.0014125279018, + "logps/rejected": -447.3102756076389, + "loss": 0.1313, + "rewards/chosen": 1.319936888558524, + "rewards/margins": 8.26653494153704, + "rewards/rejected": -6.946598052978516, + "step": 490 + }, + { + "epoch": 0.18125605648101148, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 9.314843252304405e-06, + "logits/chosen": 1760165614.9333334, + "logits/rejected": 1962293127.5294118, + "logps/chosen": -291.0530598958333, + "logps/rejected": -376.41931870404414, + "loss": 0.1784, + "rewards/chosen": 1.0503790537516275, + "rewards/margins": 6.396765233956131, + "rewards/rejected": -5.346386180204504, + "step": 491 + }, + { + "epoch": 0.18162521341885468, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 9.311866653667967e-06, + "logits/chosen": 1244888795.4285715, + "logits/rejected": 1571566478.2222223, + "logps/chosen": -184.14887346540178, + "logps/rejected": -386.0273708767361, + "loss": 0.1341, + "rewards/chosen": 1.8228936876569475, + "rewards/margins": 7.820430240933858, + "rewards/rejected": -5.99753655327691, + "step": 492 + }, + { + "epoch": 0.18199437035669788, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 9.30888408099506e-06, + "logits/chosen": 1501434733.7142856, + "logits/rejected": 2069093262.2222223, + "logps/chosen": -246.63685825892858, + "logps/rejected": -476.80946180555554, + "loss": 0.1564, + "rewards/chosen": 0.8960362161908831, + "rewards/margins": 8.771150657108851, + "rewards/rejected": -7.875114440917969, + "step": 493 + }, + { + "epoch": 0.18236352729454108, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 9.305895538418004e-06, + "logits/chosen": 2132647073.6842105, + "logits/rejected": 2157983271.3846154, + "logps/chosen": -325.94017269736844, + "logps/rejected": -592.6210186298077, + "loss": 0.1793, + "rewards/chosen": 1.2058826245759662, + "rewards/margins": 8.917423433620437, + "rewards/rejected": -7.711540809044471, + "step": 494 + }, + { + "epoch": 0.18273268423238428, + "grad_norm": 10.625, + "kl": 0.7403898239135742, + "learning_rate": 9.302901030077384e-06, + "logits/chosen": 1953897773.1764705, + "logits/rejected": 1632884189.8666666, + "logps/chosen": -236.97508329503677, + "logps/rejected": -434.2770182291667, + "loss": 0.1789, + "rewards/chosen": 1.3950559952679802, + "rewards/margins": 9.621203964831782, + "rewards/rejected": -8.226147969563803, + "step": 495 + }, + { + "epoch": 0.1831018411702275, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 9.299900560122057e-06, + "logits/chosen": 1951862393.9047618, + "logits/rejected": 2139443200.0, + "logps/chosen": -287.9765159970238, + "logps/rejected": -598.4104225852273, + "loss": 0.2039, + "rewards/chosen": 1.463614781697591, + "rewards/margins": 8.55523548704205, + "rewards/rejected": -7.09162070534446, + "step": 496 + }, + { + "epoch": 0.1834709981080707, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 9.296894132709134e-06, + "logits/chosen": 2160421225.4117646, + "logits/rejected": 2569863714.133333, + "logps/chosen": -259.53369140625, + "logps/rejected": -390.4531575520833, + "loss": 0.1536, + "rewards/chosen": 1.621340358958525, + "rewards/margins": 6.94680524339863, + "rewards/rejected": -5.3254648844401045, + "step": 497 + }, + { + "epoch": 0.1838401550459139, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 9.29388175200398e-06, + "logits/chosen": 1766330514.2857144, + "logits/rejected": 1565979534.2222223, + "logps/chosen": -284.0482177734375, + "logps/rejected": -489.21685112847223, + "loss": 0.1402, + "rewards/chosen": 1.3489065170288086, + "rewards/margins": 8.749697261386448, + "rewards/rejected": -7.400790744357639, + "step": 498 + }, + { + "epoch": 0.1842093119837571, + "grad_norm": 17.25, + "kl": 0.0, + "learning_rate": 9.290863422180211e-06, + "logits/chosen": 1795120261.5652175, + "logits/rejected": 1485407118.2222223, + "logps/chosen": -314.0444972826087, + "logps/rejected": -440.51402452256946, + "loss": 0.2479, + "rewards/chosen": 0.9329056947127633, + "rewards/margins": 7.176038815779387, + "rewards/rejected": -6.243133121066624, + "step": 499 + }, + { + "epoch": 0.1845784689216003, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 9.287839147419685e-06, + "logits/chosen": 1734314880.0, + "logits/rejected": 1805313280.0, + "logps/chosen": -232.840576171875, + "logps/rejected": -454.1831970214844, + "loss": 0.1561, + "rewards/chosen": 1.5997205972671509, + "rewards/margins": 9.560705304145813, + "rewards/rejected": -7.960984706878662, + "step": 500 + }, + { + "epoch": 0.1849476258594435, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 9.284808931912501e-06, + "logits/chosen": 2343862272.0, + "logits/rejected": 1620667520.0, + "logps/chosen": -327.5703125, + "logps/rejected": -442.1397705078125, + "loss": 0.1956, + "rewards/chosen": 0.9307717680931091, + "rewards/margins": 8.823698937892914, + "rewards/rejected": -7.892927169799805, + "step": 501 + }, + { + "epoch": 0.1853167827972867, + "grad_norm": 16.0, + "kl": 0.5581798553466797, + "learning_rate": 9.281772779856977e-06, + "logits/chosen": 2163830198.857143, + "logits/rejected": 2000974196.3636363, + "logps/chosen": -332.61358351934524, + "logps/rejected": -476.36647727272725, + "loss": 0.2553, + "rewards/chosen": 0.7307724271501813, + "rewards/margins": 8.296268698457, + "rewards/rejected": -7.565496271306818, + "step": 502 + }, + { + "epoch": 0.1856859397351299, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 9.278730695459664e-06, + "logits/chosen": 2037516180.2105262, + "logits/rejected": 2681786998.1538463, + "logps/chosen": -268.5090974506579, + "logps/rejected": -392.14734825721155, + "loss": 0.1554, + "rewards/chosen": 1.622494245830335, + "rewards/margins": 7.87285928301483, + "rewards/rejected": -6.250365037184495, + "step": 503 + }, + { + "epoch": 0.1860550966729731, + "grad_norm": 18.5, + "kl": 0.9719223976135254, + "learning_rate": 9.275682682935336e-06, + "logits/chosen": 1748425386.6666667, + "logits/rejected": 1918790475.2941177, + "logps/chosen": -375.046484375, + "logps/rejected": -453.5421357996324, + "loss": 0.193, + "rewards/chosen": 1.345151138305664, + "rewards/margins": 6.440502862369313, + "rewards/rejected": -5.095351724063649, + "step": 504 + }, + { + "epoch": 0.1864242536108163, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 9.27262874650697e-06, + "logits/chosen": 1772750506.6666667, + "logits/rejected": 1787728486.4, + "logps/chosen": -242.2025349934896, + "logps/rejected": -495.9966796875, + "loss": 0.1052, + "rewards/chosen": 1.7768511772155762, + "rewards/margins": 8.226907634735108, + "rewards/rejected": -6.450056457519532, + "step": 505 + }, + { + "epoch": 0.1867934105486595, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 9.269568890405762e-06, + "logits/chosen": 1557479962.9473684, + "logits/rejected": 1926980214.1538463, + "logps/chosen": -275.08760793585526, + "logps/rejected": -402.48922025240387, + "loss": 0.176, + "rewards/chosen": 1.565025530363384, + "rewards/margins": 7.244478279762422, + "rewards/rejected": -5.679452749399038, + "step": 506 + }, + { + "epoch": 0.1871625674865027, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 9.2665031188711e-06, + "logits/chosen": 1559975296.0, + "logits/rejected": 2149469184.0, + "logps/chosen": -211.3551025390625, + "logps/rejected": -463.6027526855469, + "loss": 0.1895, + "rewards/chosen": 0.9574509263038635, + "rewards/margins": 8.133961260318756, + "rewards/rejected": -7.176510334014893, + "step": 507 + }, + { + "epoch": 0.1875317244243459, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 9.263431436150571e-06, + "logits/chosen": 1693622710.857143, + "logits/rejected": 1439700423.1111112, + "logps/chosen": -296.3124302455357, + "logps/rejected": -428.69371202256946, + "loss": 0.155, + "rewards/chosen": 1.498307773045131, + "rewards/margins": 7.987213361830939, + "rewards/rejected": -6.488905588785808, + "step": 508 + }, + { + "epoch": 0.1879008813621891, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 9.260353846499954e-06, + "logits/chosen": 1579855631.0588236, + "logits/rejected": 1536109772.8, + "logps/chosen": -287.30221737132354, + "logps/rejected": -432.1041666666667, + "loss": 0.1974, + "rewards/chosen": 0.8192945368149701, + "rewards/margins": 8.274596966014188, + "rewards/rejected": -7.455302429199219, + "step": 509 + }, + { + "epoch": 0.1882700383000323, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.257270354183212e-06, + "logits/chosen": 1602002189.4736843, + "logits/rejected": 1682813085.5384614, + "logps/chosen": -360.8655941611842, + "logps/rejected": -430.3850661057692, + "loss": 0.1526, + "rewards/chosen": 2.0253319991262337, + "rewards/margins": 10.72539937254871, + "rewards/rejected": -8.700067373422476, + "step": 510 + }, + { + "epoch": 0.1886391952378755, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 9.254180963472478e-06, + "logits/chosen": 1536171212.8, + "logits/rejected": 2071653436.235294, + "logps/chosen": -224.76726888020832, + "logps/rejected": -490.26941636029414, + "loss": 0.1261, + "rewards/chosen": 1.5379498799641926, + "rewards/margins": 8.906932531618605, + "rewards/rejected": -7.368982651654412, + "step": 511 + }, + { + "epoch": 0.1890083521757187, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 9.251085678648072e-06, + "logits/chosen": 1429697877.3333333, + "logits/rejected": 2144477476.5714285, + "logps/chosen": -338.524658203125, + "logps/rejected": -478.64208984375, + "loss": 0.1882, + "rewards/chosen": 1.0394471486409504, + "rewards/margins": 8.476882298787435, + "rewards/rejected": -7.437435150146484, + "step": 512 + }, + { + "epoch": 0.1893775091135619, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 9.247984503998466e-06, + "logits/chosen": 2323327096.470588, + "logits/rejected": 1814560358.4, + "logps/chosen": -203.08531996783088, + "logps/rejected": -510.0996419270833, + "loss": 0.1886, + "rewards/chosen": 1.1141486448400162, + "rewards/margins": 8.385692020491057, + "rewards/rejected": -7.271543375651041, + "step": 513 + }, + { + "epoch": 0.1897466660514051, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 9.2448774438203e-06, + "logits/chosen": 1847028480.0, + "logits/rejected": 2033866752.0, + "logps/chosen": -270.27276611328125, + "logps/rejected": -645.2341918945312, + "loss": 0.176, + "rewards/chosen": 1.5080769062042236, + "rewards/margins": 9.613614320755005, + "rewards/rejected": -8.105537414550781, + "step": 514 + }, + { + "epoch": 0.1901158229892483, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 9.241764502418365e-06, + "logits/chosen": 1933643776.0, + "logits/rejected": 2672773443.368421, + "logps/chosen": -319.96585787259613, + "logps/rejected": -610.021638569079, + "loss": 0.1894, + "rewards/chosen": 0.6091128496023325, + "rewards/margins": 6.803653174566353, + "rewards/rejected": -6.194540324964021, + "step": 515 + }, + { + "epoch": 0.1904849799270915, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 9.238645684105606e-06, + "logits/chosen": 1463426779.4285715, + "logits/rejected": 1637662947.5555556, + "logps/chosen": -187.28548758370536, + "logps/rejected": -489.7932400173611, + "loss": 0.1173, + "rewards/chosen": 2.0201211656842912, + "rewards/margins": 9.265878374614413, + "rewards/rejected": -7.245757208930121, + "step": 516 + }, + { + "epoch": 0.19085413686493471, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 9.2355209932031e-06, + "logits/chosen": 2076992170.6666667, + "logits/rejected": 1505105920.0, + "logps/chosen": -251.32462565104166, + "logps/rejected": -365.79561941964283, + "loss": 0.1951, + "rewards/chosen": 1.523885515001085, + "rewards/margins": 7.000006600031777, + "rewards/rejected": -5.476121085030692, + "step": 517 + }, + { + "epoch": 0.19122329380277792, + "grad_norm": 15.375, + "kl": 0.0, + "learning_rate": 9.232390434040071e-06, + "logits/chosen": 1723503411.2, + "logits/rejected": 1271525376.0, + "logps/chosen": -257.61630859375, + "logps/rejected": -454.79833984375, + "loss": 0.2204, + "rewards/chosen": 1.1533077239990235, + "rewards/margins": 9.002512105305989, + "rewards/rejected": -7.849204381306966, + "step": 518 + }, + { + "epoch": 0.19159245074062112, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 9.229254010953868e-06, + "logits/chosen": 2019313904.9411764, + "logits/rejected": 1638672247.4666667, + "logps/chosen": -271.79541015625, + "logps/rejected": -538.7889973958333, + "loss": 0.1815, + "rewards/chosen": 1.1583520103903377, + "rewards/margins": 9.723732869765339, + "rewards/rejected": -8.565380859375, + "step": 519 + }, + { + "epoch": 0.19196160767846432, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 9.226111728289963e-06, + "logits/chosen": 2070273365.3333333, + "logits/rejected": 2225796505.6, + "logps/chosen": -237.7210489908854, + "logps/rejected": -393.187060546875, + "loss": 0.1627, + "rewards/chosen": 0.5267066955566406, + "rewards/margins": 7.491919708251953, + "rewards/rejected": -6.965213012695313, + "step": 520 + }, + { + "epoch": 0.19233076461630752, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.222963590401953e-06, + "logits/chosen": 1506792999.3846154, + "logits/rejected": 1365210812.631579, + "logps/chosen": -337.86328125, + "logps/rejected": -517.5747327302631, + "loss": 0.1248, + "rewards/chosen": 1.7331085205078125, + "rewards/margins": 8.621051989103618, + "rewards/rejected": -6.887943468595806, + "step": 521 + }, + { + "epoch": 0.19269992155415072, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 9.21980960165154e-06, + "logits/chosen": 1434353408.0, + "logits/rejected": 1940198144.0, + "logps/chosen": -328.9886779785156, + "logps/rejected": -441.9497375488281, + "loss": 0.1721, + "rewards/chosen": 1.191384196281433, + "rewards/margins": 8.249430537223816, + "rewards/rejected": -7.058046340942383, + "step": 522 + }, + { + "epoch": 0.1930690784919939, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.216649766408536e-06, + "logits/chosen": 1948139929.6, + "logits/rejected": 2137723843.764706, + "logps/chosen": -283.740625, + "logps/rejected": -461.90935202205884, + "loss": 0.1831, + "rewards/chosen": 1.0980261484781901, + "rewards/margins": 8.45220553080241, + "rewards/rejected": -7.354179382324219, + "step": 523 + }, + { + "epoch": 0.1934382354298371, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 9.213484089050853e-06, + "logits/chosen": 1227787324.235294, + "logits/rejected": 1396557141.3333333, + "logps/chosen": -222.26766429227942, + "logps/rejected": -410.88671875, + "loss": 0.1888, + "rewards/chosen": 1.3342289644129135, + "rewards/margins": 10.131850627824372, + "rewards/rejected": -8.797621663411459, + "step": 524 + }, + { + "epoch": 0.1938073923676803, + "grad_norm": 15.375, + "kl": 0.0, + "learning_rate": 9.210312573964496e-06, + "logits/chosen": 1975815976.4210527, + "logits/rejected": 2138181947.0769231, + "logps/chosen": -286.6119449013158, + "logps/rejected": -434.5, + "loss": 0.2473, + "rewards/chosen": 0.774317741394043, + "rewards/margins": 7.7571216729971075, + "rewards/rejected": -6.982803931603065, + "step": 525 + }, + { + "epoch": 0.1941765493055235, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 9.207135225543557e-06, + "logits/chosen": 2058776094.1176472, + "logits/rejected": 1696930065.0666666, + "logps/chosen": -272.95211971507354, + "logps/rejected": -418.25413411458334, + "loss": 0.1907, + "rewards/chosen": 1.1297698301427506, + "rewards/margins": 7.756622247134938, + "rewards/rejected": -6.626852416992188, + "step": 526 + }, + { + "epoch": 0.1945457062433667, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.203952048190217e-06, + "logits/chosen": 1456261802.6666667, + "logits/rejected": 1508054308.5714285, + "logps/chosen": -273.5607638888889, + "logps/rejected": -459.2677525111607, + "loss": 0.1944, + "rewards/chosen": 0.9496177037556967, + "rewards/margins": 8.545520282927013, + "rewards/rejected": -7.595902579171317, + "step": 527 + }, + { + "epoch": 0.1949148631812099, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 9.200763046314725e-06, + "logits/chosen": 1726218480.9411764, + "logits/rejected": 2044894276.2666667, + "logps/chosen": -251.31692325367646, + "logps/rejected": -491.96669921875, + "loss": 0.1647, + "rewards/chosen": 1.305674721212948, + "rewards/margins": 8.819770281922583, + "rewards/rejected": -7.514095560709635, + "step": 528 + }, + { + "epoch": 0.1952840201190531, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.197568224335401e-06, + "logits/chosen": 2031852604.235294, + "logits/rejected": 2154188253.866667, + "logps/chosen": -327.15558938419116, + "logps/rejected": -386.21259765625, + "loss": 0.156, + "rewards/chosen": 1.687422696281882, + "rewards/margins": 8.830508938957664, + "rewards/rejected": -7.143086242675781, + "step": 529 + }, + { + "epoch": 0.1956531770568963, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 9.194367586678634e-06, + "logits/chosen": 1813981440.0, + "logits/rejected": 1483889152.0, + "logps/chosen": -263.4017333984375, + "logps/rejected": -545.5291748046875, + "loss": 0.1329, + "rewards/chosen": 1.5658973455429077, + "rewards/margins": 10.041785597801208, + "rewards/rejected": -8.4758882522583, + "step": 530 + }, + { + "epoch": 0.1960223339947395, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 9.191161137778861e-06, + "logits/chosen": 1460036461.7142856, + "logits/rejected": 2147167345.7777777, + "logps/chosen": -230.32803780691964, + "logps/rejected": -490.8362087673611, + "loss": 0.1394, + "rewards/chosen": 1.4165964126586914, + "rewards/margins": 9.720474349127876, + "rewards/rejected": -8.303877936469185, + "step": 531 + }, + { + "epoch": 0.1963914909325827, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 9.187948882078582e-06, + "logits/chosen": 2209746688.0, + "logits/rejected": 2317312256.0, + "logps/chosen": -290.333984375, + "logps/rejected": -435.89013671875, + "loss": 0.1986, + "rewards/chosen": 0.5478121638298035, + "rewards/margins": 8.063519179821014, + "rewards/rejected": -7.515707015991211, + "step": 532 + }, + { + "epoch": 0.1967606478704259, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 9.184730824028334e-06, + "logits/chosen": 1651792256.0, + "logits/rejected": 1720465536.0, + "logps/chosen": -273.79901123046875, + "logps/rejected": -401.3411865234375, + "loss": 0.1557, + "rewards/chosen": 1.341324806213379, + "rewards/margins": 9.602961540222168, + "rewards/rejected": -8.261636734008789, + "step": 533 + }, + { + "epoch": 0.1971298048082691, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.181506968086696e-06, + "logits/chosen": 2016805683.2, + "logits/rejected": 1754403498.6666667, + "logps/chosen": -191.06611328125, + "logps/rejected": -412.4502360026042, + "loss": 0.2163, + "rewards/chosen": 1.2251964569091798, + "rewards/margins": 7.654104487101238, + "rewards/rejected": -6.428908030192058, + "step": 534 + }, + { + "epoch": 0.1974989617461123, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 9.178277318720279e-06, + "logits/chosen": 1795798766.9333334, + "logits/rejected": 1837004559.0588236, + "logps/chosen": -263.09851888020836, + "logps/rejected": -435.7174287683824, + "loss": 0.1544, + "rewards/chosen": 1.2812094370524088, + "rewards/margins": 7.581780654308843, + "rewards/rejected": -6.300571217256434, + "step": 535 + }, + { + "epoch": 0.1978681186839555, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 9.175041880403721e-06, + "logits/chosen": 2105398452.7058823, + "logits/rejected": 1484027630.9333334, + "logps/chosen": -196.0568129595588, + "logps/rejected": -516.4480143229167, + "loss": 0.1913, + "rewards/chosen": 0.9719672483556411, + "rewards/margins": 9.985456017886891, + "rewards/rejected": -9.01348876953125, + "step": 536 + }, + { + "epoch": 0.19823727562179871, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 9.171800657619683e-06, + "logits/chosen": 2277996701.5384617, + "logits/rejected": 2253424316.631579, + "logps/chosen": -312.363037109375, + "logps/rejected": -451.5133120888158, + "loss": 0.1669, + "rewards/chosen": 0.6535064990703876, + "rewards/margins": 8.331756460521868, + "rewards/rejected": -7.67824996145148, + "step": 537 + }, + { + "epoch": 0.19860643255964192, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 9.168553654858834e-06, + "logits/chosen": 1963455360.0, + "logits/rejected": 1795472256.0, + "logps/chosen": -280.07733154296875, + "logps/rejected": -462.04681396484375, + "loss": 0.157, + "rewards/chosen": 1.5113461017608643, + "rewards/margins": 8.006688833236694, + "rewards/rejected": -6.49534273147583, + "step": 538 + }, + { + "epoch": 0.19897558949748512, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.165300876619857e-06, + "logits/chosen": 2262503619.047619, + "logits/rejected": 1666511778.909091, + "logps/chosen": -296.01922898065476, + "logps/rejected": -477.92751242897725, + "loss": 0.2336, + "rewards/chosen": 0.9710443587530226, + "rewards/margins": 7.999099029607071, + "rewards/rejected": -7.028054670854048, + "step": 539 + }, + { + "epoch": 0.19934474643532832, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 9.162042327409437e-06, + "logits/chosen": 2047733037.1764705, + "logits/rejected": 1576008635.7333333, + "logps/chosen": -258.44054457720586, + "logps/rejected": -452.567578125, + "loss": 0.1145, + "rewards/chosen": 1.7968803854549633, + "rewards/margins": 9.497731825884651, + "rewards/rejected": -7.7008514404296875, + "step": 540 + }, + { + "epoch": 0.19971390337317152, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.15877801174225e-06, + "logits/chosen": 2478298180.266667, + "logits/rejected": 2671404935.529412, + "logps/chosen": -315.15989583333334, + "logps/rejected": -444.2377355238971, + "loss": 0.1427, + "rewards/chosen": 1.4257303873697917, + "rewards/margins": 7.939315197514553, + "rewards/rejected": -6.513584810144761, + "step": 541 + }, + { + "epoch": 0.20008306031101472, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 9.155507934140962e-06, + "logits/chosen": 1719702905.2631578, + "logits/rejected": 1912967168.0, + "logps/chosen": -236.93313116776315, + "logps/rejected": -471.5680964543269, + "loss": 0.1785, + "rewards/chosen": 1.284210205078125, + "rewards/margins": 10.377217806302584, + "rewards/rejected": -9.093007601224459, + "step": 542 + }, + { + "epoch": 0.20045221724885792, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 9.152232099136227e-06, + "logits/chosen": 1952822613.3333333, + "logits/rejected": 1566212096.0, + "logps/chosen": -322.4392578125, + "logps/rejected": -393.16745174632354, + "loss": 0.1792, + "rewards/chosen": 1.3511577606201173, + "rewards/margins": 7.851636168536018, + "rewards/rejected": -6.5004784079159, + "step": 543 + }, + { + "epoch": 0.20082137418670112, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 9.148950511266674e-06, + "logits/chosen": 1664007509.3333333, + "logits/rejected": 1545862348.8, + "logps/chosen": -291.77972412109375, + "logps/rejected": -448.739453125, + "loss": 0.1312, + "rewards/chosen": 1.9782946904500325, + "rewards/margins": 7.589177640279134, + "rewards/rejected": -5.610882949829102, + "step": 544 + }, + { + "epoch": 0.20119053112454433, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 9.1456631750789e-06, + "logits/chosen": 2050768640.0, + "logits/rejected": 1730386688.0, + "logps/chosen": -316.289794921875, + "logps/rejected": -456.49554443359375, + "loss": 0.2026, + "rewards/chosen": 0.947542130947113, + "rewards/margins": 6.61425107717514, + "rewards/rejected": -5.666708946228027, + "step": 545 + }, + { + "epoch": 0.20155968806238753, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 9.142370095127465e-06, + "logits/chosen": 1590782361.6, + "logits/rejected": 2142522187.2941177, + "logps/chosen": -316.83128255208334, + "logps/rejected": -556.9126263786765, + "loss": 0.136, + "rewards/chosen": 1.432281239827474, + "rewards/margins": 10.470899439793008, + "rewards/rejected": -9.038618199965534, + "step": 546 + }, + { + "epoch": 0.20192884500023073, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 9.139071275974894e-06, + "logits/chosen": 1994018028.3076923, + "logits/rejected": 2116788008.4210527, + "logps/chosen": -310.19933143028845, + "logps/rejected": -401.5547645970395, + "loss": 0.118, + "rewards/chosen": 1.5993235661433294, + "rewards/margins": 8.001874514436915, + "rewards/rejected": -6.402550948293586, + "step": 547 + }, + { + "epoch": 0.20229800193807393, + "grad_norm": 13.5, + "kl": 0.5224275588989258, + "learning_rate": 9.135766722191655e-06, + "logits/chosen": 1761064960.0, + "logits/rejected": 1998165138.2857144, + "logps/chosen": -268.2634548611111, + "logps/rejected": -537.3767787388393, + "loss": 0.1693, + "rewards/chosen": 1.641847398546007, + "rewards/margins": 8.541684347485738, + "rewards/rejected": -6.899836948939732, + "step": 548 + }, + { + "epoch": 0.20266715887591713, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 9.132456438356165e-06, + "logits/chosen": 1566293040.7619047, + "logits/rejected": 1479971840.0, + "logps/chosen": -240.08909970238096, + "logps/rejected": -422.41122159090907, + "loss": 0.1641, + "rewards/chosen": 1.7491017296200706, + "rewards/margins": 10.319969094676889, + "rewards/rejected": -8.570867365056818, + "step": 549 + }, + { + "epoch": 0.20303631581376033, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 9.129140429054785e-06, + "logits/chosen": 1665969545.8461537, + "logits/rejected": 2158504474.9473686, + "logps/chosen": -350.14832481971155, + "logps/rejected": -565.6294202302631, + "loss": 0.1375, + "rewards/chosen": 1.0363346980168269, + "rewards/margins": 9.660001561709262, + "rewards/rejected": -8.623666863692435, + "step": 550 + }, + { + "epoch": 0.20340547275160353, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 9.125818698881798e-06, + "logits/chosen": 2100261091.5555556, + "logits/rejected": 1602453650.2857144, + "logps/chosen": -374.42222764756946, + "logps/rejected": -444.42703683035717, + "loss": 0.2465, + "rewards/chosen": 0.46811405817667645, + "rewards/margins": 7.718553838275728, + "rewards/rejected": -7.250439780099051, + "step": 551 + }, + { + "epoch": 0.20377462968944673, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.122491252439425e-06, + "logits/chosen": 1712183175.5294118, + "logits/rejected": 1576757248.0, + "logps/chosen": -247.01490693933823, + "logps/rejected": -373.6669596354167, + "loss": 0.1459, + "rewards/chosen": 1.5472046347225414, + "rewards/margins": 6.945182336545457, + "rewards/rejected": -5.397977701822916, + "step": 552 + }, + { + "epoch": 0.20414378662728994, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 9.119158094337794e-06, + "logits/chosen": 1938288867.5555556, + "logits/rejected": 1805902848.0, + "logps/chosen": -292.389404296875, + "logps/rejected": -692.8347516741071, + "loss": 0.2384, + "rewards/chosen": 0.642407152387831, + "rewards/margins": 10.807107251787942, + "rewards/rejected": -10.164700099400111, + "step": 553 + }, + { + "epoch": 0.20451294356513314, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 9.11581922919496e-06, + "logits/chosen": 2167668931.047619, + "logits/rejected": 2191192436.3636365, + "logps/chosen": -275.5000465029762, + "logps/rejected": -599.0743519176136, + "loss": 0.2574, + "rewards/chosen": 0.6368027641659691, + "rewards/margins": 9.732010672102758, + "rewards/rejected": -9.09520790793679, + "step": 554 + }, + { + "epoch": 0.20488210050297634, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.112474661636871e-06, + "logits/chosen": 1644161536.0, + "logits/rejected": 1943716249.6, + "logps/chosen": -185.45072428385416, + "logps/rejected": -418.820947265625, + "loss": 0.1206, + "rewards/chosen": 1.3512927691141765, + "rewards/margins": 6.789508406321208, + "rewards/rejected": -5.438215637207032, + "step": 555 + }, + { + "epoch": 0.20525125744081954, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 9.109124396297391e-06, + "logits/chosen": 1545126456.8888888, + "logits/rejected": 1875554304.0, + "logps/chosen": -204.2105712890625, + "logps/rejected": -493.33328683035717, + "loss": 0.177, + "rewards/chosen": 1.603835317823622, + "rewards/margins": 9.379460274227082, + "rewards/rejected": -7.77562495640346, + "step": 556 + }, + { + "epoch": 0.20562041437866274, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 9.105768437818263e-06, + "logits/chosen": 1924836944.8421052, + "logits/rejected": 1989770791.3846154, + "logps/chosen": -386.01112767269734, + "logps/rejected": -473.62349759615387, + "loss": 0.1918, + "rewards/chosen": 1.2474373265316612, + "rewards/margins": 7.677802668891937, + "rewards/rejected": -6.430365342360276, + "step": 557 + }, + { + "epoch": 0.20598957131650594, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.102406790849129e-06, + "logits/chosen": 1754000653.4736843, + "logits/rejected": 2242638611.6923075, + "logps/chosen": -299.9478053042763, + "logps/rejected": -591.8624924879807, + "loss": 0.2149, + "rewards/chosen": 1.003880450600072, + "rewards/margins": 9.462475726478978, + "rewards/rejected": -8.458595275878906, + "step": 558 + }, + { + "epoch": 0.20635872825434912, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 9.099039460047506e-06, + "logits/chosen": 1713453056.0, + "logits/rejected": 1666205559.4666667, + "logps/chosen": -239.04262408088235, + "logps/rejected": -407.6943033854167, + "loss": 0.186, + "rewards/chosen": 1.0664082695456112, + "rewards/margins": 7.2653849134258195, + "rewards/rejected": -6.198976643880209, + "step": 559 + }, + { + "epoch": 0.20672788519219232, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 9.09566645007879e-06, + "logits/chosen": 2019744654.2222223, + "logits/rejected": 2054881572.5714285, + "logps/chosen": -373.71446397569446, + "logps/rejected": -401.4518345424107, + "loss": 0.1705, + "rewards/chosen": 1.4559410942925348, + "rewards/margins": 9.07830077882797, + "rewards/rejected": -7.622359684535435, + "step": 560 + }, + { + "epoch": 0.20709704213003552, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 9.09228776561624e-06, + "logits/chosen": 1899064481.6842105, + "logits/rejected": 1670445528.6153846, + "logps/chosen": -327.5763003700658, + "logps/rejected": -514.2982271634615, + "loss": 0.203, + "rewards/chosen": 1.0184479763633327, + "rewards/margins": 8.24529335083749, + "rewards/rejected": -7.226845374474158, + "step": 561 + }, + { + "epoch": 0.20746619906787872, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 9.088903411340985e-06, + "logits/chosen": 1573458602.6666667, + "logits/rejected": 1942682038.857143, + "logps/chosen": -269.1371799045139, + "logps/rejected": -461.8994838169643, + "loss": 0.1754, + "rewards/chosen": 1.3602530161539714, + "rewards/margins": 8.284960156395321, + "rewards/rejected": -6.924707140241351, + "step": 562 + }, + { + "epoch": 0.20783535600572192, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 9.085513391942003e-06, + "logits/chosen": 1709549112.8888888, + "logits/rejected": 1265875090.2857144, + "logps/chosen": -268.36634657118054, + "logps/rejected": -443.36704799107144, + "loss": 0.1792, + "rewards/chosen": 1.6182136535644531, + "rewards/margins": 9.469799041748047, + "rewards/rejected": -7.851585388183594, + "step": 563 + }, + { + "epoch": 0.20820451294356512, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.08211771211612e-06, + "logits/chosen": 1362971306.6666667, + "logits/rejected": 1379651945.4117646, + "logps/chosen": -299.9892252604167, + "logps/rejected": -463.3130744485294, + "loss": 0.1783, + "rewards/chosen": 1.0788370768229167, + "rewards/margins": 9.094118305281096, + "rewards/rejected": -8.01528122845818, + "step": 564 + }, + { + "epoch": 0.20857366988140832, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 9.078716376568011e-06, + "logits/chosen": 1803551597.7142856, + "logits/rejected": 2502470314.6666665, + "logps/chosen": -199.92628696986608, + "logps/rejected": -503.97846137152777, + "loss": 0.1049, + "rewards/chosen": 1.9145941053118025, + "rewards/margins": 8.923256011236282, + "rewards/rejected": -7.0086619059244795, + "step": 565 + }, + { + "epoch": 0.20894282681925153, + "grad_norm": 22.75, + "kl": 3.969576358795166, + "learning_rate": 9.075309390010182e-06, + "logits/chosen": 2006638376.4210527, + "logits/rejected": 1806391768.6153846, + "logps/chosen": -269.13204152960526, + "logps/rejected": -466.76998197115387, + "loss": 0.2186, + "rewards/chosen": 1.7497229325143915, + "rewards/margins": 6.704926649085905, + "rewards/rejected": -4.955203716571514, + "step": 566 + }, + { + "epoch": 0.20931198375709473, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 9.07189675716297e-06, + "logits/chosen": 1584099514.1818182, + "logits/rejected": 1690752731.4285715, + "logps/chosen": -184.32218794389203, + "logps/rejected": -573.49609375, + "loss": 0.0659, + "rewards/chosen": 3.0985742048783735, + "rewards/margins": 11.60654728546803, + "rewards/rejected": -8.507973080589657, + "step": 567 + }, + { + "epoch": 0.20968114069493793, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.068478482754532e-06, + "logits/chosen": 2399958481.4545455, + "logits/rejected": 2145268687.2380953, + "logps/chosen": -313.9454900568182, + "logps/rejected": -536.1988467261905, + "loss": 0.1317, + "rewards/chosen": 0.9548492431640625, + "rewards/margins": 7.847540719168527, + "rewards/rejected": -6.892691476004464, + "step": 568 + }, + { + "epoch": 0.21005029763278113, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.065054571520846e-06, + "logits/chosen": 1584595416.6153846, + "logits/rejected": 1301092783.1578948, + "logps/chosen": -318.8743239182692, + "logps/rejected": -373.0796155427632, + "loss": 0.1537, + "rewards/chosen": 0.938760023850661, + "rewards/margins": 7.361528064557898, + "rewards/rejected": -6.422768040707237, + "step": 569 + }, + { + "epoch": 0.21041945457062433, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 9.061625028205699e-06, + "logits/chosen": 1455776699.7333333, + "logits/rejected": 1304073276.235294, + "logps/chosen": -280.13551432291666, + "logps/rejected": -432.4692957261029, + "loss": 0.1366, + "rewards/chosen": 1.6975513458251954, + "rewards/margins": 9.96963440390194, + "rewards/rejected": -8.272083058076745, + "step": 570 + }, + { + "epoch": 0.21078861150846753, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 9.058189857560675e-06, + "logits/chosen": 1510337929.8461537, + "logits/rejected": 2164830423.5789475, + "logps/chosen": -318.91011868990387, + "logps/rejected": -455.4263466282895, + "loss": 0.1296, + "rewards/chosen": 1.1510832859919622, + "rewards/margins": 9.861998299355449, + "rewards/rejected": -8.710915013363486, + "step": 571 + }, + { + "epoch": 0.21115776844631073, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 9.054749064345165e-06, + "logits/chosen": 1941602304.0, + "logits/rejected": 2153375744.0, + "logps/chosen": -290.41527035361844, + "logps/rejected": -569.1043419471154, + "loss": 0.2105, + "rewards/chosen": 1.2930553837826377, + "rewards/margins": 9.11125630212699, + "rewards/rejected": -7.818200918344351, + "step": 572 + }, + { + "epoch": 0.21152692538415394, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 9.05130265332634e-06, + "logits/chosen": 1237256738.1333334, + "logits/rejected": 1572677391.0588236, + "logps/chosen": -253.20830078125, + "logps/rejected": -439.21989889705884, + "loss": 0.1441, + "rewards/chosen": 1.5978641510009766, + "rewards/margins": 9.366781459135169, + "rewards/rejected": -7.768917308134191, + "step": 573 + }, + { + "epoch": 0.21189608232199714, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 9.04785062927916e-06, + "logits/chosen": 1750054473.142857, + "logits/rejected": 1767066624.0, + "logps/chosen": -289.7642299107143, + "logps/rejected": -438.2781575520833, + "loss": 0.1207, + "rewards/chosen": 1.5603437423706055, + "rewards/margins": 8.245320108201769, + "rewards/rejected": -6.684976365831163, + "step": 574 + }, + { + "epoch": 0.21226523925984034, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 9.04439299698636e-06, + "logits/chosen": 2106680320.0, + "logits/rejected": 2149581255.111111, + "logps/chosen": -217.13985770089286, + "logps/rejected": -511.48442925347223, + "loss": 0.1674, + "rewards/chosen": 0.6998860495431083, + "rewards/margins": 9.178852656530955, + "rewards/rejected": -8.478966606987846, + "step": 575 + }, + { + "epoch": 0.21263439619768354, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 9.040929761238448e-06, + "logits/chosen": 1783858907.4285715, + "logits/rejected": 1612699875.5555556, + "logps/chosen": -364.8817661830357, + "logps/rejected": -416.44829644097223, + "loss": 0.131, + "rewards/chosen": 1.5877128328595842, + "rewards/margins": 9.273552167983283, + "rewards/rejected": -7.685839335123698, + "step": 576 + }, + { + "epoch": 0.21300355313552674, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 9.03746092683369e-06, + "logits/chosen": 2298371120.7619047, + "logits/rejected": 1714261457.4545455, + "logps/chosen": -313.3645833333333, + "logps/rejected": -654.8312322443181, + "loss": 0.2546, + "rewards/chosen": 0.5874936694190616, + "rewards/margins": 7.647399675278437, + "rewards/rejected": -7.059906005859375, + "step": 577 + }, + { + "epoch": 0.21337271007336994, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 9.033986498578113e-06, + "logits/chosen": 1510029870.5454545, + "logits/rejected": 1497066496.0, + "logps/chosen": -245.35182883522728, + "logps/rejected": -401.3447998046875, + "loss": 0.2181, + "rewards/chosen": 1.1192637356844815, + "rewards/margins": 7.7077639493075285, + "rewards/rejected": -6.588500213623047, + "step": 578 + }, + { + "epoch": 0.21374186701121314, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 9.030506481285495e-06, + "logits/chosen": 1725962103.4666667, + "logits/rejected": 1504510192.9411764, + "logps/chosen": -308.36188151041665, + "logps/rejected": -474.1260340073529, + "loss": 0.1923, + "rewards/chosen": 0.859922981262207, + "rewards/margins": 8.62186612521901, + "rewards/rejected": -7.761943143956802, + "step": 579 + }, + { + "epoch": 0.21411102394905634, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 9.027020879777354e-06, + "logits/chosen": 1494290304.0, + "logits/rejected": 1917537280.0, + "logps/chosen": -259.0396423339844, + "logps/rejected": -476.64996337890625, + "loss": 0.1942, + "rewards/chosen": 0.7355548143386841, + "rewards/margins": 7.97956907749176, + "rewards/rejected": -7.244014263153076, + "step": 580 + }, + { + "epoch": 0.21448018088689955, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 9.023529698882946e-06, + "logits/chosen": 1687045504.0, + "logits/rejected": 2050980352.0, + "logps/chosen": -293.108642578125, + "logps/rejected": -401.1996154785156, + "loss": 0.1879, + "rewards/chosen": 1.0889683961868286, + "rewards/margins": 8.064456582069397, + "rewards/rejected": -6.975488185882568, + "step": 581 + }, + { + "epoch": 0.21484933782474275, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 9.020032943439258e-06, + "logits/chosen": 1862757139.6923077, + "logits/rejected": 2083863713.6842105, + "logps/chosen": -312.3418156550481, + "logps/rejected": -479.86703330592104, + "loss": 0.1783, + "rewards/chosen": 0.5547563479496882, + "rewards/margins": 8.033224590394179, + "rewards/rejected": -7.47846824244449, + "step": 582 + }, + { + "epoch": 0.21521849476258595, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 9.016530618291001e-06, + "logits/chosen": 1595973290.6666667, + "logits/rejected": 1898930790.4, + "logps/chosen": -337.49843343098956, + "logps/rejected": -532.58046875, + "loss": 0.1019, + "rewards/chosen": 1.6104737917582195, + "rewards/margins": 10.200187842051188, + "rewards/rejected": -8.589714050292969, + "step": 583 + }, + { + "epoch": 0.21558765170042915, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 9.013022728290604e-06, + "logits/chosen": 1673675605.3333333, + "logits/rejected": 1419395276.8, + "logps/chosen": -352.8475748697917, + "logps/rejected": -495.93935546875, + "loss": 0.1727, + "rewards/chosen": 0.8103640874226888, + "rewards/margins": 8.080707867940268, + "rewards/rejected": -7.270343780517578, + "step": 584 + }, + { + "epoch": 0.21595680863827235, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 9.009509278298201e-06, + "logits/chosen": 1310180420.2666667, + "logits/rejected": 1336335781.6470587, + "logps/chosen": -274.98138020833335, + "logps/rejected": -423.38786764705884, + "loss": 0.186, + "rewards/chosen": 0.7402849833170573, + "rewards/margins": 7.555808467491, + "rewards/rejected": -6.815523484173943, + "step": 585 + }, + { + "epoch": 0.21632596557611555, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 9.005990273181631e-06, + "logits/chosen": 1902469266.2857144, + "logits/rejected": 1657869994.6666667, + "logps/chosen": -287.6361781529018, + "logps/rejected": -411.61829969618054, + "loss": 0.1774, + "rewards/chosen": 0.8822000367300851, + "rewards/margins": 8.568849783095102, + "rewards/rejected": -7.686649746365017, + "step": 586 + }, + { + "epoch": 0.21669512251395875, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.002465717816436e-06, + "logits/chosen": 2128489418.1052632, + "logits/rejected": 1495049294.7692308, + "logps/chosen": -300.53140419407896, + "logps/rejected": -517.9981971153846, + "loss": 0.2234, + "rewards/chosen": 0.8977372018914473, + "rewards/margins": 9.218170598450943, + "rewards/rejected": -8.320433396559496, + "step": 587 + }, + { + "epoch": 0.21706427945180196, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 8.998935617085837e-06, + "logits/chosen": 2116407432.5333333, + "logits/rejected": 1388392448.0, + "logps/chosen": -209.25735677083333, + "logps/rejected": -495.6539522058824, + "loss": 0.203, + "rewards/chosen": 0.5550373713175456, + "rewards/margins": 7.457597437091902, + "rewards/rejected": -6.902560065774357, + "step": 588 + }, + { + "epoch": 0.21743343638964516, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 8.995399975880749e-06, + "logits/chosen": 1931104496.9411764, + "logits/rejected": 1907063739.7333333, + "logps/chosen": -330.6831916360294, + "logps/rejected": -584.0557942708333, + "loss": 0.1705, + "rewards/chosen": 1.3296071220846737, + "rewards/margins": 7.730576563816445, + "rewards/rejected": -6.400969441731771, + "step": 589 + }, + { + "epoch": 0.21780259332748836, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 8.991858799099755e-06, + "logits/chosen": 1355156299.2941177, + "logits/rejected": 1596645102.9333334, + "logps/chosen": -261.05534811580884, + "logps/rejected": -470.28310546875, + "loss": 0.187, + "rewards/chosen": 0.9710822385900161, + "rewards/margins": 8.149877608056162, + "rewards/rejected": -7.178795369466146, + "step": 590 + }, + { + "epoch": 0.21817175026533156, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 8.98831209164911e-06, + "logits/chosen": 2427628150.1538463, + "logits/rejected": 1891787614.3157895, + "logps/chosen": -272.9636418269231, + "logps/rejected": -424.48057154605266, + "loss": 0.1475, + "rewards/chosen": 1.3483134049635668, + "rewards/margins": 7.458520008967473, + "rewards/rejected": -6.110206604003906, + "step": 591 + }, + { + "epoch": 0.21854090720317476, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 8.984759858442734e-06, + "logits/chosen": 1370383587.5555556, + "logits/rejected": 1658954605.7142856, + "logps/chosen": -243.58902994791666, + "logps/rejected": -517.3915318080357, + "loss": 0.1532, + "rewards/chosen": 1.6982640160454645, + "rewards/margins": 9.038833648439438, + "rewards/rejected": -7.340569632393973, + "step": 592 + }, + { + "epoch": 0.21891006414101796, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 8.9812021044022e-06, + "logits/chosen": 1804773649.0666666, + "logits/rejected": 2078883358.1176472, + "logps/chosen": -246.70431315104167, + "logps/rejected": -248.9306640625, + "loss": 0.2012, + "rewards/chosen": 0.893220329284668, + "rewards/margins": 5.268978332070744, + "rewards/rejected": -4.375758002786076, + "step": 593 + }, + { + "epoch": 0.21927922107886116, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 8.97763883445673e-06, + "logits/chosen": 1296615168.0, + "logits/rejected": 1788024064.0, + "logps/chosen": -262.5074157714844, + "logps/rejected": -474.1465148925781, + "loss": 0.1508, + "rewards/chosen": 1.1971220970153809, + "rewards/margins": 8.397694110870361, + "rewards/rejected": -7.2005720138549805, + "step": 594 + }, + { + "epoch": 0.21964837801670434, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 8.97407005354319e-06, + "logits/chosen": 1575275081.142857, + "logits/rejected": 1664285582.2222223, + "logps/chosen": -254.67567661830358, + "logps/rejected": -533.3963758680555, + "loss": 0.0958, + "rewards/chosen": 2.0708652223859514, + "rewards/margins": 10.941850465441506, + "rewards/rejected": -8.870985243055555, + "step": 595 + }, + { + "epoch": 0.22001753495454754, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 8.970495766606083e-06, + "logits/chosen": 1953710295.5789473, + "logits/rejected": 1281999635.6923077, + "logps/chosen": -267.0317896792763, + "logps/rejected": -533.9694636418269, + "loss": 0.1741, + "rewards/chosen": 1.8573640522203947, + "rewards/margins": 9.819478826484218, + "rewards/rejected": -7.9621147742638225, + "step": 596 + }, + { + "epoch": 0.22038669189239074, + "grad_norm": 14.3125, + "kl": 0.02653789520263672, + "learning_rate": 8.966915978597532e-06, + "logits/chosen": 1988963669.3333333, + "logits/rejected": 2139787023.0588236, + "logps/chosen": -337.67734375, + "logps/rejected": -615.0726102941177, + "loss": 0.1601, + "rewards/chosen": 1.0363204956054688, + "rewards/margins": 11.261036682128907, + "rewards/rejected": -10.224716186523438, + "step": 597 + }, + { + "epoch": 0.22075584883023394, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 8.963330694477295e-06, + "logits/chosen": 1762913621.3333333, + "logits/rejected": 1614848722.8235295, + "logps/chosen": -208.40558268229168, + "logps/rejected": -323.7610868566176, + "loss": 0.1395, + "rewards/chosen": 1.715588633219401, + "rewards/margins": 7.319312944599226, + "rewards/rejected": -5.603724311379826, + "step": 598 + }, + { + "epoch": 0.22112500576807714, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 8.959739919212734e-06, + "logits/chosen": 2276996534.857143, + "logits/rejected": 1976095175.1111112, + "logps/chosen": -346.4030064174107, + "logps/rejected": -468.3058810763889, + "loss": 0.1851, + "rewards/chosen": 0.5459445204053607, + "rewards/margins": 7.559672768153842, + "rewards/rejected": -7.013728247748481, + "step": 599 + }, + { + "epoch": 0.22149416270592034, + "grad_norm": 12.5625, + "kl": 0.043280601501464844, + "learning_rate": 8.956143657778822e-06, + "logits/chosen": 2262869742.9333334, + "logits/rejected": 2173326034.8235292, + "logps/chosen": -280.6311848958333, + "logps/rejected": -345.14973000919116, + "loss": 0.1689, + "rewards/chosen": 1.3801097869873047, + "rewards/margins": 7.53739065282485, + "rewards/rejected": -6.157280865837546, + "step": 600 + }, + { + "epoch": 0.22186331964376355, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 8.952541915158137e-06, + "logits/chosen": 1803325124.9230769, + "logits/rejected": 2023171125.8947368, + "logps/chosen": -359.3541917067308, + "logps/rejected": -354.11937191611844, + "loss": 0.1633, + "rewards/chosen": 0.8156102987436148, + "rewards/margins": 6.361511145526099, + "rewards/rejected": -5.545900846782484, + "step": 601 + }, + { + "epoch": 0.22223247658160675, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 8.948934696340842e-06, + "logits/chosen": 1748525738.6666667, + "logits/rejected": 1933833758.1176472, + "logps/chosen": -286.5561848958333, + "logps/rejected": -437.5272575827206, + "loss": 0.1664, + "rewards/chosen": 1.1367459615071616, + "rewards/margins": 7.939574133181105, + "rewards/rejected": -6.802828171673943, + "step": 602 + }, + { + "epoch": 0.22260163351944995, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 8.945322006324698e-06, + "logits/chosen": 2229880459.6363635, + "logits/rejected": 1928157593.6, + "logps/chosen": -254.3050204190341, + "logps/rejected": -391.869189453125, + "loss": 0.2266, + "rewards/chosen": 1.2546343369917436, + "rewards/margins": 6.251790480180221, + "rewards/rejected": -4.997156143188477, + "step": 603 + }, + { + "epoch": 0.22297079045729315, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 8.941703850115037e-06, + "logits/chosen": 1916993672.5333333, + "logits/rejected": 1948162529.8823528, + "logps/chosen": -296.65638020833336, + "logps/rejected": -609.7596507352941, + "loss": 0.1389, + "rewards/chosen": 1.3807899475097656, + "rewards/margins": 9.322025658102596, + "rewards/rejected": -7.941235710592831, + "step": 604 + }, + { + "epoch": 0.22333994739513635, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 8.938080232724773e-06, + "logits/chosen": 1908749952.0, + "logits/rejected": 1511897088.0, + "logps/chosen": -224.04574584960938, + "logps/rejected": -533.3795166015625, + "loss": 0.1328, + "rewards/chosen": 1.847679853439331, + "rewards/margins": 13.875939130783081, + "rewards/rejected": -12.02825927734375, + "step": 605 + }, + { + "epoch": 0.22370910433297955, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 8.934451159174377e-06, + "logits/chosen": 1820361216.0, + "logits/rejected": 1917167872.0, + "logps/chosen": -344.0973205566406, + "logps/rejected": -413.4035949707031, + "loss": 0.1617, + "rewards/chosen": 0.9643862843513489, + "rewards/margins": 7.9102190136909485, + "rewards/rejected": -6.9458327293396, + "step": 606 + }, + { + "epoch": 0.22407826127082275, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 8.930816634491887e-06, + "logits/chosen": 2328598528.0, + "logits/rejected": 1740394837.3333333, + "logps/chosen": -286.82571847098217, + "logps/rejected": -474.1115451388889, + "loss": 0.1548, + "rewards/chosen": 0.9134588922773089, + "rewards/margins": 7.879216474200052, + "rewards/rejected": -6.965757581922743, + "step": 607 + }, + { + "epoch": 0.22444741820866596, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 8.927176663712892e-06, + "logits/chosen": 1838034670.9333334, + "logits/rejected": 1583910189.1764705, + "logps/chosen": -304.90810546875, + "logps/rejected": -457.36804917279414, + "loss": 0.1687, + "rewards/chosen": 1.0260272343953452, + "rewards/margins": 8.023338994792864, + "rewards/rejected": -6.997311760397518, + "step": 608 + }, + { + "epoch": 0.22481657514650916, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 8.923531251880524e-06, + "logits/chosen": 1449696870.4, + "logits/rejected": 1364100276.7058823, + "logps/chosen": -293.06865234375, + "logps/rejected": -417.5852481617647, + "loss": 0.1646, + "rewards/chosen": 0.9469762166341146, + "rewards/margins": 8.179127472522213, + "rewards/rejected": -7.232151255888097, + "step": 609 + }, + { + "epoch": 0.22518573208435236, + "grad_norm": 12.0625, + "kl": 0.6632614135742188, + "learning_rate": 8.919880404045452e-06, + "logits/chosen": 2320912128.0, + "logits/rejected": 2041535488.0, + "logps/chosen": -278.5690002441406, + "logps/rejected": -456.20611572265625, + "loss": 0.152, + "rewards/chosen": 1.4269788265228271, + "rewards/margins": 7.8543479442596436, + "rewards/rejected": -6.427369117736816, + "step": 610 + }, + { + "epoch": 0.22555488902219556, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 8.916224125265883e-06, + "logits/chosen": 1921330744.8888888, + "logits/rejected": 2043888786.2857144, + "logps/chosen": -284.8400065104167, + "logps/rejected": -496.33394949776783, + "loss": 0.2348, + "rewards/chosen": 0.5135703086853027, + "rewards/margins": 9.602342128753662, + "rewards/rejected": -9.08877182006836, + "step": 611 + }, + { + "epoch": 0.22592404596003876, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 8.912562420607545e-06, + "logits/chosen": 1664036608.0, + "logits/rejected": 1616830464.0, + "logps/chosen": -275.5440979003906, + "logps/rejected": -489.3852844238281, + "loss": 0.1934, + "rewards/chosen": 1.3444292545318604, + "rewards/margins": 8.888211488723755, + "rewards/rejected": -7.5437822341918945, + "step": 612 + }, + { + "epoch": 0.22629320289788196, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 8.90889529514368e-06, + "logits/chosen": 1697856512.0, + "logits/rejected": 1439162823.1111112, + "logps/chosen": -252.26491001674108, + "logps/rejected": -410.28325737847223, + "loss": 0.1397, + "rewards/chosen": 1.3609981536865234, + "rewards/margins": 8.333114412095812, + "rewards/rejected": -6.972116258409288, + "step": 613 + }, + { + "epoch": 0.22666235983572516, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 8.905222753955045e-06, + "logits/chosen": 1569975637.3333333, + "logits/rejected": 1470964589.7142856, + "logps/chosen": -230.00230577256946, + "logps/rejected": -439.30161830357144, + "loss": 0.2246, + "rewards/chosen": 1.1193210813734267, + "rewards/margins": 7.620468215336875, + "rewards/rejected": -6.501147133963449, + "step": 614 + }, + { + "epoch": 0.22703151677356836, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 8.901544802129903e-06, + "logits/chosen": 1395512500.7058823, + "logits/rejected": 1444220245.3333333, + "logps/chosen": -262.44646139705884, + "logps/rejected": -499.1049479166667, + "loss": 0.1793, + "rewards/chosen": 1.0909907397101908, + "rewards/margins": 8.905394019332586, + "rewards/rejected": -7.814403279622396, + "step": 615 + }, + { + "epoch": 0.22740067371141157, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 8.897861444764004e-06, + "logits/chosen": 1501180177.0666666, + "logits/rejected": 1689819617.8823528, + "logps/chosen": -319.46611328125, + "logps/rejected": -375.2960994944853, + "loss": 0.1336, + "rewards/chosen": 1.53377685546875, + "rewards/margins": 7.811287105784697, + "rewards/rejected": -6.277510250315947, + "step": 616 + }, + { + "epoch": 0.22776983064925477, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 8.894172686960594e-06, + "logits/chosen": 1554227561.4117646, + "logits/rejected": 1765889501.8666666, + "logps/chosen": -296.76809512867646, + "logps/rejected": -412.38128255208335, + "loss": 0.1672, + "rewards/chosen": 1.4057897679946, + "rewards/margins": 7.17333381503236, + "rewards/rejected": -5.76754404703776, + "step": 617 + }, + { + "epoch": 0.22813898758709797, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 8.890478533830403e-06, + "logits/chosen": 1935020152.4705882, + "logits/rejected": 1634824192.0, + "logps/chosen": -277.04187729779414, + "logps/rejected": -516.9689778645833, + "loss": 0.1826, + "rewards/chosen": 0.9207604352165671, + "rewards/margins": 8.044114520503026, + "rewards/rejected": -7.123354085286459, + "step": 618 + }, + { + "epoch": 0.22850814452494117, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 8.886778990491632e-06, + "logits/chosen": 1908239246.2222223, + "logits/rejected": 1755994258.2857144, + "logps/chosen": -305.37470160590277, + "logps/rejected": -416.4071568080357, + "loss": 0.1867, + "rewards/chosen": 1.1695077684190538, + "rewards/margins": 7.7874075268942216, + "rewards/rejected": -6.6178997584751675, + "step": 619 + }, + { + "epoch": 0.22887730146278437, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 8.883074062069948e-06, + "logits/chosen": 2015700582.4, + "logits/rejected": 1793307927.2727273, + "logps/chosen": -388.994775390625, + "logps/rejected": -484.0183771306818, + "loss": 0.1075, + "rewards/chosen": 1.7233427047729493, + "rewards/margins": 9.696314534274014, + "rewards/rejected": -7.972971829501065, + "step": 620 + }, + { + "epoch": 0.22924645840062757, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 8.879363753698487e-06, + "logits/chosen": 1574212096.0, + "logits/rejected": 1616455552.0, + "logps/chosen": -246.962890625, + "logps/rejected": -468.5999450683594, + "loss": 0.1638, + "rewards/chosen": 1.1764740943908691, + "rewards/margins": 8.70206594467163, + "rewards/rejected": -7.525591850280762, + "step": 621 + }, + { + "epoch": 0.22961561533847077, + "grad_norm": 12.625, + "kl": 0.10536384582519531, + "learning_rate": 8.875648070517832e-06, + "logits/chosen": 2642551552.0, + "logits/rejected": 1582042368.0, + "logps/chosen": -285.5059509277344, + "logps/rejected": -482.621826171875, + "loss": 0.1332, + "rewards/chosen": 1.4250067472457886, + "rewards/margins": 8.993879675865173, + "rewards/rejected": -7.568872928619385, + "step": 622 + }, + { + "epoch": 0.22998477227631398, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 8.871927017676013e-06, + "logits/chosen": 2179116919.4666667, + "logits/rejected": 1518672112.9411764, + "logps/chosen": -312.80263671875, + "logps/rejected": -440.6478056066176, + "loss": 0.1738, + "rewards/chosen": 1.0501731236775715, + "rewards/margins": 8.328777967714796, + "rewards/rejected": -7.278604844037225, + "step": 623 + }, + { + "epoch": 0.23035392921415718, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 8.868200600328505e-06, + "logits/chosen": 2086838994.8235295, + "logits/rejected": 1449286314.6666667, + "logps/chosen": -246.6076229319853, + "logps/rejected": -404.45647786458335, + "loss": 0.1723, + "rewards/chosen": 1.2777229757869946, + "rewards/margins": 7.690595798866422, + "rewards/rejected": -6.412872823079427, + "step": 624 + }, + { + "epoch": 0.23072308615200038, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 8.864468823638211e-06, + "logits/chosen": 1383989032.4210527, + "logits/rejected": 1829815689.8461537, + "logps/chosen": -226.97327302631578, + "logps/rejected": -455.8234675480769, + "loss": 0.2404, + "rewards/chosen": 0.7630898827000668, + "rewards/margins": 8.1317017319714, + "rewards/rejected": -7.368611849271334, + "step": 625 + }, + { + "epoch": 0.23109224308984358, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 8.860731692775459e-06, + "logits/chosen": 1965130988.3076923, + "logits/rejected": 2316218799.1578946, + "logps/chosen": -301.9336688701923, + "logps/rejected": -551.4968647203947, + "loss": 0.1547, + "rewards/chosen": 1.0046693361722505, + "rewards/margins": 9.682487232965014, + "rewards/rejected": -8.677817896792764, + "step": 626 + }, + { + "epoch": 0.23146140002768678, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 8.856989212917994e-06, + "logits/chosen": 2187031893.3333335, + "logits/rejected": 2074544686.5454545, + "logps/chosen": -259.6909644717262, + "logps/rejected": -592.6592240767045, + "loss": 0.2488, + "rewards/chosen": 0.6828241348266602, + "rewards/margins": 8.849679686806418, + "rewards/rejected": -8.166855551979758, + "step": 627 + }, + { + "epoch": 0.23183055696552998, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 8.853241389250981e-06, + "logits/chosen": 1737659847.1111112, + "logits/rejected": 1836216905.142857, + "logps/chosen": -252.59597439236111, + "logps/rejected": -586.3655133928571, + "loss": 0.2167, + "rewards/chosen": 0.8531558248731825, + "rewards/margins": 10.330153200361464, + "rewards/rejected": -9.476997375488281, + "step": 628 + }, + { + "epoch": 0.23219971390337318, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 8.849488226966975e-06, + "logits/chosen": 1546647893.3333333, + "logits/rejected": 1753084928.0, + "logps/chosen": -225.14888509114584, + "logps/rejected": -555.550537109375, + "loss": 0.1556, + "rewards/chosen": 0.7913695971171061, + "rewards/margins": 9.401956144968668, + "rewards/rejected": -8.610586547851563, + "step": 629 + }, + { + "epoch": 0.23256887084121639, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 8.84572973126594e-06, + "logits/chosen": 2628641353.142857, + "logits/rejected": 1986980522.6666667, + "logps/chosen": -239.04148646763392, + "logps/rejected": -448.5937771267361, + "loss": 0.1919, + "rewards/chosen": 0.5066382203783307, + "rewards/margins": 7.032596811415657, + "rewards/rejected": -6.525958591037327, + "step": 630 + }, + { + "epoch": 0.2329380277790596, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 8.84196590735522e-06, + "logits/chosen": 1930185344.0, + "logits/rejected": 1598667520.0, + "logps/chosen": -209.1876678466797, + "logps/rejected": -569.3306274414062, + "loss": 0.1554, + "rewards/chosen": 1.5321828126907349, + "rewards/margins": 9.476721405982971, + "rewards/rejected": -7.944538593292236, + "step": 631 + }, + { + "epoch": 0.23330718471690276, + "grad_norm": 12.375, + "kl": 0.7686805725097656, + "learning_rate": 8.83819676044955e-06, + "logits/chosen": 1234525485.1764705, + "logits/rejected": 1252758323.2, + "logps/chosen": -250.5320255055147, + "logps/rejected": -484.73046875, + "loss": 0.1991, + "rewards/chosen": 1.261500639073989, + "rewards/margins": 9.81878620222503, + "rewards/rejected": -8.557285563151042, + "step": 632 + }, + { + "epoch": 0.23367634165474596, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 8.83442229577103e-06, + "logits/chosen": 1887933056.0, + "logits/rejected": 1975230720.0, + "logps/chosen": -312.4565124511719, + "logps/rejected": -383.0641174316406, + "loss": 0.1796, + "rewards/chosen": 1.2086622714996338, + "rewards/margins": 6.871243715286255, + "rewards/rejected": -5.662581443786621, + "step": 633 + }, + { + "epoch": 0.23404549859258916, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 8.830642518549135e-06, + "logits/chosen": 2043774429.8666666, + "logits/rejected": 1816917895.5294118, + "logps/chosen": -249.179052734375, + "logps/rejected": -465.77404067095586, + "loss": 0.1461, + "rewards/chosen": 1.4289903004964193, + "rewards/margins": 9.38156239378686, + "rewards/rejected": -7.952572093290441, + "step": 634 + }, + { + "epoch": 0.23441465553043236, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 8.8268574340207e-06, + "logits/chosen": 1576409770.6666667, + "logits/rejected": 1504268288.0, + "logps/chosen": -264.4895833333333, + "logps/rejected": -492.52613740808823, + "loss": 0.1721, + "rewards/chosen": 0.9085037867228191, + "rewards/margins": 8.447358049130907, + "rewards/rejected": -7.538854262408088, + "step": 635 + }, + { + "epoch": 0.23478381246827557, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 8.823067047429908e-06, + "logits/chosen": 2060052747.1304348, + "logits/rejected": 2217257187.5555553, + "logps/chosen": -236.2571968410326, + "logps/rejected": -437.77940538194446, + "loss": 0.2949, + "rewards/chosen": 0.625471654145614, + "rewards/margins": 7.827743857379121, + "rewards/rejected": -7.202272203233507, + "step": 636 + }, + { + "epoch": 0.23515296940611877, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 8.819271364028294e-06, + "logits/chosen": 1693727232.0, + "logits/rejected": 1572822912.0, + "logps/chosen": -218.5090789794922, + "logps/rejected": -409.12890625, + "loss": 0.1736, + "rewards/chosen": 1.1553616523742676, + "rewards/margins": 7.355856418609619, + "rewards/rejected": -6.200494766235352, + "step": 637 + }, + { + "epoch": 0.23552212634396197, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 8.815470389074727e-06, + "logits/chosen": 1554153231.0588236, + "logits/rejected": 1720617233.0666666, + "logps/chosen": -238.52988568474265, + "logps/rejected": -495.54127604166666, + "loss": 0.1665, + "rewards/chosen": 1.2009148317224838, + "rewards/margins": 8.520997432633942, + "rewards/rejected": -7.320082600911459, + "step": 638 + }, + { + "epoch": 0.23589128328180517, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 8.811664127835412e-06, + "logits/chosen": 2188843008.0, + "logits/rejected": 1738923008.0, + "logps/chosen": -357.3623352050781, + "logps/rejected": -441.7335205078125, + "loss": 0.1967, + "rewards/chosen": 0.7807996869087219, + "rewards/margins": 7.317703068256378, + "rewards/rejected": -6.536903381347656, + "step": 639 + }, + { + "epoch": 0.23626044021964837, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 8.807852585583876e-06, + "logits/chosen": 1713496436.3636363, + "logits/rejected": 1442547712.0, + "logps/chosen": -291.25277432528407, + "logps/rejected": -471.3265904017857, + "loss": 0.1525, + "rewards/chosen": 0.6411446658047762, + "rewards/margins": 6.969198458122484, + "rewards/rejected": -6.328053792317708, + "step": 640 + }, + { + "epoch": 0.23662959715749157, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 8.80403576760096e-06, + "logits/chosen": 1652766993.0666666, + "logits/rejected": 2220258484.7058825, + "logps/chosen": -329.8064453125, + "logps/rejected": -502.8772403492647, + "loss": 0.1927, + "rewards/chosen": 0.5765282313028971, + "rewards/margins": 8.306226861243154, + "rewards/rejected": -7.729698629940257, + "step": 641 + }, + { + "epoch": 0.23699875409533477, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 8.800213679174818e-06, + "logits/chosen": 1691347968.0, + "logits/rejected": 1919845469.090909, + "logps/chosen": -277.1225341796875, + "logps/rejected": -481.07040127840907, + "loss": 0.1058, + "rewards/chosen": 1.1288966178894042, + "rewards/margins": 7.3966876723549575, + "rewards/rejected": -6.267791054465554, + "step": 642 + }, + { + "epoch": 0.23736791103317798, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 8.796386325600906e-06, + "logits/chosen": 1725575168.0, + "logits/rejected": 1640679531.7894738, + "logps/chosen": -303.6401179387019, + "logps/rejected": -466.49624794407896, + "loss": 0.1611, + "rewards/chosen": 0.9578496492826022, + "rewards/margins": 7.988468417271911, + "rewards/rejected": -7.030618767989309, + "step": 643 + }, + { + "epoch": 0.23773706797102118, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 8.79255371218197e-06, + "logits/chosen": 1507335561.8461537, + "logits/rejected": 2210447791.1578946, + "logps/chosen": -298.8276554987981, + "logps/rejected": -463.1732627467105, + "loss": 0.1151, + "rewards/chosen": 1.9234768794133112, + "rewards/margins": 9.787994091327374, + "rewards/rejected": -7.8645172119140625, + "step": 644 + }, + { + "epoch": 0.23810622490886438, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 8.78871584422805e-06, + "logits/chosen": 2378346203.428571, + "logits/rejected": 2268684288.0, + "logps/chosen": -253.41966029575892, + "logps/rejected": -402.89344618055554, + "loss": 0.1315, + "rewards/chosen": 1.354564939226423, + "rewards/margins": 7.773544584001813, + "rewards/rejected": -6.418979644775391, + "step": 645 + }, + { + "epoch": 0.23847538184670758, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 8.784872727056464e-06, + "logits/chosen": 2145537408.0, + "logits/rejected": 1621345920.0, + "logps/chosen": -317.9798583984375, + "logps/rejected": -487.2658386230469, + "loss": 0.1732, + "rewards/chosen": 1.084149718284607, + "rewards/margins": 8.736176371574402, + "rewards/rejected": -7.652026653289795, + "step": 646 + }, + { + "epoch": 0.23884453878455078, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 8.781024365991802e-06, + "logits/chosen": 2100527344.9411764, + "logits/rejected": 2055482299.7333333, + "logps/chosen": -355.20519301470586, + "logps/rejected": -447.6387044270833, + "loss": 0.1578, + "rewards/chosen": 1.2088603973388672, + "rewards/margins": 9.425849533081054, + "rewards/rejected": -8.216989135742187, + "step": 647 + }, + { + "epoch": 0.23921369572239398, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 8.777170766365916e-06, + "logits/chosen": 1895837549.7142856, + "logits/rejected": 2008193479.1111112, + "logps/chosen": -380.7836216517857, + "logps/rejected": -497.91596137152777, + "loss": 0.1857, + "rewards/chosen": 0.7978768348693848, + "rewards/margins": 9.084834469689262, + "rewards/rejected": -8.286957634819878, + "step": 648 + }, + { + "epoch": 0.23958285266023718, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 8.773311933517923e-06, + "logits/chosen": 2225705398.857143, + "logits/rejected": 2162490936.888889, + "logps/chosen": -241.17766462053572, + "logps/rejected": -530.4954427083334, + "loss": 0.1787, + "rewards/chosen": 1.0899677276611328, + "rewards/margins": 7.266934076944987, + "rewards/rejected": -6.1769663492838545, + "step": 649 + }, + { + "epoch": 0.23995200959808038, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 8.769447872794185e-06, + "logits/chosen": 1980485416.4210527, + "logits/rejected": 1419477307.0769231, + "logps/chosen": -327.4742495888158, + "logps/rejected": -478.44591346153845, + "loss": 0.2228, + "rewards/chosen": 0.754159224660773, + "rewards/margins": 7.879342330129523, + "rewards/rejected": -7.12518310546875, + "step": 650 + }, + { + "epoch": 0.24032116653592359, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 8.765578589548309e-06, + "logits/chosen": 1383262208.0, + "logits/rejected": 1900146688.0, + "logps/chosen": -229.96515764508928, + "logps/rejected": -408.4047580295139, + "loss": 0.1647, + "rewards/chosen": 1.2261815752301897, + "rewards/margins": 7.80615500798301, + "rewards/rejected": -6.579973432752821, + "step": 651 + }, + { + "epoch": 0.2406903234737668, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 8.76170408914114e-06, + "logits/chosen": 1413862478.7692308, + "logits/rejected": 1256695592.4210527, + "logps/chosen": -186.7420372596154, + "logps/rejected": -343.00375205592104, + "loss": 0.1453, + "rewards/chosen": 1.4604656512920673, + "rewards/margins": 7.380501565662955, + "rewards/rejected": -5.920035914370888, + "step": 652 + }, + { + "epoch": 0.24105948041161, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 8.757824376940748e-06, + "logits/chosen": 2001367040.0, + "logits/rejected": 1986912460.8, + "logps/chosen": -318.39524332682294, + "logps/rejected": -563.444140625, + "loss": 0.1138, + "rewards/chosen": 1.3615036010742188, + "rewards/margins": 9.812646484375, + "rewards/rejected": -8.451142883300781, + "step": 653 + }, + { + "epoch": 0.2414286373494532, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 8.75393945832242e-06, + "logits/chosen": 1493627426.1333334, + "logits/rejected": 1642464436.7058823, + "logps/chosen": -250.84716796875, + "logps/rejected": -474.68528837316177, + "loss": 0.1903, + "rewards/chosen": 0.8015295664469401, + "rewards/margins": 7.510263966579063, + "rewards/rejected": -6.708734400132123, + "step": 654 + }, + { + "epoch": 0.2417977942872964, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 8.75004933866867e-06, + "logits/chosen": 1490150784.0, + "logits/rejected": 1442845312.0, + "logps/chosen": -263.4288635253906, + "logps/rejected": -453.8269958496094, + "loss": 0.2033, + "rewards/chosen": 0.9110702872276306, + "rewards/margins": 7.131710469722748, + "rewards/rejected": -6.220640182495117, + "step": 655 + }, + { + "epoch": 0.2421669512251396, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 8.746154023369204e-06, + "logits/chosen": 1828971724.8, + "logits/rejected": 1851741967.0588236, + "logps/chosen": -291.86510416666664, + "logps/rejected": -472.20582490808823, + "loss": 0.1427, + "rewards/chosen": 1.4730523427327473, + "rewards/margins": 7.773016349942076, + "rewards/rejected": -6.299964007209329, + "step": 656 + }, + { + "epoch": 0.2425361081629828, + "grad_norm": 14.4375, + "kl": 0.06703472137451172, + "learning_rate": 8.742253517820933e-06, + "logits/chosen": 2721785012.7058825, + "logits/rejected": 2704604091.733333, + "logps/chosen": -299.48977481617646, + "logps/rejected": -401.200390625, + "loss": 0.2385, + "rewards/chosen": 0.39396050397087545, + "rewards/margins": 7.173337741926605, + "rewards/rejected": -6.779377237955729, + "step": 657 + }, + { + "epoch": 0.242905265100826, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 8.738347827427957e-06, + "logits/chosen": 1750829056.0, + "logits/rejected": 2413437747.2, + "logps/chosen": -276.5531364889706, + "logps/rejected": -638.8397135416667, + "loss": 0.2042, + "rewards/chosen": 0.9142261392929975, + "rewards/margins": 8.338099023407581, + "rewards/rejected": -7.423872884114584, + "step": 658 + }, + { + "epoch": 0.2432744220386692, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 8.734436957601564e-06, + "logits/chosen": 1813715259.0769231, + "logits/rejected": 2006152677.0526316, + "logps/chosen": -329.1476862980769, + "logps/rejected": -546.96875, + "loss": 0.1591, + "rewards/chosen": 1.241298822256235, + "rewards/margins": 8.937934018339705, + "rewards/rejected": -7.69663519608347, + "step": 659 + }, + { + "epoch": 0.2436435789765124, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 8.730520913760209e-06, + "logits/chosen": 1921491968.0, + "logits/rejected": 2296602168.888889, + "logps/chosen": -229.15150669642858, + "logps/rejected": -682.74169921875, + "loss": 0.1629, + "rewards/chosen": 0.8116139684404645, + "rewards/margins": 11.271374846261645, + "rewards/rejected": -10.45976087782118, + "step": 660 + }, + { + "epoch": 0.2440127359143556, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 8.726599701329526e-06, + "logits/chosen": 1634050529.8823528, + "logits/rejected": 1924187613.8666666, + "logps/chosen": -319.8782169117647, + "logps/rejected": -478.51100260416666, + "loss": 0.2059, + "rewards/chosen": 1.0886991164263558, + "rewards/margins": 8.558125080781824, + "rewards/rejected": -7.469425964355469, + "step": 661 + }, + { + "epoch": 0.2443818928521988, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 8.722673325742302e-06, + "logits/chosen": 1870096699.0769231, + "logits/rejected": 1707136377.2631578, + "logps/chosen": -353.32895132211536, + "logps/rejected": -560.3124486019736, + "loss": 0.1161, + "rewards/chosen": 1.4291331951434796, + "rewards/margins": 10.80392920922654, + "rewards/rejected": -9.37479601408306, + "step": 662 + }, + { + "epoch": 0.244751049790042, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 8.718741792438481e-06, + "logits/chosen": 1769429922.909091, + "logits/rejected": 2356020760.3809524, + "logps/chosen": -244.13831676136363, + "logps/rejected": -530.2338169642857, + "loss": 0.1267, + "rewards/chosen": 0.8344071128151633, + "rewards/margins": 8.404776288317395, + "rewards/rejected": -7.570369175502232, + "step": 663 + }, + { + "epoch": 0.2451202067278852, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 8.714805106865151e-06, + "logits/chosen": 2116740581.0526316, + "logits/rejected": 2088702109.5384614, + "logps/chosen": -357.04273745888156, + "logps/rejected": -397.4817457932692, + "loss": 0.2385, + "rewards/chosen": 0.6662938469334653, + "rewards/margins": 6.473946247023609, + "rewards/rejected": -5.807652400090144, + "step": 664 + }, + { + "epoch": 0.2454893636657284, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 8.710863274476544e-06, + "logits/chosen": 1738030501.6470587, + "logits/rejected": 1138794769.0666666, + "logps/chosen": -251.41595818014707, + "logps/rejected": -379.57652994791664, + "loss": 0.1653, + "rewards/chosen": 1.2679084329044117, + "rewards/margins": 7.372393499636182, + "rewards/rejected": -6.104485066731771, + "step": 665 + }, + { + "epoch": 0.2458585206035716, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 8.706916300734017e-06, + "logits/chosen": 1498765767.1111112, + "logits/rejected": 1142872795.4285715, + "logps/chosen": -239.326416015625, + "logps/rejected": -508.318603515625, + "loss": 0.1551, + "rewards/chosen": 1.605823940700955, + "rewards/margins": 9.487242441328744, + "rewards/rejected": -7.88141850062779, + "step": 666 + }, + { + "epoch": 0.2462276775414148, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 8.70296419110605e-06, + "logits/chosen": 2508578093.1764708, + "logits/rejected": 1667816925.8666666, + "logps/chosen": -272.29733455882354, + "logps/rejected": -500.83701171875, + "loss": 0.2335, + "rewards/chosen": 0.5442578371833352, + "rewards/margins": 9.282291894800524, + "rewards/rejected": -8.738034057617188, + "step": 667 + }, + { + "epoch": 0.24659683447925798, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 8.69900695106824e-06, + "logits/chosen": 1619647488.0, + "logits/rejected": 1496164352.0, + "logps/chosen": -242.96864536830358, + "logps/rejected": -359.64385308159723, + "loss": 0.1656, + "rewards/chosen": 1.025043146950858, + "rewards/margins": 7.142156896137056, + "rewards/rejected": -6.117113749186198, + "step": 668 + }, + { + "epoch": 0.24696599141710118, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 8.695044586103297e-06, + "logits/chosen": 1399412872.5333333, + "logits/rejected": 1689190159.0588236, + "logps/chosen": -256.296240234375, + "logps/rejected": -439.26907169117646, + "loss": 0.2092, + "rewards/chosen": 0.8171581268310547, + "rewards/margins": 8.446997855691349, + "rewards/rejected": -7.629839728860294, + "step": 669 + }, + { + "epoch": 0.24733514835494438, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 8.691077101701024e-06, + "logits/chosen": 1973901994.6666667, + "logits/rejected": 2083195465.142857, + "logps/chosen": -278.1073947482639, + "logps/rejected": -780.9238978794643, + "loss": 0.1737, + "rewards/chosen": 1.1396573384602864, + "rewards/margins": 31.758148375011626, + "rewards/rejected": -30.61849103655134, + "step": 670 + }, + { + "epoch": 0.24770430529278759, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 8.68710450335832e-06, + "logits/chosen": 1684105938.8235295, + "logits/rejected": 1960626585.6, + "logps/chosen": -240.85482249540442, + "logps/rejected": -530.4730794270833, + "loss": 0.1265, + "rewards/chosen": 1.709817549761604, + "rewards/margins": 11.87165226655848, + "rewards/rejected": -10.161834716796875, + "step": 671 + }, + { + "epoch": 0.2480734622306308, + "grad_norm": 11.625, + "kl": 0.2384204864501953, + "learning_rate": 8.683126796579173e-06, + "logits/chosen": 1599741659.4285715, + "logits/rejected": 1817046357.3333333, + "logps/chosen": -263.88612583705356, + "logps/rejected": -440.82823350694446, + "loss": 0.1585, + "rewards/chosen": 1.2255584171840124, + "rewards/margins": 8.73992032853384, + "rewards/rejected": -7.514361911349827, + "step": 672 + }, + { + "epoch": 0.248442619168474, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 8.679143986874643e-06, + "logits/chosen": 2451321304.6153846, + "logits/rejected": 1563392538.9473684, + "logps/chosen": -283.40542367788464, + "logps/rejected": -559.964689555921, + "loss": 0.1184, + "rewards/chosen": 1.419213661780724, + "rewards/margins": 11.654226318544703, + "rewards/rejected": -10.23501265676398, + "step": 673 + }, + { + "epoch": 0.2488117761063172, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 8.67515607976286e-06, + "logits/chosen": 1489568914.2857144, + "logits/rejected": 1679206855.1111112, + "logps/chosen": -302.54150390625, + "logps/rejected": -484.31260850694446, + "loss": 0.1522, + "rewards/chosen": 1.1830558776855469, + "rewards/margins": 9.90100818210178, + "rewards/rejected": -8.717952304416233, + "step": 674 + }, + { + "epoch": 0.2491809330441604, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 8.671163080769025e-06, + "logits/chosen": 1524868962.4615386, + "logits/rejected": 1854182885.0526316, + "logps/chosen": -251.41280423677884, + "logps/rejected": -561.5582853618421, + "loss": 0.1313, + "rewards/chosen": 1.2134993626521184, + "rewards/margins": 8.91554821647613, + "rewards/rejected": -7.702048853824013, + "step": 675 + }, + { + "epoch": 0.2495500899820036, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 8.66716499542538e-06, + "logits/chosen": 1884363922.2857144, + "logits/rejected": 2004013738.6666667, + "logps/chosen": -271.65733119419644, + "logps/rejected": -483.8679470486111, + "loss": 0.1577, + "rewards/chosen": 0.7499054500034877, + "rewards/margins": 7.706628133380224, + "rewards/rejected": -6.956722683376736, + "step": 676 + }, + { + "epoch": 0.2499192469198468, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 8.663161829271226e-06, + "logits/chosen": 1535478390.1538463, + "logits/rejected": 1708894315.7894738, + "logps/chosen": -280.3178898737981, + "logps/rejected": -537.8037109375, + "loss": 0.1748, + "rewards/chosen": 0.32710827313936675, + "rewards/margins": 9.31843726548106, + "rewards/rejected": -8.991328992341694, + "step": 677 + }, + { + "epoch": 0.25028840385769, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 8.659153587852895e-06, + "logits/chosen": 2179699200.0, + "logits/rejected": 1627045376.0, + "logps/chosen": -295.35888671875, + "logps/rejected": -551.3049926757812, + "loss": 0.173, + "rewards/chosen": 0.8541174530982971, + "rewards/margins": 9.764796316623688, + "rewards/rejected": -8.91067886352539, + "step": 678 + }, + { + "epoch": 0.25028840385769, + "eval_kl": 0.0, + "eval_logits/chosen": 3454141983.84689, + "eval_logits/rejected": 3476122779.151515, + "eval_logps/chosen": -293.949461722488, + "eval_logps/rejected": -474.12939664502164, + "eval_loss": 0.1509229987859726, + "eval_rewards/chosen": 1.3065299440228768, + "eval_rewards/margins": 8.946426221248256, + "eval_rewards/rejected": -7.639896277225379, + "eval_runtime": 109.3996, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 0.503, + "step": 678 + }, + { + "epoch": 0.2506575607955332, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 8.65514027672376e-06, + "logits/chosen": 1989280836.2666667, + "logits/rejected": 2138981797.6470587, + "logps/chosen": -367.14921875, + "logps/rejected": -477.27159926470586, + "loss": 0.1594, + "rewards/chosen": 1.8254585266113281, + "rewards/margins": 9.436805500703699, + "rewards/rejected": -7.611346974092371, + "step": 679 + }, + { + "epoch": 0.2510267177333764, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 8.651121901444208e-06, + "logits/chosen": 1722287581.8666666, + "logits/rejected": 1801929908.7058823, + "logps/chosen": -292.0982421875, + "logps/rejected": -464.30261948529414, + "loss": 0.1577, + "rewards/chosen": 1.1125287373860677, + "rewards/margins": 8.651902696198109, + "rewards/rejected": -7.539373958812041, + "step": 680 + }, + { + "epoch": 0.2513958746712196, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 8.64709846758165e-06, + "logits/chosen": 1762754852.5714285, + "logits/rejected": 2200898218.6666665, + "logps/chosen": -310.33182198660717, + "logps/rejected": -442.51963975694446, + "loss": 0.1514, + "rewards/chosen": 1.3102513722011022, + "rewards/margins": 8.338853124588255, + "rewards/rejected": -7.028601752387153, + "step": 681 + }, + { + "epoch": 0.25176503160906283, + "grad_norm": 12.875, + "kl": 0.11469078063964844, + "learning_rate": 8.643069980710502e-06, + "logits/chosen": 2413391872.0, + "logits/rejected": 1295672661.3333333, + "logps/chosen": -351.44215303308823, + "logps/rejected": -364.004296875, + "loss": 0.1634, + "rewards/chosen": 1.5754811904009651, + "rewards/margins": 9.508459502575445, + "rewards/rejected": -7.9329783121744795, + "step": 682 + }, + { + "epoch": 0.252134188546906, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 8.639036446412177e-06, + "logits/chosen": 2214952448.0, + "logits/rejected": 1822320000.0, + "logps/chosen": -340.1465148925781, + "logps/rejected": -526.520751953125, + "loss": 0.1663, + "rewards/chosen": 1.3681427240371704, + "rewards/margins": 10.997568726539612, + "rewards/rejected": -9.629426002502441, + "step": 683 + }, + { + "epoch": 0.25250334548474923, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 8.634997870275092e-06, + "logits/chosen": 1473706276.5714285, + "logits/rejected": 1904057344.0, + "logps/chosen": -327.3775111607143, + "logps/rejected": -511.8525390625, + "loss": 0.1452, + "rewards/chosen": 1.4785426003592355, + "rewards/margins": 9.379565617394826, + "rewards/rejected": -7.90102301703559, + "step": 684 + }, + { + "epoch": 0.2528725024225924, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 8.63095425789464e-06, + "logits/chosen": 2165988171.2941175, + "logits/rejected": 1829965277.8666666, + "logps/chosen": -330.62795840992646, + "logps/rejected": -520.0130208333334, + "loss": 0.1412, + "rewards/chosen": 1.8803940941305721, + "rewards/margins": 8.31009074940401, + "rewards/rejected": -6.429696655273437, + "step": 685 + }, + { + "epoch": 0.2532416593604356, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 8.62690561487319e-06, + "logits/chosen": 2282603520.0, + "logits/rejected": 2241282304.0, + "logps/chosen": -277.59735107421875, + "logps/rejected": -464.32916259765625, + "loss": 0.1615, + "rewards/chosen": 1.0696661472320557, + "rewards/margins": 7.556753396987915, + "rewards/rejected": -6.487087249755859, + "step": 686 + }, + { + "epoch": 0.2536108162982788, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 8.622851946820094e-06, + "logits/chosen": 1511956359.5294118, + "logits/rejected": 1509084501.3333333, + "logps/chosen": -268.2816521139706, + "logps/rejected": -497.06731770833335, + "loss": 0.1894, + "rewards/chosen": 1.186398898853975, + "rewards/margins": 8.537666395598768, + "rewards/rejected": -7.351267496744792, + "step": 687 + }, + { + "epoch": 0.253979973236122, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 8.618793259351655e-06, + "logits/chosen": 1786399232.0, + "logits/rejected": 1905487462.4, + "logps/chosen": -283.63295491536456, + "logps/rejected": -435.180615234375, + "loss": 0.1252, + "rewards/chosen": 1.3028504848480225, + "rewards/margins": 7.723425436019897, + "rewards/rejected": -6.420574951171875, + "step": 688 + }, + { + "epoch": 0.2543491301739652, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 8.614729558091129e-06, + "logits/chosen": 2401594187.2941175, + "logits/rejected": 1965033062.4, + "logps/chosen": -238.60291245404412, + "logps/rejected": -444.8243815104167, + "loss": 0.185, + "rewards/chosen": 1.054581361658433, + "rewards/margins": 10.65605027441885, + "rewards/rejected": -9.601468912760417, + "step": 689 + }, + { + "epoch": 0.2547182871118084, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 8.610660848668723e-06, + "logits/chosen": 1956186989.7142856, + "logits/rejected": 2602589980.4444447, + "logps/chosen": -284.50992257254467, + "logps/rejected": -441.7181803385417, + "loss": 0.1837, + "rewards/chosen": 0.5967070715767997, + "rewards/margins": 8.010884860205271, + "rewards/rejected": -7.414177788628472, + "step": 690 + }, + { + "epoch": 0.2550874440496516, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 8.606587136721586e-06, + "logits/chosen": 1773974118.4, + "logits/rejected": 1766915072.0, + "logps/chosen": -307.51025390625, + "logps/rejected": -474.328369140625, + "loss": 0.1978, + "rewards/chosen": 0.9682549476623535, + "rewards/margins": 7.710261249542237, + "rewards/rejected": -6.742006301879883, + "step": 691 + }, + { + "epoch": 0.2554566009874948, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 8.602508427893794e-06, + "logits/chosen": 1690416128.0, + "logits/rejected": 1653553766.4, + "logps/chosen": -270.0097249348958, + "logps/rejected": -494.12421875, + "loss": 0.1362, + "rewards/chosen": 1.0904799302419026, + "rewards/margins": 8.234240611394247, + "rewards/rejected": -7.143760681152344, + "step": 692 + }, + { + "epoch": 0.255825757925338, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 8.598424727836343e-06, + "logits/chosen": 1953130349.7142856, + "logits/rejected": 1840313139.2, + "logps/chosen": -264.21786063058033, + "logps/rejected": -587.3396484375, + "loss": 0.0843, + "rewards/chosen": 1.0863005093165807, + "rewards/margins": 10.267689669472832, + "rewards/rejected": -9.18138916015625, + "step": 693 + }, + { + "epoch": 0.2561949148631812, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 8.59433604220715e-06, + "logits/chosen": 1866516366.2222223, + "logits/rejected": 1585663853.7142856, + "logps/chosen": -211.74864366319446, + "logps/rejected": -438.4081333705357, + "loss": 0.1756, + "rewards/chosen": 1.3603085411919489, + "rewards/margins": 8.8062559158083, + "rewards/rejected": -7.445947374616351, + "step": 694 + }, + { + "epoch": 0.2565640718010244, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 8.590242376671035e-06, + "logits/chosen": 2369217929.8461537, + "logits/rejected": 2309797564.631579, + "logps/chosen": -334.5200946514423, + "logps/rejected": -531.6450452302631, + "loss": 0.1132, + "rewards/chosen": 1.693501692551833, + "rewards/margins": 8.747140351577325, + "rewards/rejected": -7.053638659025493, + "step": 695 + }, + { + "epoch": 0.2569332287388676, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 8.586143736899721e-06, + "logits/chosen": 1206721280.0, + "logits/rejected": 1069724928.0, + "logps/chosen": -232.7579803466797, + "logps/rejected": -439.3637390136719, + "loss": 0.1567, + "rewards/chosen": 1.3320797681808472, + "rewards/margins": 10.024652361869812, + "rewards/rejected": -8.692572593688965, + "step": 696 + }, + { + "epoch": 0.2573023856767108, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 8.582040128571822e-06, + "logits/chosen": 1749154669.7142856, + "logits/rejected": 1900851996.4444444, + "logps/chosen": -282.76475306919644, + "logps/rejected": -548.2631293402778, + "loss": 0.096, + "rewards/chosen": 2.03106267111642, + "rewards/margins": 9.221928081815205, + "rewards/rejected": -7.190865410698785, + "step": 697 + }, + { + "epoch": 0.257671542614554, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 8.577931557372832e-06, + "logits/chosen": 1864244284.235294, + "logits/rejected": 1581174237.8666666, + "logps/chosen": -300.5215418198529, + "logps/rejected": -367.42760416666664, + "loss": 0.1952, + "rewards/chosen": 0.9721404804902918, + "rewards/margins": 6.344993060242896, + "rewards/rejected": -5.372852579752604, + "step": 698 + }, + { + "epoch": 0.2580406995523972, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 8.573818028995129e-06, + "logits/chosen": 1502296960.0, + "logits/rejected": 1491697152.0, + "logps/chosen": -232.6980438232422, + "logps/rejected": -424.576904296875, + "loss": 0.197, + "rewards/chosen": 0.7530190944671631, + "rewards/margins": 7.363127946853638, + "rewards/rejected": -6.610108852386475, + "step": 699 + }, + { + "epoch": 0.2584098564902404, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 8.56969954913795e-06, + "logits/chosen": 1301741056.0, + "logits/rejected": 2037355904.0, + "logps/chosen": -178.26678466796875, + "logps/rejected": -490.1294860839844, + "loss": 0.1841, + "rewards/chosen": 0.9701017141342163, + "rewards/margins": 7.9521883726119995, + "rewards/rejected": -6.982086658477783, + "step": 700 + }, + { + "epoch": 0.2587790134280836, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 8.565576123507398e-06, + "logits/chosen": 1816361756.4444444, + "logits/rejected": 2888043475.478261, + "logps/chosen": -233.67787000868054, + "logps/rejected": -474.1322180706522, + "loss": 0.1049, + "rewards/chosen": 0.8033172289530436, + "rewards/margins": 8.390401791835176, + "rewards/rejected": -7.587084562882133, + "step": 701 + }, + { + "epoch": 0.2591481703659268, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 8.561447757816428e-06, + "logits/chosen": 3558549367.4666667, + "logits/rejected": 2512792636.2352943, + "logps/chosen": -348.08619791666666, + "logps/rejected": -412.71607881433823, + "loss": 0.1281, + "rewards/chosen": 1.9977976481119792, + "rewards/margins": 9.586786905924479, + "rewards/rejected": -7.5889892578125, + "step": 702 + }, + { + "epoch": 0.25951732730377003, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 8.557314457784838e-06, + "logits/chosen": 2499396039.111111, + "logits/rejected": 1837727597.7142856, + "logps/chosen": -230.98814561631946, + "logps/rejected": -402.01307896205356, + "loss": 0.1969, + "rewards/chosen": 1.1478605270385742, + "rewards/margins": 8.448952402387347, + "rewards/rejected": -7.301091875348773, + "step": 703 + }, + { + "epoch": 0.2598864842416132, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 8.553176229139262e-06, + "logits/chosen": 1905652224.0, + "logits/rejected": 2688879104.0, + "logps/chosen": -339.8748779296875, + "logps/rejected": -469.4305419921875, + "loss": 0.1593, + "rewards/chosen": 1.2111269235610962, + "rewards/margins": 7.6917964220047, + "rewards/rejected": -6.4806694984436035, + "step": 704 + }, + { + "epoch": 0.26025564117945643, + "grad_norm": 12.0, + "kl": 1.0053119659423828, + "learning_rate": 8.54903307761316e-06, + "logits/chosen": 1752171466.1052632, + "logits/rejected": 2005397504.0, + "logps/chosen": -301.2323961759868, + "logps/rejected": -369.26509915865387, + "loss": 0.1785, + "rewards/chosen": 1.498931884765625, + "rewards/margins": 7.979768606332632, + "rewards/rejected": -6.480836721567007, + "step": 705 + }, + { + "epoch": 0.2606247981172996, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 8.544885008946822e-06, + "logits/chosen": 1824079499.6363637, + "logits/rejected": 2094868382.4761906, + "logps/chosen": -216.76420454545453, + "logps/rejected": -661.0634300595239, + "loss": 0.1338, + "rewards/chosen": 1.1554926091974431, + "rewards/margins": 9.873177400399081, + "rewards/rejected": -8.717684791201638, + "step": 706 + }, + { + "epoch": 0.26099395505514283, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 8.54073202888734e-06, + "logits/chosen": 1374426453.3333333, + "logits/rejected": 1765861286.9565217, + "logps/chosen": -200.953125, + "logps/rejected": -346.5176842730978, + "loss": 0.1295, + "rewards/chosen": 0.5691349771287706, + "rewards/margins": 7.099452200719124, + "rewards/rejected": -6.5303172235903535, + "step": 707 + }, + { + "epoch": 0.261363111992986, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 8.536574143188619e-06, + "logits/chosen": 1763637248.0, + "logits/rejected": 1876370298.4347825, + "logps/chosen": -297.84312608506946, + "logps/rejected": -432.2619735054348, + "loss": 0.1234, + "rewards/chosen": 0.6439764764573839, + "rewards/margins": 8.457567811588158, + "rewards/rejected": -7.813591335130774, + "step": 708 + }, + { + "epoch": 0.26173226893082924, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 8.532411357611352e-06, + "logits/chosen": 1690759040.0, + "logits/rejected": 1620271616.0, + "logps/chosen": -281.0442199707031, + "logps/rejected": -496.8271484375, + "loss": 0.1358, + "rewards/chosen": 2.1863605976104736, + "rewards/margins": 8.861633539199829, + "rewards/rejected": -6.6752729415893555, + "step": 709 + }, + { + "epoch": 0.2621014258686724, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 8.528243677923028e-06, + "logits/chosen": 1590008685.7142856, + "logits/rejected": 1620823040.0, + "logps/chosen": -274.89578683035717, + "logps/rejected": -507.4638671875, + "loss": 0.1437, + "rewards/chosen": 1.1951545987810408, + "rewards/margins": 8.421536536443801, + "rewards/rejected": -7.226381937662761, + "step": 710 + }, + { + "epoch": 0.26247058280651564, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 8.524071109897915e-06, + "logits/chosen": 1575872512.0, + "logits/rejected": 1668048603.4285715, + "logps/chosen": -339.5906032986111, + "logps/rejected": -570.0492117745536, + "loss": 0.1921, + "rewards/chosen": 1.1478169759114583, + "rewards/margins": 11.254784356980098, + "rewards/rejected": -10.106967381068639, + "step": 711 + }, + { + "epoch": 0.2628397397443588, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 8.519893659317052e-06, + "logits/chosen": 1795548774.4, + "logits/rejected": 1825740458.6666667, + "logps/chosen": -289.83134765625, + "logps/rejected": -453.2017415364583, + "loss": 0.2297, + "rewards/chosen": 0.9489949226379395, + "rewards/margins": 7.313629754384358, + "rewards/rejected": -6.364634831746419, + "step": 712 + }, + { + "epoch": 0.26320889668220204, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 8.515711331968242e-06, + "logits/chosen": 2133171792.8421052, + "logits/rejected": 1882783113.8461537, + "logps/chosen": -277.95962685032896, + "logps/rejected": -415.3552433894231, + "loss": 0.1652, + "rewards/chosen": 1.6074349252801192, + "rewards/margins": 7.973598912659927, + "rewards/rejected": -6.3661639873798075, + "step": 713 + }, + { + "epoch": 0.2635780536200452, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 8.51152413364605e-06, + "logits/chosen": 1332163925.3333333, + "logits/rejected": 1867391337.4117646, + "logps/chosen": -222.390087890625, + "logps/rejected": -393.5350988051471, + "loss": 0.1325, + "rewards/chosen": 1.3882767995198568, + "rewards/margins": 7.548514728920132, + "rewards/rejected": -6.160237929400275, + "step": 714 + }, + { + "epoch": 0.26394721055788845, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 8.507332070151784e-06, + "logits/chosen": 2360593976.888889, + "logits/rejected": 1631233755.4285715, + "logps/chosen": -300.18126085069446, + "logps/rejected": -477.26991489955356, + "loss": 0.1997, + "rewards/chosen": 0.9707688225640191, + "rewards/margins": 8.314943343874008, + "rewards/rejected": -7.344174521309989, + "step": 715 + }, + { + "epoch": 0.2643163674957316, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 8.503135147293496e-06, + "logits/chosen": 1733139328.0, + "logits/rejected": 1562306304.0, + "logps/chosen": -262.3669128417969, + "logps/rejected": -496.5869140625, + "loss": 0.166, + "rewards/chosen": 1.1070632934570312, + "rewards/margins": 8.657933235168457, + "rewards/rejected": -7.550869941711426, + "step": 716 + }, + { + "epoch": 0.26468552443357485, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 8.498933370885967e-06, + "logits/chosen": 1360120346.9473684, + "logits/rejected": 1728848817.2307692, + "logps/chosen": -301.32090357730266, + "logps/rejected": -439.55431189903845, + "loss": 0.1702, + "rewards/chosen": 1.5223317397268195, + "rewards/margins": 8.47398954847081, + "rewards/rejected": -6.95165780874399, + "step": 717 + }, + { + "epoch": 0.265054681371418, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 8.494726746750705e-06, + "logits/chosen": 1330706295.4666667, + "logits/rejected": 2060726031.0588236, + "logps/chosen": -191.76277669270834, + "logps/rejected": -470.759765625, + "loss": 0.1443, + "rewards/chosen": 1.42223269144694, + "rewards/margins": 8.829470024856867, + "rewards/rejected": -7.407237333409927, + "step": 718 + }, + { + "epoch": 0.26542383830926125, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 8.490515280715937e-06, + "logits/chosen": 2364996186.352941, + "logits/rejected": 1840331980.8, + "logps/chosen": -285.1833065257353, + "logps/rejected": -549.5393229166667, + "loss": 0.1742, + "rewards/chosen": 1.0194485608269186, + "rewards/margins": 10.152565219355564, + "rewards/rejected": -9.133116658528646, + "step": 719 + }, + { + "epoch": 0.2657929952471044, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 8.486298978616593e-06, + "logits/chosen": 1410933097.4117646, + "logits/rejected": 1456810257.0666666, + "logps/chosen": -291.14318129595586, + "logps/rejected": -452.30983072916666, + "loss": 0.1742, + "rewards/chosen": 1.2349590974695541, + "rewards/margins": 9.11880614336799, + "rewards/rejected": -7.883847045898437, + "step": 720 + }, + { + "epoch": 0.26616215218494765, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 8.48207784629431e-06, + "logits/chosen": 1883839744.0, + "logits/rejected": 1376344064.0, + "logps/chosen": -354.7296142578125, + "logps/rejected": -386.4733581542969, + "loss": 0.1805, + "rewards/chosen": 0.8549596667289734, + "rewards/margins": 7.068763911724091, + "rewards/rejected": -6.213804244995117, + "step": 721 + }, + { + "epoch": 0.2665313091227908, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 8.477851889597408e-06, + "logits/chosen": 1465790825.4117646, + "logits/rejected": 1714474188.8, + "logps/chosen": -280.22009995404414, + "logps/rejected": -452.34583333333336, + "loss": 0.163, + "rewards/chosen": 1.5502988029928768, + "rewards/margins": 9.011663040460325, + "rewards/rejected": -7.461364237467448, + "step": 722 + }, + { + "epoch": 0.266900466060634, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 8.473621114380899e-06, + "logits/chosen": 2304839439.0588236, + "logits/rejected": 1537664614.4, + "logps/chosen": -368.36865234375, + "logps/rejected": -573.8295572916667, + "loss": 0.1895, + "rewards/chosen": 0.8097377103917739, + "rewards/margins": 9.270643166934743, + "rewards/rejected": -8.460905456542969, + "step": 723 + }, + { + "epoch": 0.26726962299847723, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 8.469385526506466e-06, + "logits/chosen": 2105896320.0, + "logits/rejected": 1929952256.0, + "logps/chosen": -342.3580322265625, + "logps/rejected": -517.1311645507812, + "loss": 0.1656, + "rewards/chosen": 1.0522119998931885, + "rewards/margins": 8.553890466690063, + "rewards/rejected": -7.501678466796875, + "step": 724 + }, + { + "epoch": 0.2676387799363204, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 8.465145131842467e-06, + "logits/chosen": 1209644347.0769231, + "logits/rejected": 2355036160.0, + "logps/chosen": -254.0584998497596, + "logps/rejected": -422.6581774259868, + "loss": 0.2067, + "rewards/chosen": 1.1839763934795673, + "rewards/margins": 6.944912929766574, + "rewards/rejected": -5.760936536287007, + "step": 725 + }, + { + "epoch": 0.26800793687416363, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 8.46089993626391e-06, + "logits/chosen": 1980739447.4666667, + "logits/rejected": 2029912304.9411764, + "logps/chosen": -375.4416015625, + "logps/rejected": -397.6094324448529, + "loss": 0.152, + "rewards/chosen": 1.1780044555664062, + "rewards/margins": 8.307536764705882, + "rewards/rejected": -7.129532309139476, + "step": 726 + }, + { + "epoch": 0.2683770938120068, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 8.456649945652463e-06, + "logits/chosen": 1993883921.0666666, + "logits/rejected": 1754258371.764706, + "logps/chosen": -191.17587890625, + "logps/rejected": -515.4721966911765, + "loss": 0.1267, + "rewards/chosen": 1.3963610331217449, + "rewards/margins": 9.150813009224686, + "rewards/rejected": -7.754451976102941, + "step": 727 + }, + { + "epoch": 0.26874625074985004, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 8.452395165896433e-06, + "logits/chosen": 1892826190.7692308, + "logits/rejected": 1999675823.1578948, + "logps/chosen": -325.47569861778845, + "logps/rejected": -589.1712582236842, + "loss": 0.1215, + "rewards/chosen": 1.2532466008112981, + "rewards/margins": 10.918916092227828, + "rewards/rejected": -9.66566949141653, + "step": 728 + }, + { + "epoch": 0.2691154076876932, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 8.448135602890763e-06, + "logits/chosen": 1941016389.8181818, + "logits/rejected": 1635075072.0, + "logps/chosen": -312.67844460227275, + "logps/rejected": -424.200146484375, + "loss": 0.2712, + "rewards/chosen": 0.6924879334189675, + "rewards/margins": 8.99737379767678, + "rewards/rejected": -8.304885864257812, + "step": 729 + }, + { + "epoch": 0.26948456462553644, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 8.443871262537023e-06, + "logits/chosen": 1789425810.2857144, + "logits/rejected": 1505312540.4444444, + "logps/chosen": -260.25106375558033, + "logps/rejected": -479.05723741319446, + "loss": 0.1432, + "rewards/chosen": 1.5229415893554688, + "rewards/margins": 10.195179409450954, + "rewards/rejected": -8.672237820095486, + "step": 730 + }, + { + "epoch": 0.2698537215633796, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 8.4396021507434e-06, + "logits/chosen": 2084863219.8095238, + "logits/rejected": 1822718510.5454545, + "logps/chosen": -295.35228329613096, + "logps/rejected": -397.00692471590907, + "loss": 0.1975, + "rewards/chosen": 1.3762493133544922, + "rewards/margins": 7.344661539251154, + "rewards/rejected": -5.968412225896662, + "step": 731 + }, + { + "epoch": 0.27022287850122284, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 8.4353282734247e-06, + "logits/chosen": 1485335009.8823528, + "logits/rejected": 1535449088.0, + "logps/chosen": -279.3338407628676, + "logps/rejected": -454.4908203125, + "loss": 0.178, + "rewards/chosen": 1.2829552818747127, + "rewards/margins": 8.180408589980182, + "rewards/rejected": -6.897453308105469, + "step": 732 + }, + { + "epoch": 0.270592035439066, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 8.431049636502322e-06, + "logits/chosen": 1759702747.4285715, + "logits/rejected": 1572019768.8888888, + "logps/chosen": -347.2534877232143, + "logps/rejected": -400.6532389322917, + "loss": 0.1198, + "rewards/chosen": 1.6595938546316964, + "rewards/margins": 9.098846919952877, + "rewards/rejected": -7.43925306532118, + "step": 733 + }, + { + "epoch": 0.27096119237690924, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 8.426766245904263e-06, + "logits/chosen": 2045888188.631579, + "logits/rejected": 1950828701.5384614, + "logps/chosen": -306.00899465460526, + "logps/rejected": -311.11609825721155, + "loss": 0.2255, + "rewards/chosen": 1.1355122014095909, + "rewards/margins": 6.30951110747179, + "rewards/rejected": -5.1739989060622, + "step": 734 + }, + { + "epoch": 0.2713303493147524, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 8.422478107565108e-06, + "logits/chosen": 1954966674.2857144, + "logits/rejected": 1370471082.6666667, + "logps/chosen": -198.89540318080358, + "logps/rejected": -435.7649197048611, + "loss": 0.1129, + "rewards/chosen": 1.7736165182931083, + "rewards/margins": 8.770435484628829, + "rewards/rejected": -6.99681896633572, + "step": 735 + }, + { + "epoch": 0.27169950625259565, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 8.418185227426016e-06, + "logits/chosen": 1740405009.0666666, + "logits/rejected": 1649257773.1764705, + "logps/chosen": -404.66588541666664, + "logps/rejected": -535.7639016544117, + "loss": 0.1833, + "rewards/chosen": 0.9322424570719401, + "rewards/margins": 9.032965723673502, + "rewards/rejected": -8.100723266601562, + "step": 736 + }, + { + "epoch": 0.2720686631904388, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 8.41388761143472e-06, + "logits/chosen": 1791831848.4210527, + "logits/rejected": 1640457137.2307692, + "logps/chosen": -232.26755242598685, + "logps/rejected": -422.34175931490387, + "loss": 0.1456, + "rewards/chosen": 1.6087160612407483, + "rewards/margins": 8.83876519454153, + "rewards/rejected": -7.230049133300781, + "step": 737 + }, + { + "epoch": 0.27243782012828205, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 8.409585265545509e-06, + "logits/chosen": 1417081514.6666667, + "logits/rejected": 2093712091.4285715, + "logps/chosen": -224.60643174913196, + "logps/rejected": -489.6946498325893, + "loss": 0.1888, + "rewards/chosen": 1.2593495051066081, + "rewards/margins": 8.408879643394833, + "rewards/rejected": -7.149530138288226, + "step": 738 + }, + { + "epoch": 0.2728069770661252, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 8.405278195719233e-06, + "logits/chosen": 2898794268.4444447, + "logits/rejected": 1607668004.5714285, + "logps/chosen": -299.5699869791667, + "logps/rejected": -415.4795619419643, + "loss": 0.1778, + "rewards/chosen": 1.3134018580118816, + "rewards/margins": 7.878941081819081, + "rewards/rejected": -6.565539223807199, + "step": 739 + }, + { + "epoch": 0.27317613400396845, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 8.40096640792328e-06, + "logits/chosen": 1600694810.9473684, + "logits/rejected": 1610778624.0, + "logps/chosen": -286.3563168174342, + "logps/rejected": -435.7476337139423, + "loss": 0.1579, + "rewards/chosen": 2.0813379789653577, + "rewards/margins": 8.825593044883327, + "rewards/rejected": -6.744255065917969, + "step": 740 + }, + { + "epoch": 0.2735452909418116, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 8.396649908131578e-06, + "logits/chosen": 992298461.8666667, + "logits/rejected": 1261353923.764706, + "logps/chosen": -222.095947265625, + "logps/rejected": -434.11646943933823, + "loss": 0.122, + "rewards/chosen": 2.016513188680013, + "rewards/margins": 8.744632085164389, + "rewards/rejected": -6.728118896484375, + "step": 741 + }, + { + "epoch": 0.27391444787965485, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 8.39232870232458e-06, + "logits/chosen": 1883850368.0, + "logits/rejected": 2926250496.0, + "logps/chosen": -327.1271057128906, + "logps/rejected": -513.7808227539062, + "loss": 0.1535, + "rewards/chosen": 1.3816542625427246, + "rewards/margins": 8.368366241455078, + "rewards/rejected": -6.9867119789123535, + "step": 742 + }, + { + "epoch": 0.27428360481749803, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 8.388002796489267e-06, + "logits/chosen": 1672761636.5714285, + "logits/rejected": 1775308913.7777777, + "logps/chosen": -233.78553989955358, + "logps/rejected": -427.9992404513889, + "loss": 0.1439, + "rewards/chosen": 1.558112961905343, + "rewards/margins": 9.18136159200517, + "rewards/rejected": -7.623248630099827, + "step": 743 + }, + { + "epoch": 0.27465276175534126, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 8.383672196619123e-06, + "logits/chosen": 1935238290.2857144, + "logits/rejected": 2633674379.6363635, + "logps/chosen": -263.06849888392856, + "logps/rejected": -494.3399769176136, + "loss": 0.2092, + "rewards/chosen": 1.2428042093912761, + "rewards/margins": 8.891675891298236, + "rewards/rejected": -7.64887168190696, + "step": 744 + }, + { + "epoch": 0.27502191869318443, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 8.37933690871414e-06, + "logits/chosen": 1874977249.8823528, + "logits/rejected": 1927916339.2, + "logps/chosen": -281.28937844669116, + "logps/rejected": -384.8652018229167, + "loss": 0.1381, + "rewards/chosen": 1.6131117203656364, + "rewards/margins": 7.994609421374751, + "rewards/rejected": -6.381497701009114, + "step": 745 + }, + { + "epoch": 0.27539107563102766, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 8.374996938780804e-06, + "logits/chosen": 1971765850.3529413, + "logits/rejected": 1536362222.9333334, + "logps/chosen": -354.9510857077206, + "logps/rejected": -447.5161458333333, + "loss": 0.1122, + "rewards/chosen": 2.0395447226131664, + "rewards/margins": 9.545179284787645, + "rewards/rejected": -7.505634562174479, + "step": 746 + }, + { + "epoch": 0.27576023256887083, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 8.370652292832087e-06, + "logits/chosen": 1677098154.6666667, + "logits/rejected": 1403976601.6, + "logps/chosen": -316.8774820963542, + "logps/rejected": -517.842724609375, + "loss": 0.0975, + "rewards/chosen": 1.9518934885660808, + "rewards/margins": 9.733116022745769, + "rewards/rejected": -7.781222534179688, + "step": 747 + }, + { + "epoch": 0.27612938950671406, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 8.366302976887442e-06, + "logits/chosen": 1577245582.2222223, + "logits/rejected": 1311952749.7142856, + "logps/chosen": -297.1344943576389, + "logps/rejected": -412.52762276785717, + "loss": 0.1604, + "rewards/chosen": 1.7567385567559137, + "rewards/margins": 9.2358122174702, + "rewards/rejected": -7.479073660714286, + "step": 748 + }, + { + "epoch": 0.27649854644455724, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 8.361948996972792e-06, + "logits/chosen": 2142604065.3913043, + "logits/rejected": 1724719900.4444444, + "logps/chosen": -261.6469089673913, + "logps/rejected": -394.0995822482639, + "loss": 0.2383, + "rewards/chosen": 1.134678550388502, + "rewards/margins": 8.576778227580341, + "rewards/rejected": -7.44209967719184, + "step": 749 + }, + { + "epoch": 0.27686770338240047, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 8.357590359120518e-06, + "logits/chosen": 1716166294.5882354, + "logits/rejected": 1356393676.8, + "logps/chosen": -256.6574276194853, + "logps/rejected": -518.3037109375, + "loss": 0.1659, + "rewards/chosen": 1.1612701416015625, + "rewards/margins": 10.466366577148438, + "rewards/rejected": -9.305096435546876, + "step": 750 + }, + { + "epoch": 0.27723686032024364, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 8.353227069369461e-06, + "logits/chosen": 1748869688.8888888, + "logits/rejected": 1602397622.857143, + "logps/chosen": -335.7532009548611, + "logps/rejected": -393.21888950892856, + "loss": 0.2198, + "rewards/chosen": 0.8964270485772027, + "rewards/margins": 8.122755762130495, + "rewards/rejected": -7.2263287135532925, + "step": 751 + }, + { + "epoch": 0.27760601725808687, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 8.348859133764902e-06, + "logits/chosen": 1615163632.9411764, + "logits/rejected": 1529911432.5333333, + "logps/chosen": -386.5240693933824, + "logps/rejected": -366.5708333333333, + "loss": 0.2168, + "rewards/chosen": 0.690982369815602, + "rewards/margins": 7.027823698754404, + "rewards/rejected": -6.336841328938802, + "step": 752 + }, + { + "epoch": 0.27797517419593004, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 8.34448655835856e-06, + "logits/chosen": 1645199667.2, + "logits/rejected": 1599855616.0, + "logps/chosen": -290.9887939453125, + "logps/rejected": -415.5536702473958, + "loss": 0.2095, + "rewards/chosen": 1.0834282875061034, + "rewards/margins": 7.789382521311442, + "rewards/rejected": -6.705954233805339, + "step": 753 + }, + { + "epoch": 0.27834433113377327, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 8.34010934920858e-06, + "logits/chosen": 1542402404.173913, + "logits/rejected": 1927808341.3333333, + "logps/chosen": -273.32033372961956, + "logps/rejected": -443.34190538194446, + "loss": 0.2449, + "rewards/chosen": 1.099161148071289, + "rewards/margins": 6.864378823174371, + "rewards/rejected": -5.765217675103082, + "step": 754 + }, + { + "epoch": 0.27871348807161644, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 8.335727512379535e-06, + "logits/chosen": 1669322752.0, + "logits/rejected": 1378362647.2727273, + "logps/chosen": -251.3404296875, + "logps/rejected": -428.9881480823864, + "loss": 0.0852, + "rewards/chosen": 1.3134403228759766, + "rewards/margins": 9.980798721313477, + "rewards/rejected": -8.6673583984375, + "step": 755 + }, + { + "epoch": 0.2790826450094597, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 8.3313410539424e-06, + "logits/chosen": 2006537947.4285715, + "logits/rejected": 1937155413.3333333, + "logps/chosen": -245.164794921875, + "logps/rejected": -386.92778862847223, + "loss": 0.1364, + "rewards/chosen": 1.4845929827008928, + "rewards/margins": 7.347633906773158, + "rewards/rejected": -5.863040924072266, + "step": 756 + }, + { + "epoch": 0.27945180194730285, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 8.32694997997456e-06, + "logits/chosen": 1701811792.8421052, + "logits/rejected": 1718808891.0769231, + "logps/chosen": -274.68716591282896, + "logps/rejected": -411.28064903846155, + "loss": 0.2414, + "rewards/chosen": 0.7195495304308439, + "rewards/margins": 7.341983119485832, + "rewards/rejected": -6.622433589054988, + "step": 757 + }, + { + "epoch": 0.279820958885146, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 8.322554296559792e-06, + "logits/chosen": 2085876736.0, + "logits/rejected": 1581271040.0, + "logps/chosen": -309.3119201660156, + "logps/rejected": -376.4004211425781, + "loss": 0.1942, + "rewards/chosen": 1.4214270114898682, + "rewards/margins": 7.609297037124634, + "rewards/rejected": -6.187870025634766, + "step": 758 + }, + { + "epoch": 0.28019011582298925, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 8.318154009788257e-06, + "logits/chosen": 2207913691.428571, + "logits/rejected": 1648308968.7272727, + "logps/chosen": -308.4857933407738, + "logps/rejected": -463.4357244318182, + "loss": 0.2117, + "rewards/chosen": 1.3291015625, + "rewards/margins": 7.837362809614702, + "rewards/rejected": -6.508261247114702, + "step": 759 + }, + { + "epoch": 0.2805592727608324, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 8.3137491257565e-06, + "logits/chosen": 2180984832.0, + "logits/rejected": 2946864274.285714, + "logps/chosen": -265.23008897569446, + "logps/rejected": -516.3490164620536, + "loss": 0.205, + "rewards/chosen": 1.1903200149536133, + "rewards/margins": 8.279749597821917, + "rewards/rejected": -7.089429582868304, + "step": 760 + }, + { + "epoch": 0.28092842969867565, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 8.30933965056743e-06, + "logits/chosen": 2080200294.4, + "logits/rejected": 1794530605.1764705, + "logps/chosen": -441.62604166666665, + "logps/rejected": -421.8765510110294, + "loss": 0.1558, + "rewards/chosen": 1.3187789916992188, + "rewards/margins": 8.952957602108226, + "rewards/rejected": -7.634178610409007, + "step": 761 + }, + { + "epoch": 0.2812975866365188, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 8.304925590330318e-06, + "logits/chosen": 2067771776.0, + "logits/rejected": 1599947648.0, + "logps/chosen": -328.9259338378906, + "logps/rejected": -520.1430053710938, + "loss": 0.1249, + "rewards/chosen": 1.9356510639190674, + "rewards/margins": 9.888720273971558, + "rewards/rejected": -7.95306921005249, + "step": 762 + }, + { + "epoch": 0.28166674357436206, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 8.300506951160789e-06, + "logits/chosen": 1608517778.2857144, + "logits/rejected": 1204006684.4444444, + "logps/chosen": -339.0216587611607, + "logps/rejected": -451.3554958767361, + "loss": 0.1558, + "rewards/chosen": 1.1144777025495256, + "rewards/margins": 8.15309939308772, + "rewards/rejected": -7.038621690538195, + "step": 763 + }, + { + "epoch": 0.28203590051220523, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 8.296083739180812e-06, + "logits/chosen": 2149351680.0, + "logits/rejected": 2357044224.0, + "logps/chosen": -316.3919372558594, + "logps/rejected": -444.80926513671875, + "loss": 0.1695, + "rewards/chosen": 1.044415831565857, + "rewards/margins": 9.435024619102478, + "rewards/rejected": -8.390608787536621, + "step": 764 + }, + { + "epoch": 0.28240505745004846, + "grad_norm": 11.5625, + "kl": 0.22266674041748047, + "learning_rate": 8.29165596051869e-06, + "logits/chosen": 1829449728.0, + "logits/rejected": 2631174553.6, + "logps/chosen": -277.0448404947917, + "logps/rejected": -524.19052734375, + "loss": 0.1437, + "rewards/chosen": 0.9586912790934244, + "rewards/margins": 7.74412218729655, + "rewards/rejected": -6.785430908203125, + "step": 765 + }, + { + "epoch": 0.28277421438789163, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 8.287223621309055e-06, + "logits/chosen": 1858469614.9333334, + "logits/rejected": 1870799570.8235295, + "logps/chosen": -284.45530598958334, + "logps/rejected": -694.7135799632352, + "loss": 0.1766, + "rewards/chosen": 0.7417887369791667, + "rewards/margins": 10.97066608503753, + "rewards/rejected": -10.228877348058363, + "step": 766 + }, + { + "epoch": 0.28314337132573486, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 8.282786727692856e-06, + "logits/chosen": 1748838184.4210527, + "logits/rejected": 1371550956.3076923, + "logps/chosen": -235.44140625, + "logps/rejected": -445.85990084134613, + "loss": 0.1579, + "rewards/chosen": 1.8203420137104236, + "rewards/margins": 9.876913974159642, + "rewards/rejected": -8.056571960449219, + "step": 767 + }, + { + "epoch": 0.28351252826357803, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 8.278345285817353e-06, + "logits/chosen": 1767820107.2941177, + "logits/rejected": 1284047530.6666667, + "logps/chosen": -329.79181985294116, + "logps/rejected": -441.61217447916664, + "loss": 0.1603, + "rewards/chosen": 1.191459431367762, + "rewards/margins": 8.286346719779221, + "rewards/rejected": -7.094887288411458, + "step": 768 + }, + { + "epoch": 0.28388168520142126, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 8.273899301836111e-06, + "logits/chosen": 2094867865.6, + "logits/rejected": 2216908458.6666665, + "logps/chosen": -278.3982177734375, + "logps/rejected": -437.493896484375, + "loss": 0.1828, + "rewards/chosen": 1.2558897018432618, + "rewards/margins": 8.385862795511882, + "rewards/rejected": -7.12997309366862, + "step": 769 + }, + { + "epoch": 0.28425084213926444, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 8.269448781908977e-06, + "logits/chosen": 1579163216.8421052, + "logits/rejected": 1355386564.9230769, + "logps/chosen": -310.46767064144734, + "logps/rejected": -474.17886117788464, + "loss": 0.1883, + "rewards/chosen": 1.2862842961361534, + "rewards/margins": 8.333298972743727, + "rewards/rejected": -7.0470146766075725, + "step": 770 + }, + { + "epoch": 0.28461999907710767, + "grad_norm": 8.5625, + "kl": 0.4121088981628418, + "learning_rate": 8.264993732202094e-06, + "logits/chosen": 2757693755.076923, + "logits/rejected": 2028119309.4736843, + "logps/chosen": -253.33710186298077, + "logps/rejected": -377.8335731907895, + "loss": 0.083, + "rewards/chosen": 2.3260924999530497, + "rewards/margins": 8.270744856552557, + "rewards/rejected": -5.944652356599507, + "step": 771 + }, + { + "epoch": 0.28498915601495084, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 8.260534158887878e-06, + "logits/chosen": 1885598378.6666667, + "logits/rejected": 1636342723.764706, + "logps/chosen": -239.49612630208333, + "logps/rejected": -550.2908432904412, + "loss": 0.1687, + "rewards/chosen": 0.8601183573404948, + "rewards/margins": 9.799769846598307, + "rewards/rejected": -8.939651489257812, + "step": 772 + }, + { + "epoch": 0.28535831295279407, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 8.256070068145009e-06, + "logits/chosen": 1815894144.0, + "logits/rejected": 2124727552.0, + "logps/chosen": -331.2087707519531, + "logps/rejected": -452.85400390625, + "loss": 0.1587, + "rewards/chosen": 1.2800062894821167, + "rewards/margins": 8.432247996330261, + "rewards/rejected": -7.1522417068481445, + "step": 773 + }, + { + "epoch": 0.28572746989063724, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 8.251601466158428e-06, + "logits/chosen": 2008872576.0, + "logits/rejected": 1761766784.0, + "logps/chosen": -242.1033172607422, + "logps/rejected": -635.6817626953125, + "loss": 0.106, + "rewards/chosen": 2.5335941314697266, + "rewards/margins": 12.11904525756836, + "rewards/rejected": -9.585451126098633, + "step": 774 + }, + { + "epoch": 0.28609662682848047, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 8.247128359119326e-06, + "logits/chosen": 1794399872.0, + "logits/rejected": 1635362816.0, + "logps/chosen": -331.3878173828125, + "logps/rejected": -544.0890502929688, + "loss": 0.1684, + "rewards/chosen": 1.1966028213500977, + "rewards/margins": 9.600072860717773, + "rewards/rejected": -8.403470039367676, + "step": 775 + }, + { + "epoch": 0.28646578376632365, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 8.242650753225137e-06, + "logits/chosen": 1904461414.4, + "logits/rejected": 1611770709.3333333, + "logps/chosen": -273.80771484375, + "logps/rejected": -443.1193033854167, + "loss": 0.2077, + "rewards/chosen": 1.1953511238098145, + "rewards/margins": 8.147849241892498, + "rewards/rejected": -6.952498118082683, + "step": 776 + }, + { + "epoch": 0.2868349407041669, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 8.238168654679528e-06, + "logits/chosen": 1889235259.0769231, + "logits/rejected": 1862129125.0526316, + "logps/chosen": -281.66562124399036, + "logps/rejected": -474.93251439144734, + "loss": 0.1539, + "rewards/chosen": 0.9561989124004657, + "rewards/margins": 9.507205831859757, + "rewards/rejected": -8.551006919459292, + "step": 777 + }, + { + "epoch": 0.28720409764201005, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 8.233682069692388e-06, + "logits/chosen": 1260089990.7368422, + "logits/rejected": 2550590857.8461537, + "logps/chosen": -241.13666735197367, + "logps/rejected": -581.9029071514423, + "loss": 0.2017, + "rewards/chosen": 0.9727998030813116, + "rewards/margins": 8.633146787944593, + "rewards/rejected": -7.660346984863281, + "step": 778 + }, + { + "epoch": 0.2875732545798533, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 8.229191004479825e-06, + "logits/chosen": 1650074965.3333333, + "logits/rejected": 1767265718.857143, + "logps/chosen": -238.38064236111111, + "logps/rejected": -570.4975934709821, + "loss": 0.1762, + "rewards/chosen": 1.0989985995822482, + "rewards/margins": 9.829411763993521, + "rewards/rejected": -8.730413164411273, + "step": 779 + }, + { + "epoch": 0.28794241151769645, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 8.22469546526415e-06, + "logits/chosen": 1627609600.0, + "logits/rejected": 1682386560.0, + "logps/chosen": -306.8687744140625, + "logps/rejected": -530.6214599609375, + "loss": 0.1753, + "rewards/chosen": 0.9425001740455627, + "rewards/margins": 9.181773245334625, + "rewards/rejected": -8.239273071289062, + "step": 780 + }, + { + "epoch": 0.2883115684555397, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 8.220195458273879e-06, + "logits/chosen": 1872500248.3809524, + "logits/rejected": 2260174103.2727275, + "logps/chosen": -189.82042875744048, + "logps/rejected": -349.0494495738636, + "loss": 0.2311, + "rewards/chosen": 0.9272289276123047, + "rewards/margins": 6.977015581997958, + "rewards/rejected": -6.049786654385653, + "step": 781 + }, + { + "epoch": 0.28868072539338285, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 8.21569098974371e-06, + "logits/chosen": 2099915434.6666667, + "logits/rejected": 2713975868.2352943, + "logps/chosen": -264.8400390625, + "logps/rejected": -515.2446001838235, + "loss": 0.156, + "rewards/chosen": 0.9563973108927409, + "rewards/margins": 8.259683695026473, + "rewards/rejected": -7.303286384133732, + "step": 782 + }, + { + "epoch": 0.2890498823312261, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 8.211182065914531e-06, + "logits/chosen": 1855494912.0, + "logits/rejected": 1574805120.0, + "logps/chosen": -261.7393798828125, + "logps/rejected": -497.7912902832031, + "loss": 0.1779, + "rewards/chosen": 1.088707685470581, + "rewards/margins": 8.731104612350464, + "rewards/rejected": -7.642396926879883, + "step": 783 + }, + { + "epoch": 0.28941903926906926, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 8.206668693033399e-06, + "logits/chosen": 2046985344.0, + "logits/rejected": 1863873536.0, + "logps/chosen": -302.32244873046875, + "logps/rejected": -480.84954833984375, + "loss": 0.1591, + "rewards/chosen": 1.0393097400665283, + "rewards/margins": 9.838927507400513, + "rewards/rejected": -8.799617767333984, + "step": 784 + }, + { + "epoch": 0.2897881962069125, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 8.202150877353533e-06, + "logits/chosen": 1967445760.0, + "logits/rejected": 1730024448.0, + "logps/chosen": -298.3865661621094, + "logps/rejected": -527.1275634765625, + "loss": 0.1651, + "rewards/chosen": 1.0504655838012695, + "rewards/margins": 11.105935096740723, + "rewards/rejected": -10.055469512939453, + "step": 785 + }, + { + "epoch": 0.29015735314475566, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 8.197628625134306e-06, + "logits/chosen": 1962916864.0, + "logits/rejected": 1984057728.0, + "logps/chosen": -297.5548400878906, + "logps/rejected": -582.4058227539062, + "loss": 0.1215, + "rewards/chosen": 1.6408048868179321, + "rewards/margins": 9.789620995521545, + "rewards/rejected": -8.148816108703613, + "step": 786 + }, + { + "epoch": 0.2905265100825989, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 8.193101942641248e-06, + "logits/chosen": 1613820586.6666667, + "logits/rejected": 1536797696.0, + "logps/chosen": -280.4144287109375, + "logps/rejected": -612.351123046875, + "loss": 0.0981, + "rewards/chosen": 1.8199532826741536, + "rewards/margins": 18.205771764119465, + "rewards/rejected": -16.385818481445312, + "step": 787 + }, + { + "epoch": 0.29089566702044206, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 8.188570836146015e-06, + "logits/chosen": 1538041976.4705882, + "logits/rejected": 2477161949.866667, + "logps/chosen": -284.44226792279414, + "logps/rejected": -559.7286458333333, + "loss": 0.1468, + "rewards/chosen": 1.5848420087028952, + "rewards/margins": 8.970545540603936, + "rewards/rejected": -7.385703531901042, + "step": 788 + }, + { + "epoch": 0.2912648239582853, + "grad_norm": 12.8125, + "kl": 0.11930704116821289, + "learning_rate": 8.184035311926397e-06, + "logits/chosen": 2058395921.0666666, + "logits/rejected": 2079244528.9411764, + "logps/chosen": -343.15247395833336, + "logps/rejected": -634.4252642463235, + "loss": 0.162, + "rewards/chosen": 0.9836466471354167, + "rewards/margins": 9.543459305108764, + "rewards/rejected": -8.559812657973346, + "step": 789 + }, + { + "epoch": 0.29163398089612846, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 8.17949537626631e-06, + "logits/chosen": 1745475111.3846154, + "logits/rejected": 1933801579.7894738, + "logps/chosen": -283.62252103365387, + "logps/rejected": -501.8071546052632, + "loss": 0.1124, + "rewards/chosen": 1.8299647111159105, + "rewards/margins": 9.434399075836305, + "rewards/rejected": -7.604434364720395, + "step": 790 + }, + { + "epoch": 0.2920031378339717, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 8.174951035455772e-06, + "logits/chosen": 1403702980.9230769, + "logits/rejected": 1370635425.6842105, + "logps/chosen": -254.3580603966346, + "logps/rejected": -453.39766652960526, + "loss": 0.1313, + "rewards/chosen": 1.5636182931753306, + "rewards/margins": 8.556041902858718, + "rewards/rejected": -6.992423609683388, + "step": 791 + }, + { + "epoch": 0.29237229477181487, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 8.170402295790913e-06, + "logits/chosen": 1577094940.4444444, + "logits/rejected": 1511763968.0, + "logps/chosen": -272.1066623263889, + "logps/rejected": -474.45717075892856, + "loss": 0.1887, + "rewards/chosen": 1.3187084197998047, + "rewards/margins": 9.258815492902484, + "rewards/rejected": -7.940107073102679, + "step": 792 + }, + { + "epoch": 0.2927414517096581, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 8.165849163573954e-06, + "logits/chosen": 2107981824.0, + "logits/rejected": 1416503588.5714285, + "logps/chosen": -319.99354383680554, + "logps/rejected": -503.9273158482143, + "loss": 0.2151, + "rewards/chosen": 0.8559544351365831, + "rewards/margins": 8.393319939810132, + "rewards/rejected": -7.5373655046735495, + "step": 793 + }, + { + "epoch": 0.29311060864750127, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 8.161291645113198e-06, + "logits/chosen": 2130477283.5555556, + "logits/rejected": 2515419721.142857, + "logps/chosen": -324.08124457465277, + "logps/rejected": -680.0563616071429, + "loss": 0.1846, + "rewards/chosen": 1.0262871848212347, + "rewards/margins": 10.007030048067609, + "rewards/rejected": -8.980742863246373, + "step": 794 + }, + { + "epoch": 0.29347976558534444, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 8.156729746723034e-06, + "logits/chosen": 1590843776.0, + "logits/rejected": 1275528064.0, + "logps/chosen": -307.7215270996094, + "logps/rejected": -440.2825012207031, + "loss": 0.2022, + "rewards/chosen": 0.7474891543388367, + "rewards/margins": 6.7448737025260925, + "rewards/rejected": -5.997384548187256, + "step": 795 + }, + { + "epoch": 0.2938489225231877, + "grad_norm": 11.1875, + "kl": 0.2413802146911621, + "learning_rate": 8.15216347472391e-06, + "logits/chosen": 1804501307.0769231, + "logits/rejected": 2096511946.1052632, + "logps/chosen": -246.74275090144232, + "logps/rejected": -427.66737767269734, + "loss": 0.1506, + "rewards/chosen": 0.8671428240262545, + "rewards/margins": 8.057019982743359, + "rewards/rejected": -7.189877158717105, + "step": 796 + }, + { + "epoch": 0.29421807946103085, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 8.14759283544234e-06, + "logits/chosen": 1894426477.7142856, + "logits/rejected": 2007614350.2222223, + "logps/chosen": -335.78982979910717, + "logps/rejected": -490.38368055555554, + "loss": 0.0905, + "rewards/chosen": 2.523394993373326, + "rewards/margins": 10.30474841405475, + "rewards/rejected": -7.781353420681423, + "step": 797 + }, + { + "epoch": 0.2945872363988741, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 8.143017835210886e-06, + "logits/chosen": 1871879246.7692308, + "logits/rejected": 1869968869.0526316, + "logps/chosen": -294.62950721153845, + "logps/rejected": -540.0745785361842, + "loss": 0.1127, + "rewards/chosen": 1.684869619516226, + "rewards/margins": 9.496207542264992, + "rewards/rejected": -7.811337922748766, + "step": 798 + }, + { + "epoch": 0.29495639333671725, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 8.138438480368153e-06, + "logits/chosen": 1932779038.1176472, + "logits/rejected": 1321326182.4, + "logps/chosen": -311.02478745404414, + "logps/rejected": -550.4051432291667, + "loss": 0.1587, + "rewards/chosen": 1.3772319344913257, + "rewards/margins": 10.33872994067622, + "rewards/rejected": -8.961498006184895, + "step": 799 + }, + { + "epoch": 0.2953255502745605, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 8.133854777258779e-06, + "logits/chosen": 1410485589.3333333, + "logits/rejected": 2007218778.3529413, + "logps/chosen": -315.62060546875, + "logps/rejected": -498.88493795955884, + "loss": 0.14, + "rewards/chosen": 1.6581541697184246, + "rewards/margins": 10.358503865260703, + "rewards/rejected": -8.700349695542279, + "step": 800 + }, + { + "epoch": 0.29569470721240365, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 8.129266732233427e-06, + "logits/chosen": 1715517560.4705882, + "logits/rejected": 1721836202.6666667, + "logps/chosen": -329.07146139705884, + "logps/rejected": -525.2569661458333, + "loss": 0.1602, + "rewards/chosen": 1.6880555994370405, + "rewards/margins": 8.610727616852405, + "rewards/rejected": -6.922672017415365, + "step": 801 + }, + { + "epoch": 0.2960638641502469, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 8.124674351648773e-06, + "logits/chosen": 1967045339.4285715, + "logits/rejected": 2015654343.1111112, + "logps/chosen": -285.47964913504467, + "logps/rejected": -555.0106336805555, + "loss": 0.1641, + "rewards/chosen": 0.7254183632986886, + "rewards/margins": 9.31349718003046, + "rewards/rejected": -8.588078816731771, + "step": 802 + }, + { + "epoch": 0.29643302108809005, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 8.120077641867506e-06, + "logits/chosen": 2209987515.733333, + "logits/rejected": 2046963952.9411764, + "logps/chosen": -266.27252604166665, + "logps/rejected": -375.4507697610294, + "loss": 0.1433, + "rewards/chosen": 1.7118882497151693, + "rewards/margins": 7.565687508676566, + "rewards/rejected": -5.853799258961397, + "step": 803 + }, + { + "epoch": 0.2968021780259333, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 8.115476609258303e-06, + "logits/chosen": 1672854949.6470587, + "logits/rejected": 2294377676.8, + "logps/chosen": -279.0005744485294, + "logps/rejected": -457.2577799479167, + "loss": 0.1472, + "rewards/chosen": 1.9660868925206803, + "rewards/margins": 8.950609394148284, + "rewards/rejected": -6.9845225016276045, + "step": 804 + }, + { + "epoch": 0.29717133496377646, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 8.110871260195843e-06, + "logits/chosen": 1355694923.2941177, + "logits/rejected": 1886667161.6, + "logps/chosen": -262.04664522058823, + "logps/rejected": -330.3739908854167, + "loss": 0.1819, + "rewards/chosen": 1.5579574809354895, + "rewards/margins": 6.109295168109969, + "rewards/rejected": -4.551337687174479, + "step": 805 + }, + { + "epoch": 0.2975404919016197, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 8.106261601060773e-06, + "logits/chosen": 1960393581.7142856, + "logits/rejected": 1478986638.2222223, + "logps/chosen": -376.36094447544644, + "logps/rejected": -393.79554578993054, + "loss": 0.1636, + "rewards/chosen": 0.8501204081944057, + "rewards/margins": 7.2051483487326005, + "rewards/rejected": -6.355027940538195, + "step": 806 + }, + { + "epoch": 0.29790964883946286, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 8.10164763823972e-06, + "logits/chosen": 1776711559.5294118, + "logits/rejected": 2229499767.4666667, + "logps/chosen": -144.30712890625, + "logps/rejected": -537.4246419270834, + "loss": 0.1197, + "rewards/chosen": 1.7147829392377067, + "rewards/margins": 9.895915446561926, + "rewards/rejected": -8.181132507324218, + "step": 807 + }, + { + "epoch": 0.2982788057773061, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 8.097029378125269e-06, + "logits/chosen": 1782969344.0, + "logits/rejected": 1916704085.3333333, + "logps/chosen": -345.97011021205356, + "logps/rejected": -546.0418836805555, + "loss": 0.1453, + "rewards/chosen": 1.2972239085606165, + "rewards/margins": 7.867170197623117, + "rewards/rejected": -6.5699462890625, + "step": 808 + }, + { + "epoch": 0.29864796271514926, + "grad_norm": 14.5625, + "kl": 0.4725308418273926, + "learning_rate": 8.092406827115964e-06, + "logits/chosen": 1916424045.7142856, + "logits/rejected": 2491431822.2222223, + "logps/chosen": -312.70263671875, + "logps/rejected": -383.3122829861111, + "loss": 0.1935, + "rewards/chosen": 0.5758002826145717, + "rewards/margins": 6.934301716940744, + "rewards/rejected": -6.358501434326172, + "step": 809 + }, + { + "epoch": 0.2990171196529925, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 8.087779991616287e-06, + "logits/chosen": 2305710834.5263157, + "logits/rejected": 1833789597.5384614, + "logps/chosen": -352.52518503289474, + "logps/rejected": -574.2116887019231, + "loss": 0.2213, + "rewards/chosen": 0.8366149099249589, + "rewards/margins": 7.839409028952904, + "rewards/rejected": -7.002794119027945, + "step": 810 + }, + { + "epoch": 0.29938627659083566, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 8.083148878036662e-06, + "logits/chosen": 2242518812.4444447, + "logits/rejected": 1754433097.142857, + "logps/chosen": -306.30088975694446, + "logps/rejected": -525.7042759486607, + "loss": 0.2172, + "rewards/chosen": 1.0050924089219835, + "rewards/margins": 7.88835558815608, + "rewards/rejected": -6.883263179234096, + "step": 811 + }, + { + "epoch": 0.2997554335286789, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 8.078513492793438e-06, + "logits/chosen": 2229952785.0666666, + "logits/rejected": 1723739678.1176472, + "logps/chosen": -256.3374837239583, + "logps/rejected": -398.6739717371324, + "loss": 0.1617, + "rewards/chosen": 1.688958994547526, + "rewards/margins": 7.8996254266477095, + "rewards/rejected": -6.210666432100184, + "step": 812 + }, + { + "epoch": 0.30012459046652207, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 8.073873842308882e-06, + "logits/chosen": 1441944507.7333333, + "logits/rejected": 1432007378.8235295, + "logps/chosen": -244.43362630208333, + "logps/rejected": -509.1650965073529, + "loss": 0.1445, + "rewards/chosen": 1.368047332763672, + "rewards/margins": 11.12289513980641, + "rewards/rejected": -9.754847807042738, + "step": 813 + }, + { + "epoch": 0.3004937474043653, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 8.06922993301117e-06, + "logits/chosen": 1783088696.8888888, + "logits/rejected": 1600717385.142857, + "logps/chosen": -287.3686794704861, + "logps/rejected": -470.32212611607144, + "loss": 0.1665, + "rewards/chosen": 1.4139023886786566, + "rewards/margins": 7.958555599999806, + "rewards/rejected": -6.544653211321149, + "step": 814 + }, + { + "epoch": 0.30086290434220847, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 8.06458177133438e-06, + "logits/chosen": 1862189686.1538463, + "logits/rejected": 1446147772.631579, + "logps/chosen": -336.5436448317308, + "logps/rejected": -431.9814453125, + "loss": 0.1496, + "rewards/chosen": 0.794594251192533, + "rewards/margins": 7.943976189926085, + "rewards/rejected": -7.149381938733552, + "step": 815 + }, + { + "epoch": 0.3012320612800517, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 8.059929363718482e-06, + "logits/chosen": 1840019797.3333333, + "logits/rejected": 1429795693.7142856, + "logps/chosen": -299.02704535590277, + "logps/rejected": -445.1821986607143, + "loss": 0.1649, + "rewards/chosen": 1.6504359775119357, + "rewards/margins": 8.557581341455853, + "rewards/rejected": -6.9071453639439175, + "step": 816 + }, + { + "epoch": 0.3016012182178949, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 8.055272716609325e-06, + "logits/chosen": 1079479247.2380953, + "logits/rejected": 1137328128.0, + "logps/chosen": -266.5591750372024, + "logps/rejected": -350.0929509943182, + "loss": 0.1932, + "rewards/chosen": 1.4339239029657274, + "rewards/margins": 7.15955943153018, + "rewards/rejected": -5.725635528564453, + "step": 817 + }, + { + "epoch": 0.3019703751557381, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 8.050611836458638e-06, + "logits/chosen": 1612715918.2222223, + "logits/rejected": 1363407579.4285715, + "logps/chosen": -285.65643988715277, + "logps/rejected": -465.4964076450893, + "loss": 0.1736, + "rewards/chosen": 1.238120608859592, + "rewards/margins": 8.374668242439391, + "rewards/rejected": -7.1365476335797995, + "step": 818 + }, + { + "epoch": 0.3023395320935813, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 8.045946729724006e-06, + "logits/chosen": 1880778428.631579, + "logits/rejected": 2447008059.076923, + "logps/chosen": -343.3036852384868, + "logps/rejected": -408.2204026442308, + "loss": 0.2144, + "rewards/chosen": 1.2783741198087994, + "rewards/margins": 7.553862166308199, + "rewards/rejected": -6.275488046499399, + "step": 819 + }, + { + "epoch": 0.3027086890314245, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 8.041277402868881e-06, + "logits/chosen": 2307380565.3333335, + "logits/rejected": 2018702998.5882354, + "logps/chosen": -284.2231770833333, + "logps/rejected": -661.1852022058823, + "loss": 0.1459, + "rewards/chosen": 1.295455805460612, + "rewards/margins": 9.093158370373295, + "rewards/rejected": -7.797702564912684, + "step": 820 + }, + { + "epoch": 0.3030778459692677, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 8.036603862362553e-06, + "logits/chosen": 1566930944.0, + "logits/rejected": 1995055826.8235295, + "logps/chosen": -259.1030598958333, + "logps/rejected": -517.7414981617648, + "loss": 0.186, + "rewards/chosen": 0.8430644989013671, + "rewards/margins": 9.743824027566347, + "rewards/rejected": -8.900759528664981, + "step": 821 + }, + { + "epoch": 0.3034470029071109, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 8.031926114680153e-06, + "logits/chosen": 1599134866.2857144, + "logits/rejected": 1836644238.2222223, + "logps/chosen": -302.35414341517856, + "logps/rejected": -446.29462348090277, + "loss": 0.16, + "rewards/chosen": 1.1045301301138741, + "rewards/margins": 7.215257228366912, + "rewards/rejected": -6.110727098253038, + "step": 822 + }, + { + "epoch": 0.3038161598449541, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 8.027244166302641e-06, + "logits/chosen": 1522726754.4615386, + "logits/rejected": 1644102602.1052632, + "logps/chosen": -306.7819260817308, + "logps/rejected": -423.27073910361844, + "loss": 0.1398, + "rewards/chosen": 1.3881590916560247, + "rewards/margins": 8.87909617404706, + "rewards/rejected": -7.490937082391036, + "step": 823 + }, + { + "epoch": 0.3041853167827973, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 8.022558023716799e-06, + "logits/chosen": 1832794955.2941177, + "logits/rejected": 2229187379.2, + "logps/chosen": -296.2141544117647, + "logps/rejected": -545.9464192708333, + "loss": 0.1632, + "rewards/chosen": 1.3795812270220589, + "rewards/margins": 9.726521839815028, + "rewards/rejected": -8.346940612792968, + "step": 824 + }, + { + "epoch": 0.3045544737206405, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 8.017867693415214e-06, + "logits/chosen": 1346095206.4, + "logits/rejected": 1339008093.090909, + "logps/chosen": -194.44639892578124, + "logps/rejected": -450.3898259943182, + "loss": 0.1156, + "rewards/chosen": 1.4361581802368164, + "rewards/margins": 8.218214121731844, + "rewards/rejected": -6.782055941495028, + "step": 825 + }, + { + "epoch": 0.3049236306584837, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 8.013173181896283e-06, + "logits/chosen": 1727281971.2, + "logits/rejected": 2156776448.0, + "logps/chosen": -269.996630859375, + "logps/rejected": -474.599609375, + "loss": 0.1871, + "rewards/chosen": 1.3075427055358886, + "rewards/margins": 8.960203711191813, + "rewards/rejected": -7.652661005655925, + "step": 826 + }, + { + "epoch": 0.3052927875963269, + "grad_norm": 10.4375, + "kl": 0.501070499420166, + "learning_rate": 8.008474495664189e-06, + "logits/chosen": 1383627161.6, + "logits/rejected": 2093605345.8823528, + "logps/chosen": -239.06378580729168, + "logps/rejected": -470.8046300551471, + "loss": 0.1523, + "rewards/chosen": 1.244427490234375, + "rewards/margins": 8.324596719180837, + "rewards/rejected": -7.0801692289464615, + "step": 827 + }, + { + "epoch": 0.3056619445341701, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 8.003771641228905e-06, + "logits/chosen": 1466680506.1818182, + "logits/rejected": 1519476540.952381, + "logps/chosen": -263.35595703125, + "logps/rejected": -413.21805245535717, + "loss": 0.0956, + "rewards/chosen": 1.7336137945001775, + "rewards/margins": 8.510508500136337, + "rewards/rejected": -6.776894705636161, + "step": 828 + }, + { + "epoch": 0.3060311014720133, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 7.999064625106174e-06, + "logits/chosen": 1780798464.0, + "logits/rejected": 1442287820.8, + "logps/chosen": -311.51849365234375, + "logps/rejected": -468.090087890625, + "loss": 0.1556, + "rewards/chosen": 0.6209770838419596, + "rewards/margins": 7.846309725443523, + "rewards/rejected": -7.225332641601563, + "step": 829 + }, + { + "epoch": 0.30640025840985646, + "grad_norm": 15.125, + "kl": 0.0, + "learning_rate": 7.994353453817508e-06, + "logits/chosen": 1567426304.0, + "logits/rejected": 1793480192.0, + "logps/chosen": -272.9121398925781, + "logps/rejected": -472.41534423828125, + "loss": 0.1434, + "rewards/chosen": 1.765087604522705, + "rewards/margins": 8.643160343170166, + "rewards/rejected": -6.878072738647461, + "step": 830 + }, + { + "epoch": 0.3067694153476997, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 7.989638133890174e-06, + "logits/chosen": 1846025830.4, + "logits/rejected": 1813832797.090909, + "logps/chosen": -278.8658203125, + "logps/rejected": -466.7374378551136, + "loss": 0.1349, + "rewards/chosen": 0.49846348762512205, + "rewards/margins": 7.225576691194013, + "rewards/rejected": -6.727113203568892, + "step": 831 + }, + { + "epoch": 0.30713857228554287, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 7.984918671857189e-06, + "logits/chosen": 1748837841.4545455, + "logits/rejected": 1713967104.0, + "logps/chosen": -343.68430397727275, + "logps/rejected": -489.34351748511904, + "loss": 0.1482, + "rewards/chosen": 0.45304398103193805, + "rewards/margins": 9.26828620340917, + "rewards/rejected": -8.815242222377233, + "step": 832 + }, + { + "epoch": 0.3075077292233861, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 7.980195074257307e-06, + "logits/chosen": 1777100913.7777777, + "logits/rejected": 1371332900.5714285, + "logps/chosen": -311.48670789930554, + "logps/rejected": -501.6962890625, + "loss": 0.2414, + "rewards/chosen": 0.5426774024963379, + "rewards/margins": 7.749609470367432, + "rewards/rejected": -7.206932067871094, + "step": 833 + }, + { + "epoch": 0.30787688616122927, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 7.975467347635012e-06, + "logits/chosen": 2009840896.0, + "logits/rejected": 2519991040.0, + "logps/chosen": -385.7723083496094, + "logps/rejected": -499.6424560546875, + "loss": 0.2121, + "rewards/chosen": 0.44561702013015747, + "rewards/margins": 9.27849131822586, + "rewards/rejected": -8.832874298095703, + "step": 834 + }, + { + "epoch": 0.3082460430990725, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 7.97073549854051e-06, + "logits/chosen": 1729585044.2105262, + "logits/rejected": 2396147396.923077, + "logps/chosen": -279.35570004111844, + "logps/rejected": -504.0295973557692, + "loss": 0.1944, + "rewards/chosen": 1.1528539155658923, + "rewards/margins": 9.089363268029835, + "rewards/rejected": -7.9365093524639425, + "step": 835 + }, + { + "epoch": 0.30861520003691567, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 7.965999533529718e-06, + "logits/chosen": 1602250752.0, + "logits/rejected": 1625722880.0, + "logps/chosen": -312.43860677083336, + "logps/rejected": -445.0725528492647, + "loss": 0.1557, + "rewards/chosen": 0.9785322825113932, + "rewards/margins": 8.30315673678529, + "rewards/rejected": -7.324624454273897, + "step": 836 + }, + { + "epoch": 0.3089843569747589, + "grad_norm": 13.5, + "kl": 0.8632402420043945, + "learning_rate": 7.961259459164254e-06, + "logits/chosen": 2449136298.6666665, + "logits/rejected": 1711760530.2857144, + "logps/chosen": -302.5050455729167, + "logps/rejected": -364.52064732142856, + "loss": 0.1564, + "rewards/chosen": 1.5189595752292209, + "rewards/margins": 7.422852092319065, + "rewards/rejected": -5.903892517089844, + "step": 837 + }, + { + "epoch": 0.3093535139126021, + "grad_norm": 8.625, + "kl": 0.1173563003540039, + "learning_rate": 7.956515282011434e-06, + "logits/chosen": 1912410726.4, + "logits/rejected": 1991198161.4545455, + "logps/chosen": -261.2768310546875, + "logps/rejected": -506.15598366477275, + "loss": 0.1034, + "rewards/chosen": 1.3286043167114259, + "rewards/margins": 9.454141148653896, + "rewards/rejected": -8.12553683194247, + "step": 838 + }, + { + "epoch": 0.3097226708504453, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 7.951767008644251e-06, + "logits/chosen": 1904186853.0526316, + "logits/rejected": 2634485760.0, + "logps/chosen": -304.5488024259868, + "logps/rejected": -420.5614483173077, + "loss": 0.1538, + "rewards/chosen": 1.6742320813630756, + "rewards/margins": 8.151954249331826, + "rewards/rejected": -6.47772216796875, + "step": 839 + }, + { + "epoch": 0.3100918277882885, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 7.94701464564138e-06, + "logits/chosen": 1815226624.0, + "logits/rejected": 1394356096.0, + "logps/chosen": -254.62367248535156, + "logps/rejected": -513.35986328125, + "loss": 0.1686, + "rewards/chosen": 1.2715548276901245, + "rewards/margins": 8.247064471244812, + "rewards/rejected": -6.9755096435546875, + "step": 840 + }, + { + "epoch": 0.3104609847261317, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 7.942258199587158e-06, + "logits/chosen": 1590144682.6666667, + "logits/rejected": 2149934811.428571, + "logps/chosen": -234.60926649305554, + "logps/rejected": -454.6900111607143, + "loss": 0.1761, + "rewards/chosen": 1.2197287877400715, + "rewards/margins": 8.611499286833263, + "rewards/rejected": -7.391770499093192, + "step": 841 + }, + { + "epoch": 0.3108301416639749, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 7.937497677071583e-06, + "logits/chosen": 1759271680.0, + "logits/rejected": 1243496704.0, + "logps/chosen": -346.13360595703125, + "logps/rejected": -441.9634704589844, + "loss": 0.1619, + "rewards/chosen": 1.4510995149612427, + "rewards/margins": 9.325560688972473, + "rewards/rejected": -7.8744611740112305, + "step": 842 + }, + { + "epoch": 0.3111992986018181, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 7.932733084690296e-06, + "logits/chosen": 2695615283.2, + "logits/rejected": 2047527273.4117646, + "logps/chosen": -242.93645833333332, + "logps/rejected": -499.9755284926471, + "loss": 0.154, + "rewards/chosen": 1.401041030883789, + "rewards/margins": 9.020968829884248, + "rewards/rejected": -7.619927799000459, + "step": 843 + }, + { + "epoch": 0.3115684555396613, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 7.92796442904458e-06, + "logits/chosen": 1695778816.0, + "logits/rejected": 2190669824.0, + "logps/chosen": -221.9394287109375, + "logps/rejected": -522.3164469401041, + "loss": 0.2014, + "rewards/chosen": 1.383891487121582, + "rewards/margins": 9.715642992655436, + "rewards/rejected": -8.331751505533854, + "step": 844 + }, + { + "epoch": 0.3119376124775045, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 7.923191716741348e-06, + "logits/chosen": 1731405092.5714285, + "logits/rejected": 1420649472.0, + "logps/chosen": -256.72764369419644, + "logps/rejected": -493.5515407986111, + "loss": 0.125, + "rewards/chosen": 1.5650439943586076, + "rewards/margins": 9.874180279080829, + "rewards/rejected": -8.309136284722221, + "step": 845 + }, + { + "epoch": 0.3123067694153477, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 7.91841495439313e-06, + "logits/chosen": 2037930752.0, + "logits/rejected": 1181810432.0, + "logps/chosen": -313.9730224609375, + "logps/rejected": -426.9949951171875, + "loss": 0.117, + "rewards/chosen": 1.6640669107437134, + "rewards/margins": 9.747782349586487, + "rewards/rejected": -8.083715438842773, + "step": 846 + }, + { + "epoch": 0.3126759263531909, + "grad_norm": 12.0625, + "kl": 1.4579029083251953, + "learning_rate": 7.913634148618073e-06, + "logits/chosen": 2648354575.0588236, + "logits/rejected": 2590585651.2, + "logps/chosen": -299.03733915441177, + "logps/rejected": -597.1313151041667, + "loss": 0.1463, + "rewards/chosen": 1.845749350155101, + "rewards/margins": 10.176253703996247, + "rewards/rejected": -8.330504353841146, + "step": 847 + }, + { + "epoch": 0.3130450832910341, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 7.908849306039918e-06, + "logits/chosen": 1793597224.4210527, + "logits/rejected": 1987188578.4615386, + "logps/chosen": -211.62826377467104, + "logps/rejected": -452.23888221153845, + "loss": 0.1447, + "rewards/chosen": 1.88387037578382, + "rewards/margins": 9.43273019211495, + "rewards/rejected": -7.54885981633113, + "step": 848 + }, + { + "epoch": 0.3134142402288773, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 7.904060433288007e-06, + "logits/chosen": 1583498368.0, + "logits/rejected": 1967017984.0, + "logps/chosen": -345.40704345703125, + "logps/rejected": -496.48956298828125, + "loss": 0.153, + "rewards/chosen": 1.4391528367996216, + "rewards/margins": 8.345987915992737, + "rewards/rejected": -6.906835079193115, + "step": 849 + }, + { + "epoch": 0.3137833971667205, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 7.899267536997261e-06, + "logits/chosen": 1980651520.0, + "logits/rejected": 1958105088.0, + "logps/chosen": -283.6163330078125, + "logps/rejected": -461.9182400173611, + "loss": 0.1873, + "rewards/chosen": 0.6477373668125698, + "rewards/margins": 10.040383308652848, + "rewards/rejected": -9.392645941840279, + "step": 850 + }, + { + "epoch": 0.3141525541045637, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 7.894470623808176e-06, + "logits/chosen": 1569617646.9333334, + "logits/rejected": 1806889321.4117646, + "logps/chosen": -289.48268229166666, + "logps/rejected": -545.7815946691177, + "loss": 0.1845, + "rewards/chosen": 0.7846832911173502, + "rewards/margins": 8.969135123608158, + "rewards/rejected": -8.184451832490808, + "step": 851 + }, + { + "epoch": 0.3145217110424069, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 7.889669700366818e-06, + "logits/chosen": 2559108096.0, + "logits/rejected": 1608796672.0, + "logps/chosen": -310.5573425292969, + "logps/rejected": -481.6836853027344, + "loss": 0.1616, + "rewards/chosen": 1.2778233289718628, + "rewards/margins": 8.877211928367615, + "rewards/rejected": -7.599388599395752, + "step": 852 + }, + { + "epoch": 0.3148908679802501, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 7.884864773324802e-06, + "logits/chosen": 1995534882.1333334, + "logits/rejected": 1424070053.6470587, + "logps/chosen": -300.98522135416664, + "logps/rejected": -409.033203125, + "loss": 0.1691, + "rewards/chosen": 0.9390234629313151, + "rewards/margins": 7.236901503918218, + "rewards/rejected": -6.297878040986903, + "step": 853 + }, + { + "epoch": 0.3152600249180933, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 7.880055849339294e-06, + "logits/chosen": 1757543966.1176472, + "logits/rejected": 1802464324.2666667, + "logps/chosen": -269.2121151194853, + "logps/rejected": -487.03040364583336, + "loss": 0.2151, + "rewards/chosen": 0.9384696062873391, + "rewards/margins": 8.705585898605047, + "rewards/rejected": -7.767116292317708, + "step": 854 + }, + { + "epoch": 0.3156291818559365, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 7.875242935073e-06, + "logits/chosen": 1987477744.9411764, + "logits/rejected": 2268258577.0666666, + "logps/chosen": -234.45444623161765, + "logps/rejected": -569.7572916666667, + "loss": 0.1736, + "rewards/chosen": 1.1210244122673483, + "rewards/margins": 9.779857216629328, + "rewards/rejected": -8.65883280436198, + "step": 855 + }, + { + "epoch": 0.3159983387937797, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 7.870426037194146e-06, + "logits/chosen": 2192152439.4666667, + "logits/rejected": 1715654535.5294118, + "logps/chosen": -282.333203125, + "logps/rejected": -556.7888901654412, + "loss": 0.1325, + "rewards/chosen": 1.3768541971842447, + "rewards/margins": 8.7076467925427, + "rewards/rejected": -7.330792595358456, + "step": 856 + }, + { + "epoch": 0.3163674957316229, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 7.865605162376485e-06, + "logits/chosen": 2648439125.3333335, + "logits/rejected": 2109074432.0, + "logps/chosen": -255.0826416015625, + "logps/rejected": -610.97919921875, + "loss": 0.1604, + "rewards/chosen": 0.5252934296925863, + "rewards/margins": 9.420483096440634, + "rewards/rejected": -8.895189666748047, + "step": 857 + }, + { + "epoch": 0.3167366526694661, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 7.860780317299282e-06, + "logits/chosen": 1570806000.9411764, + "logits/rejected": 2079949482.6666667, + "logps/chosen": -317.6905158547794, + "logps/rejected": -551.1836588541667, + "loss": 0.2373, + "rewards/chosen": 0.40599890316233916, + "rewards/margins": 8.36161922380036, + "rewards/rejected": -7.955620320638021, + "step": 858 + }, + { + "epoch": 0.31710580960730933, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 7.855951508647295e-06, + "logits/chosen": 1765538884.2666667, + "logits/rejected": 1681512086.5882354, + "logps/chosen": -293.0960286458333, + "logps/rejected": -579.9281939338235, + "loss": 0.1577, + "rewards/chosen": 1.0512412389119465, + "rewards/margins": 10.966892448126101, + "rewards/rejected": -9.915651209214154, + "step": 859 + }, + { + "epoch": 0.3174749665451525, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 7.851118743110774e-06, + "logits/chosen": 1880508825.6, + "logits/rejected": 1363168798.1176472, + "logps/chosen": -259.2115234375, + "logps/rejected": -414.41096047794116, + "loss": 0.1456, + "rewards/chosen": 1.181839116414388, + "rewards/margins": 7.924150033090628, + "rewards/rejected": -6.7423109166762405, + "step": 860 + }, + { + "epoch": 0.31784412348299573, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 7.846282027385462e-06, + "logits/chosen": 1934284361.142857, + "logits/rejected": 1841744099.5555556, + "logps/chosen": -325.43289620535717, + "logps/rejected": -557.28515625, + "loss": 0.1751, + "rewards/chosen": 0.7499315398080009, + "rewards/margins": 9.445112644679963, + "rewards/rejected": -8.695181104871962, + "step": 861 + }, + { + "epoch": 0.3182132804208389, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 7.841441368172559e-06, + "logits/chosen": 1720026794.6666667, + "logits/rejected": 2204854476.8, + "logps/chosen": -285.39902750651044, + "logps/rejected": -625.31611328125, + "loss": 0.1504, + "rewards/chosen": 0.6516085465749105, + "rewards/margins": 9.230434687932332, + "rewards/rejected": -8.578826141357421, + "step": 862 + }, + { + "epoch": 0.31858243735868214, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 7.836596772178741e-06, + "logits/chosen": 1264582440.4210527, + "logits/rejected": 1422821218.4615386, + "logps/chosen": -264.1763466282895, + "logps/rejected": -508.7086838942308, + "loss": 0.1399, + "rewards/chosen": 1.973711515727796, + "rewards/margins": 9.603615455781883, + "rewards/rejected": -7.629903940054087, + "step": 863 + }, + { + "epoch": 0.3189515942965253, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 7.831748246116136e-06, + "logits/chosen": 1554434867.2, + "logits/rejected": 1591946541.1764705, + "logps/chosen": -308.05983072916666, + "logps/rejected": -436.9375, + "loss": 0.1631, + "rewards/chosen": 1.367718505859375, + "rewards/margins": 8.272011341768152, + "rewards/rejected": -6.904292835908778, + "step": 864 + }, + { + "epoch": 0.31932075123436854, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 7.826895796702311e-06, + "logits/chosen": 1430278451.2, + "logits/rejected": 1330676821.3333333, + "logps/chosen": -269.13447265625, + "logps/rejected": -535.0544026692709, + "loss": 0.1515, + "rewards/chosen": 1.753652000427246, + "rewards/margins": 12.64719778696696, + "rewards/rejected": -10.893545786539713, + "step": 865 + }, + { + "epoch": 0.3196899081722117, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 7.822039430660276e-06, + "logits/chosen": 1734446464.0, + "logits/rejected": 1881682688.0, + "logps/chosen": -254.1585693359375, + "logps/rejected": -429.14788818359375, + "loss": 0.175, + "rewards/chosen": 1.269014835357666, + "rewards/margins": 8.75364065170288, + "rewards/rejected": -7.484625816345215, + "step": 866 + }, + { + "epoch": 0.3200590651100549, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 7.817179154718463e-06, + "logits/chosen": 1960170496.0, + "logits/rejected": 1939534233.6, + "logps/chosen": -273.4120279947917, + "logps/rejected": -593.1083984375, + "loss": 0.136, + "rewards/chosen": 1.4268786112467449, + "rewards/margins": 9.494739786783853, + "rewards/rejected": -8.067861175537109, + "step": 867 + }, + { + "epoch": 0.3204282220478981, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 7.812314975610722e-06, + "logits/chosen": 1603350698.6666667, + "logits/rejected": 2280404992.0, + "logps/chosen": -323.5094807942708, + "logps/rejected": -501.25146484375, + "loss": 0.1175, + "rewards/chosen": 1.5535982449849446, + "rewards/margins": 10.187495644887289, + "rewards/rejected": -8.633897399902343, + "step": 868 + }, + { + "epoch": 0.3207973789857413, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 7.807446900076314e-06, + "logits/chosen": 1788359065.6, + "logits/rejected": 1520271872.0, + "logps/chosen": -290.7920166015625, + "logps/rejected": -464.9145914713542, + "loss": 0.2118, + "rewards/chosen": 1.0742314338684082, + "rewards/margins": 8.150565433502198, + "rewards/rejected": -7.076333999633789, + "step": 869 + }, + { + "epoch": 0.3211665359235845, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 7.802574934859894e-06, + "logits/chosen": 2170383291.733333, + "logits/rejected": 2172929927.529412, + "logps/chosen": -351.991015625, + "logps/rejected": -413.9040958180147, + "loss": 0.1894, + "rewards/chosen": 0.7881627400716146, + "rewards/margins": 6.570350706811045, + "rewards/rejected": -5.78218796673943, + "step": 870 + }, + { + "epoch": 0.3215356928614277, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 7.797699086711507e-06, + "logits/chosen": 1279808365.7142856, + "logits/rejected": 1880706730.6666667, + "logps/chosen": -274.23985072544644, + "logps/rejected": -457.2399088541667, + "loss": 0.1414, + "rewards/chosen": 1.2010801860264368, + "rewards/margins": 9.530923495216975, + "rewards/rejected": -8.329843309190538, + "step": 871 + }, + { + "epoch": 0.3219048497992709, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 7.792819362386581e-06, + "logits/chosen": 2444352275.6923075, + "logits/rejected": 2515995378.5263157, + "logps/chosen": -285.77723106971155, + "logps/rejected": -499.4429481907895, + "loss": 0.1614, + "rewards/chosen": 0.6998635071974534, + "rewards/margins": 8.135114040451976, + "rewards/rejected": -7.435250533254523, + "step": 872 + }, + { + "epoch": 0.3222740067371141, + "grad_norm": 13.75, + "kl": 0.2538723945617676, + "learning_rate": 7.78793576864591e-06, + "logits/chosen": 2022321212.235294, + "logits/rejected": 2103823701.3333333, + "logps/chosen": -361.94921875, + "logps/rejected": -485.2930013020833, + "loss": 0.1823, + "rewards/chosen": 1.061662898344152, + "rewards/margins": 8.366700333239987, + "rewards/rejected": -7.305037434895834, + "step": 873 + }, + { + "epoch": 0.3226431636749573, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 7.783048312255653e-06, + "logits/chosen": 1785292344.8888888, + "logits/rejected": 1668517595.4285715, + "logps/chosen": -360.0162760416667, + "logps/rejected": -456.99107142857144, + "loss": 0.1947, + "rewards/chosen": 1.303312725490994, + "rewards/margins": 8.372472641960023, + "rewards/rejected": -7.069159916469029, + "step": 874 + }, + { + "epoch": 0.3230123206128005, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 7.778156999987317e-06, + "logits/chosen": 2182021734.4, + "logits/rejected": 1793286485.3333333, + "logps/chosen": -290.0043212890625, + "logps/rejected": -436.28271484375, + "loss": 0.1927, + "rewards/chosen": 1.5033204078674316, + "rewards/margins": 8.524445565541585, + "rewards/rejected": -7.021125157674153, + "step": 875 + }, + { + "epoch": 0.3233814775506437, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 7.773261838617753e-06, + "logits/chosen": 1628498602.6666667, + "logits/rejected": 2125614381.1764705, + "logps/chosen": -260.090625, + "logps/rejected": -474.8009823069853, + "loss": 0.1342, + "rewards/chosen": 1.50275510152181, + "rewards/margins": 9.636059473075118, + "rewards/rejected": -8.133304371553308, + "step": 876 + }, + { + "epoch": 0.3237506344884869, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 7.768362834929146e-06, + "logits/chosen": 1198572885.3333333, + "logits/rejected": 1701367326.1176472, + "logps/chosen": -266.14222005208336, + "logps/rejected": -409.30612362132354, + "loss": 0.1193, + "rewards/chosen": 1.7067952473958334, + "rewards/margins": 8.17200341318168, + "rewards/rejected": -6.465208165785846, + "step": 877 + }, + { + "epoch": 0.32411979142633013, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 7.763459995709004e-06, + "logits/chosen": 2087007573.3333333, + "logits/rejected": 1960215347.2, + "logps/chosen": -333.076171875, + "logps/rejected": -416.950634765625, + "loss": 0.1421, + "rewards/chosen": 1.0731836954752605, + "rewards/margins": 8.53884531656901, + "rewards/rejected": -7.46566162109375, + "step": 878 + }, + { + "epoch": 0.3244889483641733, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 7.758553327750146e-06, + "logits/chosen": 2049599146.6666667, + "logits/rejected": 1973374566.4, + "logps/chosen": -306.4269612630208, + "logps/rejected": -515.5177734375, + "loss": 0.118, + "rewards/chosen": 1.1513862609863281, + "rewards/margins": 8.792795562744141, + "rewards/rejected": -7.641409301757813, + "step": 879 + }, + { + "epoch": 0.32485810530201653, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 7.753642837850698e-06, + "logits/chosen": 2004762168.8888888, + "logits/rejected": 1707413796.5714285, + "logps/chosen": -308.2451443142361, + "logps/rejected": -402.8251953125, + "loss": 0.1414, + "rewards/chosen": 1.7647307713826497, + "rewards/margins": 8.60278270358131, + "rewards/rejected": -6.838051932198661, + "step": 880 + }, + { + "epoch": 0.3252272622398597, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 7.748728532814087e-06, + "logits/chosen": 1987456236.3076923, + "logits/rejected": 1560770128.8421052, + "logps/chosen": -292.4107196514423, + "logps/rejected": -505.78818873355266, + "loss": 0.1189, + "rewards/chosen": 1.4480360471285307, + "rewards/margins": 9.257310820977215, + "rewards/rejected": -7.809274773848684, + "step": 881 + }, + { + "epoch": 0.32559641917770293, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 7.743810419449014e-06, + "logits/chosen": 1880472234.6666667, + "logits/rejected": 1317931287.2727273, + "logps/chosen": -306.9309895833333, + "logps/rejected": -313.5305841619318, + "loss": 0.2248, + "rewards/chosen": 1.0016085306803386, + "rewards/margins": 6.9947531729033505, + "rewards/rejected": -5.993144642223012, + "step": 882 + }, + { + "epoch": 0.3259655761155461, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 7.738888504569463e-06, + "logits/chosen": 1755457280.0, + "logits/rejected": 1859082624.0, + "logps/chosen": -282.9569091796875, + "logps/rejected": -484.5386962890625, + "loss": 0.1622, + "rewards/chosen": 1.0370687246322632, + "rewards/margins": 8.921853423118591, + "rewards/rejected": -7.884784698486328, + "step": 883 + }, + { + "epoch": 0.32633473305338934, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 7.733962794994689e-06, + "logits/chosen": 1581197507.047619, + "logits/rejected": 1897428061.090909, + "logps/chosen": -310.4190383184524, + "logps/rejected": -431.15163352272725, + "loss": 0.2255, + "rewards/chosen": 1.6414188203357516, + "rewards/margins": 8.294922238304501, + "rewards/rejected": -6.65350341796875, + "step": 884 + }, + { + "epoch": 0.3267038899912325, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 7.729033297549195e-06, + "logits/chosen": 1954659942.4, + "logits/rejected": 1719486277.8181818, + "logps/chosen": -291.13603515625, + "logps/rejected": -496.1344549005682, + "loss": 0.0977, + "rewards/chosen": 1.359393310546875, + "rewards/margins": 9.567505160245029, + "rewards/rejected": -8.208111849698154, + "step": 885 + }, + { + "epoch": 0.32707304692907574, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 7.724100019062739e-06, + "logits/chosen": 1463232619.7894738, + "logits/rejected": 2030309376.0, + "logps/chosen": -318.97977487664474, + "logps/rejected": -437.8596379206731, + "loss": 0.1802, + "rewards/chosen": 1.2761028691342002, + "rewards/margins": 8.760137480762806, + "rewards/rejected": -7.484034611628606, + "step": 886 + }, + { + "epoch": 0.3274422038669189, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 7.719162966370318e-06, + "logits/chosen": 1773986048.0, + "logits/rejected": 2529993472.0, + "logps/chosen": -309.1400146484375, + "logps/rejected": -520.2398071289062, + "loss": 0.1693, + "rewards/chosen": 0.8205697536468506, + "rewards/margins": 9.172093152999878, + "rewards/rejected": -8.351523399353027, + "step": 887 + }, + { + "epoch": 0.32781136080476214, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 7.714222146312151e-06, + "logits/chosen": 1725058141.090909, + "logits/rejected": 1580771523.047619, + "logps/chosen": -266.48970170454544, + "logps/rejected": -410.1572730654762, + "loss": 0.0769, + "rewards/chosen": 1.9124239141290837, + "rewards/margins": 8.947156551080349, + "rewards/rejected": -7.034732636951265, + "step": 888 + }, + { + "epoch": 0.3281805177426053, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 7.709277565733686e-06, + "logits/chosen": 2168832945.230769, + "logits/rejected": 1874427041.6842105, + "logps/chosen": -333.0720402644231, + "logps/rejected": -440.02451685855266, + "loss": 0.1308, + "rewards/chosen": 1.128370578472431, + "rewards/margins": 8.275280593377857, + "rewards/rejected": -7.146910014905427, + "step": 889 + }, + { + "epoch": 0.32854967468044854, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 7.704329231485576e-06, + "logits/chosen": 1901655040.0, + "logits/rejected": 2078494976.0, + "logps/chosen": -250.25067138671875, + "logps/rejected": -620.6270141601562, + "loss": 0.1702, + "rewards/chosen": 0.9859291911125183, + "rewards/margins": 10.087418258190155, + "rewards/rejected": -9.101489067077637, + "step": 890 + }, + { + "epoch": 0.3289188316182917, + "grad_norm": 12.625, + "kl": 0.6705756187438965, + "learning_rate": 7.699377150423673e-06, + "logits/chosen": 1901219960.4705882, + "logits/rejected": 1919831381.3333333, + "logps/chosen": -271.7267635569853, + "logps/rejected": -413.4860026041667, + "loss": 0.176, + "rewards/chosen": 1.210100061753217, + "rewards/margins": 7.839382695216759, + "rewards/rejected": -6.629282633463542, + "step": 891 + }, + { + "epoch": 0.32928798855613495, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 7.69442132940902e-06, + "logits/chosen": 1952762013.5384614, + "logits/rejected": 1612072421.0526316, + "logps/chosen": -256.74569936899036, + "logps/rejected": -579.6266961348684, + "loss": 0.1164, + "rewards/chosen": 1.5467969454251802, + "rewards/margins": 10.161844199485625, + "rewards/rejected": -8.615047254060444, + "step": 892 + }, + { + "epoch": 0.3296571454939781, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 7.689461775307852e-06, + "logits/chosen": 1823993514.6666667, + "logits/rejected": 1844002816.0, + "logps/chosen": -370.154541015625, + "logps/rejected": -476.57841796875, + "loss": 0.0902, + "rewards/chosen": 2.6579599380493164, + "rewards/margins": 9.21244831085205, + "rewards/rejected": -6.5544883728027346, + "step": 893 + }, + { + "epoch": 0.33002630243182135, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 7.684498494991562e-06, + "logits/chosen": 1833489261.7142856, + "logits/rejected": 2251695217.7777777, + "logps/chosen": -282.82137625558033, + "logps/rejected": -568.4927300347222, + "loss": 0.1838, + "rewards/chosen": 0.5352809088570731, + "rewards/margins": 8.542147363935198, + "rewards/rejected": -8.006866455078125, + "step": 894 + }, + { + "epoch": 0.3303954593696645, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 7.679531495336712e-06, + "logits/chosen": 1223900452.5714285, + "logits/rejected": 1317417756.4444444, + "logps/chosen": -280.4718540736607, + "logps/rejected": -508.79839409722223, + "loss": 0.1245, + "rewards/chosen": 1.3638677597045898, + "rewards/margins": 10.956251250372993, + "rewards/rejected": -9.592383490668404, + "step": 895 + }, + { + "epoch": 0.33076461630750775, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 7.674560783225018e-06, + "logits/chosen": 1593461930.6666667, + "logits/rejected": 1526610432.0, + "logps/chosen": -330.1383870442708, + "logps/rejected": -491.8966796875, + "loss": 0.1423, + "rewards/chosen": 0.7845905621846517, + "rewards/margins": 7.885630448659261, + "rewards/rejected": -7.101039886474609, + "step": 896 + }, + { + "epoch": 0.3311337732453509, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 7.669586365543342e-06, + "logits/chosen": 1434181511.5294118, + "logits/rejected": 1812982715.7333333, + "logps/chosen": -265.90593405330884, + "logps/rejected": -376.2314778645833, + "loss": 0.1248, + "rewards/chosen": 1.6857174144071692, + "rewards/margins": 8.931303046731388, + "rewards/rejected": -7.245585632324219, + "step": 897 + }, + { + "epoch": 0.33150293018319416, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 7.66460824918367e-06, + "logits/chosen": 1352985031.1111112, + "logits/rejected": 1459286747.4285715, + "logps/chosen": -284.2196994357639, + "logps/rejected": -536.5718819754464, + "loss": 0.1559, + "rewards/chosen": 1.8418197631835938, + "rewards/margins": 10.688632420131139, + "rewards/rejected": -8.846812656947545, + "step": 898 + }, + { + "epoch": 0.33187208712103733, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 7.659626441043125e-06, + "logits/chosen": 1665285558.857143, + "logits/rejected": 1617629184.0, + "logps/chosen": -248.09040178571428, + "logps/rejected": -539.4910333806819, + "loss": 0.2342, + "rewards/chosen": 0.9018145061674572, + "rewards/margins": 9.706793310322286, + "rewards/rejected": -8.80497880415483, + "step": 899 + }, + { + "epoch": 0.33224124405888056, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 7.654640948023934e-06, + "logits/chosen": 1655850325.3333333, + "logits/rejected": 1784091238.4, + "logps/chosen": -344.3448486328125, + "logps/rejected": -526.92978515625, + "loss": 0.0933, + "rewards/chosen": 1.5509324073791504, + "rewards/margins": 10.57110071182251, + "rewards/rejected": -9.02016830444336, + "step": 900 + }, + { + "epoch": 0.33261040099672373, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 7.649651777033438e-06, + "logits/chosen": 2249472819.2, + "logits/rejected": 2201947306.6666665, + "logps/chosen": -343.315673828125, + "logps/rejected": -493.31298828125, + "loss": 0.246, + "rewards/chosen": 1.0787864685058595, + "rewards/margins": 6.9987528483072925, + "rewards/rejected": -5.919966379801433, + "step": 901 + }, + { + "epoch": 0.3329795579345669, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 7.644658934984066e-06, + "logits/chosen": 2456276992.0, + "logits/rejected": 1881527523.5555556, + "logps/chosen": -244.56180245535714, + "logps/rejected": -487.82329644097223, + "loss": 0.1489, + "rewards/chosen": 0.9881204196384975, + "rewards/margins": 9.73927584905473, + "rewards/rejected": -8.751155429416233, + "step": 902 + }, + { + "epoch": 0.33334871487241013, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 7.639662428793342e-06, + "logits/chosen": 1487158091.2941177, + "logits/rejected": 1611816140.8, + "logps/chosen": -278.10572725183823, + "logps/rejected": -420.52623697916664, + "loss": 0.2118, + "rewards/chosen": 1.27820385203642, + "rewards/margins": 8.86293285594267, + "rewards/rejected": -7.58472900390625, + "step": 903 + }, + { + "epoch": 0.3337178718102533, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 7.634662265383858e-06, + "logits/chosen": 1662423235.047619, + "logits/rejected": 1726442402.909091, + "logps/chosen": -291.7034505208333, + "logps/rejected": -438.85129616477275, + "loss": 0.2083, + "rewards/chosen": 1.4665750776018416, + "rewards/margins": 8.466131185556387, + "rewards/rejected": -6.999556107954546, + "step": 904 + }, + { + "epoch": 0.33408702874809654, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 7.62965845168328e-06, + "logits/chosen": 1726058868.3636363, + "logits/rejected": 2354392268.8, + "logps/chosen": -215.2125799005682, + "logps/rejected": -506.12451171875, + "loss": 0.2494, + "rewards/chosen": 0.8305882540616122, + "rewards/margins": 7.739098080721768, + "rewards/rejected": -6.908509826660156, + "step": 905 + }, + { + "epoch": 0.3344561856859397, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 7.624650994624325e-06, + "logits/chosen": 1767791138.1333334, + "logits/rejected": 2375940216.470588, + "logps/chosen": -293.40514322916664, + "logps/rejected": -447.08375459558823, + "loss": 0.1658, + "rewards/chosen": 0.9302012125651041, + "rewards/margins": 8.82211243872549, + "rewards/rejected": -7.891911226160386, + "step": 906 + }, + { + "epoch": 0.33482534262378294, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 7.619639901144764e-06, + "logits/chosen": 1255263865.9047618, + "logits/rejected": 2391726266.181818, + "logps/chosen": -249.60291108630952, + "logps/rejected": -515.1480823863636, + "loss": 0.16, + "rewards/chosen": 1.883026849655878, + "rewards/margins": 8.997358181775907, + "rewards/rejected": -7.114331332120028, + "step": 907 + }, + { + "epoch": 0.3351944995616261, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 7.614625178187402e-06, + "logits/chosen": 2859116905.4117646, + "logits/rejected": 2164651076.266667, + "logps/chosen": -262.6431525735294, + "logps/rejected": -525.9163736979167, + "loss": 0.1595, + "rewards/chosen": 1.163559408748851, + "rewards/margins": 8.732710206274891, + "rewards/rejected": -7.569150797526041, + "step": 908 + }, + { + "epoch": 0.33556365649946934, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 7.609606832700074e-06, + "logits/chosen": 1687007744.0, + "logits/rejected": 1782651904.0, + "logps/chosen": -276.0148010253906, + "logps/rejected": -471.23260498046875, + "loss": 0.2142, + "rewards/chosen": 0.4478931128978729, + "rewards/margins": 8.311402767896652, + "rewards/rejected": -7.863509654998779, + "step": 909 + }, + { + "epoch": 0.3359328134373125, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 7.604584871635634e-06, + "logits/chosen": 2069287040.0, + "logits/rejected": 1681955328.0, + "logps/chosen": -333.0334777832031, + "logps/rejected": -435.6813659667969, + "loss": 0.1716, + "rewards/chosen": 0.8769656419754028, + "rewards/margins": 8.55918037891388, + "rewards/rejected": -7.682214736938477, + "step": 910 + }, + { + "epoch": 0.33630197037515575, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 7.5995593019519444e-06, + "logits/chosen": 1722787367.3846154, + "logits/rejected": 1900985290.1052632, + "logps/chosen": -298.0274000901442, + "logps/rejected": -489.00956003289474, + "loss": 0.1875, + "rewards/chosen": 0.6020212173461914, + "rewards/margins": 9.469590187072754, + "rewards/rejected": -8.867568969726562, + "step": 911 + }, + { + "epoch": 0.3366711273129989, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 7.59453013061187e-06, + "logits/chosen": 1690889637.6470587, + "logits/rejected": 1660935918.9333334, + "logps/chosen": -298.0385167738971, + "logps/rejected": -337.87568359375, + "loss": 0.1988, + "rewards/chosen": 1.154226639691521, + "rewards/margins": 8.371024531944125, + "rewards/rejected": -7.216797892252604, + "step": 912 + }, + { + "epoch": 0.33704028425084215, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 7.589497364583263e-06, + "logits/chosen": 1773031664.9411764, + "logits/rejected": 1574099763.2, + "logps/chosen": -264.55506089154414, + "logps/rejected": -505.86266276041664, + "loss": 0.1831, + "rewards/chosen": 0.8982828925637638, + "rewards/margins": 9.883475763657513, + "rewards/rejected": -8.98519287109375, + "step": 913 + }, + { + "epoch": 0.3374094411886853, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 7.5844610108389546e-06, + "logits/chosen": 1447904460.8, + "logits/rejected": 1668153514.6666667, + "logps/chosen": -292.718017578125, + "logps/rejected": -363.1314697265625, + "loss": 0.2237, + "rewards/chosen": 0.7272062301635742, + "rewards/margins": 7.903041521708171, + "rewards/rejected": -7.175835291544597, + "step": 914 + }, + { + "epoch": 0.33777859812652855, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 7.579421076356753e-06, + "logits/chosen": 1018841838.9333333, + "logits/rejected": 1410410134.5882354, + "logps/chosen": -218.51357421875, + "logps/rejected": -461.0711454503676, + "loss": 0.1083, + "rewards/chosen": 1.9160456339518228, + "rewards/margins": 9.014958041321997, + "rewards/rejected": -7.098912407370174, + "step": 915 + }, + { + "epoch": 0.3381477550643717, + "grad_norm": 17.125, + "kl": 0.13433599472045898, + "learning_rate": 7.574377568119421e-06, + "logits/chosen": 1930412590.5454545, + "logits/rejected": 1891858432.0, + "logps/chosen": -358.59028764204544, + "logps/rejected": -470.4443359375, + "loss": 0.2489, + "rewards/chosen": 0.7783184918490323, + "rewards/margins": 9.669926157864658, + "rewards/rejected": -8.891607666015625, + "step": 916 + }, + { + "epoch": 0.33851691200221495, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 7.569330493114675e-06, + "logits/chosen": 1960144151.2727273, + "logits/rejected": 2162470521.904762, + "logps/chosen": -375.31125710227275, + "logps/rejected": -513.3297061011905, + "loss": 0.1091, + "rewards/chosen": 1.2385460246693005, + "rewards/margins": 8.746607024948318, + "rewards/rejected": -7.508061000279018, + "step": 917 + }, + { + "epoch": 0.3388860689400581, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 7.564279858335174e-06, + "logits/chosen": 1869662549.3333333, + "logits/rejected": 2316551606.857143, + "logps/chosen": -338.86610243055554, + "logps/rejected": -413.498046875, + "loss": 0.1876, + "rewards/chosen": 1.2881849077012804, + "rewards/margins": 7.598282738337441, + "rewards/rejected": -6.310097830636161, + "step": 918 + }, + { + "epoch": 0.33925522587790136, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 7.5592256707785085e-06, + "logits/chosen": 1960458870.1538463, + "logits/rejected": 1602352397.4736843, + "logps/chosen": -339.49954927884613, + "logps/rejected": -491.3453947368421, + "loss": 0.1199, + "rewards/chosen": 1.098565174983098, + "rewards/margins": 8.301733542067801, + "rewards/rejected": -7.203168367084704, + "step": 919 + }, + { + "epoch": 0.33962438281574453, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 7.55416793744719e-06, + "logits/chosen": 2056471250.8235295, + "logits/rejected": 2146412407.4666667, + "logps/chosen": -270.07080078125, + "logps/rejected": -555.6276041666666, + "loss": 0.1706, + "rewards/chosen": 1.395520154167624, + "rewards/margins": 10.307916394402, + "rewards/rejected": -8.912396240234376, + "step": 920 + }, + { + "epoch": 0.33999353975358776, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 7.549106665348644e-06, + "logits/chosen": 1905739093.3333333, + "logits/rejected": 1687318287.0588236, + "logps/chosen": -300.5034505208333, + "logps/rejected": -439.69192325367646, + "loss": 0.1365, + "rewards/chosen": 1.4077505747477213, + "rewards/margins": 8.567379050161325, + "rewards/rejected": -7.159628475413603, + "step": 921 + }, + { + "epoch": 0.34036269669143093, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 7.544041861495202e-06, + "logits/chosen": 1344253470.1176472, + "logits/rejected": 1642606182.4, + "logps/chosen": -225.7610294117647, + "logps/rejected": -457.58203125, + "loss": 0.1943, + "rewards/chosen": 0.7626733218922335, + "rewards/margins": 8.16777280919692, + "rewards/rejected": -7.405099487304687, + "step": 922 + }, + { + "epoch": 0.34073185362927416, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 7.53897353290408e-06, + "logits/chosen": 1800802889.142857, + "logits/rejected": 2673995403.6363635, + "logps/chosen": -249.97484188988096, + "logps/rejected": -537.8727361505681, + "loss": 0.1836, + "rewards/chosen": 1.3472404479980469, + "rewards/margins": 10.790766282515092, + "rewards/rejected": -9.443525834517045, + "step": 923 + }, + { + "epoch": 0.34110101056711734, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 7.5339016865973865e-06, + "logits/chosen": 1795347251.2, + "logits/rejected": 1636105557.3333333, + "logps/chosen": -298.932861328125, + "logps/rejected": -551.721435546875, + "loss": 0.2117, + "rewards/chosen": 1.0166474342346192, + "rewards/margins": 9.860231494903564, + "rewards/rejected": -8.843584060668945, + "step": 924 + }, + { + "epoch": 0.34147016750496056, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 7.528826329602099e-06, + "logits/chosen": 2177455360.0, + "logits/rejected": 2134380928.0, + "logps/chosen": -285.8106994628906, + "logps/rejected": -420.74798583984375, + "loss": 0.1701, + "rewards/chosen": 1.2023110389709473, + "rewards/margins": 8.19617748260498, + "rewards/rejected": -6.993866443634033, + "step": 925 + }, + { + "epoch": 0.34183932444280374, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 7.523747468950061e-06, + "logits/chosen": 1323178257.0666666, + "logits/rejected": 1444000707.764706, + "logps/chosen": -245.53020833333332, + "logps/rejected": -403.16946231617646, + "loss": 0.1866, + "rewards/chosen": 0.753152338663737, + "rewards/margins": 8.40325573939903, + "rewards/rejected": -7.650103400735294, + "step": 926 + }, + { + "epoch": 0.34220848138064697, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 7.518665111677968e-06, + "logits/chosen": 2290167076.571429, + "logits/rejected": 2018201827.5555556, + "logps/chosen": -291.4937220982143, + "logps/rejected": -431.3548177083333, + "loss": 0.1155, + "rewards/chosen": 1.6446316582815987, + "rewards/margins": 7.973628104679168, + "rewards/rejected": -6.32899644639757, + "step": 927 + }, + { + "epoch": 0.34257763831849014, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 7.513579264827362e-06, + "logits/chosen": 1385023715.5555556, + "logits/rejected": 1243799552.0, + "logps/chosen": -271.28968641493054, + "logps/rejected": -429.49428013392856, + "loss": 0.166, + "rewards/chosen": 1.675567838880751, + "rewards/margins": 8.763795247153631, + "rewards/rejected": -7.088227408272879, + "step": 928 + }, + { + "epoch": 0.34294679525633337, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 7.508489935444618e-06, + "logits/chosen": 2002825671.1111112, + "logits/rejected": 1914287981.7142856, + "logps/chosen": -285.6776529947917, + "logps/rejected": -538.4484165736607, + "loss": 0.193, + "rewards/chosen": 1.1649994320339627, + "rewards/margins": 10.725618241325257, + "rewards/rejected": -9.560618809291295, + "step": 929 + }, + { + "epoch": 0.34331595219417654, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 7.5033971305809405e-06, + "logits/chosen": 2330751939.7647057, + "logits/rejected": 2037251003.7333333, + "logps/chosen": -295.12293198529414, + "logps/rejected": -350.74462890625, + "loss": 0.1766, + "rewards/chosen": 1.469827315386604, + "rewards/margins": 7.661250350054573, + "rewards/rejected": -6.191423034667968, + "step": 930 + }, + { + "epoch": 0.3436851091320198, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 7.498300857292342e-06, + "logits/chosen": 1719453696.0, + "logits/rejected": 2068799715.5555556, + "logps/chosen": -321.3524693080357, + "logps/rejected": -447.4122721354167, + "loss": 0.1707, + "rewards/chosen": 0.7054557800292969, + "rewards/margins": 7.708563910590278, + "rewards/rejected": -7.003108130560981, + "step": 931 + }, + { + "epoch": 0.34405426606986295, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 7.493201122639648e-06, + "logits/chosen": 2340573184.0, + "logits/rejected": 1927329189.6470587, + "logps/chosen": -314.91253255208335, + "logps/rejected": -533.3768382352941, + "loss": 0.1049, + "rewards/chosen": 2.145867919921875, + "rewards/margins": 10.510021613625918, + "rewards/rejected": -8.364153693704043, + "step": 932 + }, + { + "epoch": 0.3444234230077062, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 7.488097933688474e-06, + "logits/chosen": 1447946496.0, + "logits/rejected": 1541616640.0, + "logps/chosen": -259.95098876953125, + "logps/rejected": -495.1393127441406, + "loss": 0.1307, + "rewards/chosen": 1.893912672996521, + "rewards/margins": 8.915956854820251, + "rewards/rejected": -7.0220441818237305, + "step": 933 + }, + { + "epoch": 0.34479257994554935, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 7.482991297509225e-06, + "logits/chosen": 2224604306.285714, + "logits/rejected": 1424257024.0, + "logps/chosen": -221.37976655505952, + "logps/rejected": -401.4173029119318, + "loss": 0.2513, + "rewards/chosen": 1.1251788366408575, + "rewards/margins": 8.36860923436813, + "rewards/rejected": -7.2434303977272725, + "step": 934 + }, + { + "epoch": 0.3451617368833926, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 7.477881221177077e-06, + "logits/chosen": 1761141191.1111112, + "logits/rejected": 2132146322.2857144, + "logps/chosen": -263.58873155381946, + "logps/rejected": -593.7700892857143, + "loss": 0.1433, + "rewards/chosen": 1.5794934166802301, + "rewards/margins": 11.216820338415722, + "rewards/rejected": -9.637326921735491, + "step": 935 + }, + { + "epoch": 0.34553089382123575, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 7.472767711771979e-06, + "logits/chosen": 1821833362.2857144, + "logits/rejected": 1630131086.2222223, + "logps/chosen": -297.4893275669643, + "logps/rejected": -479.9275716145833, + "loss": 0.146, + "rewards/chosen": 1.212001119341169, + "rewards/margins": 7.0146333452255005, + "rewards/rejected": -5.802632225884332, + "step": 936 + }, + { + "epoch": 0.345900050759079, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 7.467650776378633e-06, + "logits/chosen": 3512381440.0, + "logits/rejected": 1458853730.4615386, + "logps/chosen": -338.2206217447917, + "logps/rejected": -422.85171274038464, + "loss": 0.0864, + "rewards/chosen": 0.9173627694447836, + "rewards/margins": 7.412650738006983, + "rewards/rejected": -6.4952879685622, + "step": 937 + }, + { + "epoch": 0.34626920769692215, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 7.462530422086487e-06, + "logits/chosen": 1397631385.6, + "logits/rejected": 1547091245.1764705, + "logps/chosen": -329.06634114583335, + "logps/rejected": -407.38536879595586, + "loss": 0.1634, + "rewards/chosen": 1.2566670735677083, + "rewards/margins": 8.51182409548292, + "rewards/rejected": -7.2551570219152115, + "step": 938 + }, + { + "epoch": 0.34663836463476533, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 7.4574066559897276e-06, + "logits/chosen": 2377773251.047619, + "logits/rejected": 1754632378.1818182, + "logps/chosen": -432.77901785714283, + "logps/rejected": -614.4805131392045, + "loss": 0.2243, + "rewards/chosen": 0.8133530389694941, + "rewards/margins": 11.139354986545843, + "rewards/rejected": -10.32600194757635, + "step": 939 + }, + { + "epoch": 0.34700752157260856, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 7.452279485187268e-06, + "logits/chosen": 1447949653.3333333, + "logits/rejected": 2150024285.090909, + "logps/chosen": -262.5908900669643, + "logps/rejected": -357.42502663352275, + "loss": 0.1509, + "rewards/chosen": 2.028566451299758, + "rewards/margins": 8.210210544206364, + "rewards/rejected": -6.181644092906605, + "step": 940 + }, + { + "epoch": 0.34737667851045173, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 7.4471489167827374e-06, + "logits/chosen": 1611676525.7142856, + "logits/rejected": 1414036707.5555556, + "logps/chosen": -265.53526088169644, + "logps/rejected": -587.0643446180555, + "loss": 0.1662, + "rewards/chosen": 0.8271936689104352, + "rewards/margins": 9.588599613734655, + "rewards/rejected": -8.761405944824219, + "step": 941 + }, + { + "epoch": 0.34774583544829496, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 7.442014957884473e-06, + "logits/chosen": 1233277064.5333333, + "logits/rejected": 1708645918.1176472, + "logps/chosen": -283.3255208333333, + "logps/rejected": -518.5861098345588, + "loss": 0.0978, + "rewards/chosen": 1.9671727498372396, + "rewards/margins": 8.511437150543811, + "rewards/rejected": -6.544264400706572, + "step": 942 + }, + { + "epoch": 0.34811499238613813, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 7.43687761560551e-06, + "logits/chosen": 1316014518.857143, + "logits/rejected": 1534406997.3333333, + "logps/chosen": -277.56649344308033, + "logps/rejected": -457.04893663194446, + "loss": 0.1335, + "rewards/chosen": 1.7897484643118722, + "rewards/margins": 9.203452382768903, + "rewards/rejected": -7.413703918457031, + "step": 943 + }, + { + "epoch": 0.34848414932398136, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 7.43173689706357e-06, + "logits/chosen": 1710671744.0, + "logits/rejected": 2123786240.0, + "logps/chosen": -256.97344970703125, + "logps/rejected": -378.7826843261719, + "loss": 0.1972, + "rewards/chosen": 0.7358039021492004, + "rewards/margins": 8.032732784748077, + "rewards/rejected": -7.296928882598877, + "step": 944 + }, + { + "epoch": 0.34885330626182454, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 7.4265928093810545e-06, + "logits/chosen": 1805941760.0, + "logits/rejected": 1353394029.7142856, + "logps/chosen": -315.53607855902777, + "logps/rejected": -467.92599051339283, + "loss": 0.1995, + "rewards/chosen": 1.014037874009874, + "rewards/margins": 8.545091144622319, + "rewards/rejected": -7.5310532706124445, + "step": 945 + }, + { + "epoch": 0.34922246319966777, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 7.421445359685031e-06, + "logits/chosen": 2025756672.0, + "logits/rejected": 2363623594.6666665, + "logps/chosen": -260.0708740234375, + "logps/rejected": -463.7125651041667, + "loss": 0.1523, + "rewards/chosen": 1.8779289245605468, + "rewards/margins": 9.56402130126953, + "rewards/rejected": -7.686092376708984, + "step": 946 + }, + { + "epoch": 0.34959162013751094, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 7.416294555107226e-06, + "logits/chosen": 1616024462.2222223, + "logits/rejected": 2003059273.142857, + "logps/chosen": -229.84727647569446, + "logps/rejected": -626.2214704241071, + "loss": 0.1279, + "rewards/chosen": 1.7303761376274958, + "rewards/margins": 11.969911878071134, + "rewards/rejected": -10.239535740443639, + "step": 947 + }, + { + "epoch": 0.34996077707535417, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 7.411140402784014e-06, + "logits/chosen": 1490518256.9411764, + "logits/rejected": 1578971409.0666666, + "logps/chosen": -258.0514705882353, + "logps/rejected": -527.38232421875, + "loss": 0.1893, + "rewards/chosen": 1.2784929836497587, + "rewards/margins": 9.739565778246114, + "rewards/rejected": -8.461072794596355, + "step": 948 + }, + { + "epoch": 0.35032993401319734, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 7.4059829098564075e-06, + "logits/chosen": 1558872519.1111112, + "logits/rejected": 1927073206.857143, + "logps/chosen": -316.08827039930554, + "logps/rejected": -447.53780691964283, + "loss": 0.2301, + "rewards/chosen": 0.4418847295973036, + "rewards/margins": 8.732850218576099, + "rewards/rejected": -8.290965488978795, + "step": 949 + }, + { + "epoch": 0.35069909095104057, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 7.400822083470046e-06, + "logits/chosen": 1879575250.8235295, + "logits/rejected": 1586992469.3333333, + "logps/chosen": -302.0782111672794, + "logps/rejected": -451.1384765625, + "loss": 0.2205, + "rewards/chosen": 0.42042970657348633, + "rewards/margins": 6.872364012400309, + "rewards/rejected": -6.451934305826823, + "step": 950 + }, + { + "epoch": 0.35106824788888374, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 7.395657930775191e-06, + "logits/chosen": 2129165789.8666666, + "logits/rejected": 1816884645.6470587, + "logps/chosen": -250.94674479166667, + "logps/rejected": -426.09329044117646, + "loss": 0.169, + "rewards/chosen": 1.108864720662435, + "rewards/margins": 7.327608886419558, + "rewards/rejected": -6.218744165757123, + "step": 951 + }, + { + "epoch": 0.351437404826727, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 7.390490458926708e-06, + "logits/chosen": 1667485952.0, + "logits/rejected": 1894405760.0, + "logps/chosen": -303.7418518066406, + "logps/rejected": -434.12652587890625, + "loss": 0.1836, + "rewards/chosen": 0.9207848906517029, + "rewards/margins": 8.116925656795502, + "rewards/rejected": -7.196140766143799, + "step": 952 + }, + { + "epoch": 0.35180656176457015, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 7.385319675084066e-06, + "logits/chosen": 1373067400.5333333, + "logits/rejected": 2307482563.7647057, + "logps/chosen": -220.604052734375, + "logps/rejected": -401.45447495404414, + "loss": 0.1471, + "rewards/chosen": 1.1077101389567057, + "rewards/margins": 8.315524972653856, + "rewards/rejected": -7.20781483369715, + "step": 953 + }, + { + "epoch": 0.3521757187024134, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 7.38014558641132e-06, + "logits/chosen": 1684706450.2857144, + "logits/rejected": 2076216661.3333333, + "logps/chosen": -301.26346261160717, + "logps/rejected": -408.6073404947917, + "loss": 0.1348, + "rewards/chosen": 1.1618796757289342, + "rewards/margins": 7.90938104901995, + "rewards/rejected": -6.747501373291016, + "step": 954 + }, + { + "epoch": 0.35254487564025655, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 7.3749682000771016e-06, + "logits/chosen": 1645678045.8666666, + "logits/rejected": 1413347809.8823528, + "logps/chosen": -287.57607421875, + "logps/rejected": -477.34791475183823, + "loss": 0.1031, + "rewards/chosen": 2.4843419392903647, + "rewards/margins": 10.2081210566502, + "rewards/rejected": -7.723779117359834, + "step": 955 + }, + { + "epoch": 0.3529140325780998, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 7.369787523254617e-06, + "logits/chosen": 1606024669.8666666, + "logits/rejected": 1487674428.235294, + "logps/chosen": -303.75478515625, + "logps/rejected": -648.9426700367648, + "loss": 0.1396, + "rewards/chosen": 1.4781453450520834, + "rewards/margins": 9.526215018478094, + "rewards/rejected": -8.048069673426012, + "step": 956 + }, + { + "epoch": 0.35328318951594295, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 7.364603563121627e-06, + "logits/chosen": 1678673578.6666667, + "logits/rejected": 1662014902.857143, + "logps/chosen": -294.0096842447917, + "logps/rejected": -526.1534598214286, + "loss": 0.175, + "rewards/chosen": 1.0924949645996094, + "rewards/margins": 8.929591587611608, + "rewards/rejected": -7.837096623011997, + "step": 957 + }, + { + "epoch": 0.3536523464537862, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 7.359416326860443e-06, + "logits/chosen": 1438470609.4545455, + "logits/rejected": 1270316339.2, + "logps/chosen": -299.47713955965907, + "logps/rejected": -499.35068359375, + "loss": 0.2181, + "rewards/chosen": 1.450064485723322, + "rewards/margins": 10.619850557500666, + "rewards/rejected": -9.169786071777343, + "step": 958 + }, + { + "epoch": 0.35402150339162936, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 7.3542258216579136e-06, + "logits/chosen": 1652226048.0, + "logits/rejected": 1550886195.2, + "logps/chosen": -234.97845458984375, + "logps/rejected": -515.172998046875, + "loss": 0.1079, + "rewards/chosen": 1.8943514823913574, + "rewards/margins": 9.456859111785889, + "rewards/rejected": -7.562507629394531, + "step": 959 + }, + { + "epoch": 0.3543906603294726, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 7.349032054705417e-06, + "logits/chosen": 1516410042.1818182, + "logits/rejected": 1574237427.8095238, + "logps/chosen": -317.68148526278407, + "logps/rejected": -417.3616536458333, + "loss": 0.066, + "rewards/chosen": 2.162708802656694, + "rewards/margins": 9.034151324978122, + "rewards/rejected": -6.871442522321429, + "step": 960 + }, + { + "epoch": 0.35475981726731576, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 7.343835033198854e-06, + "logits/chosen": 1409585561.6, + "logits/rejected": 1751366826.6666667, + "logps/chosen": -245.317529296875, + "logps/rejected": -461.6015625, + "loss": 0.2072, + "rewards/chosen": 0.9638049125671386, + "rewards/margins": 9.06177905400594, + "rewards/rejected": -8.097974141438803, + "step": 961 + }, + { + "epoch": 0.355128974205159, + "grad_norm": 13.4375, + "kl": 0.24583673477172852, + "learning_rate": 7.33863476433863e-06, + "logits/chosen": 2062272102.4, + "logits/rejected": 2364149037.1764708, + "logps/chosen": -293.49261067708335, + "logps/rejected": -568.8755744485294, + "loss": 0.1889, + "rewards/chosen": 0.7752403895060221, + "rewards/margins": 8.824402412713743, + "rewards/rejected": -8.049162023207721, + "step": 962 + }, + { + "epoch": 0.35549813114300216, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 7.333431255329653e-06, + "logits/chosen": 1834292932.9230769, + "logits/rejected": 1764339173.0526316, + "logps/chosen": -216.812255859375, + "logps/rejected": -476.7093441611842, + "loss": 0.1674, + "rewards/chosen": 0.5153369169968826, + "rewards/margins": 7.828008779147376, + "rewards/rejected": -7.312671862150493, + "step": 963 + }, + { + "epoch": 0.3558672880808454, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 7.3282245133813155e-06, + "logits/chosen": 2321588224.0, + "logits/rejected": 2192022483.478261, + "logps/chosen": -255.67643229166666, + "logps/rejected": -489.8873131793478, + "loss": 0.0913, + "rewards/chosen": 2.4182039896647134, + "rewards/margins": 10.7249136109283, + "rewards/rejected": -8.306709621263588, + "step": 964 + }, + { + "epoch": 0.35623644501868856, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 7.323014545707497e-06, + "logits/chosen": 1908292539.7333333, + "logits/rejected": 1798498304.0, + "logps/chosen": -264.74534505208334, + "logps/rejected": -429.3397863051471, + "loss": 0.1298, + "rewards/chosen": 1.4637151082356772, + "rewards/margins": 9.023291703766468, + "rewards/rejected": -7.559576595530791, + "step": 965 + }, + { + "epoch": 0.3566056019565318, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 7.317801359526538e-06, + "logits/chosen": 2345181184.0, + "logits/rejected": 2348881237.3333335, + "logps/chosen": -266.5017520680147, + "logps/rejected": -469.2376953125, + "loss": 0.2089, + "rewards/chosen": 0.7775572608498966, + "rewards/margins": 8.016799814560834, + "rewards/rejected": -7.2392425537109375, + "step": 966 + }, + { + "epoch": 0.35697475889437497, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 7.312584962061243e-06, + "logits/chosen": 1393080758.857143, + "logits/rejected": 1729947648.0, + "logps/chosen": -320.68233816964283, + "logps/rejected": -473.22422960069446, + "loss": 0.1185, + "rewards/chosen": 1.7503790174211775, + "rewards/margins": 8.686952469840882, + "rewards/rejected": -6.9365734524197045, + "step": 967 + }, + { + "epoch": 0.3573439158322182, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 7.307365360538865e-06, + "logits/chosen": 1636368930.1333334, + "logits/rejected": 1213020400.9411764, + "logps/chosen": -260.43097330729165, + "logps/rejected": -562.6962316176471, + "loss": 0.1051, + "rewards/chosen": 2.345916239420573, + "rewards/margins": 10.971485302495022, + "rewards/rejected": -8.62556906307445, + "step": 968 + }, + { + "epoch": 0.35771307277006137, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 7.302142562191092e-06, + "logits/chosen": 1700714064.8421052, + "logits/rejected": 1755746461.5384614, + "logps/chosen": -236.75647615131578, + "logps/rejected": -352.7238581730769, + "loss": 0.1956, + "rewards/chosen": 1.1716317628559314, + "rewards/margins": 8.01102361794908, + "rewards/rejected": -6.839391855093149, + "step": 969 + }, + { + "epoch": 0.3580822297079046, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 7.2969165742540495e-06, + "logits/chosen": 2663314578.285714, + "logits/rejected": 1753791829.3333333, + "logps/chosen": -252.30189732142858, + "logps/rejected": -431.94151475694446, + "loss": 0.1416, + "rewards/chosen": 1.6858813422066825, + "rewards/margins": 7.935684249514625, + "rewards/rejected": -6.249802907307942, + "step": 970 + }, + { + "epoch": 0.35845138664574777, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 7.2916874039682765e-06, + "logits/chosen": 2230933504.0, + "logits/rejected": 1695539785.142857, + "logps/chosen": -326.95871803977275, + "logps/rejected": -461.6517857142857, + "loss": 0.1146, + "rewards/chosen": 1.3776614449240945, + "rewards/margins": 8.49801187391405, + "rewards/rejected": -7.120350428989956, + "step": 971 + }, + { + "epoch": 0.358820543583591, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 7.286455058578719e-06, + "logits/chosen": 1716515726.2222223, + "logits/rejected": 1816069997.7142856, + "logps/chosen": -266.0740559895833, + "logps/rejected": -526.49658203125, + "loss": 0.1466, + "rewards/chosen": 1.6248890558878581, + "rewards/margins": 9.270361991155715, + "rewards/rejected": -7.645472935267857, + "step": 972 + }, + { + "epoch": 0.3591897005214342, + "grad_norm": 15.125, + "kl": 0.0, + "learning_rate": 7.281219545334727e-06, + "logits/chosen": 2064604364.8, + "logits/rejected": 2393996800.0, + "logps/chosen": -286.6337158203125, + "logps/rejected": -608.9473470052084, + "loss": 0.2436, + "rewards/chosen": 0.623737096786499, + "rewards/margins": 9.939618825912476, + "rewards/rejected": -9.315881729125977, + "step": 973 + }, + { + "epoch": 0.35955885745927735, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 7.275980871490038e-06, + "logits/chosen": 1813209088.0, + "logits/rejected": 2076497289.8461537, + "logps/chosen": -221.1667351973684, + "logps/rejected": -374.6608698918269, + "loss": 0.2224, + "rewards/chosen": 0.785782864219264, + "rewards/margins": 7.862442433592762, + "rewards/rejected": -7.076659569373498, + "step": 974 + }, + { + "epoch": 0.3599280143971206, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 7.270739044302767e-06, + "logits/chosen": 1592355717.12, + "logits/rejected": 1840764489.142857, + "logps/chosen": -254.803359375, + "logps/rejected": -565.531982421875, + "loss": 0.232, + "rewards/chosen": 1.3555564880371094, + "rewards/margins": 10.002648489815849, + "rewards/rejected": -8.64709200177874, + "step": 975 + }, + { + "epoch": 0.36029717133496375, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 7.265494071035401e-06, + "logits/chosen": 1381356416.0, + "logits/rejected": 1673970176.0, + "logps/chosen": -305.58660888671875, + "logps/rejected": -544.9133911132812, + "loss": 0.1208, + "rewards/chosen": 1.9210807085037231, + "rewards/margins": 9.764545559883118, + "rewards/rejected": -7.8434648513793945, + "step": 976 + }, + { + "epoch": 0.360666328272807, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 7.260245958954783e-06, + "logits/chosen": 2319021149.090909, + "logits/rejected": 2420463001.6, + "logps/chosen": -257.863037109375, + "logps/rejected": -594.554443359375, + "loss": 0.1359, + "rewards/chosen": 2.1475452076305044, + "rewards/margins": 10.656732524525035, + "rewards/rejected": -8.509187316894531, + "step": 977 + }, + { + "epoch": 0.36103548521065015, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 7.254994715332102e-06, + "logits/chosen": 1574477653.3333333, + "logits/rejected": 1777807360.0, + "logps/chosen": -296.8848876953125, + "logps/rejected": -472.112646484375, + "loss": 0.1124, + "rewards/chosen": 1.5844793319702148, + "rewards/margins": 8.958944511413574, + "rewards/rejected": -7.3744651794433596, + "step": 978 + }, + { + "epoch": 0.3614046421484934, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 7.249740347442895e-06, + "logits/chosen": 1846079360.0, + "logits/rejected": 1587787008.0, + "logps/chosen": -286.3741455078125, + "logps/rejected": -449.79815673828125, + "loss": 0.1571, + "rewards/chosen": 1.2616990804672241, + "rewards/margins": 8.543264746665955, + "rewards/rejected": -7.2815656661987305, + "step": 979 + }, + { + "epoch": 0.36177379908633656, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 7.244482862567018e-06, + "logits/chosen": 2190936221.5384617, + "logits/rejected": 1932879117.4736843, + "logps/chosen": -280.38521634615387, + "logps/rejected": -513.8957134046053, + "loss": 0.1384, + "rewards/chosen": 0.987820551945613, + "rewards/margins": 9.474336600979331, + "rewards/rejected": -8.486516049033717, + "step": 980 + }, + { + "epoch": 0.3621429560241798, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 7.2392222679886506e-06, + "logits/chosen": 2757159273.4117646, + "logits/rejected": 2392661469.866667, + "logps/chosen": -283.2100183823529, + "logps/rejected": -430.2301432291667, + "loss": 0.1369, + "rewards/chosen": 1.8558273315429688, + "rewards/margins": 7.846414693196615, + "rewards/rejected": -5.990587361653646, + "step": 981 + }, + { + "epoch": 0.36251211296202296, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 7.23395857099628e-06, + "logits/chosen": 1959237159.3846154, + "logits/rejected": 1727010923.7894738, + "logps/chosen": -279.84420072115387, + "logps/rejected": -449.20579769736844, + "loss": 0.1268, + "rewards/chosen": 1.8084634634164662, + "rewards/margins": 8.901284283471977, + "rewards/rejected": -7.09282082005551, + "step": 982 + }, + { + "epoch": 0.3628812698998662, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 7.2286917788826926e-06, + "logits/chosen": 1503039744.0, + "logits/rejected": 2163049813.3333335, + "logps/chosen": -307.04766845703125, + "logps/rejected": -438.4186604817708, + "loss": 0.0863, + "rewards/chosen": 1.2653594017028809, + "rewards/margins": 8.401666800181072, + "rewards/rejected": -7.13630739847819, + "step": 983 + }, + { + "epoch": 0.36325042683770936, + "grad_norm": 17.625, + "kl": 0.23502063751220703, + "learning_rate": 7.22342189894496e-06, + "logits/chosen": 1752616550.4, + "logits/rejected": 1625619626.6666667, + "logps/chosen": -362.8557861328125, + "logps/rejected": -364.2164713541667, + "loss": 0.2617, + "rewards/chosen": 0.5863108158111572, + "rewards/margins": 6.7914690494537355, + "rewards/rejected": -6.205158233642578, + "step": 984 + }, + { + "epoch": 0.3636195837755526, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 7.218148938484435e-06, + "logits/chosen": 1652110267.7333333, + "logits/rejected": 1324196562.8235295, + "logps/chosen": -288.89147135416664, + "logps/rejected": -462.12971047794116, + "loss": 0.1594, + "rewards/chosen": 1.224811808268229, + "rewards/margins": 9.152072263231464, + "rewards/rejected": -7.9272604549632355, + "step": 985 + }, + { + "epoch": 0.36398874071339576, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 7.212872904806736e-06, + "logits/chosen": 1901161728.0, + "logits/rejected": 1616806400.0, + "logps/chosen": -228.57269287109375, + "logps/rejected": -400.9443054199219, + "loss": 0.1549, + "rewards/chosen": 1.6516929864883423, + "rewards/margins": 8.632474541664124, + "rewards/rejected": -6.980781555175781, + "step": 986 + }, + { + "epoch": 0.364357897651239, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 7.207593805221742e-06, + "logits/chosen": 1408388681.142857, + "logits/rejected": 1415150478.2222223, + "logps/chosen": -236.67501395089286, + "logps/rejected": -489.34950086805554, + "loss": 0.1353, + "rewards/chosen": 1.3888410840715681, + "rewards/margins": 9.922443480718703, + "rewards/rejected": -8.533602396647135, + "step": 987 + }, + { + "epoch": 0.36472705458908217, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 7.202311647043579e-06, + "logits/chosen": 1497909394.2857144, + "logits/rejected": 1511184611.5555556, + "logps/chosen": -319.49204799107144, + "logps/rejected": -533.3566623263889, + "loss": 0.1143, + "rewards/chosen": 1.4950159617832728, + "rewards/margins": 14.413478927006796, + "rewards/rejected": -12.918462965223524, + "step": 988 + }, + { + "epoch": 0.3650962115269254, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 7.197026437590608e-06, + "logits/chosen": 2296422400.0, + "logits/rejected": 1475995648.0, + "logps/chosen": -328.4907740542763, + "logps/rejected": -608.2450796274038, + "loss": 0.176, + "rewards/chosen": 1.2631664276123047, + "rewards/margins": 9.48023326580341, + "rewards/rejected": -8.217066838191105, + "step": 989 + }, + { + "epoch": 0.36546536846476857, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 7.191738184185422e-06, + "logits/chosen": 1630315520.0, + "logits/rejected": 1560410496.0, + "logps/chosen": -331.495361328125, + "logps/rejected": -477.4812316894531, + "loss": 0.1674, + "rewards/chosen": 1.070287823677063, + "rewards/margins": 9.238373875617981, + "rewards/rejected": -8.168086051940918, + "step": 990 + }, + { + "epoch": 0.3658345254026118, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 7.186446894154826e-06, + "logits/chosen": 1444898508.8, + "logits/rejected": 1346974720.0, + "logps/chosen": -242.1988037109375, + "logps/rejected": -366.4676106770833, + "loss": 0.2152, + "rewards/chosen": 1.2893174171447754, + "rewards/margins": 7.068068790435791, + "rewards/rejected": -5.778751373291016, + "step": 991 + }, + { + "epoch": 0.366203682340455, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 7.181152574829837e-06, + "logits/chosen": 1762307510.857143, + "logits/rejected": 1726475377.7777777, + "logps/chosen": -281.13462611607144, + "logps/rejected": -441.1068522135417, + "loss": 0.1504, + "rewards/chosen": 1.120798110961914, + "rewards/margins": 7.686441421508789, + "rewards/rejected": -6.565643310546875, + "step": 992 + }, + { + "epoch": 0.3665728392782982, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 7.175855233545669e-06, + "logits/chosen": 1077204480.0, + "logits/rejected": 1931332096.0, + "logps/chosen": -262.81243896484375, + "logps/rejected": -427.8982747395833, + "loss": 0.089, + "rewards/chosen": 1.3157358169555664, + "rewards/margins": 8.350486437479656, + "rewards/rejected": -7.034750620524089, + "step": 993 + }, + { + "epoch": 0.3669419962161414, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 7.1705548776417165e-06, + "logits/chosen": 1722438314.6666667, + "logits/rejected": 1947669913.6, + "logps/chosen": -222.4847208658854, + "logps/rejected": -423.852978515625, + "loss": 0.1279, + "rewards/chosen": 1.4284906387329102, + "rewards/margins": 7.967957496643066, + "rewards/rejected": -6.539466857910156, + "step": 994 + }, + { + "epoch": 0.3673111531539846, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 7.1652515144615575e-06, + "logits/chosen": 2405808583.111111, + "logits/rejected": 2353978953.142857, + "logps/chosen": -337.2381998697917, + "logps/rejected": -482.495849609375, + "loss": 0.1692, + "rewards/chosen": 1.3000715043809679, + "rewards/margins": 9.607193326193189, + "rewards/rejected": -8.30712182181222, + "step": 995 + }, + { + "epoch": 0.3676803100918278, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 7.1599451513529364e-06, + "logits/chosen": 1827178359.4666667, + "logits/rejected": 2424532269.1764708, + "logps/chosen": -272.7055338541667, + "logps/rejected": -425.91061580882354, + "loss": 0.1273, + "rewards/chosen": 1.6635528564453126, + "rewards/margins": 8.568769746668199, + "rewards/rejected": -6.905216890222886, + "step": 996 + }, + { + "epoch": 0.368049467029671, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 7.154635795667748e-06, + "logits/chosen": 2276263799.4666667, + "logits/rejected": 1883278878.1176472, + "logps/chosen": -183.10221354166666, + "logps/rejected": -445.306640625, + "loss": 0.1339, + "rewards/chosen": 1.7934338887532553, + "rewards/margins": 9.271901942234415, + "rewards/rejected": -7.4784680534811585, + "step": 997 + }, + { + "epoch": 0.3684186239675142, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 7.149323454762039e-06, + "logits/chosen": 2160585458.5263157, + "logits/rejected": 1944447763.6923077, + "logps/chosen": -271.9268092105263, + "logps/rejected": -583.3894981971154, + "loss": 0.1788, + "rewards/chosen": 1.4040230198910362, + "rewards/margins": 11.444025696047888, + "rewards/rejected": -10.04000267615685, + "step": 998 + }, + { + "epoch": 0.3687877809053574, + "grad_norm": 11.5625, + "kl": 0.23833942413330078, + "learning_rate": 7.144008135995992e-06, + "logits/chosen": 1906898156.3076923, + "logits/rejected": 2135348924.631579, + "logps/chosen": -269.36632361778845, + "logps/rejected": -509.2290296052632, + "loss": 0.1521, + "rewards/chosen": 1.0306680385883038, + "rewards/margins": 10.820670961851052, + "rewards/rejected": -9.790002923262747, + "step": 999 + }, + { + "epoch": 0.3691569378432006, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 7.1386898467339114e-06, + "logits/chosen": 2097580032.0, + "logits/rejected": 1917192704.0, + "logps/chosen": -361.8123291015625, + "logps/rejected": -551.8881022135416, + "loss": 0.2031, + "rewards/chosen": 1.446066188812256, + "rewards/margins": 8.684330272674561, + "rewards/rejected": -7.238264083862305, + "step": 1000 + }, + { + "epoch": 0.3695260947810438, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 7.13336859434422e-06, + "logits/chosen": 1305741635.368421, + "logits/rejected": 1609012145.2307692, + "logps/chosen": -295.3207750822368, + "logps/rejected": -361.2751277043269, + "loss": 0.1953, + "rewards/chosen": 1.1566213306627775, + "rewards/margins": 6.857979832390542, + "rewards/rejected": -5.701358501727764, + "step": 1001 + }, + { + "epoch": 0.369895251718887, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 7.128044386199445e-06, + "logits/chosen": 1775492534.857143, + "logits/rejected": 1439385372.4444444, + "logps/chosen": -294.73228236607144, + "logps/rejected": -417.39453125, + "loss": 0.1552, + "rewards/chosen": 1.1806485312325614, + "rewards/margins": 8.779596692039853, + "rewards/rejected": -7.598948160807292, + "step": 1002 + }, + { + "epoch": 0.3702644086567302, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 7.1227172296762086e-06, + "logits/chosen": 1571582554.3529413, + "logits/rejected": 1307812113.0666666, + "logps/chosen": -340.0009765625, + "logps/rejected": -421.11712239583335, + "loss": 0.2024, + "rewards/chosen": 0.9526125963996438, + "rewards/margins": 7.747683498906154, + "rewards/rejected": -6.79507090250651, + "step": 1003 + }, + { + "epoch": 0.3706335655945734, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 7.11738713215522e-06, + "logits/chosen": 1451334528.0, + "logits/rejected": 2082155264.0, + "logps/chosen": -281.0663146972656, + "logps/rejected": -549.1507568359375, + "loss": 0.1469, + "rewards/chosen": 1.365386724472046, + "rewards/margins": 10.51999831199646, + "rewards/rejected": -9.154611587524414, + "step": 1004 + }, + { + "epoch": 0.3710027225324166, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 7.112054101021262e-06, + "logits/chosen": 2051193651.2, + "logits/rejected": 2593418782.117647, + "logps/chosen": -269.0847493489583, + "logps/rejected": -389.28435202205884, + "loss": 0.1824, + "rewards/chosen": 0.7525379180908203, + "rewards/margins": 7.855279832727769, + "rewards/rejected": -7.102741914636948, + "step": 1005 + }, + { + "epoch": 0.3713718794702598, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 7.106718143663178e-06, + "logits/chosen": 1515413383.5294118, + "logits/rejected": 1759716966.4, + "logps/chosen": -249.42681525735293, + "logps/rejected": -430.48896484375, + "loss": 0.1241, + "rewards/chosen": 2.167745029225069, + "rewards/margins": 9.018424553964653, + "rewards/rejected": -6.850679524739584, + "step": 1006 + }, + { + "epoch": 0.371741036408103, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 7.101379267473873e-06, + "logits/chosen": 1397140736.0, + "logits/rejected": 1566905216.0, + "logps/chosen": -212.9573974609375, + "logps/rejected": -484.15118408203125, + "loss": 0.1198, + "rewards/chosen": 1.9922702312469482, + "rewards/margins": 9.459661722183228, + "rewards/rejected": -7.467391490936279, + "step": 1007 + }, + { + "epoch": 0.3721101933459462, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 7.096037479850292e-06, + "logits/chosen": 1703364096.0, + "logits/rejected": 1684157952.0, + "logps/chosen": -287.6410827636719, + "logps/rejected": -498.546875, + "loss": 0.176, + "rewards/chosen": 1.269957423210144, + "rewards/margins": 11.07624614238739, + "rewards/rejected": -9.806288719177246, + "step": 1008 + }, + { + "epoch": 0.3724793502837894, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 7.090692788193409e-06, + "logits/chosen": 1348441460.3636363, + "logits/rejected": 1387642489.9047618, + "logps/chosen": -226.32741477272728, + "logps/rejected": -432.0068359375, + "loss": 0.0945, + "rewards/chosen": 1.7215891751376065, + "rewards/margins": 10.125743634773023, + "rewards/rejected": -8.404154459635416, + "step": 1009 + }, + { + "epoch": 0.3728485072216326, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 7.085345199908234e-06, + "logits/chosen": 1900947132.631579, + "logits/rejected": 2362239448.6153846, + "logps/chosen": -320.6041837993421, + "logps/rejected": -469.6843449519231, + "loss": 0.1808, + "rewards/chosen": 1.0752721083791632, + "rewards/margins": 9.314012295804043, + "rewards/rejected": -8.23874018742488, + "step": 1010 + }, + { + "epoch": 0.37321766415947577, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 7.0799947224037765e-06, + "logits/chosen": 2052017298.2857144, + "logits/rejected": 1794142435.5555556, + "logps/chosen": -302.06689453125, + "logps/rejected": -408.7010091145833, + "loss": 0.1213, + "rewards/chosen": 1.681612423488072, + "rewards/margins": 9.354882754976787, + "rewards/rejected": -7.673270331488715, + "step": 1011 + }, + { + "epoch": 0.373586821097319, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 7.074641363093058e-06, + "logits/chosen": 1759558957.1764705, + "logits/rejected": 1738430464.0, + "logps/chosen": -284.1734834558824, + "logps/rejected": -574.5643880208333, + "loss": 0.1831, + "rewards/chosen": 0.7744261797736672, + "rewards/margins": 8.885008658614813, + "rewards/rejected": -8.110582478841145, + "step": 1012 + }, + { + "epoch": 0.3739559780351622, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 7.0692851293930885e-06, + "logits/chosen": 1928637644.8, + "logits/rejected": 1967263930.1818182, + "logps/chosen": -287.18701171875, + "logps/rejected": -535.2310901988636, + "loss": 0.105, + "rewards/chosen": 1.008524513244629, + "rewards/margins": 10.19213976426558, + "rewards/rejected": -9.183615251020951, + "step": 1013 + }, + { + "epoch": 0.3743251349730054, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 7.063926028724861e-06, + "logits/chosen": 2877248365.714286, + "logits/rejected": 2187818325.3333335, + "logps/chosen": -200.97377232142858, + "logps/rejected": -422.8129069010417, + "loss": 0.151, + "rewards/chosen": 0.9688583782741002, + "rewards/margins": 7.786101318541027, + "rewards/rejected": -6.817242940266927, + "step": 1014 + }, + { + "epoch": 0.3746942919108486, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 7.058564068513344e-06, + "logits/chosen": 2151062323.2, + "logits/rejected": 1565646607.0588236, + "logps/chosen": -362.2584635416667, + "logps/rejected": -562.8606962316177, + "loss": 0.1693, + "rewards/chosen": 0.9718924204508463, + "rewards/margins": 10.957364258111692, + "rewards/rejected": -9.985471837660846, + "step": 1015 + }, + { + "epoch": 0.3750634488486918, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 7.053199256187464e-06, + "logits/chosen": 1247994441.142857, + "logits/rejected": 1363786296.8888888, + "logps/chosen": -288.19775390625, + "logps/rejected": -493.1687825520833, + "loss": 0.126, + "rewards/chosen": 1.473339217049735, + "rewards/margins": 9.890036401294527, + "rewards/rejected": -8.416697184244791, + "step": 1016 + }, + { + "epoch": 0.375432605786535, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 7.047831599180099e-06, + "logits/chosen": 1764913906.5263157, + "logits/rejected": 1760204642.4615386, + "logps/chosen": -271.7682462993421, + "logps/rejected": -491.1436298076923, + "loss": 0.1656, + "rewards/chosen": 1.477714036640368, + "rewards/margins": 8.784272498930031, + "rewards/rejected": -7.306558462289663, + "step": 1017 + }, + { + "epoch": 0.375432605786535, + "eval_kl": 0.0, + "eval_logits/chosen": 3611505208.3444977, + "eval_logits/rejected": 3641732818.5627704, + "eval_logps/chosen": -292.8368906997608, + "eval_logps/rejected": -479.27627840909093, + "eval_loss": 0.1404857188463211, + "eval_rewards/chosen": 1.4177916312331789, + "eval_rewards/margins": 9.572374922291079, + "eval_rewards/rejected": -8.1545832910579, + "eval_runtime": 109.497, + "eval_samples_per_second": 8.0, + "eval_steps_per_second": 0.502, + "step": 1017 + }, + { + "epoch": 0.3758017627243782, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 7.042461104928072e-06, + "logits/chosen": 2308291840.0, + "logits/rejected": 2235788800.0, + "logps/chosen": -277.43316650390625, + "logps/rejected": -451.8218994140625, + "loss": 0.1494, + "rewards/chosen": 1.4298688173294067, + "rewards/margins": 9.353927254676819, + "rewards/rejected": -7.924058437347412, + "step": 1018 + }, + { + "epoch": 0.3761709196622214, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 7.037087780872134e-06, + "logits/chosen": 1762543802.1818182, + "logits/rejected": 1822557135.2380953, + "logps/chosen": -230.17649147727272, + "logps/rejected": -531.8423549107143, + "loss": 0.0798, + "rewards/chosen": 1.6482153805819424, + "rewards/margins": 9.225164727215127, + "rewards/rejected": -7.576949346633184, + "step": 1019 + }, + { + "epoch": 0.3765400766000646, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 7.031711634456954e-06, + "logits/chosen": 1740394222.9333334, + "logits/rejected": 1744674695.5294118, + "logps/chosen": -280.4731119791667, + "logps/rejected": -521.5599724264706, + "loss": 0.1718, + "rewards/chosen": 0.8565059026082357, + "rewards/margins": 10.729071676964853, + "rewards/rejected": -9.872565774356618, + "step": 1020 + }, + { + "epoch": 0.3769092335379078, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 7.02633267313112e-06, + "logits/chosen": 1977146669.1764705, + "logits/rejected": 1873325397.3333333, + "logps/chosen": -350.4914981617647, + "logps/rejected": -631.047265625, + "loss": 0.1623, + "rewards/chosen": 1.2001571655273438, + "rewards/margins": 20.739378356933592, + "rewards/rejected": -19.53922119140625, + "step": 1021 + }, + { + "epoch": 0.377278390475751, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 7.02095090434711e-06, + "logits/chosen": 1923323611.4285715, + "logits/rejected": 1513963292.4444444, + "logps/chosen": -205.41287667410714, + "logps/rejected": -464.9054361979167, + "loss": 0.1326, + "rewards/chosen": 1.5992021560668945, + "rewards/margins": 8.810722033182781, + "rewards/rejected": -7.211519877115886, + "step": 1022 + }, + { + "epoch": 0.3776475474135942, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 7.015566335561297e-06, + "logits/chosen": 1366490538.6666667, + "logits/rejected": 1401238937.6, + "logps/chosen": -274.3295491536458, + "logps/rejected": -413.497119140625, + "loss": 0.1514, + "rewards/chosen": 0.5374922752380371, + "rewards/margins": 7.9245329856872555, + "rewards/rejected": -7.387040710449218, + "step": 1023 + }, + { + "epoch": 0.3780167043514374, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 7.010178974233936e-06, + "logits/chosen": 1271549520.8421052, + "logits/rejected": 1499073142.1538463, + "logps/chosen": -288.74043996710526, + "logps/rejected": -354.60006009615387, + "loss": 0.2372, + "rewards/chosen": 1.1480709879021895, + "rewards/margins": 6.171163991395279, + "rewards/rejected": -5.023093003493089, + "step": 1024 + }, + { + "epoch": 0.3783858612892806, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 7.004788827829143e-06, + "logits/chosen": 1838691766.857143, + "logits/rejected": 2093924807.1111112, + "logps/chosen": -237.09024483816964, + "logps/rejected": -469.8660481770833, + "loss": 0.1701, + "rewards/chosen": 0.9613872255597796, + "rewards/margins": 8.38368014683799, + "rewards/rejected": -7.422292921278212, + "step": 1025 + }, + { + "epoch": 0.3787550182271238, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 6.9993959038149e-06, + "logits/chosen": 1867516635.4285715, + "logits/rejected": 1856136988.4444444, + "logps/chosen": -284.51790945870533, + "logps/rejected": -479.26719835069446, + "loss": 0.1954, + "rewards/chosen": 0.5043069635118756, + "rewards/margins": 8.398740726803977, + "rewards/rejected": -7.894433763292101, + "step": 1026 + }, + { + "epoch": 0.379124175164967, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 6.994000209663037e-06, + "logits/chosen": 2062204723.2, + "logits/rejected": 2233994240.0, + "logps/chosen": -256.8276611328125, + "logps/rejected": -490.1243489583333, + "loss": 0.2158, + "rewards/chosen": 0.913912582397461, + "rewards/margins": 8.102145640055339, + "rewards/rejected": -7.188233057657878, + "step": 1027 + }, + { + "epoch": 0.3794933321028102, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 6.988601752849213e-06, + "logits/chosen": 1167296585.142857, + "logits/rejected": 1973685589.3333333, + "logps/chosen": -240.83367047991072, + "logps/rejected": -483.53038194444446, + "loss": 0.1569, + "rewards/chosen": 1.602776391165597, + "rewards/margins": 8.96336785573808, + "rewards/rejected": -7.360591464572483, + "step": 1028 + }, + { + "epoch": 0.3798624890406534, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 6.983200540852928e-06, + "logits/chosen": 1465306663.3846154, + "logits/rejected": 2209368602.9473686, + "logps/chosen": -253.55885667067307, + "logps/rejected": -618.7204975328947, + "loss": 0.1161, + "rewards/chosen": 1.612320533165565, + "rewards/margins": 12.05262413488226, + "rewards/rejected": -10.440303601716694, + "step": 1029 + }, + { + "epoch": 0.3802316459784966, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 6.97779658115749e-06, + "logits/chosen": 1983917624.8888888, + "logits/rejected": 2281856146.285714, + "logps/chosen": -245.47466362847223, + "logps/rejected": -456.01175362723217, + "loss": 0.1951, + "rewards/chosen": 1.3230375713772244, + "rewards/margins": 7.345101780361599, + "rewards/rejected": -6.022064208984375, + "step": 1030 + }, + { + "epoch": 0.3806008029163398, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 6.972389881250015e-06, + "logits/chosen": 1558298781.5384614, + "logits/rejected": 2149740328.4210525, + "logps/chosen": -351.7357647235577, + "logps/rejected": -441.39828330592104, + "loss": 0.1155, + "rewards/chosen": 1.8236981905423677, + "rewards/margins": 8.840064446453141, + "rewards/rejected": -7.016366255910773, + "step": 1031 + }, + { + "epoch": 0.380969959854183, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 6.9669804486214196e-06, + "logits/chosen": 2142004077.7142856, + "logits/rejected": 3051310193.7777777, + "logps/chosen": -287.0013427734375, + "logps/rejected": -575.4581705729166, + "loss": 0.1923, + "rewards/chosen": 0.3978395462036133, + "rewards/margins": 9.325613763597277, + "rewards/rejected": -8.927774217393663, + "step": 1032 + }, + { + "epoch": 0.3813391167920262, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 6.9615682907664025e-06, + "logits/chosen": 1474775771.4285715, + "logits/rejected": 1737034865.7777777, + "logps/chosen": -281.27083914620533, + "logps/rejected": -519.3575303819445, + "loss": 0.1237, + "rewards/chosen": 1.2917416436331612, + "rewards/margins": 9.480021537296356, + "rewards/rejected": -8.188279893663195, + "step": 1033 + }, + { + "epoch": 0.38170827372986943, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 6.95615341518344e-06, + "logits/chosen": 1503608217.6, + "logits/rejected": 1627995477.3333333, + "logps/chosen": -283.79169921875, + "logps/rejected": -479.4219563802083, + "loss": 0.2233, + "rewards/chosen": 0.972289752960205, + "rewards/margins": 8.359754276275634, + "rewards/rejected": -7.38746452331543, + "step": 1034 + }, + { + "epoch": 0.3820774306677126, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 6.950735829374773e-06, + "logits/chosen": 1944830313.4117646, + "logits/rejected": 1608161416.5333333, + "logps/chosen": -276.91906020220586, + "logps/rejected": -425.09485677083336, + "loss": 0.1686, + "rewards/chosen": 1.1956363004796646, + "rewards/margins": 8.467504179711437, + "rewards/rejected": -7.271867879231771, + "step": 1035 + }, + { + "epoch": 0.38244658760555583, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 6.9453155408464005e-06, + "logits/chosen": 1364251884.3076923, + "logits/rejected": 2067474539.7894738, + "logps/chosen": -307.3048565204327, + "logps/rejected": -421.2173622532895, + "loss": 0.1127, + "rewards/chosen": 1.5592793684739332, + "rewards/margins": 8.88919366226505, + "rewards/rejected": -7.329914293791118, + "step": 1036 + }, + { + "epoch": 0.382815744543399, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 6.939892557108059e-06, + "logits/chosen": 1341420836.5714285, + "logits/rejected": 1569915107.5555556, + "logps/chosen": -217.47148786272322, + "logps/rejected": -457.4826388888889, + "loss": 0.1035, + "rewards/chosen": 2.421404702322824, + "rewards/margins": 9.182301173134455, + "rewards/rejected": -6.760896470811632, + "step": 1037 + }, + { + "epoch": 0.38318490148124223, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 6.9344668856732255e-06, + "logits/chosen": 1595065344.0, + "logits/rejected": 1447868586.6666667, + "logps/chosen": -261.9731201171875, + "logps/rejected": -594.255859375, + "loss": 0.179, + "rewards/chosen": 1.436758041381836, + "rewards/margins": 9.232110977172852, + "rewards/rejected": -7.795352935791016, + "step": 1038 + }, + { + "epoch": 0.3835540584190854, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 6.9290385340591e-06, + "logits/chosen": 1886173952.0, + "logits/rejected": 1903924992.0, + "logps/chosen": -272.93597412109375, + "logps/rejected": -523.7559814453125, + "loss": 0.1671, + "rewards/chosen": 1.059628963470459, + "rewards/margins": 10.914914608001709, + "rewards/rejected": -9.85528564453125, + "step": 1039 + }, + { + "epoch": 0.38392321535692864, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 6.923607509786593e-06, + "logits/chosen": 1697210368.0, + "logits/rejected": 1431115414.5882354, + "logps/chosen": -282.9158203125, + "logps/rejected": -464.20978860294116, + "loss": 0.1563, + "rewards/chosen": 1.1723981221516928, + "rewards/margins": 7.068711060168696, + "rewards/rejected": -5.896312938017004, + "step": 1040 + }, + { + "epoch": 0.3842923722947718, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 6.918173820380321e-06, + "logits/chosen": 1955246694.4, + "logits/rejected": 1858172245.3333333, + "logps/chosen": -320.8313232421875, + "logps/rejected": -463.9962158203125, + "loss": 0.2125, + "rewards/chosen": 1.063975715637207, + "rewards/margins": 9.645189984639487, + "rewards/rejected": -8.58121426900228, + "step": 1041 + }, + { + "epoch": 0.38466152923261504, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 6.91273747336859e-06, + "logits/chosen": 1671425675.6363637, + "logits/rejected": 1863190235.4285715, + "logps/chosen": -403.3275035511364, + "logps/rejected": -477.48902529761904, + "loss": 0.1058, + "rewards/chosen": 1.5428071455522017, + "rewards/margins": 8.234510578634419, + "rewards/rejected": -6.691703433082218, + "step": 1042 + }, + { + "epoch": 0.3850306861704582, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 6.907298476283392e-06, + "logits/chosen": 1792460559.0588236, + "logits/rejected": 1976245452.8, + "logps/chosen": -289.27001953125, + "logps/rejected": -495.2573567708333, + "loss": 0.1556, + "rewards/chosen": 1.134869519401999, + "rewards/margins": 9.29150048050226, + "rewards/rejected": -8.15663096110026, + "step": 1043 + }, + { + "epoch": 0.38539984310830144, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 6.901856836660386e-06, + "logits/chosen": 1665567597.7142856, + "logits/rejected": 1531277994.6666667, + "logps/chosen": -308.95382254464283, + "logps/rejected": -477.3097330729167, + "loss": 0.1233, + "rewards/chosen": 1.54719420841762, + "rewards/margins": 9.833128656659808, + "rewards/rejected": -8.285934448242188, + "step": 1044 + }, + { + "epoch": 0.3857690000461446, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 6.896412562038897e-06, + "logits/chosen": 2135352115.2, + "logits/rejected": 1951150592.0, + "logps/chosen": -316.8733154296875, + "logps/rejected": -463.3391927083333, + "loss": 0.1678, + "rewards/chosen": 1.4995895385742188, + "rewards/margins": 7.670197423299154, + "rewards/rejected": -6.170607884724935, + "step": 1045 + }, + { + "epoch": 0.3861381569839878, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 6.890965659961897e-06, + "logits/chosen": 1855345371.4285715, + "logits/rejected": 1641295872.0, + "logps/chosen": -273.76109095982144, + "logps/rejected": -410.3990071614583, + "loss": 0.1632, + "rewards/chosen": 0.7023242541721889, + "rewards/margins": 7.484789023323665, + "rewards/rejected": -6.782464769151476, + "step": 1046 + }, + { + "epoch": 0.386507313921831, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 6.885516137975998e-06, + "logits/chosen": 2374526005.894737, + "logits/rejected": 2854545880.6153846, + "logps/chosen": -295.58277652138156, + "logps/rejected": -551.6556865985577, + "loss": 0.2131, + "rewards/chosen": 1.1426232990465666, + "rewards/margins": 10.180677545215438, + "rewards/rejected": -9.03805424616887, + "step": 1047 + }, + { + "epoch": 0.3868764708596742, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 6.880064003631446e-06, + "logits/chosen": 1734248075.6363637, + "logits/rejected": 1319384576.0, + "logps/chosen": -261.5970348011364, + "logps/rejected": -555.1615234375, + "loss": 0.1929, + "rewards/chosen": 1.5112571716308594, + "rewards/margins": 9.255259704589843, + "rewards/rejected": -7.7440025329589846, + "step": 1048 + }, + { + "epoch": 0.3872456277975174, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 6.874609264482103e-06, + "logits/chosen": 2079796428.8, + "logits/rejected": 2374291456.0, + "logps/chosen": -295.87216796875, + "logps/rejected": -528.7972005208334, + "loss": 0.1862, + "rewards/chosen": 1.1997077941894532, + "rewards/margins": 9.661069361368815, + "rewards/rejected": -8.461361567179361, + "step": 1049 + }, + { + "epoch": 0.3876147847353606, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 6.8691519280854406e-06, + "logits/chosen": 1482780535.4666667, + "logits/rejected": 1877646637.1764705, + "logps/chosen": -258.8557454427083, + "logps/rejected": -474.5862821691176, + "loss": 0.1284, + "rewards/chosen": 1.8471700032552083, + "rewards/margins": 9.155250339882047, + "rewards/rejected": -7.308080336626838, + "step": 1050 + }, + { + "epoch": 0.3879839416732038, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 6.863692002002529e-06, + "logits/chosen": 1874244494.2222223, + "logits/rejected": 1743058505.142857, + "logps/chosen": -264.25013563368054, + "logps/rejected": -465.16074916294644, + "loss": 0.1696, + "rewards/chosen": 1.35261779361301, + "rewards/margins": 8.799388597881983, + "rewards/rejected": -7.446770804268973, + "step": 1051 + }, + { + "epoch": 0.388353098611047, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 6.858229493798026e-06, + "logits/chosen": 2008383078.4, + "logits/rejected": 1645460901.6470587, + "logps/chosen": -297.67291666666665, + "logps/rejected": -458.19264131433823, + "loss": 0.1667, + "rewards/chosen": 1.3095297495524088, + "rewards/margins": 8.976888147989909, + "rewards/rejected": -7.6673583984375, + "step": 1052 + }, + { + "epoch": 0.3887222555488902, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 6.85276441104017e-06, + "logits/chosen": 1269208632.8888888, + "logits/rejected": 1441612214.857143, + "logps/chosen": -200.60045030381946, + "logps/rejected": -424.145751953125, + "loss": 0.1084, + "rewards/chosen": 2.139698028564453, + "rewards/margins": 9.366568429129465, + "rewards/rejected": -7.226870400565011, + "step": 1053 + }, + { + "epoch": 0.3890914124867334, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 6.84729676130076e-06, + "logits/chosen": 1720410298.1818182, + "logits/rejected": 1806025094.0952382, + "logps/chosen": -294.91455078125, + "logps/rejected": -408.66866629464283, + "loss": 0.1352, + "rewards/chosen": 0.9500854665582831, + "rewards/margins": 8.02888330649504, + "rewards/rejected": -7.078797839936756, + "step": 1054 + }, + { + "epoch": 0.38946056942457663, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 6.841826552155158e-06, + "logits/chosen": 1740043264.0, + "logits/rejected": 1787326634.6666667, + "logps/chosen": -295.114697265625, + "logps/rejected": -522.4771321614584, + "loss": 0.1849, + "rewards/chosen": 1.6446212768554687, + "rewards/margins": 8.710415903727213, + "rewards/rejected": -7.065794626871745, + "step": 1055 + }, + { + "epoch": 0.3898297263624198, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 6.836353791182266e-06, + "logits/chosen": 1634993421.4736843, + "logits/rejected": 1509501085.5384614, + "logps/chosen": -320.8282534950658, + "logps/rejected": -481.29627403846155, + "loss": 0.1987, + "rewards/chosen": 1.2330954702276933, + "rewards/margins": 8.729897580166094, + "rewards/rejected": -7.496802109938401, + "step": 1056 + }, + { + "epoch": 0.39019888330026303, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 6.830878485964528e-06, + "logits/chosen": 2385185370.352941, + "logits/rejected": 2357308620.8, + "logps/chosen": -290.01447610294116, + "logps/rejected": -517.9828450520833, + "loss": 0.1509, + "rewards/chosen": 1.3951729045194738, + "rewards/margins": 9.56887082118614, + "rewards/rejected": -8.173697916666667, + "step": 1057 + }, + { + "epoch": 0.3905680402381062, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 6.8254006440879094e-06, + "logits/chosen": 1822228187.4285715, + "logits/rejected": 2376029980.4444447, + "logps/chosen": -262.7651890345982, + "logps/rejected": -467.51605902777777, + "loss": 0.1095, + "rewards/chosen": 1.7289127622331892, + "rewards/margins": 9.106157499646384, + "rewards/rejected": -7.377244737413195, + "step": 1058 + }, + { + "epoch": 0.39093719717594944, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 6.81992027314189e-06, + "logits/chosen": 1610216960.0, + "logits/rejected": 1451137433.6, + "logps/chosen": -352.8995768229167, + "logps/rejected": -535.891162109375, + "loss": 0.1011, + "rewards/chosen": 1.7764005661010742, + "rewards/margins": 9.954036521911622, + "rewards/rejected": -8.177635955810548, + "step": 1059 + }, + { + "epoch": 0.3913063541137926, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 6.814437380719453e-06, + "logits/chosen": 1706885376.0, + "logits/rejected": 1430523008.0, + "logps/chosen": -338.99603271484375, + "logps/rejected": -497.7025146484375, + "loss": 0.167, + "rewards/chosen": 1.2853343486785889, + "rewards/margins": 9.400489568710327, + "rewards/rejected": -8.115155220031738, + "step": 1060 + }, + { + "epoch": 0.39167551105163584, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 6.808951974417077e-06, + "logits/chosen": 1654338901.3333333, + "logits/rejected": 2470924288.0, + "logps/chosen": -320.02239583333335, + "logps/rejected": -551.1536649816177, + "loss": 0.1571, + "rewards/chosen": 1.0757904052734375, + "rewards/margins": 8.820557538200827, + "rewards/rejected": -7.7447671329273895, + "step": 1061 + }, + { + "epoch": 0.392044667989479, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 6.803464061834725e-06, + "logits/chosen": 2183733679.1578946, + "logits/rejected": 2154804775.3846154, + "logps/chosen": -259.61973170230266, + "logps/rejected": -475.3329326923077, + "loss": 0.2198, + "rewards/chosen": 0.9142705013877467, + "rewards/margins": 10.85560867371347, + "rewards/rejected": -9.941338172325722, + "step": 1062 + }, + { + "epoch": 0.39241382492732224, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 6.7979736505758264e-06, + "logits/chosen": 1948117248.0, + "logits/rejected": 2163497472.0, + "logps/chosen": -257.1861267089844, + "logps/rejected": -734.1687622070312, + "loss": 0.1596, + "rewards/chosen": 1.0611233711242676, + "rewards/margins": 12.813782215118408, + "rewards/rejected": -11.75265884399414, + "step": 1063 + }, + { + "epoch": 0.3927829818651654, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 6.792480748247278e-06, + "logits/chosen": 1448152832.0, + "logits/rejected": 1541000064.0, + "logps/chosen": -203.82208251953125, + "logps/rejected": -502.36737060546875, + "loss": 0.188, + "rewards/chosen": 0.7844272255897522, + "rewards/margins": 7.859462797641754, + "rewards/rejected": -7.075035572052002, + "step": 1064 + }, + { + "epoch": 0.39315213880300864, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 6.786985362459427e-06, + "logits/chosen": 1916399856.9411764, + "logits/rejected": 1841463022.9333334, + "logps/chosen": -308.50281479779414, + "logps/rejected": -474.8029296875, + "loss": 0.2352, + "rewards/chosen": 0.532837475047392, + "rewards/margins": 7.137090099559111, + "rewards/rejected": -6.604252624511719, + "step": 1065 + }, + { + "epoch": 0.3935212957408518, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 6.78148750082606e-06, + "logits/chosen": 2029342720.0, + "logits/rejected": 1566383274.6666667, + "logps/chosen": -267.5640380859375, + "logps/rejected": -443.1219482421875, + "loss": 0.1779, + "rewards/chosen": 1.6598033905029297, + "rewards/margins": 8.844360987345379, + "rewards/rejected": -7.184557596842448, + "step": 1066 + }, + { + "epoch": 0.39389045267869505, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 6.7759871709643934e-06, + "logits/chosen": 2633280170.6666665, + "logits/rejected": 1716884684.8, + "logps/chosen": -270.5099690755208, + "logps/rejected": -476.720458984375, + "loss": 0.1225, + "rewards/chosen": 1.2277071475982666, + "rewards/margins": 8.100663995742797, + "rewards/rejected": -6.872956848144531, + "step": 1067 + }, + { + "epoch": 0.3942596096165382, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 6.770484380495064e-06, + "logits/chosen": 1452035449.2631578, + "logits/rejected": 1376616132.9230769, + "logps/chosen": -297.52467105263156, + "logps/rejected": -344.2648737980769, + "loss": 0.1763, + "rewards/chosen": 1.3346168116519326, + "rewards/margins": 7.819943725338832, + "rewards/rejected": -6.485326913686899, + "step": 1068 + }, + { + "epoch": 0.39462876655438145, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 6.76497913704212e-06, + "logits/chosen": 2628989513.142857, + "logits/rejected": 1922882673.7777777, + "logps/chosen": -251.04506138392858, + "logps/rejected": -419.33685980902777, + "loss": 0.1447, + "rewards/chosen": 1.1268036024911063, + "rewards/margins": 8.774278103359162, + "rewards/rejected": -7.647474500868055, + "step": 1069 + }, + { + "epoch": 0.3949979234922246, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 6.759471448233008e-06, + "logits/chosen": 2149958354.8235292, + "logits/rejected": 2147723400.5333333, + "logps/chosen": -309.3968864889706, + "logps/rejected": -390.9126953125, + "loss": 0.1881, + "rewards/chosen": 1.1064137851490694, + "rewards/margins": 6.825656135409486, + "rewards/rejected": -5.719242350260417, + "step": 1070 + }, + { + "epoch": 0.39536708043006785, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 6.7539613216985555e-06, + "logits/chosen": 1769832334.2222223, + "logits/rejected": 2350473216.0, + "logps/chosen": -332.0930989583333, + "logps/rejected": -547.8415178571429, + "loss": 0.1829, + "rewards/chosen": 1.2105256186591253, + "rewards/margins": 7.974713567703489, + "rewards/rejected": -6.764187949044364, + "step": 1071 + }, + { + "epoch": 0.395736237367911, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 6.748448765072977e-06, + "logits/chosen": 2189282304.0, + "logits/rejected": 1654270208.0, + "logps/chosen": -322.883544921875, + "logps/rejected": -451.36541748046875, + "loss": 0.1766, + "rewards/chosen": 1.4144606590270996, + "rewards/margins": 8.264976024627686, + "rewards/rejected": -6.850515365600586, + "step": 1072 + }, + { + "epoch": 0.39610539430575425, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 6.742933785993847e-06, + "logits/chosen": 1477750784.0, + "logits/rejected": 1608406471.1111112, + "logps/chosen": -354.78745814732144, + "logps/rejected": -488.7117513020833, + "loss": 0.153, + "rewards/chosen": 0.7984057835170201, + "rewards/margins": 8.426434653145927, + "rewards/rejected": -7.628028869628906, + "step": 1073 + }, + { + "epoch": 0.39647455124359743, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 6.737416392102101e-06, + "logits/chosen": 1754761728.0, + "logits/rejected": 3110345472.0, + "logps/chosen": -344.156005859375, + "logps/rejected": -457.4859924316406, + "loss": 0.1588, + "rewards/chosen": 1.0564942359924316, + "rewards/margins": 7.976099491119385, + "rewards/rejected": -6.919605255126953, + "step": 1074 + }, + { + "epoch": 0.39684370818144066, + "grad_norm": 11.5, + "kl": 0.10857439041137695, + "learning_rate": 6.731896591042016e-06, + "logits/chosen": 2848242892.8, + "logits/rejected": 1513277098.6666667, + "logps/chosen": -259.1068603515625, + "logps/rejected": -470.3874104817708, + "loss": 0.1405, + "rewards/chosen": 1.8974687576293945, + "rewards/margins": 10.167544873555501, + "rewards/rejected": -8.270076115926107, + "step": 1075 + }, + { + "epoch": 0.39721286511928383, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 6.72637439046121e-06, + "logits/chosen": 2200587008.0, + "logits/rejected": 2018576384.0, + "logps/chosen": -231.5769805908203, + "logps/rejected": -546.85107421875, + "loss": 0.1844, + "rewards/chosen": 0.7452334761619568, + "rewards/margins": 8.770974099636078, + "rewards/rejected": -8.025740623474121, + "step": 1076 + }, + { + "epoch": 0.39758202205712706, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 6.720849798010618e-06, + "logits/chosen": 1728916736.0, + "logits/rejected": 1202708736.0, + "logps/chosen": -316.3252868652344, + "logps/rejected": -330.36627197265625, + "loss": 0.1605, + "rewards/chosen": 1.364596962928772, + "rewards/margins": 8.332844853401184, + "rewards/rejected": -6.968247890472412, + "step": 1077 + }, + { + "epoch": 0.39795117899497023, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 6.715322821344495e-06, + "logits/chosen": 1815720960.0, + "logits/rejected": 1506452593.7777777, + "logps/chosen": -261.01661900111606, + "logps/rejected": -412.3838161892361, + "loss": 0.1259, + "rewards/chosen": 1.493248394557408, + "rewards/margins": 7.944241402641175, + "rewards/rejected": -6.450993008083767, + "step": 1078 + }, + { + "epoch": 0.39832033593281346, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 6.709793468120395e-06, + "logits/chosen": 2138135130.3529413, + "logits/rejected": 1652556868.2666667, + "logps/chosen": -280.15062040441177, + "logps/rejected": -414.19469401041664, + "loss": 0.1463, + "rewards/chosen": 1.7847302380730123, + "rewards/margins": 9.132459730260512, + "rewards/rejected": -7.3477294921875, + "step": 1079 + }, + { + "epoch": 0.39868949287065664, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 6.704261745999168e-06, + "logits/chosen": 1866493952.0, + "logits/rejected": 2490449510.4, + "logps/chosen": -253.5846280184659, + "logps/rejected": -545.456640625, + "loss": 0.1757, + "rewards/chosen": 1.9148472872647373, + "rewards/margins": 9.670248898592863, + "rewards/rejected": -7.755401611328125, + "step": 1080 + }, + { + "epoch": 0.39905864980849987, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 6.698727662644944e-06, + "logits/chosen": 1572910518.857143, + "logits/rejected": 2027579619.5555556, + "logps/chosen": -346.2440708705357, + "logps/rejected": -478.9582248263889, + "loss": 0.1502, + "rewards/chosen": 0.9923221043178013, + "rewards/margins": 9.176119304838636, + "rewards/rejected": -8.183797200520834, + "step": 1081 + }, + { + "epoch": 0.39942780674634304, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 6.693191225725125e-06, + "logits/chosen": 2154774869.3333335, + "logits/rejected": 1602721484.8, + "logps/chosen": -318.2677815755208, + "logps/rejected": -487.480615234375, + "loss": 0.1724, + "rewards/chosen": 0.2615639567375183, + "rewards/margins": 7.575173270702362, + "rewards/rejected": -7.313609313964844, + "step": 1082 + }, + { + "epoch": 0.3997969636841862, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 6.687652442910375e-06, + "logits/chosen": 2295648665.6, + "logits/rejected": 2079275861.3333333, + "logps/chosen": -308.50146484375, + "logps/rejected": -525.5748291015625, + "loss": 0.1521, + "rewards/chosen": 1.5182968139648438, + "rewards/margins": 8.581021245320638, + "rewards/rejected": -7.062724431355794, + "step": 1083 + }, + { + "epoch": 0.40016612062202944, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 6.682111321874608e-06, + "logits/chosen": 1779618579.6923077, + "logits/rejected": 2139230854.7368422, + "logps/chosen": -297.64881310096155, + "logps/rejected": -444.95101768092104, + "loss": 0.1467, + "rewards/chosen": 1.1541243333082933, + "rewards/margins": 8.995461838448096, + "rewards/rejected": -7.841337505139802, + "step": 1084 + }, + { + "epoch": 0.4005352775598726, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 6.6765678702949744e-06, + "logits/chosen": 1722460882.8235295, + "logits/rejected": 2046736520.5333333, + "logps/chosen": -292.67155905330884, + "logps/rejected": -606.8057942708333, + "loss": 0.1613, + "rewards/chosen": 1.4810222176944507, + "rewards/margins": 8.706855856203566, + "rewards/rejected": -7.225833638509115, + "step": 1085 + }, + { + "epoch": 0.40090443449771584, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 6.671022095851857e-06, + "logits/chosen": 1377177088.0, + "logits/rejected": 1273354880.0, + "logps/chosen": -238.42919921875, + "logps/rejected": -338.3040771484375, + "loss": 0.1703, + "rewards/chosen": 1.0728797912597656, + "rewards/margins": 8.594537734985352, + "rewards/rejected": -7.521657943725586, + "step": 1086 + }, + { + "epoch": 0.401273591435559, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 6.6654740062288555e-06, + "logits/chosen": 1826151243.2941177, + "logits/rejected": 1477414638.9333334, + "logps/chosen": -319.5673828125, + "logps/rejected": -465.1244791666667, + "loss": 0.1747, + "rewards/chosen": 1.2964708664838005, + "rewards/margins": 8.54504264382755, + "rewards/rejected": -7.24857177734375, + "step": 1087 + }, + { + "epoch": 0.40164274837340225, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 6.65992360911278e-06, + "logits/chosen": 1716000904.5333333, + "logits/rejected": 2282586593.882353, + "logps/chosen": -318.74274088541665, + "logps/rejected": -422.5145048253676, + "loss": 0.2004, + "rewards/chosen": 0.6620027542114257, + "rewards/margins": 7.636162443721996, + "rewards/rejected": -6.97415968951057, + "step": 1088 + }, + { + "epoch": 0.4020119053112454, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 6.654370912193633e-06, + "logits/chosen": 1798950619.4285715, + "logits/rejected": 1833331598.2222223, + "logps/chosen": -239.75355747767858, + "logps/rejected": -413.25634765625, + "loss": 0.1621, + "rewards/chosen": 1.013793672834124, + "rewards/margins": 8.6400084419856, + "rewards/rejected": -7.626214769151476, + "step": 1089 + }, + { + "epoch": 0.40238106224908865, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 6.648815923164604e-06, + "logits/chosen": 1368820940.8, + "logits/rejected": 1533021485.1764705, + "logps/chosen": -258.4552734375, + "logps/rejected": -451.1934168198529, + "loss": 0.1343, + "rewards/chosen": 1.4674625396728516, + "rewards/margins": 9.484920389512006, + "rewards/rejected": -8.017457849839154, + "step": 1090 + }, + { + "epoch": 0.4027502191869318, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 6.6432586497220615e-06, + "logits/chosen": 2034860714.6666667, + "logits/rejected": 1664781165.7142856, + "logps/chosen": -329.9111328125, + "logps/rejected": -445.34852818080356, + "loss": 0.1826, + "rewards/chosen": 0.9422001308865018, + "rewards/margins": 8.085133022732204, + "rewards/rejected": -7.142932891845703, + "step": 1091 + }, + { + "epoch": 0.40311937612477505, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 6.637699099565538e-06, + "logits/chosen": 1102140928.0, + "logits/rejected": 1957234176.0, + "logps/chosen": -234.69979858398438, + "logps/rejected": -398.2297668457031, + "loss": 0.1144, + "rewards/chosen": 2.1419057846069336, + "rewards/margins": 8.355353355407715, + "rewards/rejected": -6.213447570800781, + "step": 1092 + }, + { + "epoch": 0.4034885330626182, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 6.632137280397719e-06, + "logits/chosen": 1345742336.0, + "logits/rejected": 1185961472.0, + "logps/chosen": -221.3385467529297, + "logps/rejected": -418.79638671875, + "loss": 0.1473, + "rewards/chosen": 1.4392707347869873, + "rewards/margins": 9.306103944778442, + "rewards/rejected": -7.866833209991455, + "step": 1093 + }, + { + "epoch": 0.40385769000046146, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 6.626573199924433e-06, + "logits/chosen": 1883031688.5333333, + "logits/rejected": 1826290989.1764705, + "logps/chosen": -256.943408203125, + "logps/rejected": -432.3770392922794, + "loss": 0.2093, + "rewards/chosen": 0.6236879348754882, + "rewards/margins": 8.390810988931095, + "rewards/rejected": -7.767123054055607, + "step": 1094 + }, + { + "epoch": 0.40422684693830463, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 6.621006865854645e-06, + "logits/chosen": 1363621614.9333334, + "logits/rejected": 2099916318.1176472, + "logps/chosen": -264.11180013020834, + "logps/rejected": -482.5666934742647, + "loss": 0.1792, + "rewards/chosen": 1.009035873413086, + "rewards/margins": 9.644103083891029, + "rewards/rejected": -8.635067210477942, + "step": 1095 + }, + { + "epoch": 0.40459600387614786, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 6.6154382859004385e-06, + "logits/chosen": 1671596544.0, + "logits/rejected": 1756755763.2, + "logps/chosen": -300.86431884765625, + "logps/rejected": -456.12861328125, + "loss": 0.0506, + "rewards/chosen": 2.6266072591145835, + "rewards/margins": 11.428383382161458, + "rewards/rejected": -8.801776123046874, + "step": 1096 + }, + { + "epoch": 0.40496516081399103, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 6.609867467777011e-06, + "logits/chosen": 1575780096.0, + "logits/rejected": 1986036608.0, + "logps/chosen": -247.15362548828125, + "logps/rejected": -409.39166259765625, + "loss": 0.1368, + "rewards/chosen": 1.9149789810180664, + "rewards/margins": 8.450450420379639, + "rewards/rejected": -6.535471439361572, + "step": 1097 + }, + { + "epoch": 0.40533431775183426, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 6.60429441920266e-06, + "logits/chosen": 1234863581.8666666, + "logits/rejected": 1268639141.6470587, + "logps/chosen": -260.67286783854166, + "logps/rejected": -417.61669921875, + "loss": 0.1488, + "rewards/chosen": 1.2188986460367839, + "rewards/margins": 8.177619582531499, + "rewards/rejected": -6.958720936494715, + "step": 1098 + }, + { + "epoch": 0.40570347468967743, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 6.598719147898773e-06, + "logits/chosen": 2582913570.133333, + "logits/rejected": 2258682578.8235292, + "logps/chosen": -320.8715494791667, + "logps/rejected": -518.5440027573529, + "loss": 0.1175, + "rewards/chosen": 1.4747879028320312, + "rewards/margins": 9.46514793844784, + "rewards/rejected": -7.990360035615809, + "step": 1099 + }, + { + "epoch": 0.40607263162752066, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 6.593141661589819e-06, + "logits/chosen": 1318332416.0, + "logits/rejected": 1603574272.0, + "logps/chosen": -260.5073547363281, + "logps/rejected": -502.38873291015625, + "loss": 0.155, + "rewards/chosen": 1.172070026397705, + "rewards/margins": 9.341768741607666, + "rewards/rejected": -8.169698715209961, + "step": 1100 + }, + { + "epoch": 0.40644178856536384, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 6.5875619680033334e-06, + "logits/chosen": 1675709312.0, + "logits/rejected": 1772520960.0, + "logps/chosen": -321.2685852050781, + "logps/rejected": -467.0329895019531, + "loss": 0.1751, + "rewards/chosen": 0.9003105163574219, + "rewards/margins": 8.461559295654297, + "rewards/rejected": -7.561248779296875, + "step": 1101 + }, + { + "epoch": 0.40681094550320707, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 6.581980074869911e-06, + "logits/chosen": 1714092544.0, + "logits/rejected": 1687138676.3636363, + "logps/chosen": -249.962890625, + "logps/rejected": -459.38325639204544, + "loss": 0.0833, + "rewards/chosen": 1.6970787048339844, + "rewards/margins": 10.40854887528853, + "rewards/rejected": -8.711470170454545, + "step": 1102 + }, + { + "epoch": 0.40718010244105024, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 6.576395989923193e-06, + "logits/chosen": 2520461702.095238, + "logits/rejected": 3754214865.4545455, + "logps/chosen": -299.0514322916667, + "logps/rejected": -549.5311168323864, + "loss": 0.1738, + "rewards/chosen": 1.3261250995454335, + "rewards/margins": 9.291498745674694, + "rewards/rejected": -7.965373646129262, + "step": 1103 + }, + { + "epoch": 0.40754925937889347, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 6.57080972089986e-06, + "logits/chosen": 1685218560.0, + "logits/rejected": 2441829376.0, + "logps/chosen": -269.3581237792969, + "logps/rejected": -613.5254516601562, + "loss": 0.1814, + "rewards/chosen": 0.9856081008911133, + "rewards/margins": 9.273841857910156, + "rewards/rejected": -8.288233757019043, + "step": 1104 + }, + { + "epoch": 0.40791841631673664, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 6.565221275539615e-06, + "logits/chosen": 1720788352.0, + "logits/rejected": 3060720128.0, + "logps/chosen": -287.57244873046875, + "logps/rejected": -524.9459228515625, + "loss": 0.1628, + "rewards/chosen": 1.0667322874069214, + "rewards/margins": 9.874749064445496, + "rewards/rejected": -8.808016777038574, + "step": 1105 + }, + { + "epoch": 0.40828757325457987, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 6.559630661585179e-06, + "logits/chosen": 2250348544.0, + "logits/rejected": 2399702016.0, + "logps/chosen": -246.15711975097656, + "logps/rejected": -493.6845703125, + "loss": 0.1979, + "rewards/chosen": 0.7520021200180054, + "rewards/margins": 9.300316214561462, + "rewards/rejected": -8.548314094543457, + "step": 1106 + }, + { + "epoch": 0.40865673019242305, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 6.554037886782276e-06, + "logits/chosen": 1768274688.0, + "logits/rejected": 1983123968.0, + "logps/chosen": -282.6739196777344, + "logps/rejected": -639.4232177734375, + "loss": 0.168, + "rewards/chosen": 1.048548936843872, + "rewards/margins": 11.228220224380493, + "rewards/rejected": -10.179671287536621, + "step": 1107 + }, + { + "epoch": 0.4090258871302663, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 6.548442958879624e-06, + "logits/chosen": 1799554779.4285715, + "logits/rejected": 1885458059.6363637, + "logps/chosen": -349.1602492559524, + "logps/rejected": -376.32967862215907, + "loss": 0.1789, + "rewards/chosen": 1.561465127127511, + "rewards/margins": 8.300678550422965, + "rewards/rejected": -6.739213423295454, + "step": 1108 + }, + { + "epoch": 0.40939504406810945, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 6.542845885628926e-06, + "logits/chosen": 2072133427.2, + "logits/rejected": 1951218005.3333333, + "logps/chosen": -334.244482421875, + "logps/rejected": -544.6512858072916, + "loss": 0.2149, + "rewards/chosen": 0.9359611511230469, + "rewards/margins": 7.532897694905599, + "rewards/rejected": -6.596936543782552, + "step": 1109 + }, + { + "epoch": 0.4097642010059527, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 6.537246674784855e-06, + "logits/chosen": 1867153839.1578948, + "logits/rejected": 2346710567.3846154, + "logps/chosen": -256.1425010279605, + "logps/rejected": -416.83435997596155, + "loss": 0.1947, + "rewards/chosen": 1.641961750231291, + "rewards/margins": 10.085263611334055, + "rewards/rejected": -8.443301861102764, + "step": 1110 + }, + { + "epoch": 0.41013335794379585, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 6.531645334105045e-06, + "logits/chosen": 1257903182.7692308, + "logits/rejected": 1269046864.8421052, + "logps/chosen": -291.4960186298077, + "logps/rejected": -465.91976768092104, + "loss": 0.118, + "rewards/chosen": 1.589037381685697, + "rewards/margins": 8.692923233094003, + "rewards/rejected": -7.103885851408306, + "step": 1111 + }, + { + "epoch": 0.4105025148816391, + "grad_norm": 12.25, + "kl": 0.1269521713256836, + "learning_rate": 6.526041871350086e-06, + "logits/chosen": 2045406617.6, + "logits/rejected": 2009409706.6666667, + "logps/chosen": -269.8741943359375, + "logps/rejected": -413.7782389322917, + "loss": 0.1711, + "rewards/chosen": 1.3508588790893554, + "rewards/margins": 8.308771959940593, + "rewards/rejected": -6.957913080851237, + "step": 1112 + }, + { + "epoch": 0.41087167181948225, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 6.520436294283503e-06, + "logits/chosen": 1492258816.0, + "logits/rejected": 1909271371.2941177, + "logps/chosen": -370.37067057291665, + "logps/rejected": -499.5703699448529, + "loss": 0.1596, + "rewards/chosen": 1.0874961853027343, + "rewards/margins": 9.080233001708985, + "rewards/rejected": -7.99273681640625, + "step": 1113 + }, + { + "epoch": 0.4112408287573255, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 6.514828610671751e-06, + "logits/chosen": 1138387171.5555556, + "logits/rejected": 1366036480.0, + "logps/chosen": -258.1832682291667, + "logps/rejected": -452.3655482700893, + "loss": 0.1647, + "rewards/chosen": 1.5192832946777344, + "rewards/margins": 8.75993183680943, + "rewards/rejected": -7.240648542131696, + "step": 1114 + }, + { + "epoch": 0.41160998569516866, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 6.509218828284203e-06, + "logits/chosen": 1768484224.0, + "logits/rejected": 1837243520.0, + "logps/chosen": -242.58375549316406, + "logps/rejected": -434.974365234375, + "loss": 0.2303, + "rewards/chosen": 0.6882071495056152, + "rewards/margins": 8.065849781036377, + "rewards/rejected": -7.377642631530762, + "step": 1115 + }, + { + "epoch": 0.4119791426330119, + "grad_norm": 12.875, + "kl": 1.3077964782714844, + "learning_rate": 6.503606954893143e-06, + "logits/chosen": 2068589146.3529413, + "logits/rejected": 1327237529.6, + "logps/chosen": -286.3373448988971, + "logps/rejected": -469.93639322916664, + "loss": 0.1656, + "rewards/chosen": 1.3158083523021025, + "rewards/margins": 9.355337262621113, + "rewards/rejected": -8.03952891031901, + "step": 1116 + }, + { + "epoch": 0.41234829957085506, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 6.497992998273751e-06, + "logits/chosen": 1952715008.0, + "logits/rejected": 2775755776.0, + "logps/chosen": -292.9701232910156, + "logps/rejected": -535.5264892578125, + "loss": 0.204, + "rewards/chosen": 0.6841689944267273, + "rewards/margins": 9.049020946025848, + "rewards/rejected": -8.364851951599121, + "step": 1117 + }, + { + "epoch": 0.41271745650869823, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 6.492376966204092e-06, + "logits/chosen": 2344901578.105263, + "logits/rejected": 1948000886.1538463, + "logps/chosen": -270.2896278782895, + "logps/rejected": -490.9521484375, + "loss": 0.1722, + "rewards/chosen": 1.1184669293855365, + "rewards/margins": 10.533515381909575, + "rewards/rejected": -9.415048452524038, + "step": 1118 + }, + { + "epoch": 0.41308661344654146, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 6.486758866465106e-06, + "logits/chosen": 2102126405.8181818, + "logits/rejected": 1895719731.2, + "logps/chosen": -262.9108220880682, + "logps/rejected": -314.0551025390625, + "loss": 0.2529, + "rewards/chosen": 0.9838359139182351, + "rewards/margins": 8.212189032814718, + "rewards/rejected": -7.228353118896484, + "step": 1119 + }, + { + "epoch": 0.41345577038438464, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 6.4811387068406e-06, + "logits/chosen": 2003318311.3846154, + "logits/rejected": 1924786714.9473684, + "logps/chosen": -337.1915940504808, + "logps/rejected": -437.0299650493421, + "loss": 0.0897, + "rewards/chosen": 2.399751369769757, + "rewards/margins": 9.86688233194081, + "rewards/rejected": -7.467130962171052, + "step": 1120 + }, + { + "epoch": 0.41382492732222786, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 6.475516495117233e-06, + "logits/chosen": 1753316944.8421052, + "logits/rejected": 1924960413.5384614, + "logps/chosen": -261.00318667763156, + "logps/rejected": -538.9020057091346, + "loss": 0.1668, + "rewards/chosen": 1.6847694798519737, + "rewards/margins": 10.288064439287071, + "rewards/rejected": -8.603294959435097, + "step": 1121 + }, + { + "epoch": 0.41419408426007104, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 6.4698922390845085e-06, + "logits/chosen": 1683193719.4666667, + "logits/rejected": 1687166976.0, + "logps/chosen": -337.6322265625, + "logps/rejected": -532.2019186580883, + "loss": 0.1827, + "rewards/chosen": 1.0114428202311199, + "rewards/margins": 9.718021692014208, + "rewards/rejected": -8.706578871783089, + "step": 1122 + }, + { + "epoch": 0.41456324119791427, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 6.464265946534762e-06, + "logits/chosen": 1628887040.0, + "logits/rejected": 1815603785.142857, + "logps/chosen": -242.24549696180554, + "logps/rejected": -485.71944754464283, + "loss": 0.1966, + "rewards/chosen": 1.2598820792304144, + "rewards/margins": 8.970887698824443, + "rewards/rejected": -7.711005619594029, + "step": 1123 + }, + { + "epoch": 0.41493239813575744, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 6.4586376252631485e-06, + "logits/chosen": 2302734336.0, + "logits/rejected": 1491904835.368421, + "logps/chosen": -267.0373722956731, + "logps/rejected": -527.2330900493421, + "loss": 0.1008, + "rewards/chosen": 1.6559059436504657, + "rewards/margins": 10.085405519616748, + "rewards/rejected": -8.429499575966283, + "step": 1124 + }, + { + "epoch": 0.41530155507360067, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 6.453007283067638e-06, + "logits/chosen": 1544921646.5454545, + "logits/rejected": 1785599317.3333333, + "logps/chosen": -323.65189985795456, + "logps/rejected": -469.6827101934524, + "loss": 0.1003, + "rewards/chosen": 1.6645911823619495, + "rewards/margins": 9.30596095659, + "rewards/rejected": -7.641369774228051, + "step": 1125 + }, + { + "epoch": 0.41567071201144384, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 6.447374927748997e-06, + "logits/chosen": 1922283648.0, + "logits/rejected": 2328636672.0, + "logps/chosen": -270.9356689453125, + "logps/rejected": -632.645263671875, + "loss": 0.1432, + "rewards/chosen": 1.3607394695281982, + "rewards/margins": 9.40466856956482, + "rewards/rejected": -8.043929100036621, + "step": 1126 + }, + { + "epoch": 0.4160398689492871, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 6.4417405671107826e-06, + "logits/chosen": 1603148686.2222223, + "logits/rejected": 1720604672.0, + "logps/chosen": -242.55327690972223, + "logps/rejected": -405.3080357142857, + "loss": 0.2098, + "rewards/chosen": 0.7402637269761827, + "rewards/margins": 7.627523619031149, + "rewards/rejected": -6.887259892054966, + "step": 1127 + }, + { + "epoch": 0.41640902588713025, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 6.4361042089593285e-06, + "logits/chosen": 1081351533.7142856, + "logits/rejected": 1631940608.0, + "logps/chosen": -161.29227120535714, + "logps/rejected": -487.464375, + "loss": 0.0474, + "rewards/chosen": 2.1523145948137556, + "rewards/margins": 9.27088759286063, + "rewards/rejected": -7.118572998046875, + "step": 1128 + }, + { + "epoch": 0.4167781828249735, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 6.43046586110374e-06, + "logits/chosen": 1291870663.1111112, + "logits/rejected": 1146556342.857143, + "logps/chosen": -195.44893391927084, + "logps/rejected": -457.042236328125, + "loss": 0.124, + "rewards/chosen": 1.9835215674506292, + "rewards/margins": 9.076082471817259, + "rewards/rejected": -7.092560904366629, + "step": 1129 + }, + { + "epoch": 0.41714733976281665, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 6.4248255313558735e-06, + "logits/chosen": 2056995498.6666667, + "logits/rejected": 1691546770.2857144, + "logps/chosen": -234.71310763888889, + "logps/rejected": -717.1671316964286, + "loss": 0.1393, + "rewards/chosen": 1.9324864281548395, + "rewards/margins": 12.466210531809974, + "rewards/rejected": -10.533724103655134, + "step": 1130 + }, + { + "epoch": 0.4175164967006599, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 6.419183227530336e-06, + "logits/chosen": 1991164550.7368422, + "logits/rejected": 2730601865.8461537, + "logps/chosen": -287.17655222039474, + "logps/rejected": -388.02700570913464, + "loss": 0.2119, + "rewards/chosen": 0.8276575991981908, + "rewards/margins": 8.972054454479139, + "rewards/rejected": -8.144396855280949, + "step": 1131 + }, + { + "epoch": 0.41788565363850305, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 6.413538957444468e-06, + "logits/chosen": 1639600600.6153846, + "logits/rejected": 1920802600.4210527, + "logps/chosen": -233.87603290264423, + "logps/rejected": -434.2275904605263, + "loss": 0.0978, + "rewards/chosen": 1.6472176771897535, + "rewards/margins": 8.774054353536382, + "rewards/rejected": -7.126836676346628, + "step": 1132 + }, + { + "epoch": 0.4182548105763463, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 6.407892728918333e-06, + "logits/chosen": 1551009069.1764705, + "logits/rejected": 1465116672.0, + "logps/chosen": -375.33645450367646, + "logps/rejected": -479.93095703125, + "loss": 0.203, + "rewards/chosen": 1.5076521705178654, + "rewards/margins": 8.756058135687136, + "rewards/rejected": -7.248405965169271, + "step": 1133 + }, + { + "epoch": 0.41862396751418945, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 6.402244549774707e-06, + "logits/chosen": 2049305429.3333333, + "logits/rejected": 1487737856.0, + "logps/chosen": -364.9890543619792, + "logps/rejected": -395.2612548828125, + "loss": 0.1162, + "rewards/chosen": 1.6487247149149578, + "rewards/margins": 8.443416182200114, + "rewards/rejected": -6.794691467285157, + "step": 1134 + }, + { + "epoch": 0.4189931244520327, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 6.396594427839076e-06, + "logits/chosen": 2071104398.2222223, + "logits/rejected": 2137403830.857143, + "logps/chosen": -371.79161241319446, + "logps/rejected": -409.3422154017857, + "loss": 0.1588, + "rewards/chosen": 1.3216544257269964, + "rewards/margins": 8.581192803761315, + "rewards/rejected": -7.2595383780343195, + "step": 1135 + }, + { + "epoch": 0.41936228138987586, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 6.3909423709396054e-06, + "logits/chosen": 1363735732.7058823, + "logits/rejected": 1484259874.1333334, + "logps/chosen": -237.30710018382354, + "logps/rejected": -466.32610677083335, + "loss": 0.176, + "rewards/chosen": 1.174126568962546, + "rewards/margins": 9.750303530225567, + "rewards/rejected": -8.576176961263021, + "step": 1136 + }, + { + "epoch": 0.4197314383277191, + "grad_norm": 14.0625, + "kl": 0.2199411392211914, + "learning_rate": 6.385288386907155e-06, + "logits/chosen": 2198811828.7058825, + "logits/rejected": 1326079590.4, + "logps/chosen": -334.3848230698529, + "logps/rejected": -417.2638346354167, + "loss": 0.1986, + "rewards/chosen": 0.8786048889160156, + "rewards/margins": 7.615520985921224, + "rewards/rejected": -6.736916097005208, + "step": 1137 + }, + { + "epoch": 0.42010059526556226, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 6.379632483575242e-06, + "logits/chosen": 1504016384.0, + "logits/rejected": 1379714944.0, + "logps/chosen": -230.348388671875, + "logps/rejected": -453.11279296875, + "loss": 0.1131, + "rewards/chosen": 2.285276174545288, + "rewards/margins": 9.45160698890686, + "rewards/rejected": -7.166330814361572, + "step": 1138 + }, + { + "epoch": 0.4204697522034055, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 6.373974668780053e-06, + "logits/chosen": 2059143031.4666667, + "logits/rejected": 1624866816.0, + "logps/chosen": -245.61163736979168, + "logps/rejected": -554.9939682904412, + "loss": 0.1344, + "rewards/chosen": 1.384009552001953, + "rewards/margins": 9.36359598496381, + "rewards/rejected": -7.979586432961857, + "step": 1139 + }, + { + "epoch": 0.42083890914124866, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 6.368314950360416e-06, + "logits/chosen": 1493679360.0, + "logits/rejected": 1566018048.0, + "logps/chosen": -250.75241088867188, + "logps/rejected": -399.3016662597656, + "loss": 0.1291, + "rewards/chosen": 1.5947717428207397, + "rewards/margins": 7.962820410728455, + "rewards/rejected": -6.368048667907715, + "step": 1140 + }, + { + "epoch": 0.4212080660790919, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 6.362653336157798e-06, + "logits/chosen": 1781087310.7692308, + "logits/rejected": 2392368181.894737, + "logps/chosen": -284.09262319711536, + "logps/rejected": -580.8717105263158, + "loss": 0.1431, + "rewards/chosen": 1.3495875138502855, + "rewards/margins": 8.396975691019282, + "rewards/rejected": -7.047388177168997, + "step": 1141 + }, + { + "epoch": 0.42157722301693507, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 6.356989834016296e-06, + "logits/chosen": 1847291041.6842105, + "logits/rejected": 1232314131.6923077, + "logps/chosen": -417.98812705592104, + "logps/rejected": -385.4050856370192, + "loss": 0.1951, + "rewards/chosen": 1.281013287995991, + "rewards/margins": 9.887054242585835, + "rewards/rejected": -8.606040954589844, + "step": 1142 + }, + { + "epoch": 0.4219463799547783, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 6.35132445178262e-06, + "logits/chosen": 2133992684.3076923, + "logits/rejected": 1827002044.631579, + "logps/chosen": -349.7681415264423, + "logps/rejected": -473.3071546052632, + "loss": 0.1165, + "rewards/chosen": 1.5746553861177885, + "rewards/margins": 8.605774481769515, + "rewards/rejected": -7.031119095651727, + "step": 1143 + }, + { + "epoch": 0.42231553689262147, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 6.3456571973060835e-06, + "logits/chosen": 1694877696.0, + "logits/rejected": 1631411053.7142856, + "logps/chosen": -258.93755425347223, + "logps/rejected": -392.53634207589283, + "loss": 0.1326, + "rewards/chosen": 1.6761817932128906, + "rewards/margins": 9.116954258510045, + "rewards/rejected": -7.440772465297154, + "step": 1144 + }, + { + "epoch": 0.4226846938304647, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 6.339988078438597e-06, + "logits/chosen": 1762287988.3636363, + "logits/rejected": 1539779925.3333333, + "logps/chosen": -252.0506258877841, + "logps/rejected": -419.8271019345238, + "loss": 0.1235, + "rewards/chosen": 1.0255888158624822, + "rewards/margins": 7.33895493379403, + "rewards/rejected": -6.3133661179315474, + "step": 1145 + }, + { + "epoch": 0.42305385076830787, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 6.3343171030346525e-06, + "logits/chosen": 2056606900.7058823, + "logits/rejected": 1524163515.7333333, + "logps/chosen": -380.4685489430147, + "logps/rejected": -541.4267252604167, + "loss": 0.1909, + "rewards/chosen": 0.7522559446447036, + "rewards/margins": 8.904037662580901, + "rewards/rejected": -8.151781717936197, + "step": 1146 + }, + { + "epoch": 0.4234230077061511, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 6.3286442789513135e-06, + "logits/chosen": 1846317933.7142856, + "logits/rejected": 1858965048.8888888, + "logps/chosen": -327.10030691964283, + "logps/rejected": -519.5309787326389, + "loss": 0.1448, + "rewards/chosen": 1.3853645324707031, + "rewards/margins": 9.764601389567057, + "rewards/rejected": -8.379236857096354, + "step": 1147 + }, + { + "epoch": 0.4237921646439943, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 6.322969614048207e-06, + "logits/chosen": 1845080473.6, + "logits/rejected": 2054415540.7058823, + "logps/chosen": -233.98995768229167, + "logps/rejected": -454.52306410845586, + "loss": 0.1709, + "rewards/chosen": 0.8968348185221354, + "rewards/margins": 8.678135980344285, + "rewards/rejected": -7.78130116182215, + "step": 1148 + }, + { + "epoch": 0.4241613215818375, + "grad_norm": 14.5625, + "kl": 0.6815147399902344, + "learning_rate": 6.317293116187508e-06, + "logits/chosen": 1318118741.3333333, + "logits/rejected": 1146506752.0, + "logps/chosen": -325.28130425347223, + "logps/rejected": -378.51283482142856, + "loss": 0.2119, + "rewards/chosen": 0.845975293053521, + "rewards/margins": 7.358902757129972, + "rewards/rejected": -6.5129274640764505, + "step": 1149 + }, + { + "epoch": 0.4245304785196807, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 6.311614793233932e-06, + "logits/chosen": 2105605120.0, + "logits/rejected": 1956215193.6, + "logps/chosen": -347.881591796875, + "logps/rejected": -528.3490234375, + "loss": 0.1155, + "rewards/chosen": 1.1439013481140137, + "rewards/margins": 9.072905254364013, + "rewards/rejected": -7.92900390625, + "step": 1150 + }, + { + "epoch": 0.4248996354575239, + "grad_norm": 11.5, + "kl": 1.2215385437011719, + "learning_rate": 6.3059346530547245e-06, + "logits/chosen": 1729642837.3333333, + "logits/rejected": 1619941990.4, + "logps/chosen": -352.1038818359375, + "logps/rejected": -416.00341796875, + "loss": 0.124, + "rewards/chosen": 1.104286829630534, + "rewards/margins": 8.16099764506022, + "rewards/rejected": -7.056710815429687, + "step": 1151 + }, + { + "epoch": 0.4252687923953671, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 6.300252703519647e-06, + "logits/chosen": 1930884573.8666666, + "logits/rejected": 1877269443.764706, + "logps/chosen": -250.41813151041666, + "logps/rejected": -459.25539981617646, + "loss": 0.1654, + "rewards/chosen": 0.9490040461222331, + "rewards/margins": 9.570912794973337, + "rewards/rejected": -8.621908748851103, + "step": 1152 + }, + { + "epoch": 0.4256379493332103, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 6.294568952500968e-06, + "logits/chosen": 2442897863.111111, + "logits/rejected": 1969817600.0, + "logps/chosen": -364.4619954427083, + "logps/rejected": -538.2336077008929, + "loss": 0.206, + "rewards/chosen": 1.1122061411539714, + "rewards/margins": 10.133271989368257, + "rewards/rejected": -9.021065848214286, + "step": 1153 + }, + { + "epoch": 0.4260071062710535, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 6.288883407873452e-06, + "logits/chosen": 1907285248.0, + "logits/rejected": 1914947840.0, + "logps/chosen": -231.92572021484375, + "logps/rejected": -615.7651977539062, + "loss": 0.1436, + "rewards/chosen": 1.3146599531173706, + "rewards/margins": 10.787593722343445, + "rewards/rejected": -9.472933769226074, + "step": 1154 + }, + { + "epoch": 0.42637626320889666, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 6.283196077514351e-06, + "logits/chosen": 1970119296.0, + "logits/rejected": 2374567936.0, + "logps/chosen": -249.85861206054688, + "logps/rejected": -459.8196716308594, + "loss": 0.1746, + "rewards/chosen": 1.0710588693618774, + "rewards/margins": 8.319616436958313, + "rewards/rejected": -7.2485575675964355, + "step": 1155 + }, + { + "epoch": 0.4267454201467399, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 6.277506969303387e-06, + "logits/chosen": 1930484224.0, + "logits/rejected": 1529655808.0, + "logps/chosen": -278.8389587402344, + "logps/rejected": -369.96929931640625, + "loss": 0.1074, + "rewards/chosen": 2.15155029296875, + "rewards/margins": 9.867129802703857, + "rewards/rejected": -7.715579509735107, + "step": 1156 + }, + { + "epoch": 0.42711457708458306, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 6.271816091122748e-06, + "logits/chosen": 1174500273.2307692, + "logits/rejected": 1485946880.0, + "logps/chosen": -267.9584209735577, + "logps/rejected": -403.4807771381579, + "loss": 0.1057, + "rewards/chosen": 1.9342938936673677, + "rewards/margins": 8.65736454025454, + "rewards/rejected": -6.723070646587171, + "step": 1157 + }, + { + "epoch": 0.4274837340224263, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 6.266123450857071e-06, + "logits/chosen": 1549263872.0, + "logits/rejected": 2493089536.0, + "logps/chosen": -362.7276916503906, + "logps/rejected": -615.659912109375, + "loss": 0.1158, + "rewards/chosen": 1.7271019220352173, + "rewards/margins": 8.561207890510559, + "rewards/rejected": -6.834105968475342, + "step": 1158 + }, + { + "epoch": 0.42785289096026946, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 6.26042905639344e-06, + "logits/chosen": 2171231393.6842103, + "logits/rejected": 1991832182.1538463, + "logps/chosen": -254.29970189144737, + "logps/rejected": -634.875, + "loss": 0.1962, + "rewards/chosen": 1.0531517831902755, + "rewards/margins": 9.369530449994661, + "rewards/rejected": -8.316378666804386, + "step": 1159 + }, + { + "epoch": 0.4282220478981127, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 6.254732915621365e-06, + "logits/chosen": 1854142122.6666667, + "logits/rejected": 2540871680.0, + "logps/chosen": -186.53206380208334, + "logps/rejected": -459.5015625, + "loss": 0.1267, + "rewards/chosen": 1.4769954681396484, + "rewards/margins": 8.428338241577148, + "rewards/rejected": -6.9513427734375, + "step": 1160 + }, + { + "epoch": 0.42859120483595586, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 6.249035036432776e-06, + "logits/chosen": 1165527040.0, + "logits/rejected": 2775265039.0588236, + "logps/chosen": -279.04212239583336, + "logps/rejected": -345.0897863051471, + "loss": 0.0813, + "rewards/chosen": 2.81498285929362, + "rewards/margins": 8.918571187935623, + "rewards/rejected": -6.103588328642004, + "step": 1161 + }, + { + "epoch": 0.4289603617737991, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 6.243335426722014e-06, + "logits/chosen": 1389212765.090909, + "logits/rejected": 1344073728.0, + "logps/chosen": -260.4420055042614, + "logps/rejected": -401.3618396577381, + "loss": 0.1538, + "rewards/chosen": 0.5857828747142445, + "rewards/margins": 7.569769138897652, + "rewards/rejected": -6.983986264183407, + "step": 1162 + }, + { + "epoch": 0.42932951871164227, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 6.237634094385814e-06, + "logits/chosen": 1910650790.9565217, + "logits/rejected": 2024341959.1111112, + "logps/chosen": -321.93546195652175, + "logps/rejected": -334.76860894097223, + "loss": 0.2052, + "rewards/chosen": 1.425300764000934, + "rewards/margins": 7.059296244008529, + "rewards/rejected": -5.633995480007595, + "step": 1163 + }, + { + "epoch": 0.4296986756494855, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 6.2319310473233e-06, + "logits/chosen": 1756033843.2, + "logits/rejected": 2160959969.882353, + "logps/chosen": -270.2632161458333, + "logps/rejected": -507.60403262867646, + "loss": 0.1608, + "rewards/chosen": 1.045746421813965, + "rewards/margins": 8.742213204327753, + "rewards/rejected": -7.696466782513787, + "step": 1164 + }, + { + "epoch": 0.43006783258732867, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 6.226226293435973e-06, + "logits/chosen": 1725159581.5384614, + "logits/rejected": 1544387745.6842105, + "logps/chosen": -374.47506009615387, + "logps/rejected": -594.3879009046053, + "loss": 0.1383, + "rewards/chosen": 1.0191868268526518, + "rewards/margins": 10.440421760806188, + "rewards/rejected": -9.421234933953537, + "step": 1165 + }, + { + "epoch": 0.4304369895251719, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 6.2205198406276946e-06, + "logits/chosen": 1835970087.3846154, + "logits/rejected": 2041349820.631579, + "logps/chosen": -229.0101036658654, + "logps/rejected": -564.3002158717105, + "loss": 0.0761, + "rewards/chosen": 2.8253951439490685, + "rewards/margins": 12.381373216266091, + "rewards/rejected": -9.555978072317023, + "step": 1166 + }, + { + "epoch": 0.43080614646301507, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 6.214811696804682e-06, + "logits/chosen": 1432108193.6842105, + "logits/rejected": 1391518326.1538463, + "logps/chosen": -330.2620785361842, + "logps/rejected": -332.88333834134613, + "loss": 0.1739, + "rewards/chosen": 1.5397603888260691, + "rewards/margins": 6.876531840335986, + "rewards/rejected": -5.336771451509916, + "step": 1167 + }, + { + "epoch": 0.4311753034008583, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 6.2091018698755e-06, + "logits/chosen": 1752609177.6, + "logits/rejected": 1628360192.0, + "logps/chosen": -329.652783203125, + "logps/rejected": -560.4564615885416, + "loss": 0.1757, + "rewards/chosen": 1.4416525840759278, + "rewards/margins": 7.585653591156006, + "rewards/rejected": -6.144001007080078, + "step": 1168 + }, + { + "epoch": 0.4315444603387015, + "grad_norm": 11.5625, + "kl": 0.725778341293335, + "learning_rate": 6.203390367751038e-06, + "logits/chosen": 1748932224.0, + "logits/rejected": 1559972608.0, + "logps/chosen": -222.55104064941406, + "logps/rejected": -576.6209716796875, + "loss": 0.1622, + "rewards/chosen": 1.1698040962219238, + "rewards/margins": 10.918261051177979, + "rewards/rejected": -9.748456954956055, + "step": 1169 + }, + { + "epoch": 0.4319136172765447, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 6.197677198344508e-06, + "logits/chosen": 1562490538.6666667, + "logits/rejected": 1583447478.857143, + "logps/chosen": -336.48046875, + "logps/rejected": -526.939208984375, + "loss": 0.1996, + "rewards/chosen": 1.3562950558132596, + "rewards/margins": 10.80140483190143, + "rewards/rejected": -9.44510977608817, + "step": 1170 + }, + { + "epoch": 0.4322827742143879, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 6.191962369571439e-06, + "logits/chosen": 1714684800.0, + "logits/rejected": 2165829632.0, + "logps/chosen": -200.8190460205078, + "logps/rejected": -616.0302734375, + "loss": 0.1569, + "rewards/chosen": 1.3629053831100464, + "rewards/margins": 10.023276209831238, + "rewards/rejected": -8.660370826721191, + "step": 1171 + }, + { + "epoch": 0.4326519311522311, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 6.18624588934965e-06, + "logits/chosen": 1825007616.0, + "logits/rejected": 1593575082.6666667, + "logps/chosen": -300.746240234375, + "logps/rejected": -549.9100341796875, + "loss": 0.2117, + "rewards/chosen": 1.0870254516601563, + "rewards/margins": 10.471473185221354, + "rewards/rejected": -9.384447733561197, + "step": 1172 + }, + { + "epoch": 0.4330210880900743, + "grad_norm": 13.0625, + "kl": 1.0147876739501953, + "learning_rate": 6.1805277655992514e-06, + "logits/chosen": 1357554609.2307692, + "logits/rejected": 1241522499.368421, + "logps/chosen": -325.1211688701923, + "logps/rejected": -377.4354697779605, + "loss": 0.1663, + "rewards/chosen": 0.9268924272977389, + "rewards/margins": 7.230636291658348, + "rewards/rejected": -6.303743864360609, + "step": 1173 + }, + { + "epoch": 0.4333902450279175, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 6.1748080062426345e-06, + "logits/chosen": 1546373658.9473684, + "logits/rejected": 1704660676.9230769, + "logps/chosen": -260.5208675986842, + "logps/rejected": -413.3268479567308, + "loss": 0.1868, + "rewards/chosen": 1.2665455466822575, + "rewards/margins": 8.056643296832497, + "rewards/rejected": -6.79009775015024, + "step": 1174 + }, + { + "epoch": 0.4337594019657607, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 6.169086619204447e-06, + "logits/chosen": 1653013390.2222223, + "logits/rejected": 1492339273.142857, + "logps/chosen": -289.9823404947917, + "logps/rejected": -521.7072405133929, + "loss": 0.1516, + "rewards/chosen": 1.5092287063598633, + "rewards/margins": 8.2974944795881, + "rewards/rejected": -6.788265773228237, + "step": 1175 + }, + { + "epoch": 0.4341285589036039, + "grad_norm": 11.375, + "kl": 0.17486953735351562, + "learning_rate": 6.1633636124116045e-06, + "logits/chosen": 2221824000.0, + "logits/rejected": 1360264533.3333333, + "logps/chosen": -238.27641950334822, + "logps/rejected": -437.22216796875, + "loss": 0.1297, + "rewards/chosen": 1.7954562050955636, + "rewards/margins": 8.397061090620737, + "rewards/rejected": -6.601604885525173, + "step": 1176 + }, + { + "epoch": 0.4344977158414471, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 6.157638993793257e-06, + "logits/chosen": 2236897280.0, + "logits/rejected": 2706335914.6666665, + "logps/chosen": -266.4365478515625, + "logps/rejected": -463.990478515625, + "loss": 0.22, + "rewards/chosen": 0.9851810455322265, + "rewards/margins": 8.468461227416991, + "rewards/rejected": -7.483280181884766, + "step": 1177 + }, + { + "epoch": 0.4348668727792903, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 6.15191277128079e-06, + "logits/chosen": 1442499145.142857, + "logits/rejected": 1629059072.0, + "logps/chosen": -277.12874930245533, + "logps/rejected": -392.89252387152777, + "loss": 0.1443, + "rewards/chosen": 1.0982837677001953, + "rewards/margins": 7.683296839396159, + "rewards/rejected": -6.585013071695964, + "step": 1178 + }, + { + "epoch": 0.4352360297171335, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 6.146184952807815e-06, + "logits/chosen": 1754633102.2222223, + "logits/rejected": 1636933778.2857144, + "logps/chosen": -227.54985894097223, + "logps/rejected": -406.6975795200893, + "loss": 0.2225, + "rewards/chosen": 0.820282088385688, + "rewards/margins": 6.5763398276435, + "rewards/rejected": -5.7560577392578125, + "step": 1179 + }, + { + "epoch": 0.4356051866549767, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 6.140455546310149e-06, + "logits/chosen": 2224008011.2941175, + "logits/rejected": 1498087833.6, + "logps/chosen": -214.51319795496323, + "logps/rejected": -478.83811848958334, + "loss": 0.1217, + "rewards/chosen": 1.7925473381491268, + "rewards/margins": 9.45401075774548, + "rewards/rejected": -7.6614634195963545, + "step": 1180 + }, + { + "epoch": 0.4359743435928199, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 6.134724559725812e-06, + "logits/chosen": 1237875858.2857144, + "logits/rejected": 1739870776.8888888, + "logps/chosen": -181.70908900669642, + "logps/rejected": -591.18359375, + "loss": 0.0739, + "rewards/chosen": 3.107684816632952, + "rewards/margins": 10.792802750118195, + "rewards/rejected": -7.685117933485243, + "step": 1181 + }, + { + "epoch": 0.4363435005306631, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 6.128992000995015e-06, + "logits/chosen": 1452509730.1333334, + "logits/rejected": 1326499237.6470587, + "logps/chosen": -254.52776692708332, + "logps/rejected": -488.2190946691176, + "loss": 0.1279, + "rewards/chosen": 1.8877037048339844, + "rewards/margins": 9.98043046839097, + "rewards/rejected": -8.092726763556986, + "step": 1182 + }, + { + "epoch": 0.4367126574685063, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 6.123257878060146e-06, + "logits/chosen": 1854752229.0526316, + "logits/rejected": 1756491776.0, + "logps/chosen": -320.4297388980263, + "logps/rejected": -609.3537785456731, + "loss": 0.1622, + "rewards/chosen": 1.3992160997892682, + "rewards/margins": 10.522290557984881, + "rewards/rejected": -9.123074458195614, + "step": 1183 + }, + { + "epoch": 0.4370818144063495, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 6.1175221988657555e-06, + "logits/chosen": 1513226555.0769231, + "logits/rejected": 2028709780.2105262, + "logps/chosen": -274.0032489483173, + "logps/rejected": -616.7845394736842, + "loss": 0.0994, + "rewards/chosen": 1.8532689901498647, + "rewards/margins": 11.015977473394107, + "rewards/rejected": -9.162708483244243, + "step": 1184 + }, + { + "epoch": 0.4374509713441927, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 6.111784971358556e-06, + "logits/chosen": 1832744487.3846154, + "logits/rejected": 1290767629.4736843, + "logps/chosen": -246.79479041466345, + "logps/rejected": -426.1707185444079, + "loss": 0.1309, + "rewards/chosen": 1.108178212092473, + "rewards/margins": 9.519169680020106, + "rewards/rejected": -8.410991467927632, + "step": 1185 + }, + { + "epoch": 0.4378201282820359, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 6.106046203487406e-06, + "logits/chosen": 1892622982.7368422, + "logits/rejected": 2296552054.1538463, + "logps/chosen": -276.08280222039474, + "logps/rejected": -506.60385366586536, + "loss": 0.1655, + "rewards/chosen": 1.5301130194413035, + "rewards/margins": 9.17572001407021, + "rewards/rejected": -7.645606994628906, + "step": 1186 + }, + { + "epoch": 0.4381892852198791, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 6.100305903203292e-06, + "logits/chosen": 2090680064.0, + "logits/rejected": 1524462720.0, + "logps/chosen": -303.1239318847656, + "logps/rejected": -413.6661071777344, + "loss": 0.1732, + "rewards/chosen": 0.900115430355072, + "rewards/margins": 7.91779226064682, + "rewards/rejected": -7.017676830291748, + "step": 1187 + }, + { + "epoch": 0.43855844215772233, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 6.094564078459329e-06, + "logits/chosen": 1995865916.952381, + "logits/rejected": 1719082077.090909, + "logps/chosen": -327.9398484002976, + "logps/rejected": -439.54909446022725, + "loss": 0.2154, + "rewards/chosen": 1.2101111639113653, + "rewards/margins": 8.796088198046663, + "rewards/rejected": -7.585977034135298, + "step": 1188 + }, + { + "epoch": 0.4389275990955655, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 6.08882073721074e-06, + "logits/chosen": 2015229952.0, + "logits/rejected": 2145516066.1333334, + "logps/chosen": -220.79549632352942, + "logps/rejected": -523.5539713541667, + "loss": 0.1503, + "rewards/chosen": 1.4565528420840992, + "rewards/margins": 10.015811875287223, + "rewards/rejected": -8.559259033203125, + "step": 1189 + }, + { + "epoch": 0.4392967560334087, + "grad_norm": 22.25, + "kl": 0.0, + "learning_rate": 6.083075887414854e-06, + "logits/chosen": 2800536576.0, + "logits/rejected": 2805027072.0, + "logps/chosen": -593.7982177734375, + "logps/rejected": -464.11492919921875, + "loss": 0.2057, + "rewards/chosen": 0.6975958347320557, + "rewards/margins": 7.765184640884399, + "rewards/rejected": -7.067588806152344, + "step": 1190 + }, + { + "epoch": 0.4396659129712519, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 6.077329537031087e-06, + "logits/chosen": 1896302592.0, + "logits/rejected": 1878963404.8, + "logps/chosen": -281.3272298177083, + "logps/rejected": -414.824365234375, + "loss": 0.1053, + "rewards/chosen": 2.070777098337809, + "rewards/margins": 9.892878691355387, + "rewards/rejected": -7.822101593017578, + "step": 1191 + }, + { + "epoch": 0.4400350699090951, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 6.071581694020933e-06, + "logits/chosen": 2042647424.0, + "logits/rejected": 2000086912.0, + "logps/chosen": -241.7680206298828, + "logps/rejected": -453.7569274902344, + "loss": 0.1614, + "rewards/chosen": 1.0097142457962036, + "rewards/margins": 8.50803005695343, + "rewards/rejected": -7.498315811157227, + "step": 1192 + }, + { + "epoch": 0.4404042268469383, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 6.0658323663479555e-06, + "logits/chosen": 1755452928.0, + "logits/rejected": 2008418688.0, + "logps/chosen": -343.9593200683594, + "logps/rejected": -417.07623291015625, + "loss": 0.1973, + "rewards/chosen": 0.8431195020675659, + "rewards/margins": 6.240869879722595, + "rewards/rejected": -5.397750377655029, + "step": 1193 + }, + { + "epoch": 0.4407733837847815, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 6.060081561977778e-06, + "logits/chosen": 2562091264.0, + "logits/rejected": 2177996544.0, + "logps/chosen": -266.28472900390625, + "logps/rejected": -485.1036376953125, + "loss": 0.1919, + "rewards/chosen": 0.9154738783836365, + "rewards/margins": 8.303599774837494, + "rewards/rejected": -7.388125896453857, + "step": 1194 + }, + { + "epoch": 0.4411425407226247, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 6.054329288878062e-06, + "logits/chosen": 1566706619.7333333, + "logits/rejected": 1717107049.4117646, + "logps/chosen": -323.4928385416667, + "logps/rejected": -401.29279641544116, + "loss": 0.1434, + "rewards/chosen": 1.609241739908854, + "rewards/margins": 8.901518458946079, + "rewards/rejected": -7.292276719037225, + "step": 1195 + }, + { + "epoch": 0.4415116976604679, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 6.048575555018512e-06, + "logits/chosen": 1431801582.9333334, + "logits/rejected": 1437397714.8235295, + "logps/chosen": -228.38050130208333, + "logps/rejected": -339.2764246323529, + "loss": 0.1725, + "rewards/chosen": 1.303792953491211, + "rewards/margins": 7.495886297786937, + "rewards/rejected": -6.192093344295726, + "step": 1196 + }, + { + "epoch": 0.4418808545983111, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 6.042820368370854e-06, + "logits/chosen": 2360157735.3846154, + "logits/rejected": 1598738647.5789473, + "logps/chosen": -298.84664212740387, + "logps/rejected": -570.661287006579, + "loss": 0.1197, + "rewards/chosen": 1.3980539762056792, + "rewards/margins": 10.39228374465757, + "rewards/rejected": -8.994229768451891, + "step": 1197 + }, + { + "epoch": 0.4422500115361543, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 6.037063736908822e-06, + "logits/chosen": 2263678603.6363635, + "logits/rejected": 2211574150.095238, + "logps/chosen": -303.0801890980114, + "logps/rejected": -555.9850260416666, + "loss": 0.1034, + "rewards/chosen": 1.2730738032947888, + "rewards/margins": 9.049683678201783, + "rewards/rejected": -7.776609874906994, + "step": 1198 + }, + { + "epoch": 0.4426191684739975, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 6.03130566860816e-06, + "logits/chosen": 1346461696.0, + "logits/rejected": 1491568298.6666667, + "logps/chosen": -231.18781389508928, + "logps/rejected": -500.9045138888889, + "loss": 0.0867, + "rewards/chosen": 2.0051404408046176, + "rewards/margins": 9.647707545568071, + "rewards/rejected": -7.6425671047634545, + "step": 1199 + }, + { + "epoch": 0.4429883254118407, + "grad_norm": 12.125, + "kl": 0.2620217800140381, + "learning_rate": 6.025546171446599e-06, + "logits/chosen": 1541022606.2222223, + "logits/rejected": 2020823186.2857144, + "logps/chosen": -365.2447916666667, + "logps/rejected": -341.2520228794643, + "loss": 0.1454, + "rewards/chosen": 1.557430585225423, + "rewards/margins": 8.61750280289423, + "rewards/rejected": -7.0600722176688055, + "step": 1200 + }, + { + "epoch": 0.4433574823496839, + "grad_norm": 13.1875, + "kl": 0.7655420303344727, + "learning_rate": 6.019785253403843e-06, + "logits/chosen": 1700520448.0, + "logits/rejected": 1358886741.3333333, + "logps/chosen": -285.6713623046875, + "logps/rejected": -472.9306233723958, + "loss": 0.1732, + "rewards/chosen": 1.501136016845703, + "rewards/margins": 10.205138397216796, + "rewards/rejected": -8.704002380371094, + "step": 1201 + }, + { + "epoch": 0.4437266392875271, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 6.0140229224615765e-06, + "logits/chosen": 1855772069.6470587, + "logits/rejected": 1800440490.6666667, + "logps/chosen": -329.4501378676471, + "logps/rejected": -474.51067708333335, + "loss": 0.1685, + "rewards/chosen": 1.2871527952306412, + "rewards/margins": 8.473191003238455, + "rewards/rejected": -7.186038208007813, + "step": 1202 + }, + { + "epoch": 0.4440957962253703, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 6.008259186603434e-06, + "logits/chosen": 2020252672.0, + "logits/rejected": 1473277824.0, + "logps/chosen": -278.72344970703125, + "logps/rejected": -345.509765625, + "loss": 0.1417, + "rewards/chosen": 1.5811870098114014, + "rewards/margins": 7.500243902206421, + "rewards/rejected": -5.9190568923950195, + "step": 1203 + }, + { + "epoch": 0.4444649531632135, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 6.0024940538149965e-06, + "logits/chosen": 1819806254.5454545, + "logits/rejected": 1489909467.4285715, + "logps/chosen": -251.67808948863637, + "logps/rejected": -482.00320870535717, + "loss": 0.1343, + "rewards/chosen": 1.0286538384177468, + "rewards/margins": 10.236009882642076, + "rewards/rejected": -9.20735604422433, + "step": 1204 + }, + { + "epoch": 0.4448341101010567, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 5.996727532083786e-06, + "logits/chosen": 2133358796.8, + "logits/rejected": 2853421614.5454545, + "logps/chosen": -275.001953125, + "logps/rejected": -515.6335671164773, + "loss": 0.1006, + "rewards/chosen": 1.579201889038086, + "rewards/margins": 9.978601386330343, + "rewards/rejected": -8.399399497292258, + "step": 1205 + }, + { + "epoch": 0.4452032670388999, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 5.990959629399242e-06, + "logits/chosen": 2673478997.3333335, + "logits/rejected": 1972337078.857143, + "logps/chosen": -249.64501953125, + "logps/rejected": -447.3184291294643, + "loss": 0.1886, + "rewards/chosen": 1.1158578660753038, + "rewards/margins": 8.043473440503316, + "rewards/rejected": -6.927615574428013, + "step": 1206 + }, + { + "epoch": 0.4455724239767431, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 5.9851903537527225e-06, + "logits/chosen": 2054583637.3333333, + "logits/rejected": 3197068008.7272725, + "logps/chosen": -291.3203590029762, + "logps/rejected": -519.6983309659091, + "loss": 0.1963, + "rewards/chosen": 1.041286831810361, + "rewards/margins": 9.012508755638486, + "rewards/rejected": -7.971221923828125, + "step": 1207 + }, + { + "epoch": 0.4459415809145863, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 5.979419713137484e-06, + "logits/chosen": 2014196736.0, + "logits/rejected": 1931886796.8, + "logps/chosen": -407.2604166666667, + "logps/rejected": -561.581396484375, + "loss": 0.1302, + "rewards/chosen": 1.230411132176717, + "rewards/margins": 9.07372473080953, + "rewards/rejected": -7.843313598632813, + "step": 1208 + }, + { + "epoch": 0.44631073785242953, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 5.973647715548676e-06, + "logits/chosen": 1875393792.0, + "logits/rejected": 1684791296.0, + "logps/chosen": -256.8343811035156, + "logps/rejected": -413.5158386230469, + "loss": 0.1744, + "rewards/chosen": 1.0216689109802246, + "rewards/margins": 7.533850193023682, + "rewards/rejected": -6.512181282043457, + "step": 1209 + }, + { + "epoch": 0.4466798947902727, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 5.9678743689833284e-06, + "logits/chosen": 1364298330.3529413, + "logits/rejected": 1680835652.2666667, + "logps/chosen": -332.80807674632354, + "logps/rejected": -441.7998372395833, + "loss": 0.1681, + "rewards/chosen": 1.2386263679055607, + "rewards/margins": 11.259316268621706, + "rewards/rejected": -10.020689900716146, + "step": 1210 + }, + { + "epoch": 0.44704905172811593, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 5.962099681440341e-06, + "logits/chosen": 2271398229.3333335, + "logits/rejected": 2435773440.0, + "logps/chosen": -319.8719482421875, + "logps/rejected": -505.247216796875, + "loss": 0.0856, + "rewards/chosen": 1.6468556722005208, + "rewards/margins": 11.313663228352866, + "rewards/rejected": -9.666807556152344, + "step": 1211 + }, + { + "epoch": 0.4474182086659591, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 5.9563236609204655e-06, + "logits/chosen": 1256605157.0526316, + "logits/rejected": 1234640659.6923077, + "logps/chosen": -254.10264185855263, + "logps/rejected": -389.03331580528845, + "loss": 0.1772, + "rewards/chosen": 1.2331807989823191, + "rewards/margins": 10.453341063217596, + "rewards/rejected": -9.220160264235277, + "step": 1212 + }, + { + "epoch": 0.44778736560380233, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 5.950546315426309e-06, + "logits/chosen": 1574508646.4, + "logits/rejected": 1581717333.3333333, + "logps/chosen": -347.7767578125, + "logps/rejected": -580.7117919921875, + "loss": 0.2246, + "rewards/chosen": 0.8461013793945312, + "rewards/margins": 9.856765365600586, + "rewards/rejected": -9.010663986206055, + "step": 1213 + }, + { + "epoch": 0.4481565225416455, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 5.944767652962309e-06, + "logits/chosen": 2185421902.769231, + "logits/rejected": 1601608218.9473684, + "logps/chosen": -295.25940880408655, + "logps/rejected": -493.65244654605266, + "loss": 0.1338, + "rewards/chosen": 1.616997792170598, + "rewards/margins": 9.221936500024217, + "rewards/rejected": -7.604938707853618, + "step": 1214 + }, + { + "epoch": 0.44852567947948874, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 5.938987681534729e-06, + "logits/chosen": 2318612889.6, + "logits/rejected": 2851488466.8235292, + "logps/chosen": -284.07607421875, + "logps/rejected": -543.5099379595588, + "loss": 0.1784, + "rewards/chosen": 0.6491394678751627, + "rewards/margins": 9.000372557546578, + "rewards/rejected": -8.351233089671416, + "step": 1215 + }, + { + "epoch": 0.4488948364173319, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5.933206409151646e-06, + "logits/chosen": 2260613864.7272725, + "logits/rejected": 1400607695.2380953, + "logps/chosen": -260.8787730823864, + "logps/rejected": -366.06719680059524, + "loss": 0.1068, + "rewards/chosen": 1.41098031130704, + "rewards/margins": 8.437018344928692, + "rewards/rejected": -7.026038033621652, + "step": 1216 + }, + { + "epoch": 0.44926399335517514, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 5.92742384382294e-06, + "logits/chosen": 1553157324.8, + "logits/rejected": 1821359646.1176472, + "logps/chosen": -244.955078125, + "logps/rejected": -444.84593290441177, + "loss": 0.1391, + "rewards/chosen": 1.2986845652262369, + "rewards/margins": 9.09821190179563, + "rewards/rejected": -7.799527336569393, + "step": 1217 + }, + { + "epoch": 0.4496331502930183, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 5.92163999356028e-06, + "logits/chosen": 1328993673.8461537, + "logits/rejected": 2256255299.368421, + "logps/chosen": -232.06120417668268, + "logps/rejected": -557.3095703125, + "loss": 0.0339, + "rewards/chosen": 3.691144503079928, + "rewards/margins": 13.032181651003448, + "rewards/rejected": -9.34103714792352, + "step": 1218 + }, + { + "epoch": 0.45000230723086154, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 5.91585486637712e-06, + "logits/chosen": 1228726994.8235295, + "logits/rejected": 1830348800.0, + "logps/chosen": -292.02780330882354, + "logps/rejected": -344.55579427083336, + "loss": 0.1261, + "rewards/chosen": 1.8638019561767578, + "rewards/margins": 8.369182205200195, + "rewards/rejected": -6.505380249023437, + "step": 1219 + }, + { + "epoch": 0.4503714641687047, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 5.910068470288677e-06, + "logits/chosen": 1787162624.0, + "logits/rejected": 1427194311.1111112, + "logps/chosen": -252.86805943080358, + "logps/rejected": -470.22157118055554, + "loss": 0.154, + "rewards/chosen": 0.7780662264142718, + "rewards/margins": 8.091146060398646, + "rewards/rejected": -7.313079833984375, + "step": 1220 + }, + { + "epoch": 0.45074062110654795, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 5.90428081331193e-06, + "logits/chosen": 1609466948.2666667, + "logits/rejected": 2006268626.8235295, + "logps/chosen": -263.45774739583334, + "logps/rejected": -443.23733340992646, + "loss": 0.1032, + "rewards/chosen": 2.535178629557292, + "rewards/margins": 10.641681386910234, + "rewards/rejected": -8.106502757352942, + "step": 1221 + }, + { + "epoch": 0.4511097780443911, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 5.898491903465607e-06, + "logits/chosen": 2119557632.0, + "logits/rejected": 2010020864.0, + "logps/chosen": -302.6974182128906, + "logps/rejected": -505.8480224609375, + "loss": 0.1362, + "rewards/chosen": 1.3110463619232178, + "rewards/margins": 9.3225839138031, + "rewards/rejected": -8.011537551879883, + "step": 1222 + }, + { + "epoch": 0.45147893498223435, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 5.892701748770165e-06, + "logits/chosen": 1501518994.2857144, + "logits/rejected": 1429623921.7777777, + "logps/chosen": -336.3925083705357, + "logps/rejected": -448.51752387152777, + "loss": 0.1696, + "rewards/chosen": 1.1946919304983956, + "rewards/margins": 9.690975567651174, + "rewards/rejected": -8.496283637152779, + "step": 1223 + }, + { + "epoch": 0.4518480919200775, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 5.886910357247792e-06, + "logits/chosen": 1445423513.6, + "logits/rejected": 1527701504.0, + "logps/chosen": -281.47978515625, + "logps/rejected": -417.58289292279414, + "loss": 0.1531, + "rewards/chosen": 1.0559834798177083, + "rewards/margins": 8.95943124808517, + "rewards/rejected": -7.903447768267463, + "step": 1224 + }, + { + "epoch": 0.45221724885792075, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 5.8811177369223895e-06, + "logits/chosen": 1963946456.6153846, + "logits/rejected": 2457097701.0526314, + "logps/chosen": -239.3133826622596, + "logps/rejected": -444.90316611842104, + "loss": 0.1389, + "rewards/chosen": 1.289617685171274, + "rewards/margins": 7.403787960407705, + "rewards/rejected": -6.114170275236431, + "step": 1225 + }, + { + "epoch": 0.4525864057957639, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 5.875323895819554e-06, + "logits/chosen": 1622376155.4285715, + "logits/rejected": 1542382819.5555556, + "logps/chosen": -342.06675502232144, + "logps/rejected": -461.17632378472223, + "loss": 0.143, + "rewards/chosen": 1.1816917146955217, + "rewards/margins": 8.13701101333376, + "rewards/rejected": -6.955319298638238, + "step": 1226 + }, + { + "epoch": 0.4529555627336071, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 5.869528841966583e-06, + "logits/chosen": 1349430067.2, + "logits/rejected": 1569057008.9411764, + "logps/chosen": -242.981396484375, + "logps/rejected": -439.3377470128676, + "loss": 0.1575, + "rewards/chosen": 1.3296928405761719, + "rewards/margins": 9.372089834774242, + "rewards/rejected": -8.04239699419807, + "step": 1227 + }, + { + "epoch": 0.4533247196714503, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 5.8637325833924494e-06, + "logits/chosen": 1174395562.6666667, + "logits/rejected": 1660332393.4117646, + "logps/chosen": -212.13362630208334, + "logps/rejected": -518.6707835477941, + "loss": 0.1155, + "rewards/chosen": 2.1738690694173175, + "rewards/margins": 10.203831945681104, + "rewards/rejected": -8.029962876263786, + "step": 1228 + }, + { + "epoch": 0.4536938766092935, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 5.857935128127793e-06, + "logits/chosen": 2365410123.2941175, + "logits/rejected": 2166744951.4666667, + "logps/chosen": -343.05193014705884, + "logps/rejected": -352.9173177083333, + "loss": 0.1634, + "rewards/chosen": 1.4876788644229664, + "rewards/margins": 8.633831106447706, + "rewards/rejected": -7.14615224202474, + "step": 1229 + }, + { + "epoch": 0.45406303354713673, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 5.852136484204918e-06, + "logits/chosen": 2396326912.0, + "logits/rejected": 2090340352.0, + "logps/chosen": -288.4384765625, + "logps/rejected": -531.9009399414062, + "loss": 0.0932, + "rewards/chosen": 1.9437556266784668, + "rewards/margins": 11.144826412200928, + "rewards/rejected": -9.201070785522461, + "step": 1230 + }, + { + "epoch": 0.4544321904849799, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 5.8463366596577706e-06, + "logits/chosen": 2382004614.095238, + "logits/rejected": 2396348788.3636365, + "logps/chosen": -180.46914527529762, + "logps/rejected": -475.4094904119318, + "loss": 0.1904, + "rewards/chosen": 1.9556323460170202, + "rewards/margins": 9.508001897242162, + "rewards/rejected": -7.552369551225142, + "step": 1231 + }, + { + "epoch": 0.45480134742282313, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 5.8405356625219335e-06, + "logits/chosen": 2065018441.142857, + "logits/rejected": 1884366620.4444444, + "logps/chosen": -293.97073800223217, + "logps/rejected": -516.9618055555555, + "loss": 0.1156, + "rewards/chosen": 2.0288102286202565, + "rewards/margins": 9.391472649952721, + "rewards/rejected": -7.362662421332465, + "step": 1232 + }, + { + "epoch": 0.4551705043606663, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 5.834733500834615e-06, + "logits/chosen": 1148140701.5384614, + "logits/rejected": 1146724244.2105262, + "logps/chosen": -203.06114783653845, + "logps/rejected": -367.587890625, + "loss": 0.1449, + "rewards/chosen": 1.4231803600604718, + "rewards/margins": 8.270568376610637, + "rewards/rejected": -6.847388016550164, + "step": 1233 + }, + { + "epoch": 0.45553966129850954, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 5.8289301826346375e-06, + "logits/chosen": 1455251335.5294118, + "logits/rejected": 1382165708.8, + "logps/chosen": -287.36506204044116, + "logps/rejected": -507.770703125, + "loss": 0.1952, + "rewards/chosen": 0.7624001222498277, + "rewards/margins": 8.872617916032379, + "rewards/rejected": -8.110217793782551, + "step": 1234 + }, + { + "epoch": 0.4559088182363527, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5.823125715962421e-06, + "logits/chosen": 2696668081.230769, + "logits/rejected": 2132366389.8947368, + "logps/chosen": -260.568359375, + "logps/rejected": -358.1229954769737, + "loss": 0.1323, + "rewards/chosen": 1.1942630914541392, + "rewards/margins": 7.4920162749193935, + "rewards/rejected": -6.297753183465255, + "step": 1235 + }, + { + "epoch": 0.45627797517419594, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 5.817320108859984e-06, + "logits/chosen": 1886683750.4, + "logits/rejected": 2053242135.2727273, + "logps/chosen": -309.898876953125, + "logps/rejected": -390.2923029119318, + "loss": 0.1443, + "rewards/chosen": 1.429996395111084, + "rewards/margins": 6.617664241790772, + "rewards/rejected": -5.1876678466796875, + "step": 1236 + }, + { + "epoch": 0.4566471321120391, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 5.811513369370921e-06, + "logits/chosen": 2279487624.5333333, + "logits/rejected": 2077135811.764706, + "logps/chosen": -285.45192057291666, + "logps/rejected": -455.3132755055147, + "loss": 0.1544, + "rewards/chosen": 1.2030244191487631, + "rewards/margins": 9.618088613771924, + "rewards/rejected": -8.415064194623161, + "step": 1237 + }, + { + "epoch": 0.45701628904988234, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 5.805705505540392e-06, + "logits/chosen": 1523144396.8, + "logits/rejected": 1777177258.6666667, + "logps/chosen": -337.6244140625, + "logps/rejected": -541.5087076822916, + "loss": 0.1981, + "rewards/chosen": 0.952875804901123, + "rewards/margins": 10.259911886850992, + "rewards/rejected": -9.30703608194987, + "step": 1238 + }, + { + "epoch": 0.4573854459877255, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 5.799896525415124e-06, + "logits/chosen": 2006574501.6470587, + "logits/rejected": 2102981973.3333333, + "logps/chosen": -292.20703125, + "logps/rejected": -433.98570963541664, + "loss": 0.195, + "rewards/chosen": 1.1579180324778837, + "rewards/margins": 9.019636273851582, + "rewards/rejected": -7.861718241373698, + "step": 1239 + }, + { + "epoch": 0.45775460292556874, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 5.7940864370433825e-06, + "logits/chosen": 1898868893.5384614, + "logits/rejected": 1820311336.4210527, + "logps/chosen": -249.96574519230768, + "logps/rejected": -336.3924496299342, + "loss": 0.158, + "rewards/chosen": 0.8757619124192458, + "rewards/margins": 7.235195538293012, + "rewards/rejected": -6.359433625873766, + "step": 1240 + }, + { + "epoch": 0.4581237598634119, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 5.78827524847497e-06, + "logits/chosen": 2071726552.6153846, + "logits/rejected": 2597189524.2105265, + "logps/chosen": -244.2867713341346, + "logps/rejected": -572.7776521381579, + "loss": 0.1228, + "rewards/chosen": 1.2823998377873347, + "rewards/margins": 10.17870431969523, + "rewards/rejected": -8.896304481907896, + "step": 1241 + }, + { + "epoch": 0.45849291680125515, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 5.782462967761217e-06, + "logits/chosen": 2230663770.352941, + "logits/rejected": 2650438587.733333, + "logps/chosen": -345.81554457720586, + "logps/rejected": -415.34873046875, + "loss": 0.1491, + "rewards/chosen": 1.3631730921128218, + "rewards/margins": 9.286865922516467, + "rewards/rejected": -7.923692830403646, + "step": 1242 + }, + { + "epoch": 0.4588620737390983, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 5.776649602954963e-06, + "logits/chosen": 1526717952.0, + "logits/rejected": 1852887552.0, + "logps/chosen": -316.0617370605469, + "logps/rejected": -470.9912109375, + "loss": 0.1509, + "rewards/chosen": 1.6883612871170044, + "rewards/margins": 9.559767365455627, + "rewards/rejected": -7.871406078338623, + "step": 1243 + }, + { + "epoch": 0.45923123067694155, + "grad_norm": 11.5, + "kl": 0.3478884696960449, + "learning_rate": 5.770835162110551e-06, + "logits/chosen": 1603625164.8, + "logits/rejected": 1446146730.6666667, + "logps/chosen": -239.4867431640625, + "logps/rejected": -413.8687744140625, + "loss": 0.1515, + "rewards/chosen": 1.8164600372314452, + "rewards/margins": 9.336744689941407, + "rewards/rejected": -7.520284652709961, + "step": 1244 + }, + { + "epoch": 0.4596003876147847, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 5.765019653283814e-06, + "logits/chosen": 1460917833.142857, + "logits/rejected": 1950867000.8888888, + "logps/chosen": -244.91615513392858, + "logps/rejected": -525.5714518229166, + "loss": 0.1437, + "rewards/chosen": 1.304199491228376, + "rewards/margins": 9.710113797869, + "rewards/rejected": -8.405914306640625, + "step": 1245 + }, + { + "epoch": 0.45996954455262795, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 5.759203084532068e-06, + "logits/chosen": 1415099099.4285715, + "logits/rejected": 1731776967.1111112, + "logps/chosen": -253.03949846540178, + "logps/rejected": -424.7610134548611, + "loss": 0.1277, + "rewards/chosen": 1.9732042040143694, + "rewards/margins": 8.159210538107251, + "rewards/rejected": -6.186006334092882, + "step": 1246 + }, + { + "epoch": 0.4603387014904711, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 5.753385463914094e-06, + "logits/chosen": 1865517909.3333333, + "logits/rejected": 2402443776.0, + "logps/chosen": -281.33013916015625, + "logps/rejected": -470.059814453125, + "loss": 0.2223, + "rewards/chosen": 1.663313388824463, + "rewards/margins": 9.450936794281006, + "rewards/rejected": -7.787623405456543, + "step": 1247 + }, + { + "epoch": 0.46070785842831435, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 5.7475667994901316e-06, + "logits/chosen": 1754968320.0, + "logits/rejected": 1428333312.0, + "logps/chosen": -214.22906494140625, + "logps/rejected": -447.44293212890625, + "loss": 0.1045, + "rewards/chosen": 2.797363519668579, + "rewards/margins": 9.79190468788147, + "rewards/rejected": -6.994541168212891, + "step": 1248 + }, + { + "epoch": 0.4610770153661575, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 5.741747099321866e-06, + "logits/chosen": 2795557683.2, + "logits/rejected": 1755072512.0, + "logps/chosen": -347.5180419921875, + "logps/rejected": -503.2632242838542, + "loss": 0.1784, + "rewards/chosen": 1.4398449897766112, + "rewards/margins": 7.826426410675049, + "rewards/rejected": -6.3865814208984375, + "step": 1249 + }, + { + "epoch": 0.46144617230400076, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5.735926371472418e-06, + "logits/chosen": 1943041792.0, + "logits/rejected": 1800609664.0, + "logps/chosen": -308.3397521972656, + "logps/rejected": -465.0818786621094, + "loss": 0.1018, + "rewards/chosen": 2.063084125518799, + "rewards/margins": 9.08430004119873, + "rewards/rejected": -7.021215915679932, + "step": 1250 + }, + { + "epoch": 0.46181532924184393, + "grad_norm": 12.75, + "kl": 0.058286190032958984, + "learning_rate": 5.730104624006333e-06, + "logits/chosen": 1651552870.4, + "logits/rejected": 1679456597.3333333, + "logps/chosen": -311.655517578125, + "logps/rejected": -654.9146728515625, + "loss": 0.1777, + "rewards/chosen": 1.484128761291504, + "rewards/margins": 12.667819150288901, + "rewards/rejected": -11.183690388997396, + "step": 1251 + }, + { + "epoch": 0.46218448617968716, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5.724281864989567e-06, + "logits/chosen": 979742720.0, + "logits/rejected": 1431876065.8823528, + "logps/chosen": -234.73251953125, + "logps/rejected": -518.0363625919117, + "loss": 0.1292, + "rewards/chosen": 1.6141777038574219, + "rewards/margins": 8.364383697509766, + "rewards/rejected": -6.750205993652344, + "step": 1252 + }, + { + "epoch": 0.46255364311753033, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 5.718458102489479e-06, + "logits/chosen": 1938567649.8823528, + "logits/rejected": 2226235801.6, + "logps/chosen": -279.6455078125, + "logps/rejected": -441.65758463541664, + "loss": 0.17, + "rewards/chosen": 1.2799157535328585, + "rewards/margins": 8.008850995232077, + "rewards/rejected": -6.728935241699219, + "step": 1253 + }, + { + "epoch": 0.46292280005537356, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 5.712633344574816e-06, + "logits/chosen": 1788218761.8461537, + "logits/rejected": 1813510251.7894738, + "logps/chosen": -309.4136305588942, + "logps/rejected": -405.37181332236844, + "loss": 0.1603, + "rewards/chosen": 0.821734721844013, + "rewards/margins": 7.634694091704211, + "rewards/rejected": -6.812959369860198, + "step": 1254 + }, + { + "epoch": 0.46329195699321674, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 5.70680759931571e-06, + "logits/chosen": 1841186048.0, + "logits/rejected": 1521164160.0, + "logps/chosen": -328.46795654296875, + "logps/rejected": -596.1806030273438, + "loss": 0.1781, + "rewards/chosen": 0.8562191128730774, + "rewards/margins": 9.154949963092804, + "rewards/rejected": -8.298730850219727, + "step": 1255 + }, + { + "epoch": 0.46366111393105996, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 5.7009808747836546e-06, + "logits/chosen": 1675446954.6666667, + "logits/rejected": 1797146503.5294118, + "logps/chosen": -275.03134765625, + "logps/rejected": -457.6252872242647, + "loss": 0.1342, + "rewards/chosen": 1.3551958719889323, + "rewards/margins": 8.358873240152995, + "rewards/rejected": -7.0036773681640625, + "step": 1256 + }, + { + "epoch": 0.46403027086890314, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 5.6951531790515045e-06, + "logits/chosen": 2454933504.0, + "logits/rejected": 1391053312.0, + "logps/chosen": -283.8570556640625, + "logps/rejected": -606.6972045898438, + "loss": 0.1267, + "rewards/chosen": 1.6672395467758179, + "rewards/margins": 10.889033675193787, + "rewards/rejected": -9.221794128417969, + "step": 1257 + }, + { + "epoch": 0.46439942780674637, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 5.689324520193455e-06, + "logits/chosen": 1202447360.0, + "logits/rejected": 1310785536.0, + "logps/chosen": -173.88551330566406, + "logps/rejected": -469.436279296875, + "loss": 0.1231, + "rewards/chosen": 1.9846092462539673, + "rewards/margins": 8.776886582374573, + "rewards/rejected": -6.7922773361206055, + "step": 1258 + }, + { + "epoch": 0.46476858474458954, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 5.68349490628504e-06, + "logits/chosen": 3062858956.8, + "logits/rejected": 1835020660.3636363, + "logps/chosen": -273.6923583984375, + "logps/rejected": -526.5743075284091, + "loss": 0.1315, + "rewards/chosen": 0.42178821563720703, + "rewards/margins": 8.822230165654963, + "rewards/rejected": -8.400441950017756, + "step": 1259 + }, + { + "epoch": 0.46513774168243277, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 5.677664345403118e-06, + "logits/chosen": 1590345932.8, + "logits/rejected": 2145810090.6666667, + "logps/chosen": -304.1717041015625, + "logps/rejected": -397.7779947916667, + "loss": 0.2242, + "rewards/chosen": 1.4158255577087402, + "rewards/margins": 7.893369706471761, + "rewards/rejected": -6.4775441487630205, + "step": 1260 + }, + { + "epoch": 0.46550689862027594, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 5.671832845625853e-06, + "logits/chosen": 1742845006.7692308, + "logits/rejected": 1210948338.5263157, + "logps/chosen": -241.88185471754807, + "logps/rejected": -407.84385279605266, + "loss": 0.1442, + "rewards/chosen": 1.04372684772198, + "rewards/margins": 8.31325729462782, + "rewards/rejected": -7.269530446905839, + "step": 1261 + }, + { + "epoch": 0.4658760555581192, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 5.6660004150327175e-06, + "logits/chosen": 1553163520.0, + "logits/rejected": 1601211776.0, + "logps/chosen": -268.818115234375, + "logps/rejected": -409.6429748535156, + "loss": 0.157, + "rewards/chosen": 1.1948740482330322, + "rewards/margins": 8.582369089126587, + "rewards/rejected": -7.387495040893555, + "step": 1262 + }, + { + "epoch": 0.46624521249596235, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 5.660167061704467e-06, + "logits/chosen": 1322750267.0769231, + "logits/rejected": 2040750080.0, + "logps/chosen": -293.79306265024036, + "logps/rejected": -525.7649568256579, + "loss": 0.1466, + "rewards/chosen": 0.8343367209801307, + "rewards/margins": 9.179957930375691, + "rewards/rejected": -8.34562120939556, + "step": 1263 + }, + { + "epoch": 0.4666143694338055, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 5.654332793723141e-06, + "logits/chosen": 1423350169.6, + "logits/rejected": 1356341930.6666667, + "logps/chosen": -275.570361328125, + "logps/rejected": -490.2438151041667, + "loss": 0.1845, + "rewards/chosen": 1.3467785835266113, + "rewards/margins": 8.474871031443278, + "rewards/rejected": -7.128092447916667, + "step": 1264 + }, + { + "epoch": 0.46698352637164875, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 5.648497619172042e-06, + "logits/chosen": 2446392064.0, + "logits/rejected": 3096707072.0, + "logps/chosen": -338.63818359375, + "logps/rejected": -539.504150390625, + "loss": 0.1357, + "rewards/chosen": 1.4695488214492798, + "rewards/margins": 9.217888474464417, + "rewards/rejected": -7.748339653015137, + "step": 1265 + }, + { + "epoch": 0.4673526833094919, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 5.6426615461357305e-06, + "logits/chosen": 2088707072.0, + "logits/rejected": 2162203648.0, + "logps/chosen": -287.1134033203125, + "logps/rejected": -495.737548828125, + "loss": 0.221, + "rewards/chosen": 0.8323314666748047, + "rewards/margins": 10.71656239827474, + "rewards/rejected": -9.884230931599935, + "step": 1266 + }, + { + "epoch": 0.46772184024733515, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 5.636824582700012e-06, + "logits/chosen": 1491187126.857143, + "logits/rejected": 1557911324.4444444, + "logps/chosen": -249.90961565290178, + "logps/rejected": -452.10956488715277, + "loss": 0.1323, + "rewards/chosen": 1.403160231454032, + "rewards/margins": 9.604744941469223, + "rewards/rejected": -8.20158471001519, + "step": 1267 + }, + { + "epoch": 0.4680909971851783, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 5.630986736951925e-06, + "logits/chosen": 1372629811.2, + "logits/rejected": 1716016640.0, + "logps/chosen": -351.32822265625, + "logps/rejected": -441.597412109375, + "loss": 0.2024, + "rewards/chosen": 1.0433027267456054, + "rewards/margins": 9.956344159444173, + "rewards/rejected": -8.913041432698568, + "step": 1268 + }, + { + "epoch": 0.46846015412302155, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 5.625148016979731e-06, + "logits/chosen": 1915670714.1818182, + "logits/rejected": 1738449091.047619, + "logps/chosen": -230.6058016690341, + "logps/rejected": -579.5094401041666, + "loss": 0.1025, + "rewards/chosen": 2.1428780989213423, + "rewards/margins": 10.755178426767324, + "rewards/rejected": -8.612300327845983, + "step": 1269 + }, + { + "epoch": 0.46882931106086473, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 5.619308430872902e-06, + "logits/chosen": 1799578443.2941177, + "logits/rejected": 1814912614.4, + "logps/chosen": -277.30224609375, + "logps/rejected": -407.60078125, + "loss": 0.1613, + "rewards/chosen": 1.3436546325683594, + "rewards/margins": 8.783315785725911, + "rewards/rejected": -7.4396611531575525, + "step": 1270 + }, + { + "epoch": 0.46919846799870796, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 5.613467986722109e-06, + "logits/chosen": 2037561929.142857, + "logits/rejected": 2009167872.0, + "logps/chosen": -258.8886195591518, + "logps/rejected": -476.23285590277777, + "loss": 0.1343, + "rewards/chosen": 1.25894410269601, + "rewards/margins": 8.651989316183423, + "rewards/rejected": -7.393045213487413, + "step": 1271 + }, + { + "epoch": 0.46956762493655113, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 5.607626692619216e-06, + "logits/chosen": 2358964224.0, + "logits/rejected": 1602792594.2857144, + "logps/chosen": -276.76267311789775, + "logps/rejected": -418.0455729166667, + "loss": 0.1301, + "rewards/chosen": 1.0925772406838157, + "rewards/margins": 8.456655580760081, + "rewards/rejected": -7.364078340076265, + "step": 1272 + }, + { + "epoch": 0.46993678187439436, + "grad_norm": 13.625, + "kl": 0.2142777442932129, + "learning_rate": 5.601784556657259e-06, + "logits/chosen": 2541459348.2105265, + "logits/rejected": 2678162668.3076925, + "logps/chosen": -265.8229337993421, + "logps/rejected": -533.1594050480769, + "loss": 0.2165, + "rewards/chosen": 0.7064057902285927, + "rewards/margins": 9.181478797665491, + "rewards/rejected": -8.4750730074369, + "step": 1273 + }, + { + "epoch": 0.47030593881223753, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 5.5959415869304445e-06, + "logits/chosen": 1596286020.2666667, + "logits/rejected": 1490923399.5294118, + "logps/chosen": -297.73675130208335, + "logps/rejected": -429.6953699448529, + "loss": 0.1372, + "rewards/chosen": 1.4699593861897786, + "rewards/margins": 8.12140511157466, + "rewards/rejected": -6.651445725384881, + "step": 1274 + }, + { + "epoch": 0.47067509575008076, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 5.590097791534132e-06, + "logits/chosen": 1849802069.3333333, + "logits/rejected": 1509565440.0, + "logps/chosen": -302.30287000868054, + "logps/rejected": -445.26834542410717, + "loss": 0.1945, + "rewards/chosen": 1.0815956327650282, + "rewards/margins": 8.595228270878868, + "rewards/rejected": -7.513632638113839, + "step": 1275 + }, + { + "epoch": 0.47104425268792394, + "grad_norm": 15.9375, + "kl": 0.12620162963867188, + "learning_rate": 5.584253178564829e-06, + "logits/chosen": 1605073578.6666667, + "logits/rejected": 2175478603.2941175, + "logps/chosen": -286.10393880208335, + "logps/rejected": -437.28972311580884, + "loss": 0.2047, + "rewards/chosen": 0.8126660664876302, + "rewards/margins": 7.0792249941358385, + "rewards/rejected": -6.266558927648208, + "step": 1276 + }, + { + "epoch": 0.47141340962576717, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 5.578407756120167e-06, + "logits/chosen": 1571761883.4285715, + "logits/rejected": 2657991111.111111, + "logps/chosen": -224.30203683035714, + "logps/rejected": -430.33241102430554, + "loss": 0.114, + "rewards/chosen": 1.7509636197771346, + "rewards/margins": 7.661468278794061, + "rewards/rejected": -5.910504659016927, + "step": 1277 + }, + { + "epoch": 0.47178256656361034, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5.57256153229891e-06, + "logits/chosen": 1833593124.5714285, + "logits/rejected": 1975439173.8181818, + "logps/chosen": -383.9208054315476, + "logps/rejected": -541.19091796875, + "loss": 0.2369, + "rewards/chosen": 1.0536855061848958, + "rewards/margins": 8.431841994776871, + "rewards/rejected": -7.378156488591975, + "step": 1278 + }, + { + "epoch": 0.47215172350145357, + "grad_norm": 13.625, + "kl": 0.18763256072998047, + "learning_rate": 5.566714515200924e-06, + "logits/chosen": 1614522210.4615386, + "logits/rejected": 1795443442.5263157, + "logps/chosen": -394.5290339543269, + "logps/rejected": -424.8228824013158, + "loss": 0.1401, + "rewards/chosen": 0.8593849035409781, + "rewards/margins": 8.170134961363757, + "rewards/rejected": -7.31075005782278, + "step": 1279 + }, + { + "epoch": 0.47252088043929674, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 5.560866712927176e-06, + "logits/chosen": 2259810196.2105265, + "logits/rejected": 2241767896.6153846, + "logps/chosen": -394.91719777960526, + "logps/rejected": -518.0201322115385, + "loss": 0.1656, + "rewards/chosen": 1.5932235717773438, + "rewards/margins": 9.567390441894531, + "rewards/rejected": -7.9741668701171875, + "step": 1280 + }, + { + "epoch": 0.47289003737713997, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5.555018133579723e-06, + "logits/chosen": 1708401956.5714285, + "logits/rejected": 1525711416.8888888, + "logps/chosen": -248.92014857700892, + "logps/rejected": -515.7660047743055, + "loss": 0.1157, + "rewards/chosen": 1.7379063197544642, + "rewards/margins": 11.426573617117745, + "rewards/rejected": -9.688667297363281, + "step": 1281 + }, + { + "epoch": 0.47325919431498314, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 5.549168785261698e-06, + "logits/chosen": 1891424392.5333333, + "logits/rejected": 1580815781.6470587, + "logps/chosen": -317.3435546875, + "logps/rejected": -582.0832950367648, + "loss": 0.1386, + "rewards/chosen": 1.375164794921875, + "rewards/margins": 9.339936738855698, + "rewards/rejected": -7.964771943933823, + "step": 1282 + }, + { + "epoch": 0.4736283512528264, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 5.543318676077297e-06, + "logits/chosen": 1642259200.0, + "logits/rejected": 1401176576.0, + "logps/chosen": -239.35415649414062, + "logps/rejected": -343.9851989746094, + "loss": 0.1466, + "rewards/chosen": 1.2199809551239014, + "rewards/margins": 8.215405225753784, + "rewards/rejected": -6.995424270629883, + "step": 1283 + }, + { + "epoch": 0.47399750819066955, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 5.537467814131774e-06, + "logits/chosen": 1456396416.0, + "logits/rejected": 1336012032.0, + "logps/chosen": -319.8012390136719, + "logps/rejected": -421.53668212890625, + "loss": 0.1376, + "rewards/chosen": 1.9294440746307373, + "rewards/margins": 10.163841009140015, + "rewards/rejected": -8.234396934509277, + "step": 1284 + }, + { + "epoch": 0.4743666651285128, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 5.531616207531423e-06, + "logits/chosen": 1745274265.6, + "logits/rejected": 1146377642.6666667, + "logps/chosen": -177.51326904296874, + "logps/rejected": -386.1200358072917, + "loss": 0.1912, + "rewards/chosen": 1.6444686889648437, + "rewards/margins": 9.141820017496745, + "rewards/rejected": -7.497351328531901, + "step": 1285 + }, + { + "epoch": 0.47473582206635595, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 5.525763864383571e-06, + "logits/chosen": 2345455143.3846154, + "logits/rejected": 1758702969.2631578, + "logps/chosen": -331.0238506610577, + "logps/rejected": -540.4753289473684, + "loss": 0.1455, + "rewards/chosen": 1.3506861466627855, + "rewards/margins": 7.973228276982481, + "rewards/rejected": -6.622542130319696, + "step": 1286 + }, + { + "epoch": 0.4751049790041992, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5.519910792796565e-06, + "logits/chosen": 1299959567.0588236, + "logits/rejected": 1713693491.2, + "logps/chosen": -207.71896541819854, + "logps/rejected": -577.3069010416667, + "loss": 0.1454, + "rewards/chosen": 1.6057474472943474, + "rewards/margins": 9.96106310077742, + "rewards/rejected": -8.355315653483073, + "step": 1287 + }, + { + "epoch": 0.47547413594204235, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 5.514057000879759e-06, + "logits/chosen": 1685478130.5263157, + "logits/rejected": 2120633895.3846154, + "logps/chosen": -324.87232730263156, + "logps/rejected": -483.63773287259613, + "loss": 0.1718, + "rewards/chosen": 1.189930564478824, + "rewards/margins": 7.751122416754966, + "rewards/rejected": -6.561191852276142, + "step": 1288 + }, + { + "epoch": 0.4758432928798856, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 5.508202496743511e-06, + "logits/chosen": 2014744576.0, + "logits/rejected": 2203625676.8, + "logps/chosen": -240.30138050426137, + "logps/rejected": -398.94638671875, + "loss": 0.236, + "rewards/chosen": 0.8383747447620739, + "rewards/margins": 8.29025386463512, + "rewards/rejected": -7.4518791198730465, + "step": 1289 + }, + { + "epoch": 0.47621244981772876, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 5.50234728849916e-06, + "logits/chosen": 1520803840.0, + "logits/rejected": 1430438249.4117646, + "logps/chosen": -198.38883463541666, + "logps/rejected": -488.4974149816176, + "loss": 0.1659, + "rewards/chosen": 1.2136229197184245, + "rewards/margins": 8.781379123762543, + "rewards/rejected": -7.567756204044118, + "step": 1290 + }, + { + "epoch": 0.476581606755572, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 5.496491384259022e-06, + "logits/chosen": 1379639003.4285715, + "logits/rejected": 1674884747.6363637, + "logps/chosen": -294.07652064732144, + "logps/rejected": -446.74027876420456, + "loss": 0.1901, + "rewards/chosen": 1.2130085173107328, + "rewards/margins": 8.974377702324936, + "rewards/rejected": -7.761369185014204, + "step": 1291 + }, + { + "epoch": 0.47695076369341516, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 5.49063479213638e-06, + "logits/chosen": 1584521697.8823528, + "logits/rejected": 2190001220.266667, + "logps/chosen": -265.82536764705884, + "logps/rejected": -509.8167317708333, + "loss": 0.1717, + "rewards/chosen": 1.22653725567986, + "rewards/margins": 8.364065738752776, + "rewards/rejected": -7.137528483072916, + "step": 1292 + }, + { + "epoch": 0.4773199206312584, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 5.484777520245467e-06, + "logits/chosen": 1614214212.2666667, + "logits/rejected": 1890231838.1176472, + "logps/chosen": -249.16663411458333, + "logps/rejected": -458.64332490808823, + "loss": 0.1394, + "rewards/chosen": 1.3955904642740886, + "rewards/margins": 8.646675184661266, + "rewards/rejected": -7.251084720387178, + "step": 1293 + }, + { + "epoch": 0.47768907756910156, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 5.478919576701459e-06, + "logits/chosen": 1857916131.5555556, + "logits/rejected": 1529139053.7142856, + "logps/chosen": -395.26117621527777, + "logps/rejected": -455.662353515625, + "loss": 0.1677, + "rewards/chosen": 1.2377773920694988, + "rewards/margins": 9.08638840629941, + "rewards/rejected": -7.848611014229911, + "step": 1294 + }, + { + "epoch": 0.4780582345069448, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 5.473060969620462e-06, + "logits/chosen": 1986763224.6153846, + "logits/rejected": 1775314620.631579, + "logps/chosen": -254.94807316706732, + "logps/rejected": -524.9236225328947, + "loss": 0.1526, + "rewards/chosen": 0.8864560494056115, + "rewards/margins": 8.332916974056104, + "rewards/rejected": -7.446460924650493, + "step": 1295 + }, + { + "epoch": 0.47842739144478796, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 5.467201707119501e-06, + "logits/chosen": 1848791255.5789473, + "logits/rejected": 1642036617.8461537, + "logps/chosen": -291.47499486019734, + "logps/rejected": -449.3918644831731, + "loss": 0.1626, + "rewards/chosen": 1.54037134270919, + "rewards/margins": 8.333451599244647, + "rewards/rejected": -6.793080256535457, + "step": 1296 + }, + { + "epoch": 0.4787965483826312, + "grad_norm": 13.3125, + "kl": 3.539304733276367, + "learning_rate": 5.46134179731651e-06, + "logits/chosen": 1823378162.5263157, + "logits/rejected": 2818908790.1538463, + "logps/chosen": -294.7251747532895, + "logps/rejected": -612.2462439903846, + "loss": 0.169, + "rewards/chosen": 1.6242385663484271, + "rewards/margins": 12.370448517895904, + "rewards/rejected": -10.746209951547476, + "step": 1297 + }, + { + "epoch": 0.47916570532047437, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5.455481248330322e-06, + "logits/chosen": 1261615616.0, + "logits/rejected": 1412735744.0, + "logps/chosen": -264.7585144042969, + "logps/rejected": -418.4679260253906, + "loss": 0.1403, + "rewards/chosen": 1.4340113401412964, + "rewards/margins": 9.135201334953308, + "rewards/rejected": -7.701189994812012, + "step": 1298 + }, + { + "epoch": 0.47953486225831754, + "grad_norm": 13.6875, + "kl": 1.9575705528259277, + "learning_rate": 5.44962006828065e-06, + "logits/chosen": 1970439054.2222223, + "logits/rejected": 1931296036.5714285, + "logps/chosen": -267.701416015625, + "logps/rejected": -353.58754185267856, + "loss": 0.1906, + "rewards/chosen": 1.2216451432969835, + "rewards/margins": 6.979108326018803, + "rewards/rejected": -5.7574631827218195, + "step": 1299 + }, + { + "epoch": 0.47990401919616077, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 5.443758265288086e-06, + "logits/chosen": 2252687633.0666666, + "logits/rejected": 2124215476.7058823, + "logps/chosen": -314.2978190104167, + "logps/rejected": -446.45407284007354, + "loss": 0.1466, + "rewards/chosen": 1.422787602742513, + "rewards/margins": 8.494847682878083, + "rewards/rejected": -7.07206008013557, + "step": 1300 + }, + { + "epoch": 0.48027317613400394, + "grad_norm": 11.25, + "kl": 0.14725112915039062, + "learning_rate": 5.4378958474740826e-06, + "logits/chosen": 1790022314.6666667, + "logits/rejected": 1539240417.8823528, + "logps/chosen": -227.241259765625, + "logps/rejected": -506.6637752757353, + "loss": 0.1558, + "rewards/chosen": 1.1646222432454427, + "rewards/margins": 8.25747376984241, + "rewards/rejected": -7.0928515265969665, + "step": 1301 + }, + { + "epoch": 0.48064233307184717, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 5.4320328229609475e-06, + "logits/chosen": 2051667429.0526316, + "logits/rejected": 2811045730.4615383, + "logps/chosen": -335.99794407894734, + "logps/rejected": -583.3707932692307, + "loss": 0.2024, + "rewards/chosen": 1.106044468126799, + "rewards/margins": 11.199666529049274, + "rewards/rejected": -10.093622060922476, + "step": 1302 + }, + { + "epoch": 0.48101149000969035, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 5.426169199871824e-06, + "logits/chosen": 1664647987.2, + "logits/rejected": 1353760286.1176472, + "logps/chosen": -268.83429361979165, + "logps/rejected": -493.99718520220586, + "loss": 0.156, + "rewards/chosen": 1.0748116811116537, + "rewards/margins": 8.194945024976544, + "rewards/rejected": -7.1201333438648895, + "step": 1303 + }, + { + "epoch": 0.4813806469475336, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 5.42030498633069e-06, + "logits/chosen": 1515598279.1111112, + "logits/rejected": 1268363410.2857144, + "logps/chosen": -314.759033203125, + "logps/rejected": -418.04921177455356, + "loss": 0.1764, + "rewards/chosen": 1.0811715655856662, + "rewards/margins": 9.45944893549359, + "rewards/rejected": -8.378277369907924, + "step": 1304 + }, + { + "epoch": 0.48174980388537675, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 5.414440190462336e-06, + "logits/chosen": 2176743833.6, + "logits/rejected": 2097408843.2941177, + "logps/chosen": -308.03782552083334, + "logps/rejected": -498.0602022058824, + "loss": 0.1053, + "rewards/chosen": 1.671562703450521, + "rewards/margins": 9.62218885234758, + "rewards/rejected": -7.950626148897059, + "step": 1305 + }, + { + "epoch": 0.48211896082322, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 5.408574820392364e-06, + "logits/chosen": 1815151104.0, + "logits/rejected": 2396517376.0, + "logps/chosen": -259.60369873046875, + "logps/rejected": -427.76171875, + "loss": 0.1642, + "rewards/chosen": 1.3711602687835693, + "rewards/margins": 6.6565539836883545, + "rewards/rejected": -5.285393714904785, + "step": 1306 + }, + { + "epoch": 0.48248811776106315, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 5.402708884247169e-06, + "logits/chosen": 1595960805.0526316, + "logits/rejected": 2386163239.3846154, + "logps/chosen": -224.62705592105263, + "logps/rejected": -451.2565730168269, + "loss": 0.1514, + "rewards/chosen": 1.7443267420718545, + "rewards/margins": 8.520897575718188, + "rewards/rejected": -6.776570833646334, + "step": 1307 + }, + { + "epoch": 0.4828572746989064, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 5.39684239015393e-06, + "logits/chosen": 1928967964.4444444, + "logits/rejected": 1452820187.4285715, + "logps/chosen": -209.58490668402777, + "logps/rejected": -451.6669921875, + "loss": 0.1656, + "rewards/chosen": 1.1935692893134222, + "rewards/margins": 8.97541220225985, + "rewards/rejected": -7.781842912946429, + "step": 1308 + }, + { + "epoch": 0.48322643163674955, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 5.390975346240602e-06, + "logits/chosen": 2107858156.3076923, + "logits/rejected": 1809082906.9473684, + "logps/chosen": -253.60445462740384, + "logps/rejected": -498.1011513157895, + "loss": 0.1147, + "rewards/chosen": 1.3990399287297175, + "rewards/margins": 9.54690241331031, + "rewards/rejected": -8.147862484580592, + "step": 1309 + }, + { + "epoch": 0.4835955885745928, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 5.3851077606359e-06, + "logits/chosen": 1897576106.6666667, + "logits/rejected": 1946648576.0, + "logps/chosen": -258.45717366536456, + "logps/rejected": -470.247314453125, + "loss": 0.0966, + "rewards/chosen": 1.734079360961914, + "rewards/margins": 9.707441329956055, + "rewards/rejected": -7.973361968994141, + "step": 1310 + }, + { + "epoch": 0.48396474551243596, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 5.3792396414692895e-06, + "logits/chosen": 1775919585.8823528, + "logits/rejected": 2503379899.733333, + "logps/chosen": -246.5421932444853, + "logps/rejected": -598.6277994791667, + "loss": 0.1629, + "rewards/chosen": 1.3088937647202437, + "rewards/margins": 9.692184882070505, + "rewards/rejected": -8.383291117350261, + "step": 1311 + }, + { + "epoch": 0.4843339024502792, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 5.373370996870972e-06, + "logits/chosen": 1622649059.5555556, + "logits/rejected": 2106296466.2857144, + "logps/chosen": -262.3650716145833, + "logps/rejected": -441.1934291294643, + "loss": 0.1849, + "rewards/chosen": 1.3190646701388888, + "rewards/margins": 8.220709240625775, + "rewards/rejected": -6.901644570486886, + "step": 1312 + }, + { + "epoch": 0.48470305938812236, + "grad_norm": 12.5, + "kl": 0.2811737060546875, + "learning_rate": 5.367501834971882e-06, + "logits/chosen": 1692248907.2941177, + "logits/rejected": 1264151210.6666667, + "logps/chosen": -261.3942440257353, + "logps/rejected": -392.6326171875, + "loss": 0.1545, + "rewards/chosen": 1.4590644836425781, + "rewards/margins": 9.312414805094402, + "rewards/rejected": -7.853350321451823, + "step": 1313 + }, + { + "epoch": 0.4850722163259656, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 5.3616321639036685e-06, + "logits/chosen": 2033010414.9333334, + "logits/rejected": 2227374200.470588, + "logps/chosen": -239.54557291666666, + "logps/rejected": -505.8941865808824, + "loss": 0.1624, + "rewards/chosen": 0.873802121480306, + "rewards/margins": 7.979135337530398, + "rewards/rejected": -7.1053332160500915, + "step": 1314 + }, + { + "epoch": 0.48544137326380876, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 5.355761991798688e-06, + "logits/chosen": 1659141120.0, + "logits/rejected": 1465181952.0, + "logps/chosen": -228.775390625, + "logps/rejected": -525.4037475585938, + "loss": 0.1364, + "rewards/chosen": 1.44304358959198, + "rewards/margins": 8.97947871685028, + "rewards/rejected": -7.536435127258301, + "step": 1315 + }, + { + "epoch": 0.485810530201652, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 5.3498913267899864e-06, + "logits/chosen": 1621960476.4444444, + "logits/rejected": 1757743981.7142856, + "logps/chosen": -343.6247829861111, + "logps/rejected": -511.4305943080357, + "loss": 0.1218, + "rewards/chosen": 1.9471244812011719, + "rewards/margins": 9.948751722063337, + "rewards/rejected": -8.001627240862165, + "step": 1316 + }, + { + "epoch": 0.48617968713949516, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 5.344020177011297e-06, + "logits/chosen": 2415370671.1578946, + "logits/rejected": 1842721713.2307692, + "logps/chosen": -259.13987972861844, + "logps/rejected": -688.4924128605769, + "loss": 0.184, + "rewards/chosen": 1.2633466218647205, + "rewards/margins": 8.736241143724696, + "rewards/rejected": -7.472894521859976, + "step": 1317 + }, + { + "epoch": 0.4865488440773384, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 5.3381485505970235e-06, + "logits/chosen": 1950052547.047619, + "logits/rejected": 1725277835.6363637, + "logps/chosen": -236.72470238095238, + "logps/rejected": -436.8209339488636, + "loss": 0.1326, + "rewards/chosen": 2.096772693452381, + "rewards/margins": 9.318574451264881, + "rewards/rejected": -7.2218017578125, + "step": 1318 + }, + { + "epoch": 0.48691800101518157, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 5.3322764556822296e-06, + "logits/chosen": 1750813696.0, + "logits/rejected": 1646877568.0, + "logps/chosen": -296.6430969238281, + "logps/rejected": -605.7150268554688, + "loss": 0.1054, + "rewards/chosen": 1.9307348728179932, + "rewards/margins": 10.04166054725647, + "rewards/rejected": -8.110925674438477, + "step": 1319 + }, + { + "epoch": 0.4872871579530248, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 5.326403900402627e-06, + "logits/chosen": 2010110537.142857, + "logits/rejected": 1866685440.0, + "logps/chosen": -327.74204799107144, + "logps/rejected": -416.50157335069446, + "loss": 0.1185, + "rewards/chosen": 1.6507712772914342, + "rewards/margins": 8.491193589710054, + "rewards/rejected": -6.84042231241862, + "step": 1320 + }, + { + "epoch": 0.48765631489086797, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 5.3205308928945676e-06, + "logits/chosen": 1597392896.0, + "logits/rejected": 1690978490.1818182, + "logps/chosen": -269.39183407738096, + "logps/rejected": -560.1590465198864, + "loss": 0.1775, + "rewards/chosen": 1.6533410208565849, + "rewards/margins": 9.039959895146357, + "rewards/rejected": -7.3866188742897725, + "step": 1321 + }, + { + "epoch": 0.4880254718287112, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 5.314657441295028e-06, + "logits/chosen": 2454639023.1578946, + "logits/rejected": 2170288443.076923, + "logps/chosen": -312.21548622532896, + "logps/rejected": -450.6037785456731, + "loss": 0.1787, + "rewards/chosen": 1.2701740264892578, + "rewards/margins": 7.924861761239859, + "rewards/rejected": -6.654687734750601, + "step": 1322 + }, + { + "epoch": 0.4883946287665544, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 5.308783553741602e-06, + "logits/chosen": 2362375168.0, + "logits/rejected": 2158749696.0, + "logps/chosen": -309.65386962890625, + "logps/rejected": -571.8545532226562, + "loss": 0.133, + "rewards/chosen": 1.2727434635162354, + "rewards/margins": 10.859785318374634, + "rewards/rejected": -9.587041854858398, + "step": 1323 + }, + { + "epoch": 0.4887637857043976, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 5.302909238372485e-06, + "logits/chosen": 1591755044.5714285, + "logits/rejected": 2099575694.2222223, + "logps/chosen": -263.10934012276783, + "logps/rejected": -456.1965060763889, + "loss": 0.1151, + "rewards/chosen": 1.5688112803867884, + "rewards/margins": 8.858124702695816, + "rewards/rejected": -7.289313422309028, + "step": 1324 + }, + { + "epoch": 0.4891329426422408, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 5.297034503326466e-06, + "logits/chosen": 2792816128.0, + "logits/rejected": 2558228224.0, + "logps/chosen": -331.14752197265625, + "logps/rejected": -460.96673583984375, + "loss": 0.111, + "rewards/chosen": 2.3612864017486572, + "rewards/margins": 9.325124502182007, + "rewards/rejected": -6.96383810043335, + "step": 1325 + }, + { + "epoch": 0.489502099580084, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 5.291159356742918e-06, + "logits/chosen": 1155911439.0588236, + "logits/rejected": 1325107609.6, + "logps/chosen": -281.9095818014706, + "logps/rejected": -457.8368815104167, + "loss": 0.1638, + "rewards/chosen": 1.524407330681296, + "rewards/margins": 8.071737551221661, + "rewards/rejected": -6.547330220540364, + "step": 1326 + }, + { + "epoch": 0.4898712565179272, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 5.285283806761778e-06, + "logits/chosen": 1676260124.4444444, + "logits/rejected": 1350570569.142857, + "logps/chosen": -307.65087890625, + "logps/rejected": -648.8137555803571, + "loss": 0.208, + "rewards/chosen": 0.8008493847317166, + "rewards/margins": 10.45610429370214, + "rewards/rejected": -9.655254908970424, + "step": 1327 + }, + { + "epoch": 0.4902404134557704, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 5.27940786152355e-06, + "logits/chosen": 2108298854.4, + "logits/rejected": 2100041848.4705882, + "logps/chosen": -242.38282877604166, + "logps/rejected": -528.9017693014706, + "loss": 0.1083, + "rewards/chosen": 2.044091796875, + "rewards/margins": 10.404807775160846, + "rewards/rejected": -8.360715978285846, + "step": 1328 + }, + { + "epoch": 0.4906095703936136, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 5.27353152916928e-06, + "logits/chosen": 2099923382.857143, + "logits/rejected": 2052806428.4444444, + "logps/chosen": -201.33091517857142, + "logps/rejected": -517.92578125, + "loss": 0.1399, + "rewards/chosen": 2.0609978267124722, + "rewards/margins": 10.126928511120024, + "rewards/rejected": -8.065930684407553, + "step": 1329 + }, + { + "epoch": 0.4909787273314568, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5.267654817840552e-06, + "logits/chosen": 1372736307.2, + "logits/rejected": 1720474453.3333333, + "logps/chosen": -216.452001953125, + "logps/rejected": -536.033203125, + "loss": 0.1723, + "rewards/chosen": 1.4906314849853515, + "rewards/margins": 8.892738087972004, + "rewards/rejected": -7.402106602986653, + "step": 1330 + }, + { + "epoch": 0.4913478842693, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 5.261777735679472e-06, + "logits/chosen": 1934565504.0, + "logits/rejected": 1465056768.0, + "logps/chosen": -330.0137023925781, + "logps/rejected": -426.4817810058594, + "loss": 0.1562, + "rewards/chosen": 1.5575343370437622, + "rewards/margins": 9.330515027046204, + "rewards/rejected": -7.772980690002441, + "step": 1331 + }, + { + "epoch": 0.4917170412071432, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 5.255900290828666e-06, + "logits/chosen": 1822771386.1818182, + "logits/rejected": 2649400466.285714, + "logps/chosen": -279.80051491477275, + "logps/rejected": -627.8981584821429, + "loss": 0.1246, + "rewards/chosen": 0.8953801935369318, + "rewards/margins": 10.463937932794744, + "rewards/rejected": -9.568557739257812, + "step": 1332 + }, + { + "epoch": 0.4920861981449864, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 5.250022491431259e-06, + "logits/chosen": 2551811614.117647, + "logits/rejected": 1603687219.2, + "logps/chosen": -254.9582950367647, + "logps/rejected": -582.7176432291667, + "loss": 0.1716, + "rewards/chosen": 0.9330510532154757, + "rewards/margins": 10.603259793449851, + "rewards/rejected": -9.670208740234376, + "step": 1333 + }, + { + "epoch": 0.4924553550828296, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 5.2441443456308665e-06, + "logits/chosen": 2742068480.0, + "logits/rejected": 2128558336.0, + "logps/chosen": -348.3977355957031, + "logps/rejected": -369.847900390625, + "loss": 0.1403, + "rewards/chosen": 1.4834749698638916, + "rewards/margins": 7.628100633621216, + "rewards/rejected": -6.144625663757324, + "step": 1334 + }, + { + "epoch": 0.4928245120206728, + "grad_norm": 11.625, + "kl": 0.07767105102539062, + "learning_rate": 5.238265861571585e-06, + "logits/chosen": 1710151680.0, + "logits/rejected": 2514321408.0, + "logps/chosen": -290.6258544921875, + "logps/rejected": -567.0692274305555, + "loss": 0.1354, + "rewards/chosen": 1.4652857099260603, + "rewards/margins": 10.232826535663907, + "rewards/rejected": -8.767540825737846, + "step": 1335 + }, + { + "epoch": 0.49319366895851596, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 5.232387047397979e-06, + "logits/chosen": 1729505664.0, + "logits/rejected": 1448064384.0, + "logps/chosen": -295.0443115234375, + "logps/rejected": -538.0873413085938, + "loss": 0.0943, + "rewards/chosen": 2.2136785984039307, + "rewards/margins": 11.146042108535767, + "rewards/rejected": -8.932363510131836, + "step": 1336 + }, + { + "epoch": 0.4935628258963592, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 5.226507911255071e-06, + "logits/chosen": 2066705314.909091, + "logits/rejected": 1949240368.7619047, + "logps/chosen": -277.5948597301136, + "logps/rejected": -467.56854538690476, + "loss": 0.0581, + "rewards/chosen": 2.2151366147128018, + "rewards/margins": 9.279992281100451, + "rewards/rejected": -7.064855666387649, + "step": 1337 + }, + { + "epoch": 0.49393198283420237, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 5.22062846128833e-06, + "logits/chosen": 2080055705.6, + "logits/rejected": 1303627655.5294118, + "logps/chosen": -286.93860677083336, + "logps/rejected": -413.72351792279414, + "loss": 0.1567, + "rewards/chosen": 1.5726119995117187, + "rewards/margins": 8.59171869614545, + "rewards/rejected": -7.019106696633732, + "step": 1338 + }, + { + "epoch": 0.4943011397720456, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 5.214748705643659e-06, + "logits/chosen": 2291966674.8235292, + "logits/rejected": 2666770158.9333334, + "logps/chosen": -286.4226505055147, + "logps/rejected": -495.48606770833334, + "loss": 0.1613, + "rewards/chosen": 1.468365052167107, + "rewards/margins": 7.12423714282466, + "rewards/rejected": -5.655872090657552, + "step": 1339 + }, + { + "epoch": 0.49467029670988877, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 5.208868652467385e-06, + "logits/chosen": 2536419913.142857, + "logits/rejected": 1855157703.1111112, + "logps/chosen": -287.57125418526783, + "logps/rejected": -547.8064778645834, + "loss": 0.1029, + "rewards/chosen": 1.8387325831821986, + "rewards/margins": 10.348413437131851, + "rewards/rejected": -8.509680853949654, + "step": 1340 + }, + { + "epoch": 0.495039453647732, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 5.202988309906246e-06, + "logits/chosen": 2133601757.8666666, + "logits/rejected": 2206402439.529412, + "logps/chosen": -291.4091471354167, + "logps/rejected": -535.5203929227941, + "loss": 0.1385, + "rewards/chosen": 1.5718788146972655, + "rewards/margins": 8.556912635354434, + "rewards/rejected": -6.985033820657169, + "step": 1341 + }, + { + "epoch": 0.49540861058557517, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 5.1971076861073825e-06, + "logits/chosen": 2214473142.857143, + "logits/rejected": 1831108380.4444444, + "logps/chosen": -376.65464564732144, + "logps/rejected": -551.3812934027778, + "loss": 0.1348, + "rewards/chosen": 1.209993634905134, + "rewards/margins": 9.563559274824838, + "rewards/rejected": -8.353565639919704, + "step": 1342 + }, + { + "epoch": 0.4957777675234184, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 5.1912267892183245e-06, + "logits/chosen": 1305046528.0, + "logits/rejected": 1471241984.0, + "logps/chosen": -227.49436950683594, + "logps/rejected": -444.3511657714844, + "loss": 0.1576, + "rewards/chosen": 1.2611172199249268, + "rewards/margins": 8.324685335159302, + "rewards/rejected": -7.063568115234375, + "step": 1343 + }, + { + "epoch": 0.4961469244612616, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 5.1853456273869794e-06, + "logits/chosen": 1759601868.8, + "logits/rejected": 2462648199.529412, + "logps/chosen": -254.67054036458333, + "logps/rejected": -407.61345358455884, + "loss": 0.0916, + "rewards/chosen": 2.173112233479818, + "rewards/margins": 8.280657525156059, + "rewards/rejected": -6.1075452916762405, + "step": 1344 + }, + { + "epoch": 0.4965160813991048, + "grad_norm": 13.0625, + "kl": 0.4599893093109131, + "learning_rate": 5.179464208761622e-06, + "logits/chosen": 2120657498.3529413, + "logits/rejected": 1712831146.6666667, + "logps/chosen": -273.67503446691177, + "logps/rejected": -375.01930338541666, + "loss": 0.2057, + "rewards/chosen": 1.2168242510627298, + "rewards/margins": 8.43655793433096, + "rewards/rejected": -7.219733683268229, + "step": 1345 + }, + { + "epoch": 0.496885238336948, + "grad_norm": 10.25, + "kl": 0.060272216796875, + "learning_rate": 5.173582541490886e-06, + "logits/chosen": 1636279296.0, + "logits/rejected": 1574219264.0, + "logps/chosen": -261.7189636230469, + "logps/rejected": -407.4404602050781, + "loss": 0.1238, + "rewards/chosen": 1.6984626054763794, + "rewards/margins": 8.87115204334259, + "rewards/rejected": -7.172689437866211, + "step": 1346 + }, + { + "epoch": 0.4972543952747912, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 5.167700633723742e-06, + "logits/chosen": 1963524577.8823528, + "logits/rejected": 1844099754.6666667, + "logps/chosen": -344.1368049172794, + "logps/rejected": -596.7352213541667, + "loss": 0.1501, + "rewards/chosen": 1.524704989264993, + "rewards/margins": 10.832813509772805, + "rewards/rejected": -9.308108520507812, + "step": 1347 + }, + { + "epoch": 0.4976235522126344, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 5.1618184936095e-06, + "logits/chosen": 1494571331.368421, + "logits/rejected": 2322222316.3076925, + "logps/chosen": -261.13998252467104, + "logps/rejected": -608.8333458533654, + "loss": 0.1837, + "rewards/chosen": 1.244883185938785, + "rewards/margins": 12.026020505650322, + "rewards/rejected": -10.781137319711538, + "step": 1348 + }, + { + "epoch": 0.4979927091504776, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 5.1559361292977915e-06, + "logits/chosen": 1926492979.2, + "logits/rejected": 1670428792.4705882, + "logps/chosen": -249.45784505208334, + "logps/rejected": -443.8468232996324, + "loss": 0.1419, + "rewards/chosen": 1.5241971333821616, + "rewards/margins": 7.23117093852922, + "rewards/rejected": -5.706973805147059, + "step": 1349 + }, + { + "epoch": 0.4983618660883208, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 5.150053548938557e-06, + "logits/chosen": 1448998326.857143, + "logits/rejected": 1743995562.6666667, + "logps/chosen": -291.0453404017857, + "logps/rejected": -548.1700303819445, + "loss": 0.098, + "rewards/chosen": 1.7653214590890067, + "rewards/margins": 10.375164395286923, + "rewards/rejected": -8.609842936197916, + "step": 1350 + }, + { + "epoch": 0.498731023026164, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 5.1441707606820365e-06, + "logits/chosen": 1626287250.2857144, + "logits/rejected": 1895019613.090909, + "logps/chosen": -256.7495814732143, + "logps/rejected": -423.4954279119318, + "loss": 0.1837, + "rewards/chosen": 1.4540764944893974, + "rewards/margins": 8.532113409661628, + "rewards/rejected": -7.07803691517223, + "step": 1351 + }, + { + "epoch": 0.4991001799640072, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 5.138287772678759e-06, + "logits/chosen": 1596020508.4444444, + "logits/rejected": 1569261568.0, + "logps/chosen": -310.40757921006946, + "logps/rejected": -401.31009347098217, + "loss": 0.1367, + "rewards/chosen": 1.4865263832939997, + "rewards/margins": 9.287470575362917, + "rewards/rejected": -7.8009441920689175, + "step": 1352 + }, + { + "epoch": 0.4994693369018504, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 5.132404593079531e-06, + "logits/chosen": 1452183311.0588236, + "logits/rejected": 1540505053.8666666, + "logps/chosen": -243.78079044117646, + "logps/rejected": -455.63525390625, + "loss": 0.1294, + "rewards/chosen": 1.6914053524241728, + "rewards/margins": 9.131033953498392, + "rewards/rejected": -7.439628601074219, + "step": 1353 + }, + { + "epoch": 0.4998384938396936, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 5.1265212300354205e-06, + "logits/chosen": 2254913536.0, + "logits/rejected": 1557009521.7777777, + "logps/chosen": -321.76224190848217, + "logps/rejected": -428.58930121527777, + "loss": 0.1698, + "rewards/chosen": 0.6745338439941406, + "rewards/margins": 8.375309838189018, + "rewards/rejected": -7.700775994194879, + "step": 1354 + }, + { + "epoch": 0.5002076507775368, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 5.120637691697753e-06, + "logits/chosen": 1679704064.0, + "logits/rejected": 1817905639.6190476, + "logps/chosen": -338.9024547230114, + "logps/rejected": -450.47251674107144, + "loss": 0.1163, + "rewards/chosen": 1.8095938942649148, + "rewards/margins": 9.150026197557326, + "rewards/rejected": -7.340432303292411, + "step": 1355 + }, + { + "epoch": 0.50057680771538, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 5.114753986218095e-06, + "logits/chosen": 1849207969.6842105, + "logits/rejected": 2175581420.3076925, + "logps/chosen": -303.9397615131579, + "logps/rejected": -423.8016826923077, + "loss": 0.1618, + "rewards/chosen": 1.355708172446803, + "rewards/margins": 8.5512642261953, + "rewards/rejected": -7.195556053748498, + "step": 1356 + }, + { + "epoch": 0.50057680771538, + "eval_kl": 0.0, + "eval_logits/chosen": 3569700766.009569, + "eval_logits/rejected": 3597665958.233766, + "eval_logps/chosen": -292.00513980263156, + "eval_logps/rejected": -479.0124458874459, + "eval_loss": 0.13304804265499115, + "eval_rewards/chosen": 1.500964297061902, + "eval_rewards/margins": 9.62916666110411, + "eval_rewards/rejected": -8.128202364042208, + "eval_runtime": 401.703, + "eval_samples_per_second": 2.181, + "eval_steps_per_second": 0.137, + "step": 1356 + }, + { + "epoch": 0.5009459646532232, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 5.108870121748248e-06, + "logits/chosen": 1708571170.1333334, + "logits/rejected": 2492717899.2941175, + "logps/chosen": -296.89593098958335, + "logps/rejected": -442.7273380055147, + "loss": 0.1633, + "rewards/chosen": 2.1285452524820965, + "rewards/margins": 7.641365776809992, + "rewards/rejected": -5.512820524327895, + "step": 1357 + }, + { + "epoch": 0.5013151215910664, + "grad_norm": 15.125, + "kl": 0.0, + "learning_rate": 5.102986106440232e-06, + "logits/chosen": 2297040384.0, + "logits/rejected": 1364611072.0, + "logps/chosen": -355.4040832519531, + "logps/rejected": -512.2871704101562, + "loss": 0.1972, + "rewards/chosen": 0.7476255893707275, + "rewards/margins": 8.912567377090454, + "rewards/rejected": -8.164941787719727, + "step": 1358 + }, + { + "epoch": 0.5016842785289096, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 5.097101948446272e-06, + "logits/chosen": 1470544145.0666666, + "logits/rejected": 1684527104.0, + "logps/chosen": -255.585693359375, + "logps/rejected": -475.0417911305147, + "loss": 0.1791, + "rewards/chosen": 0.6892091115315755, + "rewards/margins": 7.746090735641181, + "rewards/rejected": -7.056881624109605, + "step": 1359 + }, + { + "epoch": 0.5020534354667529, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 5.091217655918797e-06, + "logits/chosen": 1037983822.7692307, + "logits/rejected": 1280012934.7368422, + "logps/chosen": -274.22432767427887, + "logps/rejected": -480.6950041118421, + "loss": 0.1345, + "rewards/chosen": 1.2961546090932994, + "rewards/margins": 9.015977542892642, + "rewards/rejected": -7.7198229337993425, + "step": 1360 + }, + { + "epoch": 0.502422592404596, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 5.085333237010418e-06, + "logits/chosen": 1589388902.4, + "logits/rejected": 2149247819.2941175, + "logps/chosen": -330.70989583333335, + "logps/rejected": -339.75330307904414, + "loss": 0.1368, + "rewards/chosen": 1.6708353678385417, + "rewards/margins": 7.435390906240427, + "rewards/rejected": -5.7645555384018845, + "step": 1361 + }, + { + "epoch": 0.5027917493424392, + "grad_norm": 41.5, + "kl": 0.0, + "learning_rate": 5.0794486998739235e-06, + "logits/chosen": 2756770304.0, + "logits/rejected": 2808556288.0, + "logps/chosen": -364.5640563964844, + "logps/rejected": -524.7753295898438, + "loss": 0.1311, + "rewards/chosen": 1.3919353485107422, + "rewards/margins": 9.954026222229004, + "rewards/rejected": -8.562090873718262, + "step": 1362 + }, + { + "epoch": 0.5031609062802824, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 5.073564052662265e-06, + "logits/chosen": 1823336584.5333333, + "logits/rejected": 1809577502.1176472, + "logps/chosen": -291.08782552083335, + "logps/rejected": -520.1584329044117, + "loss": 0.1449, + "rewards/chosen": 1.0799120585123698, + "rewards/margins": 8.604228285247205, + "rewards/rejected": -7.524316226734834, + "step": 1363 + }, + { + "epoch": 0.5035300632181257, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 5.067679303528546e-06, + "logits/chosen": 1713337290.1052632, + "logits/rejected": 1906020352.0, + "logps/chosen": -253.4304070723684, + "logps/rejected": -294.43241060697113, + "loss": 0.1918, + "rewards/chosen": 1.4982317874306126, + "rewards/margins": 6.0882433841103, + "rewards/rejected": -4.5900115966796875, + "step": 1364 + }, + { + "epoch": 0.5038992201559688, + "grad_norm": 6.8125, + "kl": 0.0, + "learning_rate": 5.061794460626012e-06, + "logits/chosen": 1449940361.8461537, + "logits/rejected": 1818585842.5263157, + "logps/chosen": -229.9532752403846, + "logps/rejected": -329.3118832236842, + "loss": 0.071, + "rewards/chosen": 2.4629135131835938, + "rewards/margins": 8.538191544382196, + "rewards/rejected": -6.075278031198602, + "step": 1365 + }, + { + "epoch": 0.504268377093812, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 5.055909532108038e-06, + "logits/chosen": 1640449303.2727273, + "logits/rejected": 1789124998.0952382, + "logps/chosen": -239.6365633877841, + "logps/rejected": -468.0928664434524, + "loss": 0.1127, + "rewards/chosen": 1.3634721582586116, + "rewards/margins": 8.921247019912258, + "rewards/rejected": -7.5577748616536455, + "step": 1366 + }, + { + "epoch": 0.5046375340316552, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 5.050024526128118e-06, + "logits/chosen": 1801570304.0, + "logits/rejected": 1707582976.0, + "logps/chosen": -253.68760681152344, + "logps/rejected": -388.6904296875, + "loss": 0.1824, + "rewards/chosen": 0.8950899839401245, + "rewards/margins": 8.397847056388855, + "rewards/rejected": -7.5027570724487305, + "step": 1367 + }, + { + "epoch": 0.5050066909694985, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 5.044139450839851e-06, + "logits/chosen": 2438525513.142857, + "logits/rejected": 1365052416.0, + "logps/chosen": -261.3468017578125, + "logps/rejected": -437.6296657986111, + "loss": 0.1274, + "rewards/chosen": 1.3345531736101424, + "rewards/margins": 9.100198518662225, + "rewards/rejected": -7.765645345052083, + "step": 1368 + }, + { + "epoch": 0.5053758479073416, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 5.038254314396936e-06, + "logits/chosen": 1629306092.3076923, + "logits/rejected": 2086887639.5789473, + "logps/chosen": -369.1198918269231, + "logps/rejected": -507.81625205592104, + "loss": 0.1016, + "rewards/chosen": 2.131616445688101, + "rewards/margins": 11.282267273196325, + "rewards/rejected": -9.150650827508224, + "step": 1369 + }, + { + "epoch": 0.5057450048451848, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 5.032369124953156e-06, + "logits/chosen": 2128884916.7058823, + "logits/rejected": 1889470464.0, + "logps/chosen": -258.9426700367647, + "logps/rejected": -402.4298828125, + "loss": 0.1718, + "rewards/chosen": 1.0645378337186926, + "rewards/margins": 8.451542431700464, + "rewards/rejected": -7.387004597981771, + "step": 1370 + }, + { + "epoch": 0.506114161783028, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 5.026483890662363e-06, + "logits/chosen": 1666061165.7142856, + "logits/rejected": 1908977208.8888888, + "logps/chosen": -279.53304617745533, + "logps/rejected": -396.72596571180554, + "loss": 0.1196, + "rewards/chosen": 1.664065224783761, + "rewards/margins": 8.468740856836712, + "rewards/rejected": -6.804675632052952, + "step": 1371 + }, + { + "epoch": 0.5064833187208712, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 5.020598619678478e-06, + "logits/chosen": 2059210511.0588236, + "logits/rejected": 1953174459.7333333, + "logps/chosen": -356.96223000919116, + "logps/rejected": -431.6189453125, + "loss": 0.124, + "rewards/chosen": 1.9929383221794577, + "rewards/margins": 10.404800504796645, + "rewards/rejected": -8.411862182617188, + "step": 1372 + }, + { + "epoch": 0.5068524756587144, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 5.014713320155464e-06, + "logits/chosen": 1702997120.0, + "logits/rejected": 1744341120.0, + "logps/chosen": -283.8160095214844, + "logps/rejected": -505.6544494628906, + "loss": 0.178, + "rewards/chosen": 1.0596870183944702, + "rewards/margins": 10.216323256492615, + "rewards/rejected": -9.156636238098145, + "step": 1373 + }, + { + "epoch": 0.5072216325965576, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 5.008828000247335e-06, + "logits/chosen": 1612466176.0, + "logits/rejected": 1711053824.0, + "logps/chosen": -144.10527692522322, + "logps/rejected": -497.80718315972223, + "loss": 0.1347, + "rewards/chosen": 1.6243998663766044, + "rewards/margins": 10.220217432294573, + "rewards/rejected": -8.595817565917969, + "step": 1374 + }, + { + "epoch": 0.5075907895344008, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 5.002942668108121e-06, + "logits/chosen": 2469739724.8, + "logits/rejected": 2950154053.818182, + "logps/chosen": -341.517626953125, + "logps/rejected": -601.5669389204545, + "loss": 0.1156, + "rewards/chosen": 1.15643253326416, + "rewards/margins": 11.40103993849321, + "rewards/rejected": -10.244607405229049, + "step": 1375 + }, + { + "epoch": 0.507959946472244, + "grad_norm": 13.4375, + "kl": 0.29839181900024414, + "learning_rate": 4.99705733189188e-06, + "logits/chosen": 1261077969.4545455, + "logits/rejected": 1565554995.2, + "logps/chosen": -268.02767666903407, + "logps/rejected": -397.9034912109375, + "loss": 0.1932, + "rewards/chosen": 1.5445053794167258, + "rewards/margins": 8.553530190207741, + "rewards/rejected": -7.009024810791016, + "step": 1376 + }, + { + "epoch": 0.5083291034100872, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 4.991171999752668e-06, + "logits/chosen": 1741281109.3333333, + "logits/rejected": 2636795084.8, + "logps/chosen": -254.6208292643229, + "logps/rejected": -452.489306640625, + "loss": 0.1366, + "rewards/chosen": 1.190843105316162, + "rewards/margins": 9.23163137435913, + "rewards/rejected": -8.040788269042968, + "step": 1377 + }, + { + "epoch": 0.5086982603479304, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 4.985286679844537e-06, + "logits/chosen": 1156296817.7777777, + "logits/rejected": 1690448603.4285715, + "logps/chosen": -210.05013020833334, + "logps/rejected": -345.50767299107144, + "loss": 0.1475, + "rewards/chosen": 1.8024141523573134, + "rewards/margins": 8.829753542703296, + "rewards/rejected": -7.027339390345982, + "step": 1378 + }, + { + "epoch": 0.5090674172857736, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 4.979401380321525e-06, + "logits/chosen": 2018895872.0, + "logits/rejected": 1418307015.1111112, + "logps/chosen": -253.76613071986608, + "logps/rejected": -428.7705078125, + "loss": 0.1299, + "rewards/chosen": 1.3202810287475586, + "rewards/margins": 10.538873354593912, + "rewards/rejected": -9.218592325846354, + "step": 1379 + }, + { + "epoch": 0.5094365742236168, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 4.973516109337639e-06, + "logits/chosen": 1707406131.2, + "logits/rejected": 1570416158.1176472, + "logps/chosen": -319.6482747395833, + "logps/rejected": -434.53601792279414, + "loss": 0.1126, + "rewards/chosen": 2.076096216837565, + "rewards/margins": 9.586554770376168, + "rewards/rejected": -7.510458553538603, + "step": 1380 + }, + { + "epoch": 0.50980573116146, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 4.967630875046845e-06, + "logits/chosen": 2137378973.5384614, + "logits/rejected": 2513626812.631579, + "logps/chosen": -255.63371394230768, + "logps/rejected": -499.2210629111842, + "loss": 0.1174, + "rewards/chosen": 1.8149519700270433, + "rewards/margins": 9.690565533966188, + "rewards/rejected": -7.875613563939145, + "step": 1381 + }, + { + "epoch": 0.5101748880993032, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 4.961745685603065e-06, + "logits/chosen": 1761174674.2857144, + "logits/rejected": 1701462926.2222223, + "logps/chosen": -300.18582589285717, + "logps/rejected": -544.3199869791666, + "loss": 0.1087, + "rewards/chosen": 1.6407533373151506, + "rewards/margins": 10.89885405888633, + "rewards/rejected": -9.25810072157118, + "step": 1382 + }, + { + "epoch": 0.5105440450371465, + "grad_norm": 15.125, + "kl": 0.0, + "learning_rate": 4.95586054916015e-06, + "logits/chosen": 1832999808.0, + "logits/rejected": 2329924864.0, + "logps/chosen": -319.49322509765625, + "logps/rejected": -593.6857299804688, + "loss": 0.1955, + "rewards/chosen": 0.8516303300857544, + "rewards/margins": 8.880295872688293, + "rewards/rejected": -8.028665542602539, + "step": 1383 + }, + { + "epoch": 0.5109132019749896, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 4.9499754738718835e-06, + "logits/chosen": 1686918371.5555556, + "logits/rejected": 1853349010.2857144, + "logps/chosen": -289.6108127170139, + "logps/rejected": -518.3342982700893, + "loss": 0.1601, + "rewards/chosen": 1.4505570729573567, + "rewards/margins": 7.727079573131743, + "rewards/rejected": -6.276522500174386, + "step": 1384 + }, + { + "epoch": 0.5112823589128328, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 4.944090467891963e-06, + "logits/chosen": 1728718848.0, + "logits/rejected": 2496982528.0, + "logps/chosen": -163.17221069335938, + "logps/rejected": -462.03289794921875, + "loss": 0.1177, + "rewards/chosen": 1.7627646923065186, + "rewards/margins": 10.388445615768433, + "rewards/rejected": -8.625680923461914, + "step": 1385 + }, + { + "epoch": 0.511651515850676, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 4.938205539373989e-06, + "logits/chosen": 1664008192.0, + "logits/rejected": 2397762121.142857, + "logps/chosen": -198.081298828125, + "logps/rejected": -507.88988095238096, + "loss": 0.0884, + "rewards/chosen": 1.798139051957564, + "rewards/margins": 10.058235284053918, + "rewards/rejected": -8.260096232096354, + "step": 1386 + }, + { + "epoch": 0.5120206727885193, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 4.932320696471455e-06, + "logits/chosen": 2049011419.4285715, + "logits/rejected": 2053183488.0, + "logps/chosen": -259.80848911830356, + "logps/rejected": -545.0476345486111, + "loss": 0.1103, + "rewards/chosen": 1.5186733518327986, + "rewards/margins": 8.367955510578458, + "rewards/rejected": -6.84928215874566, + "step": 1387 + }, + { + "epoch": 0.5123898297263624, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 4.926435947337736e-06, + "logits/chosen": 1915282090.6666667, + "logits/rejected": 1759349760.0, + "logps/chosen": -276.49188910590277, + "logps/rejected": -474.2200404575893, + "loss": 0.1594, + "rewards/chosen": 1.355020523071289, + "rewards/margins": 9.293698719569615, + "rewards/rejected": -7.9386781964983255, + "step": 1388 + }, + { + "epoch": 0.5127589866642056, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 4.920551300126077e-06, + "logits/chosen": 1825071513.6, + "logits/rejected": 2048856576.0, + "logps/chosen": -283.5252685546875, + "logps/rejected": -569.6129557291666, + "loss": 0.2083, + "rewards/chosen": 0.9883529663085937, + "rewards/margins": 9.903301747639974, + "rewards/rejected": -8.91494878133138, + "step": 1389 + }, + { + "epoch": 0.5131281436020488, + "grad_norm": 12.1875, + "kl": 0.06120014190673828, + "learning_rate": 4.9146667629895836e-06, + "logits/chosen": 2556223943.111111, + "logits/rejected": 1382639762.2857144, + "logps/chosen": -305.0973849826389, + "logps/rejected": -506.61181640625, + "loss": 0.1819, + "rewards/chosen": 1.5714367760552301, + "rewards/margins": 11.588053597344292, + "rewards/rejected": -10.016616821289062, + "step": 1390 + }, + { + "epoch": 0.5134973005398921, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 4.908782344081204e-06, + "logits/chosen": 1672991507.6923077, + "logits/rejected": 1505695744.0, + "logps/chosen": -269.7762920673077, + "logps/rejected": -568.1571751644736, + "loss": 0.13, + "rewards/chosen": 1.087497564462515, + "rewards/margins": 10.717353071761035, + "rewards/rejected": -9.62985550729852, + "step": 1391 + }, + { + "epoch": 0.5138664574777352, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 4.902898051553729e-06, + "logits/chosen": 2090065009.7777777, + "logits/rejected": 1559079204.5714285, + "logps/chosen": -288.529541015625, + "logps/rejected": -400.70068359375, + "loss": 0.2381, + "rewards/chosen": 0.5939667489793565, + "rewards/margins": 6.242858197953966, + "rewards/rejected": -5.648891448974609, + "step": 1392 + }, + { + "epoch": 0.5142356144155784, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 4.897013893559771e-06, + "logits/chosen": 1513899446.857143, + "logits/rejected": 1407059740.4444444, + "logps/chosen": -306.88424246651783, + "logps/rejected": -529.9427083333334, + "loss": 0.1447, + "rewards/chosen": 1.045412472316197, + "rewards/margins": 9.94919987330361, + "rewards/rejected": -8.903787400987413, + "step": 1393 + }, + { + "epoch": 0.5146047713534216, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 4.891129878251754e-06, + "logits/chosen": 2160160768.0, + "logits/rejected": 1821494067.2, + "logps/chosen": -241.5140935724432, + "logps/rejected": -532.5248046875, + "loss": 0.2109, + "rewards/chosen": 1.37207707491788, + "rewards/margins": 11.632340136441318, + "rewards/rejected": -10.260263061523437, + "step": 1394 + }, + { + "epoch": 0.5149739282912649, + "grad_norm": 13.1875, + "kl": 0.2821195125579834, + "learning_rate": 4.8852460137819065e-06, + "logits/chosen": 2231519339.7894735, + "logits/rejected": 2193698500.923077, + "logps/chosen": -308.1941560444079, + "logps/rejected": -395.08597506009613, + "loss": 0.178, + "rewards/chosen": 1.4088260249087685, + "rewards/margins": 8.7738731863045, + "rewards/rejected": -7.365047161395733, + "step": 1395 + }, + { + "epoch": 0.515343085229108, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 4.87936230830225e-06, + "logits/chosen": 1664255692.8, + "logits/rejected": 1711868648.7272727, + "logps/chosen": -221.592822265625, + "logps/rejected": -365.97037020596593, + "loss": 0.0677, + "rewards/chosen": 2.407558250427246, + "rewards/margins": 8.972099460255016, + "rewards/rejected": -6.56454120982777, + "step": 1396 + }, + { + "epoch": 0.5157122421669512, + "grad_norm": 13.5625, + "kl": 0.4003629684448242, + "learning_rate": 4.873478769964583e-06, + "logits/chosen": 1824946443.1304348, + "logits/rejected": 1951624305.7777777, + "logps/chosen": -271.5974864130435, + "logps/rejected": -372.54584418402777, + "loss": 0.2411, + "rewards/chosen": 1.0185732634171196, + "rewards/margins": 8.05117366624915, + "rewards/rejected": -7.032600402832031, + "step": 1397 + }, + { + "epoch": 0.5160813991047944, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 4.867595406920471e-06, + "logits/chosen": 2103437604.5714285, + "logits/rejected": 2325588650.6666665, + "logps/chosen": -273.76913016183033, + "logps/rejected": -373.53583441840277, + "loss": 0.0937, + "rewards/chosen": 2.1265632084437778, + "rewards/margins": 8.560105671958318, + "rewards/rejected": -6.43354246351454, + "step": 1398 + }, + { + "epoch": 0.5164505560426377, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 4.8617122273212414e-06, + "logits/chosen": 1786842550.857143, + "logits/rejected": 1627642042.1818182, + "logps/chosen": -255.01499720982142, + "logps/rejected": -681.2910600142045, + "loss": 0.1685, + "rewards/chosen": 1.28282592410133, + "rewards/margins": 11.207118748586415, + "rewards/rejected": -9.924292824485086, + "step": 1399 + }, + { + "epoch": 0.5168197129804808, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 4.855829239317964e-06, + "logits/chosen": 1711635353.6, + "logits/rejected": 1817411754.6666667, + "logps/chosen": -259.377001953125, + "logps/rejected": -531.5544026692709, + "loss": 0.1809, + "rewards/chosen": 1.2825864791870116, + "rewards/margins": 10.971911176045737, + "rewards/rejected": -9.689324696858725, + "step": 1400 + }, + { + "epoch": 0.517188869918324, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 4.849946451061444e-06, + "logits/chosen": 1569087590.4, + "logits/rejected": 1539717632.0, + "logps/chosen": -325.5746337890625, + "logps/rejected": -550.0177408854166, + "loss": 0.1615, + "rewards/chosen": 1.7510732650756835, + "rewards/margins": 9.194550895690918, + "rewards/rejected": -7.443477630615234, + "step": 1401 + }, + { + "epoch": 0.5175580268561673, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 4.84406387070221e-06, + "logits/chosen": 2571946738.5263157, + "logits/rejected": 2057901292.3076923, + "logps/chosen": -325.7138414884868, + "logps/rejected": -446.6569260817308, + "loss": 0.1627, + "rewards/chosen": 1.4801131800601357, + "rewards/margins": 8.438973724118128, + "rewards/rejected": -6.958860544057993, + "step": 1402 + }, + { + "epoch": 0.5179271837940105, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 4.838181506390501e-06, + "logits/chosen": 1443071853.7142856, + "logits/rejected": 1831897770.6666667, + "logps/chosen": -335.36282784598217, + "logps/rejected": -417.0901150173611, + "loss": 0.1797, + "rewards/chosen": 1.080009869166783, + "rewards/margins": 8.08827080802312, + "rewards/rejected": -7.008260938856337, + "step": 1403 + }, + { + "epoch": 0.5182963407318536, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 4.83229936627626e-06, + "logits/chosen": 1464151680.0, + "logits/rejected": 1658211456.0, + "logps/chosen": -231.91091918945312, + "logps/rejected": -451.02716064453125, + "loss": 0.1427, + "rewards/chosen": 1.5101016759872437, + "rewards/margins": 8.606352686882019, + "rewards/rejected": -7.096251010894775, + "step": 1404 + }, + { + "epoch": 0.5186654976696968, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 4.826417458509116e-06, + "logits/chosen": 1676629085.090909, + "logits/rejected": 1739917312.0, + "logps/chosen": -287.32204367897725, + "logps/rejected": -427.9384765625, + "loss": 0.2037, + "rewards/chosen": 1.386567549272017, + "rewards/margins": 8.783190779252486, + "rewards/rejected": -7.396623229980468, + "step": 1405 + }, + { + "epoch": 0.5190346546075401, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 4.8205357912383785e-06, + "logits/chosen": 1577459985.0666666, + "logits/rejected": 2132379045.6470587, + "logps/chosen": -372.59482421875, + "logps/rejected": -457.2965877757353, + "loss": 0.1443, + "rewards/chosen": 1.5372666676839193, + "rewards/margins": 8.891722503362917, + "rewards/rejected": -7.354455835678998, + "step": 1406 + }, + { + "epoch": 0.5194038115453832, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 4.8146543726130205e-06, + "logits/chosen": 1406500736.0, + "logits/rejected": 1865520128.0, + "logps/chosen": -253.35317993164062, + "logps/rejected": -424.6146240234375, + "loss": 0.1764, + "rewards/chosen": 0.794506311416626, + "rewards/margins": 9.008854150772095, + "rewards/rejected": -8.214347839355469, + "step": 1407 + }, + { + "epoch": 0.5197729684832264, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 4.8087732107816755e-06, + "logits/chosen": 1823062388.3636363, + "logits/rejected": 1914201497.6, + "logps/chosen": -306.27712180397725, + "logps/rejected": -514.962841796875, + "loss": 0.1956, + "rewards/chosen": 1.388677250255238, + "rewards/margins": 8.265829502452503, + "rewards/rejected": -6.8771522521972654, + "step": 1408 + }, + { + "epoch": 0.5201421254210696, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 4.802892313892618e-06, + "logits/chosen": 2008164713.4117646, + "logits/rejected": 2046371976.5333333, + "logps/chosen": -226.42112821691177, + "logps/rejected": -594.8378255208333, + "loss": 0.1658, + "rewards/chosen": 1.1232201071346508, + "rewards/margins": 9.822852878944547, + "rewards/rejected": -8.699632771809895, + "step": 1409 + }, + { + "epoch": 0.5205112823589129, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 4.797011690093756e-06, + "logits/chosen": 1796606848.0, + "logits/rejected": 1923427840.0, + "logps/chosen": -269.7610168457031, + "logps/rejected": -532.26123046875, + "loss": 0.1458, + "rewards/chosen": 1.4457528591156006, + "rewards/margins": 10.157224416732788, + "rewards/rejected": -8.711471557617188, + "step": 1410 + }, + { + "epoch": 0.520880439296756, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 4.791131347532619e-06, + "logits/chosen": 1840623820.8, + "logits/rejected": 1980183210.6666667, + "logps/chosen": -307.8463623046875, + "logps/rejected": -421.0480550130208, + "loss": 0.207, + "rewards/chosen": 1.1085908889770508, + "rewards/margins": 8.773933855692546, + "rewards/rejected": -7.665342966715495, + "step": 1411 + }, + { + "epoch": 0.5212495962345992, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 4.785251294356343e-06, + "logits/chosen": 1881812992.0, + "logits/rejected": 1609817027.764706, + "logps/chosen": -324.88108723958334, + "logps/rejected": -483.82933134191177, + "loss": 0.12, + "rewards/chosen": 1.648119862874349, + "rewards/margins": 10.262921606325635, + "rewards/rejected": -8.614801743451286, + "step": 1412 + }, + { + "epoch": 0.5216187531724424, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 4.779371538711672e-06, + "logits/chosen": 1653994951.1111112, + "logits/rejected": 1765494198.857143, + "logps/chosen": -295.8528103298611, + "logps/rejected": -443.99832589285717, + "loss": 0.2035, + "rewards/chosen": 1.053530799018012, + "rewards/margins": 8.507890126061817, + "rewards/rejected": -7.4543593270438055, + "step": 1413 + }, + { + "epoch": 0.5219879101102857, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 4.773492088744932e-06, + "logits/chosen": 1988843047.3846154, + "logits/rejected": 1694094605.4736843, + "logps/chosen": -394.0676832932692, + "logps/rejected": -459.86836965460526, + "loss": 0.1143, + "rewards/chosen": 1.5669627556434045, + "rewards/margins": 9.323186735392582, + "rewards/rejected": -7.756223979749177, + "step": 1414 + }, + { + "epoch": 0.5223570670481288, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 4.767612952602023e-06, + "logits/chosen": 1433912186.4347825, + "logits/rejected": 1430870584.8888888, + "logps/chosen": -217.52466881793478, + "logps/rejected": -348.85986328125, + "loss": 0.2027, + "rewards/chosen": 1.656420500382133, + "rewards/margins": 9.573937420683782, + "rewards/rejected": -7.917516920301649, + "step": 1415 + }, + { + "epoch": 0.522726223985972, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 4.761734138428417e-06, + "logits/chosen": 2092173604.5714285, + "logits/rejected": 1868136561.7777777, + "logps/chosen": -330.31947544642856, + "logps/rejected": -449.990478515625, + "loss": 0.1528, + "rewards/chosen": 0.9597234044756208, + "rewards/margins": 8.53897331631373, + "rewards/rejected": -7.579249911838108, + "step": 1416 + }, + { + "epoch": 0.5230953809238152, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 4.755855654369136e-06, + "logits/chosen": 2159153152.0, + "logits/rejected": 2459781412.571429, + "logps/chosen": -267.8249782986111, + "logps/rejected": -541.9461146763393, + "loss": 0.1124, + "rewards/chosen": 1.9520674811469183, + "rewards/margins": 10.826644594707187, + "rewards/rejected": -8.874577113560267, + "step": 1417 + }, + { + "epoch": 0.5234645378616585, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 4.749977508568742e-06, + "logits/chosen": 2001678080.0, + "logits/rejected": 2185899520.0, + "logps/chosen": -321.5350646972656, + "logps/rejected": -546.3653564453125, + "loss": 0.1395, + "rewards/chosen": 1.6793115139007568, + "rewards/margins": 9.83783221244812, + "rewards/rejected": -8.158520698547363, + "step": 1418 + }, + { + "epoch": 0.5238336947995016, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 4.744099709171335e-06, + "logits/chosen": 1588806528.0, + "logits/rejected": 1770794880.0, + "logps/chosen": -268.134033203125, + "logps/rejected": -469.5840759277344, + "loss": 0.1404, + "rewards/chosen": 1.5338326692581177, + "rewards/margins": 9.786189198493958, + "rewards/rejected": -8.25235652923584, + "step": 1419 + }, + { + "epoch": 0.5242028517373448, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 4.738222264320529e-06, + "logits/chosen": 2052451091.6923077, + "logits/rejected": 1964917813.8947368, + "logps/chosen": -242.90208082932693, + "logps/rejected": -457.88106496710526, + "loss": 0.1203, + "rewards/chosen": 2.232639899620643, + "rewards/margins": 9.13725319372015, + "rewards/rejected": -6.904613294099507, + "step": 1420 + }, + { + "epoch": 0.524572008675188, + "grad_norm": 15.125, + "kl": 0.0, + "learning_rate": 4.73234518215945e-06, + "logits/chosen": 2202666910.4761906, + "logits/rejected": 1515821521.4545455, + "logps/chosen": -320.7586263020833, + "logps/rejected": -325.79518821022725, + "loss": 0.1797, + "rewards/chosen": 1.3404091426304408, + "rewards/margins": 8.92747856734635, + "rewards/rejected": -7.587069424715909, + "step": 1421 + }, + { + "epoch": 0.5249411656130313, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 4.726468470830721e-06, + "logits/chosen": 1696424960.0, + "logits/rejected": 1799905393.7777777, + "logps/chosen": -263.8677978515625, + "logps/rejected": -383.4665256076389, + "loss": 0.1256, + "rewards/chosen": 1.7041067395891463, + "rewards/margins": 8.518162288363017, + "rewards/rejected": -6.814055548773871, + "step": 1422 + }, + { + "epoch": 0.5253103225508744, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 4.720592138476451e-06, + "logits/chosen": 1650887443.6923077, + "logits/rejected": 1635246080.0, + "logps/chosen": -268.5458420973558, + "logps/rejected": -532.8623046875, + "loss": 0.1219, + "rewards/chosen": 1.6093780811016376, + "rewards/margins": 10.577880728099991, + "rewards/rejected": -8.968502646998354, + "step": 1423 + }, + { + "epoch": 0.5256794794887176, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 4.714716193238221e-06, + "logits/chosen": 2029891102.1176472, + "logits/rejected": 1861177207.4666667, + "logps/chosen": -227.9993106617647, + "logps/rejected": -489.42102864583336, + "loss": 0.1708, + "rewards/chosen": 1.4098624061135685, + "rewards/margins": 8.540153645534142, + "rewards/rejected": -7.130291239420573, + "step": 1424 + }, + { + "epoch": 0.5260486364265609, + "grad_norm": 15.125, + "kl": 0.0, + "learning_rate": 4.708840643257083e-06, + "logits/chosen": 1894940672.0, + "logits/rejected": 1751252992.0, + "logps/chosen": -330.6879111842105, + "logps/rejected": -530.7499248798077, + "loss": 0.1626, + "rewards/chosen": 1.378687206067537, + "rewards/margins": 10.57488647646267, + "rewards/rejected": -9.196199270395132, + "step": 1425 + }, + { + "epoch": 0.5264177933644041, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 4.702965496673534e-06, + "logits/chosen": 1517030478.7692308, + "logits/rejected": 1964421120.0, + "logps/chosen": -270.16483248197113, + "logps/rejected": -511.1123046875, + "loss": 0.105, + "rewards/chosen": 1.2975108806903546, + "rewards/margins": 10.71217971678205, + "rewards/rejected": -9.414668836091694, + "step": 1426 + }, + { + "epoch": 0.5267869503022472, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 4.697090761627515e-06, + "logits/chosen": 1193216546.1333334, + "logits/rejected": 1381573692.235294, + "logps/chosen": -232.88701171875, + "logps/rejected": -486.94898897058823, + "loss": 0.1289, + "rewards/chosen": 1.4455071767171224, + "rewards/margins": 9.555281597960228, + "rewards/rejected": -8.109774421243106, + "step": 1427 + }, + { + "epoch": 0.5271561072400904, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 4.691216446258401e-06, + "logits/chosen": 1821479480.8888888, + "logits/rejected": 1826636214.857143, + "logps/chosen": -233.36089409722223, + "logps/rejected": -457.5027553013393, + "loss": 0.1749, + "rewards/chosen": 1.0776160558064778, + "rewards/margins": 8.537740480332147, + "rewards/rejected": -7.460124424525669, + "step": 1428 + }, + { + "epoch": 0.5275252641779337, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 4.685342558704973e-06, + "logits/chosen": 1403169450.6666667, + "logits/rejected": 2321237606.4, + "logps/chosen": -278.70501708984375, + "logps/rejected": -527.39619140625, + "loss": 0.1089, + "rewards/chosen": 1.4509318669637044, + "rewards/margins": 9.906332715352377, + "rewards/rejected": -8.455400848388672, + "step": 1429 + }, + { + "epoch": 0.5278944211157769, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 4.679469107105435e-06, + "logits/chosen": 2110140620.8, + "logits/rejected": 1746566997.3333333, + "logps/chosen": -289.9317138671875, + "logps/rejected": -419.6753336588542, + "loss": 0.1914, + "rewards/chosen": 1.163582992553711, + "rewards/margins": 10.841928354899087, + "rewards/rejected": -9.678345362345377, + "step": 1430 + }, + { + "epoch": 0.52826357805362, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 4.673596099597376e-06, + "logits/chosen": 1926557240.8888888, + "logits/rejected": 1587650852.5714285, + "logps/chosen": -279.46405707465277, + "logps/rejected": -545.5976911272321, + "loss": 0.1499, + "rewards/chosen": 1.5892397562662761, + "rewards/margins": 10.368248349144345, + "rewards/rejected": -8.77900859287807, + "step": 1431 + }, + { + "epoch": 0.5286327349914632, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 4.667723544317773e-06, + "logits/chosen": 2399223808.0, + "logits/rejected": 1898839244.8, + "logps/chosen": -346.62913602941177, + "logps/rejected": -495.0713216145833, + "loss": 0.1511, + "rewards/chosen": 1.2812078139361214, + "rewards/margins": 9.904663624482996, + "rewards/rejected": -8.623455810546876, + "step": 1432 + }, + { + "epoch": 0.5290018919293065, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 4.661851449402978e-06, + "logits/chosen": 1747558213.8181818, + "logits/rejected": 1599981080.3809524, + "logps/chosen": -291.42702414772725, + "logps/rejected": -452.5171130952381, + "loss": 0.102, + "rewards/chosen": 2.0229649977250532, + "rewards/margins": 9.820333365238074, + "rewards/rejected": -7.7973683675130205, + "step": 1433 + }, + { + "epoch": 0.5293710488671497, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 4.655979822988705e-06, + "logits/chosen": 1360292810.1052632, + "logits/rejected": 1893902808.6153846, + "logps/chosen": -271.3414884868421, + "logps/rejected": -503.37759164663464, + "loss": 0.1901, + "rewards/chosen": 1.062995810257761, + "rewards/margins": 11.168824902430236, + "rewards/rejected": -10.105829092172476, + "step": 1434 + }, + { + "epoch": 0.5297402058049928, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 4.650108673210014e-06, + "logits/chosen": 1977760256.0, + "logits/rejected": 2284163072.0, + "logps/chosen": -260.1033121744792, + "logps/rejected": -377.1365966796875, + "loss": 0.1326, + "rewards/chosen": 1.157500982284546, + "rewards/margins": 7.93215765953064, + "rewards/rejected": -6.774656677246094, + "step": 1435 + }, + { + "epoch": 0.530109362742836, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 4.644238008201314e-06, + "logits/chosen": 1453135992.4705882, + "logits/rejected": 1801064174.9333334, + "logps/chosen": -238.40090762867646, + "logps/rejected": -459.2556640625, + "loss": 0.1615, + "rewards/chosen": 1.2209557925953585, + "rewards/margins": 9.242313011019837, + "rewards/rejected": -8.021357218424479, + "step": 1436 + }, + { + "epoch": 0.5304785196806793, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 4.638367836096332e-06, + "logits/chosen": 1719710659.764706, + "logits/rejected": 1671939686.4, + "logps/chosen": -295.8189338235294, + "logps/rejected": -573.5707682291667, + "loss": 0.1645, + "rewards/chosen": 1.1836756537942326, + "rewards/margins": 10.20469819611194, + "rewards/rejected": -9.021022542317708, + "step": 1437 + }, + { + "epoch": 0.5308476766185225, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 4.632498165028119e-06, + "logits/chosen": 1994081621.3333333, + "logits/rejected": 1208616301.7142856, + "logps/chosen": -241.40432400173611, + "logps/rejected": -420.91064453125, + "loss": 0.1665, + "rewards/chosen": 1.5803714328342013, + "rewards/margins": 8.861786009773375, + "rewards/rejected": -7.2814145769391745, + "step": 1438 + }, + { + "epoch": 0.5312168335563656, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 4.6266290031290295e-06, + "logits/chosen": 1219375755.6363637, + "logits/rejected": 992931157.3333334, + "logps/chosen": -331.24007901278407, + "logps/rejected": -422.4540550595238, + "loss": 0.1497, + "rewards/chosen": 0.37084436416625977, + "rewards/margins": 9.24749957947504, + "rewards/rejected": -8.87665521530878, + "step": 1439 + }, + { + "epoch": 0.5315859904942088, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 4.620760358530713e-06, + "logits/chosen": 1943800900.2666667, + "logits/rejected": 1827827471.0588236, + "logps/chosen": -219.910302734375, + "logps/rejected": -503.73650045955884, + "loss": 0.1246, + "rewards/chosen": 1.4599599202473958, + "rewards/margins": 9.831312172085632, + "rewards/rejected": -8.371352251838236, + "step": 1440 + }, + { + "epoch": 0.5319551474320521, + "grad_norm": 11.8125, + "kl": 0.1007537841796875, + "learning_rate": 4.6148922393641e-06, + "logits/chosen": 1652104923.4285715, + "logits/rejected": 1538912824.8888888, + "logps/chosen": -309.37147739955356, + "logps/rejected": -359.60967339409723, + "loss": 0.1452, + "rewards/chosen": 1.227393695286342, + "rewards/margins": 6.849587773519849, + "rewards/rejected": -5.622194078233507, + "step": 1441 + }, + { + "epoch": 0.5323243043698953, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 4.609024653759398e-06, + "logits/chosen": 1771593728.0, + "logits/rejected": 1799871556.2666667, + "logps/chosen": -287.2710822610294, + "logps/rejected": -510.52063802083336, + "loss": 0.1317, + "rewards/chosen": 1.770153606639189, + "rewards/margins": 10.597934775258981, + "rewards/rejected": -8.827781168619792, + "step": 1442 + }, + { + "epoch": 0.5326934613077384, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 4.60315760984607e-06, + "logits/chosen": 1498198454.857143, + "logits/rejected": 2062542392.8888888, + "logps/chosen": -331.66796875, + "logps/rejected": -488.5354817708333, + "loss": 0.1126, + "rewards/chosen": 1.7394790649414062, + "rewards/margins": 9.290953742133247, + "rewards/rejected": -7.55147467719184, + "step": 1443 + }, + { + "epoch": 0.5330626182455817, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 4.597291115752832e-06, + "logits/chosen": 1577736071.5294118, + "logits/rejected": 1863935453.8666666, + "logps/chosen": -366.49192899816177, + "logps/rejected": -430.9873372395833, + "loss": 0.1822, + "rewards/chosen": 1.006417891558479, + "rewards/margins": 7.304639734006395, + "rewards/rejected": -6.298221842447917, + "step": 1444 + }, + { + "epoch": 0.5334317751834249, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 4.591425179607639e-06, + "logits/chosen": 1896802048.0, + "logits/rejected": 1895141248.0, + "logps/chosen": -351.8583984375, + "logps/rejected": -379.3516540527344, + "loss": 0.1718, + "rewards/chosen": 1.0213924646377563, + "rewards/margins": 8.53551733493805, + "rewards/rejected": -7.514124870300293, + "step": 1445 + }, + { + "epoch": 0.533800932121268, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 4.585559809537666e-06, + "logits/chosen": 1637218048.0, + "logits/rejected": 1617030016.0, + "logps/chosen": -278.0539855957031, + "logps/rejected": -507.04876708984375, + "loss": 0.1326, + "rewards/chosen": 1.7828631401062012, + "rewards/margins": 9.677094459533691, + "rewards/rejected": -7.89423131942749, + "step": 1446 + }, + { + "epoch": 0.5341700890591112, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 4.579695013669313e-06, + "logits/chosen": 1901786989.7142856, + "logits/rejected": 2463229952.0, + "logps/chosen": -235.5845947265625, + "logps/rejected": -522.2452256944445, + "loss": 0.1307, + "rewards/chosen": 1.420327595302037, + "rewards/margins": 10.949780449034677, + "rewards/rejected": -9.52945285373264, + "step": 1447 + }, + { + "epoch": 0.5345392459969545, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 4.573830800128178e-06, + "logits/chosen": 1755860764.4444444, + "logits/rejected": 1443150701.7142856, + "logps/chosen": -292.6238064236111, + "logps/rejected": -501.6510532924107, + "loss": 0.2088, + "rewards/chosen": 0.7587308883666992, + "rewards/margins": 9.21528366633824, + "rewards/rejected": -8.45655277797154, + "step": 1448 + }, + { + "epoch": 0.5349084029347977, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 4.567967177039054e-06, + "logits/chosen": 1385183505.0666666, + "logits/rejected": 1409949575.5294118, + "logps/chosen": -237.41066080729166, + "logps/rejected": -558.1147748161765, + "loss": 0.1318, + "rewards/chosen": 1.608899180094401, + "rewards/margins": 9.966396451463886, + "rewards/rejected": -8.357497271369486, + "step": 1449 + }, + { + "epoch": 0.5352775598726408, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 4.562104152525918e-06, + "logits/chosen": 1643928969.8461537, + "logits/rejected": 1656203587.368421, + "logps/chosen": -305.5281325120192, + "logps/rejected": -563.079255756579, + "loss": 0.1176, + "rewards/chosen": 1.30950194138747, + "rewards/margins": 10.946758764475463, + "rewards/rejected": -9.637256823087993, + "step": 1450 + }, + { + "epoch": 0.535646716810484, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 4.556241734711916e-06, + "logits/chosen": 1804572525.7142856, + "logits/rejected": 2158903296.0, + "logps/chosen": -312.6543666294643, + "logps/rejected": -530.3146158854166, + "loss": 0.1065, + "rewards/chosen": 1.6161805561610632, + "rewards/margins": 9.781767239646307, + "rewards/rejected": -8.165586683485243, + "step": 1451 + }, + { + "epoch": 0.5360158737483273, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 4.550379931719351e-06, + "logits/chosen": 2253960760.888889, + "logits/rejected": 1825305161.142857, + "logps/chosen": -266.5562337239583, + "logps/rejected": -525.4776436941964, + "loss": 0.1694, + "rewards/chosen": 1.1425045861138239, + "rewards/margins": 9.386477909390887, + "rewards/rejected": -8.243973323277064, + "step": 1452 + }, + { + "epoch": 0.5363850306861705, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 4.54451875166968e-06, + "logits/chosen": 1826293760.0, + "logits/rejected": 2601397248.0, + "logps/chosen": -275.9594482421875, + "logps/rejected": -595.2775065104166, + "loss": 0.1711, + "rewards/chosen": 1.55506591796875, + "rewards/margins": 10.312832641601563, + "rewards/rejected": -8.757766723632812, + "step": 1453 + }, + { + "epoch": 0.5367541876240136, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 4.53865820268349e-06, + "logits/chosen": 2149638599.111111, + "logits/rejected": 2448222061.714286, + "logps/chosen": -278.7743326822917, + "logps/rejected": -359.1007603236607, + "loss": 0.1496, + "rewards/chosen": 1.3224700291951497, + "rewards/margins": 7.822601908729189, + "rewards/rejected": -6.50013187953404, + "step": 1454 + }, + { + "epoch": 0.5371233445618568, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 4.532798292880499e-06, + "logits/chosen": 2089357312.0, + "logits/rejected": 1817274231.4666667, + "logps/chosen": -336.1497012867647, + "logps/rejected": -515.6412760416666, + "loss": 0.1193, + "rewards/chosen": 2.5888099670410156, + "rewards/margins": 10.57049077351888, + "rewards/rejected": -7.981680806477865, + "step": 1455 + }, + { + "epoch": 0.5374925014997001, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 4.5269390303795395e-06, + "logits/chosen": 1713035894.1538463, + "logits/rejected": 1581956904.4210527, + "logps/chosen": -325.36226712740387, + "logps/rejected": -331.0112818667763, + "loss": 0.1495, + "rewards/chosen": 0.9372053879957932, + "rewards/margins": 6.6871824998122, + "rewards/rejected": -5.749977111816406, + "step": 1456 + }, + { + "epoch": 0.5378616584375433, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 4.521080423298543e-06, + "logits/chosen": 1567173875.8095238, + "logits/rejected": 1956353675.6363637, + "logps/chosen": -240.74439639136904, + "logps/rejected": -435.1345880681818, + "loss": 0.1578, + "rewards/chosen": 1.8365929013206845, + "rewards/margins": 8.002209023479775, + "rewards/rejected": -6.165616122159091, + "step": 1457 + }, + { + "epoch": 0.5382308153753864, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 4.515222479754534e-06, + "logits/chosen": 1653897938.8235295, + "logits/rejected": 2136335155.2, + "logps/chosen": -247.7194393382353, + "logps/rejected": -559.3908854166667, + "loss": 0.1589, + "rewards/chosen": 1.231106814216165, + "rewards/margins": 11.619471478929707, + "rewards/rejected": -10.388364664713542, + "step": 1458 + }, + { + "epoch": 0.5385999723132296, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 4.5093652078636205e-06, + "logits/chosen": 1807508626.2857144, + "logits/rejected": 1749679559.1111112, + "logps/chosen": -260.2492152622768, + "logps/rejected": -489.9402126736111, + "loss": 0.1334, + "rewards/chosen": 1.2756057466779436, + "rewards/margins": 8.696853440905373, + "rewards/rejected": -7.42124769422743, + "step": 1459 + }, + { + "epoch": 0.5389691292510729, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 4.503508615740978e-06, + "logits/chosen": 1606513225.142857, + "logits/rejected": 2099600042.6666667, + "logps/chosen": -261.83571079799106, + "logps/rejected": -521.4605034722222, + "loss": 0.0849, + "rewards/chosen": 2.078976903642927, + "rewards/margins": 10.60254495106046, + "rewards/rejected": -8.523568047417534, + "step": 1460 + }, + { + "epoch": 0.5393382861889161, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 4.497652711500841e-06, + "logits/chosen": 1306968545.8823528, + "logits/rejected": 1934416554.6666667, + "logps/chosen": -302.8660098805147, + "logps/rejected": -461.331640625, + "loss": 0.1557, + "rewards/chosen": 1.5460037904627182, + "rewards/margins": 9.071607529883291, + "rewards/rejected": -7.525603739420573, + "step": 1461 + }, + { + "epoch": 0.5397074431267592, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 4.491797503256492e-06, + "logits/chosen": 1549414400.0, + "logits/rejected": 1591964233.142857, + "logps/chosen": -284.616455078125, + "logps/rejected": -522.4395926339286, + "loss": 0.146, + "rewards/chosen": 1.474055078294542, + "rewards/margins": 11.097156448969766, + "rewards/rejected": -9.623101370675224, + "step": 1462 + }, + { + "epoch": 0.5400766000646025, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 4.485942999120243e-06, + "logits/chosen": 1761832576.0, + "logits/rejected": 1471649920.0, + "logps/chosen": -317.7290954589844, + "logps/rejected": -447.3445129394531, + "loss": 0.1841, + "rewards/chosen": 1.1034822463989258, + "rewards/margins": 8.616630554199219, + "rewards/rejected": -7.513148307800293, + "step": 1463 + }, + { + "epoch": 0.5404457570024457, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 4.480089207203438e-06, + "logits/chosen": 2133212081.2307692, + "logits/rejected": 2074348490.1052632, + "logps/chosen": -366.17728365384613, + "logps/rejected": -491.7290810032895, + "loss": 0.1582, + "rewards/chosen": 0.6257171630859375, + "rewards/margins": 8.724038375051398, + "rewards/rejected": -8.09832121196546, + "step": 1464 + }, + { + "epoch": 0.5408149139402889, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 4.474236135616431e-06, + "logits/chosen": 1765224634.1818182, + "logits/rejected": 1367064678.4, + "logps/chosen": -329.997802734375, + "logps/rejected": -317.63291015625, + "loss": 0.2025, + "rewards/chosen": 1.5821515863591975, + "rewards/margins": 7.939817601984197, + "rewards/rejected": -6.357666015625, + "step": 1465 + }, + { + "epoch": 0.541184070878132, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 4.468383792468578e-06, + "logits/chosen": 1355165891.047619, + "logits/rejected": 1551809070.5454545, + "logps/chosen": -231.71149553571428, + "logps/rejected": -307.3352716619318, + "loss": 0.1507, + "rewards/chosen": 1.8695397149948847, + "rewards/margins": 8.870921328986363, + "rewards/rejected": -7.0013816139914775, + "step": 1466 + }, + { + "epoch": 0.5415532278159753, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 4.462532185868228e-06, + "logits/chosen": 2911437763.7647057, + "logits/rejected": 1890200644.2666667, + "logps/chosen": -274.34495634191177, + "logps/rejected": -487.87034505208334, + "loss": 0.1001, + "rewards/chosen": 1.9854206758386947, + "rewards/margins": 8.522816916073069, + "rewards/rejected": -6.537396240234375, + "step": 1467 + }, + { + "epoch": 0.5419223847538185, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 4.4566813239227045e-06, + "logits/chosen": 1671418777.6, + "logits/rejected": 2262627048.7272725, + "logps/chosen": -235.95556640625, + "logps/rejected": -529.3631036931819, + "loss": 0.0847, + "rewards/chosen": 1.7940912246704102, + "rewards/margins": 9.356569030068137, + "rewards/rejected": -7.5624778053977275, + "step": 1468 + }, + { + "epoch": 0.5422915416916617, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 4.450831214738303e-06, + "logits/chosen": 1534200410.3529413, + "logits/rejected": 1918327057.0666666, + "logps/chosen": -287.41360294117646, + "logps/rejected": -564.4870442708333, + "loss": 0.1743, + "rewards/chosen": 1.2012242709889132, + "rewards/margins": 9.978249620923808, + "rewards/rejected": -8.777025349934895, + "step": 1469 + }, + { + "epoch": 0.5426606986295048, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 4.444981866420278e-06, + "logits/chosen": 1673902336.0, + "logits/rejected": 1485098240.0, + "logps/chosen": -264.2222595214844, + "logps/rejected": -346.92144775390625, + "loss": 0.1791, + "rewards/chosen": 1.2769495248794556, + "rewards/margins": 8.044173836708069, + "rewards/rejected": -6.767224311828613, + "step": 1470 + }, + { + "epoch": 0.5430298555673481, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 4.439133287072826e-06, + "logits/chosen": 1720729190.4, + "logits/rejected": 1335062698.6666667, + "logps/chosen": -270.82158203125, + "logps/rejected": -492.3553059895833, + "loss": 0.196, + "rewards/chosen": 1.2434823989868165, + "rewards/margins": 9.650897789001466, + "rewards/rejected": -8.407415390014648, + "step": 1471 + }, + { + "epoch": 0.5433990125051913, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 4.433285484799077e-06, + "logits/chosen": 1470970337.8823528, + "logits/rejected": 1745568426.6666667, + "logps/chosen": -246.0095645680147, + "logps/rejected": -519.95634765625, + "loss": 0.1566, + "rewards/chosen": 1.4881111593807446, + "rewards/margins": 9.685113824582569, + "rewards/rejected": -8.197002665201824, + "step": 1472 + }, + { + "epoch": 0.5437681694430345, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 4.427438467701091e-06, + "logits/chosen": 1383862869.3333333, + "logits/rejected": 1709180211.2, + "logps/chosen": -319.6613362630208, + "logps/rejected": -501.716357421875, + "loss": 0.1156, + "rewards/chosen": 1.2142019271850586, + "rewards/margins": 10.17126750946045, + "rewards/rejected": -8.95706558227539, + "step": 1473 + }, + { + "epoch": 0.5441373263808776, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 4.4215922438798335e-06, + "logits/chosen": 1507560155.4285715, + "logits/rejected": 1450874538.6666667, + "logps/chosen": -237.57373046875, + "logps/rejected": -575.5750868055555, + "loss": 0.1038, + "rewards/chosen": 1.9481274741036552, + "rewards/margins": 11.109157713632735, + "rewards/rejected": -9.16103023952908, + "step": 1474 + }, + { + "epoch": 0.5445064833187209, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 4.415746821435172e-06, + "logits/chosen": 1382379776.0, + "logits/rejected": 1576045952.0, + "logps/chosen": -251.30772399902344, + "logps/rejected": -433.93133544921875, + "loss": 0.1221, + "rewards/chosen": 1.5408787727355957, + "rewards/margins": 7.948005676269531, + "rewards/rejected": -6.4071269035339355, + "step": 1475 + }, + { + "epoch": 0.5448756402565641, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 4.409902208465867e-06, + "logits/chosen": 1669020160.0, + "logits/rejected": 1747459925.3333333, + "logps/chosen": -298.6027099609375, + "logps/rejected": -547.7362874348959, + "loss": 0.2075, + "rewards/chosen": 1.1856998443603515, + "rewards/margins": 9.916599655151368, + "rewards/rejected": -8.730899810791016, + "step": 1476 + }, + { + "epoch": 0.5452447971944073, + "grad_norm": 11.3125, + "kl": 0.28092288970947266, + "learning_rate": 4.404058413069556e-06, + "logits/chosen": 1707597824.0, + "logits/rejected": 1816523183.1578948, + "logps/chosen": -249.11495267427884, + "logps/rejected": -508.8456517269737, + "loss": 0.1497, + "rewards/chosen": 0.8711577928983248, + "rewards/margins": 8.148487087203423, + "rewards/rejected": -7.2773292943050985, + "step": 1477 + }, + { + "epoch": 0.5456139541322504, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 4.398215443342741e-06, + "logits/chosen": 1422307012.9230769, + "logits/rejected": 1789060688.8421052, + "logps/chosen": -228.1471228966346, + "logps/rejected": -562.1901212993421, + "loss": 0.1295, + "rewards/chosen": 1.5018250392033503, + "rewards/margins": 9.868335530825473, + "rewards/rejected": -8.366510491622122, + "step": 1478 + }, + { + "epoch": 0.5459831110700937, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 4.3923733073807865e-06, + "logits/chosen": 2464211171.5555553, + "logits/rejected": 1745810139.4285715, + "logps/chosen": -282.0102810329861, + "logps/rejected": -547.8933803013393, + "loss": 0.127, + "rewards/chosen": 2.016949759589301, + "rewards/margins": 10.021918675256153, + "rewards/rejected": -8.004968915666852, + "step": 1479 + }, + { + "epoch": 0.5463522680079369, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 4.386532013277892e-06, + "logits/chosen": 1883093447.1111112, + "logits/rejected": 1761656978.2857144, + "logps/chosen": -268.12543402777777, + "logps/rejected": -404.05210658482144, + "loss": 0.114, + "rewards/chosen": 2.04677242702908, + "rewards/margins": 9.613088759164961, + "rewards/rejected": -7.566316332135882, + "step": 1480 + }, + { + "epoch": 0.54672142494578, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 4.3806915691271e-06, + "logits/chosen": 1499017079.4666667, + "logits/rejected": 1783177697.8823528, + "logps/chosen": -280.53587239583334, + "logps/rejected": -475.8573644301471, + "loss": 0.1428, + "rewards/chosen": 2.047152837117513, + "rewards/margins": 11.429894428627165, + "rewards/rejected": -9.382741591509651, + "step": 1481 + }, + { + "epoch": 0.5470905818836233, + "grad_norm": 13.6875, + "kl": 0.4540982246398926, + "learning_rate": 4.374851983020271e-06, + "logits/chosen": 2212230680.3809524, + "logits/rejected": 1703514298.1818182, + "logps/chosen": -269.91143508184524, + "logps/rejected": -480.63911576704544, + "loss": 0.1879, + "rewards/chosen": 1.6791009448823475, + "rewards/margins": 8.985698567840444, + "rewards/rejected": -7.306597622958097, + "step": 1482 + }, + { + "epoch": 0.5474597388214665, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 4.369013263048075e-06, + "logits/chosen": 1964579297.8823528, + "logits/rejected": 1878112665.6, + "logps/chosen": -213.126220703125, + "logps/rejected": -470.93665364583336, + "loss": 0.1309, + "rewards/chosen": 1.5103795668658089, + "rewards/margins": 8.141958139456955, + "rewards/rejected": -6.631578572591146, + "step": 1483 + }, + { + "epoch": 0.5478288957593097, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 4.363175417299989e-06, + "logits/chosen": 2018170518.5882354, + "logits/rejected": 1661600563.2, + "logps/chosen": -297.4990808823529, + "logps/rejected": -443.88746744791666, + "loss": 0.1434, + "rewards/chosen": 1.4014373106115006, + "rewards/margins": 9.546189304426605, + "rewards/rejected": -8.144751993815104, + "step": 1484 + }, + { + "epoch": 0.5481980526971528, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 4.357338453864271e-06, + "logits/chosen": 1578967313.0666666, + "logits/rejected": 1302581488.9411764, + "logps/chosen": -249.084912109375, + "logps/rejected": -397.32057100183823, + "loss": 0.1676, + "rewards/chosen": 1.0990994771321614, + "rewards/margins": 7.466604718974992, + "rewards/rejected": -6.367505241842831, + "step": 1485 + }, + { + "epoch": 0.5485672096349961, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 4.351502380827959e-06, + "logits/chosen": 1345211014.7368422, + "logits/rejected": 1321274289.2307692, + "logps/chosen": -193.30090974506578, + "logps/rejected": -479.8366887019231, + "loss": 0.2035, + "rewards/chosen": 1.225894426044665, + "rewards/margins": 9.178522928523632, + "rewards/rejected": -7.952628502478967, + "step": 1486 + }, + { + "epoch": 0.5489363665728393, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 4.345667206276861e-06, + "logits/chosen": 1953570406.4, + "logits/rejected": 2091716437.3333333, + "logps/chosen": -238.9734619140625, + "logps/rejected": -548.07080078125, + "loss": 0.1778, + "rewards/chosen": 1.4068713188171387, + "rewards/margins": 11.394570191701254, + "rewards/rejected": -9.987698872884115, + "step": 1487 + }, + { + "epoch": 0.5493055235106825, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 4.339832938295534e-06, + "logits/chosen": 1377903138.1333334, + "logits/rejected": 1721211120.9411764, + "logps/chosen": -229.88395182291666, + "logps/rejected": -326.32111672794116, + "loss": 0.1308, + "rewards/chosen": 1.7766478220621744, + "rewards/margins": 7.195669428507487, + "rewards/rejected": -5.4190216064453125, + "step": 1488 + }, + { + "epoch": 0.5496746804485256, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 4.333999584967284e-06, + "logits/chosen": 2269870535.111111, + "logits/rejected": 1919376822.857143, + "logps/chosen": -247.18712022569446, + "logps/rejected": -439.80311802455356, + "loss": 0.174, + "rewards/chosen": 1.3265101114908855, + "rewards/margins": 8.803978329613095, + "rewards/rejected": -7.47746821812221, + "step": 1489 + }, + { + "epoch": 0.5500438373863689, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 4.3281671543741476e-06, + "logits/chosen": 1784439076.5714285, + "logits/rejected": 2674367829.3333335, + "logps/chosen": -242.3653564453125, + "logps/rejected": -557.9398871527778, + "loss": 0.1212, + "rewards/chosen": 1.5368749073573522, + "rewards/margins": 9.08118812621586, + "rewards/rejected": -7.544313218858507, + "step": 1490 + }, + { + "epoch": 0.5504129943242121, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 4.322335654596884e-06, + "logits/chosen": 1495506944.0, + "logits/rejected": 1481317861.0526316, + "logps/chosen": -333.2764423076923, + "logps/rejected": -499.1869346217105, + "loss": 0.1182, + "rewards/chosen": 1.432045423067533, + "rewards/margins": 9.755431364422385, + "rewards/rejected": -8.323385941354852, + "step": 1491 + }, + { + "epoch": 0.5507821512620553, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 4.31650509371496e-06, + "logits/chosen": 1536035089.0666666, + "logits/rejected": 1842944602.3529413, + "logps/chosen": -298.66578776041666, + "logps/rejected": -394.1620232077206, + "loss": 0.1433, + "rewards/chosen": 1.494311777750651, + "rewards/margins": 8.388100283753637, + "rewards/rejected": -6.893788506002987, + "step": 1492 + }, + { + "epoch": 0.5511513081998984, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 4.310675479806546e-06, + "logits/chosen": 2172385792.0, + "logits/rejected": 1938272768.0, + "logps/chosen": -323.8974304199219, + "logps/rejected": -525.1587524414062, + "loss": 0.1447, + "rewards/chosen": 1.519779920578003, + "rewards/margins": 8.205929040908813, + "rewards/rejected": -6.6861491203308105, + "step": 1493 + }, + { + "epoch": 0.5515204651377417, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 4.304846820948497e-06, + "logits/chosen": 1642010487.4666667, + "logits/rejected": 1584346172.235294, + "logps/chosen": -287.9642578125, + "logps/rejected": -488.2645909926471, + "loss": 0.1941, + "rewards/chosen": 0.5894980748494466, + "rewards/margins": 7.121195965187222, + "rewards/rejected": -6.531697890337775, + "step": 1494 + }, + { + "epoch": 0.5518896220755849, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 4.2990191252163446e-06, + "logits/chosen": 1470411025.0666666, + "logits/rejected": 1638684792.4705882, + "logps/chosen": -251.858837890625, + "logps/rejected": -457.31491268382354, + "loss": 0.1175, + "rewards/chosen": 1.659793217976888, + "rewards/margins": 7.765438790414847, + "rewards/rejected": -6.105645572437959, + "step": 1495 + }, + { + "epoch": 0.5522587790134281, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 4.293192400684289e-06, + "logits/chosen": 2238322215.3846154, + "logits/rejected": 2111181231.1578948, + "logps/chosen": -236.1609825721154, + "logps/rejected": -536.1135896381579, + "loss": 0.1326, + "rewards/chosen": 1.30474486717811, + "rewards/margins": 9.45837035545936, + "rewards/rejected": -8.15362548828125, + "step": 1496 + }, + { + "epoch": 0.5526279359512712, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 4.287366655425185e-06, + "logits/chosen": 1629804690.2857144, + "logits/rejected": 1533564359.1111112, + "logps/chosen": -279.15907505580356, + "logps/rejected": -343.8982204861111, + "loss": 0.1228, + "rewards/chosen": 1.3151551655360632, + "rewards/margins": 7.353617486499605, + "rewards/rejected": -6.038462320963542, + "step": 1497 + }, + { + "epoch": 0.5529970928891145, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 4.281541897510524e-06, + "logits/chosen": 1363318637.7142856, + "logits/rejected": 1781351196.4444444, + "logps/chosen": -199.489501953125, + "logps/rejected": -455.03382703993054, + "loss": 0.1043, + "rewards/chosen": 1.6260534014020647, + "rewards/margins": 10.398053184388175, + "rewards/rejected": -8.77199978298611, + "step": 1498 + }, + { + "epoch": 0.5533662498269577, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 4.275718135010435e-06, + "logits/chosen": 1463660397.7142856, + "logits/rejected": 1665628273.7777777, + "logps/chosen": -270.25612095424106, + "logps/rejected": -428.3843044704861, + "loss": 0.0963, + "rewards/chosen": 2.0366314479282925, + "rewards/margins": 10.87552237132239, + "rewards/rejected": -8.838890923394096, + "step": 1499 + }, + { + "epoch": 0.5537354067648009, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 4.269895375993668e-06, + "logits/chosen": 1567494144.0, + "logits/rejected": 1746440664.6153846, + "logps/chosen": -284.0376233552632, + "logps/rejected": -368.6242112379808, + "loss": 0.1509, + "rewards/chosen": 1.6094994795949835, + "rewards/margins": 9.344933791681822, + "rewards/rejected": -7.735434312086839, + "step": 1500 + }, + { + "epoch": 0.554104563702644, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 4.264073628527583e-06, + "logits/chosen": 2340107685.647059, + "logits/rejected": 1923982267.7333333, + "logps/chosen": -253.06235638786765, + "logps/rejected": -423.40227864583335, + "loss": 0.1717, + "rewards/chosen": 1.0740675084731157, + "rewards/margins": 9.481107913746554, + "rewards/rejected": -8.407040405273438, + "step": 1501 + }, + { + "epoch": 0.5544737206404873, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 4.258252900678136e-06, + "logits/chosen": 1469816100.5714285, + "logits/rejected": 1350089045.3333333, + "logps/chosen": -287.37147739955356, + "logps/rejected": -391.86553276909723, + "loss": 0.1133, + "rewards/chosen": 1.840660640171596, + "rewards/margins": 10.60423841930571, + "rewards/rejected": -8.763577779134115, + "step": 1502 + }, + { + "epoch": 0.5548428775783305, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 4.252433200509869e-06, + "logits/chosen": 1465183085.7142856, + "logits/rejected": 1565000817.7777777, + "logps/chosen": -146.40105329241072, + "logps/rejected": -503.46912977430554, + "loss": 0.1104, + "rewards/chosen": 2.1404414858136858, + "rewards/margins": 9.823080713786776, + "rewards/rejected": -7.68263922797309, + "step": 1503 + }, + { + "epoch": 0.5552120345161737, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 4.2466145360859064e-06, + "logits/chosen": 1637285780.2105262, + "logits/rejected": 1670220878.7692308, + "logps/chosen": -377.98817845394734, + "logps/rejected": -463.47847806490387, + "loss": 0.1529, + "rewards/chosen": 1.5725767236006887, + "rewards/margins": 10.572464043312227, + "rewards/rejected": -8.999887319711538, + "step": 1504 + }, + { + "epoch": 0.5555811914540169, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 4.240796915467933e-06, + "logits/chosen": 1664880412.4444444, + "logits/rejected": 2175275008.0, + "logps/chosen": -300.136962890625, + "logps/rejected": -554.5899832589286, + "loss": 0.1861, + "rewards/chosen": 1.016314188639323, + "rewards/margins": 10.598569960821242, + "rewards/rejected": -9.58225577218192, + "step": 1505 + }, + { + "epoch": 0.5559503483918601, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 4.2349803467161864e-06, + "logits/chosen": 1625791829.3333333, + "logits/rejected": 1382109915.4285715, + "logps/chosen": -226.60230848524304, + "logps/rejected": -472.71309988839283, + "loss": 0.1505, + "rewards/chosen": 2.0374103122287326, + "rewards/margins": 9.792203206864615, + "rewards/rejected": -7.754792894635882, + "step": 1506 + }, + { + "epoch": 0.5563195053297033, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 4.229164837889451e-06, + "logits/chosen": 1808827596.8, + "logits/rejected": 1905062260.3636363, + "logps/chosen": -292.28173828125, + "logps/rejected": -489.1585138494318, + "loss": 0.1161, + "rewards/chosen": 1.0214149475097656, + "rewards/margins": 9.367843558571554, + "rewards/rejected": -8.34642861106179, + "step": 1507 + }, + { + "epoch": 0.5566886622675465, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 4.22335039704504e-06, + "logits/chosen": 2638852096.0, + "logits/rejected": 2019021255.1111112, + "logps/chosen": -256.4872349330357, + "logps/rejected": -456.43462456597223, + "loss": 0.1384, + "rewards/chosen": 1.2492584500994002, + "rewards/margins": 8.661818807087247, + "rewards/rejected": -7.412560356987847, + "step": 1508 + }, + { + "epoch": 0.5570578192053897, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 4.217537032238784e-06, + "logits/chosen": 2162638848.0, + "logits/rejected": 2556953344.0, + "logps/chosen": -241.213623046875, + "logps/rejected": -347.8073425292969, + "loss": 0.1735, + "rewards/chosen": 1.0778923034667969, + "rewards/margins": 7.886093616485596, + "rewards/rejected": -6.808201313018799, + "step": 1509 + }, + { + "epoch": 0.5574269761432329, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 4.21172475152503e-06, + "logits/chosen": 1376016256.0, + "logits/rejected": 1438879616.0, + "logps/chosen": -295.63714599609375, + "logps/rejected": -470.5392150878906, + "loss": 0.1669, + "rewards/chosen": 1.0859800577163696, + "rewards/margins": 9.749186158180237, + "rewards/rejected": -8.663206100463867, + "step": 1510 + }, + { + "epoch": 0.5577961330810761, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 4.205913562956619e-06, + "logits/chosen": 1190929709.1764705, + "logits/rejected": 1279852544.0, + "logps/chosen": -258.5443761488971, + "logps/rejected": -482.1638671875, + "loss": 0.1537, + "rewards/chosen": 1.3696163401884192, + "rewards/margins": 9.197394457050398, + "rewards/rejected": -7.827778116861979, + "step": 1511 + }, + { + "epoch": 0.5581652900189193, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 4.200103474584877e-06, + "logits/chosen": 2668650868.3636365, + "logits/rejected": 2199238363.428571, + "logps/chosen": -263.77066317471593, + "logps/rejected": -491.51283482142856, + "loss": 0.1393, + "rewards/chosen": 0.9577839591286399, + "rewards/margins": 8.081058262746572, + "rewards/rejected": -7.123274303617931, + "step": 1512 + }, + { + "epoch": 0.5585344469567625, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 4.194294494459607e-06, + "logits/chosen": 1860674258.8235295, + "logits/rejected": 1657595494.4, + "logps/chosen": -288.3760627297794, + "logps/rejected": -515.6396158854167, + "loss": 0.1533, + "rewards/chosen": 1.331414503209731, + "rewards/margins": 8.592055474075616, + "rewards/rejected": -7.260640970865885, + "step": 1513 + }, + { + "epoch": 0.5589036038946057, + "grad_norm": 12.625, + "kl": 0.6254892349243164, + "learning_rate": 4.188486630629082e-06, + "logits/chosen": 3019441220.266667, + "logits/rejected": 2114047759.0588236, + "logps/chosen": -306.73235677083335, + "logps/rejected": -495.38982077205884, + "loss": 0.1797, + "rewards/chosen": 0.9567485173543294, + "rewards/margins": 8.294745953877767, + "rewards/rejected": -7.3379974365234375, + "step": 1514 + }, + { + "epoch": 0.5592727608324489, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 4.1826798911400186e-06, + "logits/chosen": 1688798916.9230769, + "logits/rejected": 1738730118.7368422, + "logps/chosen": -176.29332557091345, + "logps/rejected": -570.9263466282895, + "loss": 0.0567, + "rewards/chosen": 2.8182584322415867, + "rewards/margins": 11.744903811559022, + "rewards/rejected": -8.926645379317435, + "step": 1515 + }, + { + "epoch": 0.559641917770292, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 4.176874284037581e-06, + "logits/chosen": 1985175070.1176472, + "logits/rejected": 1643904204.8, + "logps/chosen": -343.3421415441176, + "logps/rejected": -509.4632161458333, + "loss": 0.1244, + "rewards/chosen": 1.9928618038401884, + "rewards/margins": 10.8578306759105, + "rewards/rejected": -8.864968872070312, + "step": 1516 + }, + { + "epoch": 0.5600110747081353, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 4.171069817365365e-06, + "logits/chosen": 1836184917.3333333, + "logits/rejected": 2750208000.0, + "logps/chosen": -235.40033637152777, + "logps/rejected": -453.9974888392857, + "loss": 0.1851, + "rewards/chosen": 1.062021255493164, + "rewards/margins": 7.664433888026646, + "rewards/rejected": -6.602412632533482, + "step": 1517 + }, + { + "epoch": 0.5603802316459785, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 4.165266499165387e-06, + "logits/chosen": 2010859520.0, + "logits/rejected": 2540120356.571429, + "logps/chosen": -233.37749565972223, + "logps/rejected": -528.7618582589286, + "loss": 0.1866, + "rewards/chosen": 1.089145342508952, + "rewards/margins": 9.096690813700357, + "rewards/rejected": -8.007545471191406, + "step": 1518 + }, + { + "epoch": 0.5607493885838217, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 4.159464337478068e-06, + "logits/chosen": 1320502814.1176472, + "logits/rejected": 1959949243.7333333, + "logps/chosen": -326.94324448529414, + "logps/rejected": -532.4450520833333, + "loss": 0.1771, + "rewards/chosen": 1.6013469696044922, + "rewards/margins": 9.365592575073242, + "rewards/rejected": -7.76424560546875, + "step": 1519 + }, + { + "epoch": 0.5611185455216648, + "grad_norm": 11.375, + "kl": 1.2585926055908203, + "learning_rate": 4.15366334034223e-06, + "logits/chosen": 1633639484.235294, + "logits/rejected": 1384327850.6666667, + "logps/chosen": -238.60213694852942, + "logps/rejected": -460.4052408854167, + "loss": 0.1474, + "rewards/chosen": 1.5871229732737822, + "rewards/margins": 9.38014004277248, + "rewards/rejected": -7.793017069498698, + "step": 1520 + }, + { + "epoch": 0.5614877024595081, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 4.147863515795083e-06, + "logits/chosen": 1692486587.7333333, + "logits/rejected": 2239067557.647059, + "logps/chosen": -319.2630859375, + "logps/rejected": -331.0222598805147, + "loss": 0.1586, + "rewards/chosen": 1.2151695251464845, + "rewards/margins": 7.0623195423799405, + "rewards/rejected": -5.847150017233456, + "step": 1521 + }, + { + "epoch": 0.5618568593973513, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 4.142064871872208e-06, + "logits/chosen": 1563283937.8823528, + "logits/rejected": 1958591965.8666666, + "logps/chosen": -254.73316865808823, + "logps/rejected": -403.21119791666666, + "loss": 0.111, + "rewards/chosen": 2.3167957979090072, + "rewards/margins": 9.697942546769685, + "rewards/rejected": -7.381146748860677, + "step": 1522 + }, + { + "epoch": 0.5622260163351945, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 4.136267416607552e-06, + "logits/chosen": 1953705425.4545455, + "logits/rejected": 2055680819.2, + "logps/chosen": -286.17924360795456, + "logps/rejected": -612.01015625, + "loss": 0.1715, + "rewards/chosen": 1.6286875984885476, + "rewards/margins": 11.59086868979714, + "rewards/rejected": -9.962181091308594, + "step": 1523 + }, + { + "epoch": 0.5625951732730377, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 4.130471158033418e-06, + "logits/chosen": 1641609856.0, + "logits/rejected": 1624830720.0, + "logps/chosen": -300.0520324707031, + "logps/rejected": -425.9640197753906, + "loss": 0.184, + "rewards/chosen": 0.7831243276596069, + "rewards/margins": 8.727392554283142, + "rewards/rejected": -7.944268226623535, + "step": 1524 + }, + { + "epoch": 0.5629643302108809, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 4.124676104180447e-06, + "logits/chosen": 1881228160.0, + "logits/rejected": 1526547200.0, + "logps/chosen": -311.18035888671875, + "logps/rejected": -456.3099670410156, + "loss": 0.1017, + "rewards/chosen": 2.2032275199890137, + "rewards/margins": 10.888800144195557, + "rewards/rejected": -8.685572624206543, + "step": 1525 + }, + { + "epoch": 0.5633334871487241, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 4.118882263077613e-06, + "logits/chosen": 2354030182.4, + "logits/rejected": 1936589917.090909, + "logps/chosen": -261.5396240234375, + "logps/rejected": -438.56276633522725, + "loss": 0.0905, + "rewards/chosen": 1.961859893798828, + "rewards/margins": 9.679449532248757, + "rewards/rejected": -7.717589638449929, + "step": 1526 + }, + { + "epoch": 0.5637026440865673, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 4.113089642752208e-06, + "logits/chosen": 1495322214.4, + "logits/rejected": 1862457344.0, + "logps/chosen": -236.2589599609375, + "logps/rejected": -423.2444661458333, + "loss": 0.1676, + "rewards/chosen": 1.6972524642944335, + "rewards/margins": 9.598349444071452, + "rewards/rejected": -7.9010969797770185, + "step": 1527 + }, + { + "epoch": 0.5640718010244105, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 4.107298251229837e-06, + "logits/chosen": 1640966505.4117646, + "logits/rejected": 1833704652.8, + "logps/chosen": -241.60678998161765, + "logps/rejected": -602.0502604166667, + "loss": 0.1282, + "rewards/chosen": 2.223661983714384, + "rewards/margins": 11.33277961880553, + "rewards/rejected": -9.109117635091145, + "step": 1528 + }, + { + "epoch": 0.5644409579622537, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 4.101508096534394e-06, + "logits/chosen": 1381328896.0, + "logits/rejected": 1609009948.4444444, + "logps/chosen": -201.49030412946428, + "logps/rejected": -436.8237575954861, + "loss": 0.1026, + "rewards/chosen": 2.2058869770595004, + "rewards/margins": 10.703109877450125, + "rewards/rejected": -8.497222900390625, + "step": 1529 + }, + { + "epoch": 0.5648101149000969, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 4.095719186688071e-06, + "logits/chosen": 1575283003.0769231, + "logits/rejected": 1873326080.0, + "logps/chosen": -289.91977163461536, + "logps/rejected": -426.9222347861842, + "loss": 0.0954, + "rewards/chosen": 2.250943110539363, + "rewards/margins": 10.175228196117077, + "rewards/rejected": -7.924285085577714, + "step": 1530 + }, + { + "epoch": 0.5651792718379401, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 4.0899315297113255e-06, + "logits/chosen": 1359450248.5333333, + "logits/rejected": 1556692389.6470587, + "logps/chosen": -219.409375, + "logps/rejected": -471.14651309742646, + "loss": 0.1251, + "rewards/chosen": 1.6125633239746093, + "rewards/margins": 9.503883047664866, + "rewards/rejected": -7.891319723690257, + "step": 1531 + }, + { + "epoch": 0.5655484287757833, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 4.084145133622883e-06, + "logits/chosen": 2080294784.0, + "logits/rejected": 1509567616.0, + "logps/chosen": -264.4786376953125, + "logps/rejected": -468.321533203125, + "loss": 0.1003, + "rewards/chosen": 1.8365241289138794, + "rewards/margins": 9.873106122016907, + "rewards/rejected": -8.036581993103027, + "step": 1532 + }, + { + "epoch": 0.5659175857136265, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 4.0783600064397225e-06, + "logits/chosen": 2188647330.909091, + "logits/rejected": 1663339373.7142856, + "logps/chosen": -286.93545809659093, + "logps/rejected": -438.51488095238096, + "loss": 0.1296, + "rewards/chosen": 0.7982943274758079, + "rewards/margins": 7.720420734190838, + "rewards/rejected": -6.92212640671503, + "step": 1533 + }, + { + "epoch": 0.5662867426514697, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 4.072576156177062e-06, + "logits/chosen": 1791020763.4285715, + "logits/rejected": 1638936462.2222223, + "logps/chosen": -294.62935965401783, + "logps/rejected": -445.60199652777777, + "loss": 0.1303, + "rewards/chosen": 1.2612410954066686, + "rewards/margins": 9.0645688677591, + "rewards/rejected": -7.80332777235243, + "step": 1534 + }, + { + "epoch": 0.566655899589313, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 4.066793590848356e-06, + "logits/chosen": 2737138266.352941, + "logits/rejected": 1627665749.3333333, + "logps/chosen": -253.1959731158088, + "logps/rejected": -507.67975260416665, + "loss": 0.1513, + "rewards/chosen": 1.7087195901309742, + "rewards/margins": 9.411093349082797, + "rewards/rejected": -7.702373758951823, + "step": 1535 + }, + { + "epoch": 0.5670250565271561, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 4.061012318465272e-06, + "logits/chosen": 1958206557.090909, + "logits/rejected": 2336714947.047619, + "logps/chosen": -206.4761629971591, + "logps/rejected": -454.4915829613095, + "loss": 0.1469, + "rewards/chosen": 0.7115097912875089, + "rewards/margins": 7.4637739049407825, + "rewards/rejected": -6.752264113653274, + "step": 1536 + }, + { + "epoch": 0.5673942134649993, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 4.0552323470376916e-06, + "logits/chosen": 2396301914.352941, + "logits/rejected": 1558887219.2, + "logps/chosen": -310.0115751378676, + "logps/rejected": -523.8825520833333, + "loss": 0.1323, + "rewards/chosen": 1.5916834438548368, + "rewards/margins": 12.304236340990254, + "rewards/rejected": -10.712552897135417, + "step": 1537 + }, + { + "epoch": 0.5677633704028425, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 4.049453684573693e-06, + "logits/chosen": 1788498883.764706, + "logits/rejected": 1591486054.4, + "logps/chosen": -347.23095703125, + "logps/rejected": -371.15849609375, + "loss": 0.1947, + "rewards/chosen": 0.8887087317074046, + "rewards/margins": 6.611879202898811, + "rewards/rejected": -5.723170471191406, + "step": 1538 + }, + { + "epoch": 0.5681325273406858, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 4.043676339079536e-06, + "logits/chosen": 1943262601.8461537, + "logits/rejected": 2072485888.0, + "logps/chosen": -264.23910757211536, + "logps/rejected": -405.5697985197368, + "loss": 0.1507, + "rewards/chosen": 1.0452357805692232, + "rewards/margins": 8.734357227680654, + "rewards/rejected": -7.689121447111431, + "step": 1539 + }, + { + "epoch": 0.5685016842785289, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 4.037900318559661e-06, + "logits/chosen": 1415656825.2631578, + "logits/rejected": 2613896900.923077, + "logps/chosen": -289.8593236019737, + "logps/rejected": -664.6171875, + "loss": 0.1132, + "rewards/chosen": 2.3663962514776933, + "rewards/margins": 14.219164195813631, + "rewards/rejected": -11.852767944335938, + "step": 1540 + }, + { + "epoch": 0.5688708412163721, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 4.032125631016672e-06, + "logits/chosen": 1505131588.2666667, + "logits/rejected": 1471788815.0588236, + "logps/chosen": -230.00699869791666, + "logps/rejected": -495.8971737132353, + "loss": 0.153, + "rewards/chosen": 1.089273198445638, + "rewards/margins": 8.667426023296281, + "rewards/rejected": -7.578152824850643, + "step": 1541 + }, + { + "epoch": 0.5692399981542153, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 4.026352284451326e-06, + "logits/chosen": 1715256064.0, + "logits/rejected": 2201430016.0, + "logps/chosen": -300.9371032714844, + "logps/rejected": -537.8230590820312, + "loss": 0.1503, + "rewards/chosen": 1.535867691040039, + "rewards/margins": 9.987276077270508, + "rewards/rejected": -8.451408386230469, + "step": 1542 + }, + { + "epoch": 0.5696091550920586, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 4.020580286862517e-06, + "logits/chosen": 1801768618.6666667, + "logits/rejected": 1846642395.4285715, + "logps/chosen": -314.84160698784723, + "logps/rejected": -492.2827845982143, + "loss": 0.1689, + "rewards/chosen": 1.209504657321506, + "rewards/margins": 9.297473756093828, + "rewards/rejected": -8.087969098772321, + "step": 1543 + }, + { + "epoch": 0.5699783120299017, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 4.014809646247278e-06, + "logits/chosen": 2343066282.6666665, + "logits/rejected": 2576883712.0, + "logps/chosen": -373.53411458333335, + "logps/rejected": -467.5193876378676, + "loss": 0.1151, + "rewards/chosen": 1.4984887440999348, + "rewards/margins": 9.165083305508483, + "rewards/rejected": -7.666594561408548, + "step": 1544 + }, + { + "epoch": 0.5703474689677449, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 4.009040370600759e-06, + "logits/chosen": 1277676612.2666667, + "logits/rejected": 1454903175.5294118, + "logps/chosen": -242.98821614583332, + "logps/rejected": -417.9856962316176, + "loss": 0.1342, + "rewards/chosen": 1.6356238047281901, + "rewards/margins": 10.882323418411554, + "rewards/rejected": -9.246699613683363, + "step": 1545 + }, + { + "epoch": 0.5707166259055881, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 4.003272467916214e-06, + "logits/chosen": 1435167857.7777777, + "logits/rejected": 1678916754.2857144, + "logps/chosen": -291.91889105902777, + "logps/rejected": -499.11886160714283, + "loss": 0.1414, + "rewards/chosen": 1.6072818968031142, + "rewards/margins": 10.361424658033583, + "rewards/rejected": -8.754142761230469, + "step": 1546 + }, + { + "epoch": 0.5710857828434314, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 3.9975059461850035e-06, + "logits/chosen": 1790909696.0, + "logits/rejected": 1760119296.0, + "logps/chosen": -252.72518920898438, + "logps/rejected": -430.09063720703125, + "loss": 0.1905, + "rewards/chosen": 0.8293647170066833, + "rewards/margins": 8.052061975002289, + "rewards/rejected": -7.2226972579956055, + "step": 1547 + }, + { + "epoch": 0.5714549397812745, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 3.991740813396568e-06, + "logits/chosen": 1547518088.5333333, + "logits/rejected": 1914836028.235294, + "logps/chosen": -260.4543782552083, + "logps/rejected": -586.4466337316177, + "loss": 0.1132, + "rewards/chosen": 1.4883832295735677, + "rewards/margins": 12.047690552356197, + "rewards/rejected": -10.55930732278263, + "step": 1548 + }, + { + "epoch": 0.5718240967191177, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 3.985977077538426e-06, + "logits/chosen": 1830117171.2, + "logits/rejected": 1598192546.909091, + "logps/chosen": -334.0249267578125, + "logps/rejected": -508.8903142755682, + "loss": 0.1191, + "rewards/chosen": 0.9449188232421875, + "rewards/margins": 11.552726329456675, + "rewards/rejected": -10.607807506214488, + "step": 1549 + }, + { + "epoch": 0.5721932536569609, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 3.980214746596159e-06, + "logits/chosen": 1517594331.4285715, + "logits/rejected": 1532668586.6666667, + "logps/chosen": -274.52029854910717, + "logps/rejected": -373.99305555555554, + "loss": 0.0998, + "rewards/chosen": 2.4573797498430525, + "rewards/margins": 9.659817256624736, + "rewards/rejected": -7.202437506781684, + "step": 1550 + }, + { + "epoch": 0.5725624105948041, + "grad_norm": 12.875, + "kl": 0.29137325286865234, + "learning_rate": 3.974453828553404e-06, + "logits/chosen": 1713197465.6, + "logits/rejected": 1949724672.0, + "logps/chosen": -305.6041259765625, + "logps/rejected": -642.437744140625, + "loss": 0.1743, + "rewards/chosen": 1.3201104164123536, + "rewards/margins": 10.507301425933838, + "rewards/rejected": -9.187191009521484, + "step": 1551 + }, + { + "epoch": 0.5729315675326473, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 3.9686943313918405e-06, + "logits/chosen": 1865638400.0, + "logits/rejected": 1700111360.0, + "logps/chosen": -197.0867919921875, + "logps/rejected": -430.3459167480469, + "loss": 0.1167, + "rewards/chosen": 1.9035696983337402, + "rewards/margins": 8.86074447631836, + "rewards/rejected": -6.957174777984619, + "step": 1552 + }, + { + "epoch": 0.5733007244704905, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 3.962936263091179e-06, + "logits/chosen": 1894816768.0, + "logits/rejected": 1478858624.0, + "logps/chosen": -277.934326171875, + "logps/rejected": -548.9337768554688, + "loss": 0.1542, + "rewards/chosen": 1.0972683429718018, + "rewards/margins": 10.93404507637024, + "rewards/rejected": -9.836776733398438, + "step": 1553 + }, + { + "epoch": 0.5736698814083337, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 3.957179631629148e-06, + "logits/chosen": 2017436160.0, + "logits/rejected": 1935501568.0, + "logps/chosen": -285.2598571777344, + "logps/rejected": -459.12176513671875, + "loss": 0.118, + "rewards/chosen": 1.4929039478302002, + "rewards/margins": 9.293976068496704, + "rewards/rejected": -7.801072120666504, + "step": 1554 + }, + { + "epoch": 0.5740390383461769, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 3.9514244449814886e-06, + "logits/chosen": 1768695808.0, + "logits/rejected": 1571050402.909091, + "logps/chosen": -327.0034423828125, + "logps/rejected": -398.3634144176136, + "loss": 0.1318, + "rewards/chosen": 0.9805108070373535, + "rewards/margins": 7.15852122740312, + "rewards/rejected": -6.178010420365767, + "step": 1555 + }, + { + "epoch": 0.5744081952840201, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 3.945670711121939e-06, + "logits/chosen": 1516246076.235294, + "logits/rejected": 1621373064.5333333, + "logps/chosen": -317.91871553308823, + "logps/rejected": -544.641796875, + "loss": 0.1208, + "rewards/chosen": 1.678267422844382, + "rewards/margins": 11.20662639094334, + "rewards/rejected": -9.528358968098958, + "step": 1556 + }, + { + "epoch": 0.5747773522218633, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 3.939918438022224e-06, + "logits/chosen": 2176443278.2222223, + "logits/rejected": 2094270902.857143, + "logps/chosen": -310.28521050347223, + "logps/rejected": -528.6587960379464, + "loss": 0.2154, + "rewards/chosen": 0.9104723400539823, + "rewards/margins": 9.24463437095521, + "rewards/rejected": -8.334162030901227, + "step": 1557 + }, + { + "epoch": 0.5751465091597066, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 3.934167633652045e-06, + "logits/chosen": 1528403072.0, + "logits/rejected": 1750707456.0, + "logps/chosen": -288.1640319824219, + "logps/rejected": -455.60516357421875, + "loss": 0.1817, + "rewards/chosen": 1.2104073762893677, + "rewards/margins": 8.753005146980286, + "rewards/rejected": -7.542597770690918, + "step": 1558 + }, + { + "epoch": 0.5755156660975497, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 3.928418305979069e-06, + "logits/chosen": 1260770725.6470587, + "logits/rejected": 1175106218.6666667, + "logps/chosen": -264.6963752297794, + "logps/rejected": -327.77545572916665, + "loss": 0.1846, + "rewards/chosen": 0.9099947985480813, + "rewards/margins": 8.38090747758454, + "rewards/rejected": -7.470912679036458, + "step": 1559 + }, + { + "epoch": 0.5758848230353929, + "grad_norm": 10.875, + "kl": 0.7470197677612305, + "learning_rate": 3.922670462968914e-06, + "logits/chosen": 1567581934.9333334, + "logits/rejected": 1968781191.5294118, + "logps/chosen": -288.87766927083334, + "logps/rejected": -428.13381778492646, + "loss": 0.1227, + "rewards/chosen": 1.731033198038737, + "rewards/margins": 9.660867900474399, + "rewards/rejected": -7.929834702435662, + "step": 1560 + }, + { + "epoch": 0.5762539799732361, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 3.916924112585146e-06, + "logits/chosen": 1436531234.1333334, + "logits/rejected": 1989329739.2941177, + "logps/chosen": -313.55384114583336, + "logps/rejected": -521.0681295955883, + "loss": 0.1495, + "rewards/chosen": 1.613967514038086, + "rewards/margins": 9.617485113704905, + "rewards/rejected": -8.00351759966682, + "step": 1561 + }, + { + "epoch": 0.5766231369110794, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 3.9111792627892605e-06, + "logits/chosen": 1339692416.0, + "logits/rejected": 1596774784.0, + "logps/chosen": -186.4196319580078, + "logps/rejected": -363.9350891113281, + "loss": 0.1668, + "rewards/chosen": 1.2205349206924438, + "rewards/margins": 7.6356121301651, + "rewards/rejected": -6.415077209472656, + "step": 1562 + }, + { + "epoch": 0.5769922938489225, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 3.905435921540672e-06, + "logits/chosen": 1519513014.857143, + "logits/rejected": 3356310869.3333335, + "logps/chosen": -254.40403529575892, + "logps/rejected": -482.50396050347223, + "loss": 0.1023, + "rewards/chosen": 1.7023793629237585, + "rewards/margins": 10.642422494434175, + "rewards/rejected": -8.940043131510416, + "step": 1563 + }, + { + "epoch": 0.5773614507867657, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 3.899694096796709e-06, + "logits/chosen": 1304114984.4210527, + "logits/rejected": 1401685228.3076923, + "logps/chosen": -199.73049444901315, + "logps/rejected": -508.8992262620192, + "loss": 0.1293, + "rewards/chosen": 1.733799984580592, + "rewards/margins": 13.432070342152707, + "rewards/rejected": -11.698270357572115, + "step": 1564 + }, + { + "epoch": 0.5777306077246089, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 3.893953796512596e-06, + "logits/chosen": 2157182329.263158, + "logits/rejected": 2069086523.0769231, + "logps/chosen": -311.5898694490132, + "logps/rejected": -471.2617938701923, + "loss": 0.1493, + "rewards/chosen": 1.803490488152755, + "rewards/margins": 9.773970600081842, + "rewards/rejected": -7.970480111929087, + "step": 1565 + }, + { + "epoch": 0.5780997646624522, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 3.8882150286414455e-06, + "logits/chosen": 1589385801.142857, + "logits/rejected": 1557333219.5555556, + "logps/chosen": -179.34359305245536, + "logps/rejected": -405.90245225694446, + "loss": 0.1368, + "rewards/chosen": 1.304563249860491, + "rewards/margins": 8.853727068219866, + "rewards/rejected": -7.549163818359375, + "step": 1566 + }, + { + "epoch": 0.5784689216002953, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 3.882477801134247e-06, + "logits/chosen": 1416460672.0, + "logits/rejected": 2006638464.0, + "logps/chosen": -215.00173950195312, + "logps/rejected": -567.704833984375, + "loss": 0.1558, + "rewards/chosen": 1.4531410932540894, + "rewards/margins": 12.638487696647644, + "rewards/rejected": -11.185346603393555, + "step": 1567 + }, + { + "epoch": 0.5788380785381385, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 3.876742121939857e-06, + "logits/chosen": 1802971243.7894738, + "logits/rejected": 1339846025.8461537, + "logps/chosen": -286.4138826069079, + "logps/rejected": -465.7531550480769, + "loss": 0.234, + "rewards/chosen": 1.4533979516280324, + "rewards/margins": 7.950328000643958, + "rewards/rejected": -6.496930049015925, + "step": 1568 + }, + { + "epoch": 0.5792072354759817, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 3.871007999004986e-06, + "logits/chosen": 1692967302.0952382, + "logits/rejected": 1987839255.2727273, + "logps/chosen": -314.02562313988096, + "logps/rejected": -498.1751154119318, + "loss": 0.1877, + "rewards/chosen": 1.4399127051943825, + "rewards/margins": 8.967443722151058, + "rewards/rejected": -7.527531016956676, + "step": 1569 + }, + { + "epoch": 0.579576392413825, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 3.8652754402741896e-06, + "logits/chosen": 1949875712.0, + "logits/rejected": 2495449088.0, + "logps/chosen": -222.5999298095703, + "logps/rejected": -416.4027099609375, + "loss": 0.1468, + "rewards/chosen": 1.332458734512329, + "rewards/margins": 7.95473837852478, + "rewards/rejected": -6.622279644012451, + "step": 1570 + }, + { + "epoch": 0.5799455493516681, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 3.859544453689853e-06, + "logits/chosen": 1866519250.8235295, + "logits/rejected": 1775329826.1333334, + "logps/chosen": -269.8478573069853, + "logps/rejected": -514.1328125, + "loss": 0.1406, + "rewards/chosen": 1.4125329185934627, + "rewards/margins": 9.948137776991901, + "rewards/rejected": -8.535604858398438, + "step": 1571 + }, + { + "epoch": 0.5803147062895113, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 3.853815047192188e-06, + "logits/chosen": 1932852224.0, + "logits/rejected": 2119603072.0, + "logps/chosen": -271.93328857421875, + "logps/rejected": -430.01654052734375, + "loss": 0.1535, + "rewards/chosen": 1.2949538230895996, + "rewards/margins": 9.710268497467041, + "rewards/rejected": -8.415314674377441, + "step": 1572 + }, + { + "epoch": 0.5806838632273545, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 3.848087228719212e-06, + "logits/chosen": 2374606028.8, + "logits/rejected": 1484712899.764706, + "logps/chosen": -239.20037434895832, + "logps/rejected": -547.8754595588235, + "loss": 0.1508, + "rewards/chosen": 0.9857496897379557, + "rewards/margins": 9.940555849262312, + "rewards/rejected": -8.954806159524356, + "step": 1573 + }, + { + "epoch": 0.5810530201651978, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 3.842361006206745e-06, + "logits/chosen": 1910764001.8823528, + "logits/rejected": 2423645798.4, + "logps/chosen": -267.0335477941176, + "logps/rejected": -598.5856770833333, + "loss": 0.114, + "rewards/chosen": 1.7750677220961626, + "rewards/margins": 12.003065565520641, + "rewards/rejected": -10.227997843424479, + "step": 1574 + }, + { + "epoch": 0.5814221771030409, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 3.836636387588397e-06, + "logits/chosen": 2274053198.769231, + "logits/rejected": 1890557736.4210527, + "logps/chosen": -339.2889873798077, + "logps/rejected": -494.69849917763156, + "loss": 0.1488, + "rewards/chosen": 0.9386095633873572, + "rewards/margins": 10.422701874242621, + "rewards/rejected": -9.484092310855264, + "step": 1575 + }, + { + "epoch": 0.5817913340408841, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 3.830913380795554e-06, + "logits/chosen": 1178883754.6666667, + "logits/rejected": 2070981427.2, + "logps/chosen": -328.34130859375, + "logps/rejected": -535.059814453125, + "loss": 0.108, + "rewards/chosen": 1.5476398468017578, + "rewards/margins": 9.928046798706054, + "rewards/rejected": -8.380406951904297, + "step": 1576 + }, + { + "epoch": 0.5821604909787274, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 3.825191993757368e-06, + "logits/chosen": 1588269641.142857, + "logits/rejected": 2296671118.2222223, + "logps/chosen": -272.9706333705357, + "logps/rejected": -521.7167426215278, + "loss": 0.1177, + "rewards/chosen": 2.016730717250279, + "rewards/margins": 9.468982575431703, + "rewards/rejected": -7.452251858181423, + "step": 1577 + }, + { + "epoch": 0.5825296479165706, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 3.819472234400749e-06, + "logits/chosen": 1487237120.0, + "logits/rejected": 1567238866.8235295, + "logps/chosen": -252.5236328125, + "logps/rejected": -411.70071231617646, + "loss": 0.1425, + "rewards/chosen": 1.734966786702474, + "rewards/margins": 9.760530643837125, + "rewards/rejected": -8.025563857134651, + "step": 1578 + }, + { + "epoch": 0.5828988048544137, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 3.813754110650352e-06, + "logits/chosen": 2113338982.4, + "logits/rejected": 1766874282.6666667, + "logps/chosen": -244.8400390625, + "logps/rejected": -497.9972737630208, + "loss": 0.1681, + "rewards/chosen": 1.2866942405700683, + "rewards/margins": 9.027226797739665, + "rewards/rejected": -7.740532557169597, + "step": 1579 + }, + { + "epoch": 0.5832679617922569, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 3.8080376304285615e-06, + "logits/chosen": 2229099264.0, + "logits/rejected": 1642319744.0, + "logps/chosen": -199.20953369140625, + "logps/rejected": -375.13690185546875, + "loss": 0.1379, + "rewards/chosen": 1.548970341682434, + "rewards/margins": 9.207805275917053, + "rewards/rejected": -7.658834934234619, + "step": 1580 + }, + { + "epoch": 0.5836371187301002, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 3.8023228016554913e-06, + "logits/chosen": 2167902208.0, + "logits/rejected": 1973773516.8, + "logps/chosen": -425.531982421875, + "logps/rejected": -518.330810546875, + "loss": 0.083, + "rewards/chosen": 1.8060329755147297, + "rewards/margins": 10.585824998219808, + "rewards/rejected": -8.779792022705077, + "step": 1581 + }, + { + "epoch": 0.5840062756679434, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 3.7966096322489637e-06, + "logits/chosen": 2387021101.1764708, + "logits/rejected": 1867009774.9333334, + "logps/chosen": -290.9366096047794, + "logps/rejected": -571.2069661458333, + "loss": 0.1599, + "rewards/chosen": 1.7168055141673368, + "rewards/margins": 10.246792595059263, + "rewards/rejected": -8.529987080891926, + "step": 1582 + }, + { + "epoch": 0.5843754326057865, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 3.790898130124503e-06, + "logits/chosen": 1570446389.8947368, + "logits/rejected": 2301984452.923077, + "logps/chosen": -308.90262643914474, + "logps/rejected": -718.6134314903846, + "loss": 0.1554, + "rewards/chosen": 1.4086757459138568, + "rewards/margins": 12.920874560892823, + "rewards/rejected": -11.512198814978966, + "step": 1583 + }, + { + "epoch": 0.5847445895436297, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 3.7851883031953197e-06, + "logits/chosen": 2328689777.7777777, + "logits/rejected": 2394233417.142857, + "logps/chosen": -340.23280164930554, + "logps/rejected": -472.97471400669644, + "loss": 0.1963, + "rewards/chosen": 1.1096720165676541, + "rewards/margins": 9.040275043911404, + "rewards/rejected": -7.93060302734375, + "step": 1584 + }, + { + "epoch": 0.585113746481473, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 3.7794801593723075e-06, + "logits/chosen": 1732090538.6666667, + "logits/rejected": 1372093138.8235295, + "logps/chosen": -339.98330078125, + "logps/rejected": -480.92190372242646, + "loss": 0.1551, + "rewards/chosen": 1.0140143076578776, + "rewards/margins": 8.951101422777363, + "rewards/rejected": -7.9370871151194855, + "step": 1585 + }, + { + "epoch": 0.5854829034193162, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 3.773773706564029e-06, + "logits/chosen": 1485082192.8421052, + "logits/rejected": 1799992556.3076923, + "logps/chosen": -331.82509251644734, + "logps/rejected": -463.9473407451923, + "loss": 0.219, + "rewards/chosen": 0.7320104900159334, + "rewards/margins": 7.550414467630116, + "rewards/rejected": -6.8184039776141825, + "step": 1586 + }, + { + "epoch": 0.5858520603571593, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 3.768068952676701e-06, + "logits/chosen": 1802293679.1578948, + "logits/rejected": 2037993787.0769231, + "logps/chosen": -277.15406558388156, + "logps/rejected": -624.2053034855769, + "loss": 0.1561, + "rewards/chosen": 1.6663026307758533, + "rewards/margins": 10.73938812224971, + "rewards/rejected": -9.073085491473858, + "step": 1587 + }, + { + "epoch": 0.5862212172950025, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 3.762365905614187e-06, + "logits/chosen": 2290237440.0, + "logits/rejected": 1869202152.7272727, + "logps/chosen": -384.3370361328125, + "logps/rejected": -406.44997336647725, + "loss": 0.0833, + "rewards/chosen": 1.752349853515625, + "rewards/margins": 8.75925653631037, + "rewards/rejected": -7.006906682794744, + "step": 1588 + }, + { + "epoch": 0.5865903742328458, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 3.756664573277987e-06, + "logits/chosen": 1092833988.9230769, + "logits/rejected": 1958877722.9473684, + "logps/chosen": -306.7130784254808, + "logps/rejected": -407.5224095394737, + "loss": 0.1122, + "rewards/chosen": 1.7762223757230318, + "rewards/margins": 8.051023128061642, + "rewards/rejected": -6.27480075233861, + "step": 1589 + }, + { + "epoch": 0.5869595311706889, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 3.750964963567225e-06, + "logits/chosen": 1485822072.4705882, + "logits/rejected": 2058248055.4666667, + "logps/chosen": -259.1611328125, + "logps/rejected": -471.97776692708334, + "loss": 0.1071, + "rewards/chosen": 1.9823496201459099, + "rewards/margins": 9.721223928414139, + "rewards/rejected": -7.738874308268229, + "step": 1590 + }, + { + "epoch": 0.5873286881085321, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 3.745267084378636e-06, + "logits/chosen": 1985070788.9230769, + "logits/rejected": 1676046336.0, + "logps/chosen": -280.8149226262019, + "logps/rejected": -466.1371813322368, + "loss": 0.1089, + "rewards/chosen": 1.7290521768423228, + "rewards/margins": 8.870203203517898, + "rewards/rejected": -7.141151026675575, + "step": 1591 + }, + { + "epoch": 0.5876978450463753, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 3.7395709436065615e-06, + "logits/chosen": 1525194752.0, + "logits/rejected": 1524907836.952381, + "logps/chosen": -231.2474032315341, + "logps/rejected": -534.2500930059524, + "loss": 0.1236, + "rewards/chosen": 0.9222636656327681, + "rewards/margins": 10.456930907773765, + "rewards/rejected": -9.534667242140998, + "step": 1592 + }, + { + "epoch": 0.5880670019842186, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 3.7338765491429308e-06, + "logits/chosen": 2360051370.6666665, + "logits/rejected": 1852054118.4, + "logps/chosen": -314.39990234375, + "logps/rejected": -462.196728515625, + "loss": 0.1665, + "rewards/chosen": 0.6613660653432211, + "rewards/margins": 8.469656165440878, + "rewards/rejected": -7.808290100097656, + "step": 1593 + }, + { + "epoch": 0.5884361589220617, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 3.728183908877254e-06, + "logits/chosen": 1866152618.6666667, + "logits/rejected": 1489966622.1176472, + "logps/chosen": -334.5380859375, + "logps/rejected": -398.08473115808823, + "loss": 0.1551, + "rewards/chosen": 1.1152141571044922, + "rewards/margins": 10.08610756817986, + "rewards/rejected": -8.970893411075368, + "step": 1594 + }, + { + "epoch": 0.5888053158599049, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 3.7224930306966146e-06, + "logits/chosen": 1770790180.5714285, + "logits/rejected": 1538012160.0, + "logps/chosen": -256.11265345982144, + "logps/rejected": -434.9138454861111, + "loss": 0.141, + "rewards/chosen": 1.1807877676827567, + "rewards/margins": 8.615763285803416, + "rewards/rejected": -7.43497551812066, + "step": 1595 + }, + { + "epoch": 0.5891744727977481, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 3.7168039224856508e-06, + "logits/chosen": 1823620573.8666666, + "logits/rejected": 1503658947.764706, + "logps/chosen": -189.88141276041668, + "logps/rejected": -456.0886661305147, + "loss": 0.1599, + "rewards/chosen": 1.1929868062337239, + "rewards/margins": 8.466356988046684, + "rewards/rejected": -7.273370181812959, + "step": 1596 + }, + { + "epoch": 0.5895436297355914, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 3.711116592126548e-06, + "logits/chosen": 2275223324.4444447, + "logits/rejected": 1901636315.4285715, + "logps/chosen": -288.420166015625, + "logps/rejected": -561.6064453125, + "loss": 0.1635, + "rewards/chosen": 1.2432937622070312, + "rewards/margins": 9.059691837855748, + "rewards/rejected": -7.816398075648716, + "step": 1597 + }, + { + "epoch": 0.5899127866734345, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 3.705431047499033e-06, + "logits/chosen": 1918779684.5714285, + "logits/rejected": 1669675121.7777777, + "logps/chosen": -282.221435546875, + "logps/rejected": -403.7073025173611, + "loss": 0.1271, + "rewards/chosen": 1.1876893724714006, + "rewards/margins": 7.251343106466626, + "rewards/rejected": -6.063653733995226, + "step": 1598 + }, + { + "epoch": 0.5902819436112777, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 3.6997472964803545e-06, + "logits/chosen": 2334017536.0, + "logits/rejected": 2015881216.0, + "logps/chosen": -314.538818359375, + "logps/rejected": -592.265625, + "loss": 0.177, + "rewards/chosen": 1.0733004675971136, + "rewards/margins": 9.553823168315585, + "rewards/rejected": -8.48052270071847, + "step": 1599 + }, + { + "epoch": 0.590651100549121, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 3.694065346945278e-06, + "logits/chosen": 1527256268.8, + "logits/rejected": 2078671811.764706, + "logps/chosen": -199.35799153645834, + "logps/rejected": -392.4142635569853, + "loss": 0.1542, + "rewards/chosen": 1.2822962443033854, + "rewards/margins": 7.990234494676777, + "rewards/rejected": -6.707938250373392, + "step": 1600 + }, + { + "epoch": 0.5910202574869642, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 3.6883852067660698e-06, + "logits/chosen": 1595975826.2857144, + "logits/rejected": 1609651882.6666667, + "logps/chosen": -225.39522879464286, + "logps/rejected": -444.8332248263889, + "loss": 0.0907, + "rewards/chosen": 1.9457073211669922, + "rewards/margins": 8.23379241095649, + "rewards/rejected": -6.288085089789496, + "step": 1601 + }, + { + "epoch": 0.5913894144248073, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 3.682706883812494e-06, + "logits/chosen": 1729417069.7142856, + "logits/rejected": 1793449984.0, + "logps/chosen": -293.70535714285717, + "logps/rejected": -479.0359157986111, + "loss": 0.1479, + "rewards/chosen": 1.781531742640904, + "rewards/margins": 9.554587772914342, + "rewards/rejected": -7.7730560302734375, + "step": 1602 + }, + { + "epoch": 0.5917585713626505, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 3.6770303859517954e-06, + "logits/chosen": 1587471257.6, + "logits/rejected": 2751907157.3333335, + "logps/chosen": -297.7789794921875, + "logps/rejected": -504.7864990234375, + "loss": 0.2125, + "rewards/chosen": 1.0587594032287597, + "rewards/margins": 11.238230737050374, + "rewards/rejected": -10.179471333821615, + "step": 1603 + }, + { + "epoch": 0.5921277283004938, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 3.6713557210486874e-06, + "logits/chosen": 1635579335.1111112, + "logits/rejected": 1723680182.857143, + "logps/chosen": -241.55951605902777, + "logps/rejected": -551.9168178013393, + "loss": 0.1691, + "rewards/chosen": 1.2023677825927734, + "rewards/margins": 10.396503176007952, + "rewards/rejected": -9.194135393415179, + "step": 1604 + }, + { + "epoch": 0.592496885238337, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 3.665682896965349e-06, + "logits/chosen": 1411127055.0588236, + "logits/rejected": 1756322201.6, + "logps/chosen": -304.1891659007353, + "logps/rejected": -536.2367513020833, + "loss": 0.1381, + "rewards/chosen": 1.8095051260555493, + "rewards/margins": 10.466623859779508, + "rewards/rejected": -8.657118733723959, + "step": 1605 + }, + { + "epoch": 0.5928660421761801, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 3.660011921561405e-06, + "logits/chosen": 1720404614.7368422, + "logits/rejected": 1104758705.2307692, + "logps/chosen": -246.00557668585526, + "logps/rejected": -343.7450420673077, + "loss": 0.1699, + "rewards/chosen": 1.6902168675472862, + "rewards/margins": 7.646190566089954, + "rewards/rejected": -5.955973698542668, + "step": 1606 + }, + { + "epoch": 0.5932351991140233, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 3.654342802693918e-06, + "logits/chosen": 1761354820.2666667, + "logits/rejected": 1757254475.2941177, + "logps/chosen": -239.44178059895833, + "logps/rejected": -501.4778262867647, + "loss": 0.1505, + "rewards/chosen": 1.058096694946289, + "rewards/margins": 10.750867260203643, + "rewards/rejected": -9.692770565257353, + "step": 1607 + }, + { + "epoch": 0.5936043560518666, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 3.6486755482173814e-06, + "logits/chosen": 1878461917.8666666, + "logits/rejected": 1267757899.2941177, + "logps/chosen": -308.8157552083333, + "logps/rejected": -446.3533720128676, + "loss": 0.1547, + "rewards/chosen": 0.912695566813151, + "rewards/margins": 9.5043290830126, + "rewards/rejected": -8.59163351619945, + "step": 1608 + }, + { + "epoch": 0.5939735129897098, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 3.643010165983705e-06, + "logits/chosen": 2033138944.0, + "logits/rejected": 2117401728.0, + "logps/chosen": -348.7762451171875, + "logps/rejected": -419.6701354980469, + "loss": 0.1327, + "rewards/chosen": 1.2867287397384644, + "rewards/margins": 8.818317770957947, + "rewards/rejected": -7.531589031219482, + "step": 1609 + }, + { + "epoch": 0.5943426699275529, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 3.637346663842204e-06, + "logits/chosen": 1390960753.7777777, + "logits/rejected": 1312677010.2857144, + "logps/chosen": -171.384765625, + "logps/rejected": -379.21864536830356, + "loss": 0.1826, + "rewards/chosen": 1.1793013678656683, + "rewards/margins": 8.397883521185982, + "rewards/rejected": -7.2185821533203125, + "step": 1610 + }, + { + "epoch": 0.5947118268653961, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 3.6316850496395863e-06, + "logits/chosen": 1384547629.1764705, + "logits/rejected": 2732548915.2, + "logps/chosen": -247.89341107536765, + "logps/rejected": -530.8527018229166, + "loss": 0.1875, + "rewards/chosen": 0.9287131253410789, + "rewards/margins": 9.062336375666598, + "rewards/rejected": -8.13362325032552, + "step": 1611 + }, + { + "epoch": 0.5950809838032394, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 3.626025331219949e-06, + "logits/chosen": 1755324416.0, + "logits/rejected": 1332481536.0, + "logps/chosen": -230.27447509765625, + "logps/rejected": -428.752197265625, + "loss": 0.1269, + "rewards/chosen": 1.489720106124878, + "rewards/margins": 10.130172491073608, + "rewards/rejected": -8.64045238494873, + "step": 1612 + }, + { + "epoch": 0.5954501407410826, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 3.6203675164247586e-06, + "logits/chosen": 1960239476.3636363, + "logits/rejected": 1685069824.0, + "logps/chosen": -404.1041370738636, + "logps/rejected": -399.2914806547619, + "loss": 0.1432, + "rewards/chosen": 0.5414847894148394, + "rewards/margins": 8.016132674691997, + "rewards/rejected": -7.474647885277157, + "step": 1613 + }, + { + "epoch": 0.5958192976789257, + "grad_norm": 12.875, + "kl": 0.1794290542602539, + "learning_rate": 3.6147116130928462e-06, + "logits/chosen": 1906943707.4285715, + "logits/rejected": 2131812352.0, + "logps/chosen": -275.10525948660717, + "logps/rejected": -719.5894097222222, + "loss": 0.1612, + "rewards/chosen": 0.9627069745744977, + "rewards/margins": 27.13511264134967, + "rewards/rejected": -26.172405666775173, + "step": 1614 + }, + { + "epoch": 0.596188454616769, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 3.609057629060394e-06, + "logits/chosen": 1659486208.0, + "logits/rejected": 2458395170.133333, + "logps/chosen": -237.61684283088235, + "logps/rejected": -577.6611979166667, + "loss": 0.1997, + "rewards/chosen": 0.7374576680800494, + "rewards/margins": 10.268232611113904, + "rewards/rejected": -9.530774943033855, + "step": 1615 + }, + { + "epoch": 0.5965576115546122, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 3.6034055721609256e-06, + "logits/chosen": 1704127307.2941177, + "logits/rejected": 1458442103.4666667, + "logps/chosen": -328.10595703125, + "logps/rejected": -401.8921875, + "loss": 0.1592, + "rewards/chosen": 1.0334874321432674, + "rewards/margins": 8.623036687514361, + "rewards/rejected": -7.589549255371094, + "step": 1616 + }, + { + "epoch": 0.5969267684924554, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 3.5977554502252943e-06, + "logits/chosen": 1646280704.0, + "logits/rejected": 1644158361.6, + "logps/chosen": -214.23799641927084, + "logps/rejected": -452.42373046875, + "loss": 0.1186, + "rewards/chosen": 1.3704083760579426, + "rewards/margins": 10.119994099934896, + "rewards/rejected": -8.749585723876953, + "step": 1617 + }, + { + "epoch": 0.5972959254302985, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 3.5921072710816697e-06, + "logits/chosen": 1247320994.909091, + "logits/rejected": 1428758625.5238094, + "logps/chosen": -265.7921031605114, + "logps/rejected": -417.1030970982143, + "loss": 0.1221, + "rewards/chosen": 1.0696086016568271, + "rewards/margins": 8.47395270624202, + "rewards/rejected": -7.404344104585194, + "step": 1618 + }, + { + "epoch": 0.5976650823681418, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 3.586461042555535e-06, + "logits/chosen": 1294921386.6666667, + "logits/rejected": 1430150582.857143, + "logps/chosen": -220.45357259114584, + "logps/rejected": -410.2202845982143, + "loss": 0.2163, + "rewards/chosen": 0.8426436848110623, + "rewards/margins": 7.124429922255258, + "rewards/rejected": -6.281786237444196, + "step": 1619 + }, + { + "epoch": 0.598034239305985, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 3.5808167724696657e-06, + "logits/chosen": 2034614272.0, + "logits/rejected": 1825465514.6666667, + "logps/chosen": -263.5788818359375, + "logps/rejected": -478.2602945963542, + "loss": 0.1503, + "rewards/chosen": 2.3373882293701174, + "rewards/margins": 10.497080357869468, + "rewards/rejected": -8.15969212849935, + "step": 1620 + }, + { + "epoch": 0.5984033962438282, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 3.5751744686441277e-06, + "logits/chosen": 1451997696.0, + "logits/rejected": 2005204096.0, + "logps/chosen": -301.2395935058594, + "logps/rejected": -493.1173400878906, + "loss": 0.1805, + "rewards/chosen": 0.8699792623519897, + "rewards/margins": 9.395761847496033, + "rewards/rejected": -8.525782585144043, + "step": 1621 + }, + { + "epoch": 0.5987725531816713, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 3.569534138896262e-06, + "logits/chosen": 1949652389.6470587, + "logits/rejected": 3401601843.2, + "logps/chosen": -275.47877412683823, + "logps/rejected": -473.9805013020833, + "loss": 0.1895, + "rewards/chosen": 0.8708892710068646, + "rewards/margins": 8.675771574880562, + "rewards/rejected": -7.804882303873698, + "step": 1622 + }, + { + "epoch": 0.5991417101195146, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 3.5638957910406724e-06, + "logits/chosen": 1627490304.0, + "logits/rejected": 1417314432.0, + "logps/chosen": -264.177001953125, + "logps/rejected": -398.177734375, + "loss": 0.1217, + "rewards/chosen": 2.104872226715088, + "rewards/margins": 7.983559608459473, + "rewards/rejected": -5.878687381744385, + "step": 1623 + }, + { + "epoch": 0.5995108670573578, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 3.5582594328892183e-06, + "logits/chosen": 1858858188.8, + "logits/rejected": 2134355004.235294, + "logps/chosen": -317.9890950520833, + "logps/rejected": -460.6012752757353, + "loss": 0.1499, + "rewards/chosen": 1.0105501810709636, + "rewards/margins": 8.179043564141965, + "rewards/rejected": -7.168493383071002, + "step": 1624 + }, + { + "epoch": 0.5998800239952009, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 3.5526250722510042e-06, + "logits/chosen": 1800915968.0, + "logits/rejected": 1861827490.909091, + "logps/chosen": -286.93388671875, + "logps/rejected": -443.21182528409093, + "loss": 0.0894, + "rewards/chosen": 1.4882620811462401, + "rewards/margins": 9.29027790589766, + "rewards/rejected": -7.802015824751421, + "step": 1625 + }, + { + "epoch": 0.6002491809330441, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 3.546992716932364e-06, + "logits/chosen": 1343113898.6666667, + "logits/rejected": 1297014784.0, + "logps/chosen": -214.76592339409723, + "logps/rejected": -396.94496372767856, + "loss": 0.1694, + "rewards/chosen": 1.4739190207587347, + "rewards/margins": 9.414589579143222, + "rewards/rejected": -7.940670558384487, + "step": 1626 + }, + { + "epoch": 0.6006183378708874, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 3.541362374736852e-06, + "logits/chosen": 2147910314.6666665, + "logits/rejected": 1386161590.857143, + "logps/chosen": -287.7908528645833, + "logps/rejected": -461.8316127232143, + "loss": 0.1492, + "rewards/chosen": 1.7444096671210394, + "rewards/margins": 8.811640981643919, + "rewards/rejected": -7.067231314522879, + "step": 1627 + }, + { + "epoch": 0.6009874948087306, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 3.5357340534652397e-06, + "logits/chosen": 1763874061.4736843, + "logits/rejected": 2878865092.923077, + "logps/chosen": -258.5747327302632, + "logps/rejected": -545.0636268028846, + "loss": 0.1358, + "rewards/chosen": 2.241316745155736, + "rewards/margins": 11.70997753992737, + "rewards/rejected": -9.468660794771635, + "step": 1628 + }, + { + "epoch": 0.6013566517465737, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 3.530107760915493e-06, + "logits/chosen": 2378956800.0, + "logits/rejected": 1614122535.3846154, + "logps/chosen": -254.7918122944079, + "logps/rejected": -381.50232872596155, + "loss": 0.174, + "rewards/chosen": 1.3934964631733142, + "rewards/margins": 7.8971233599581705, + "rewards/rejected": -6.503626896784856, + "step": 1629 + }, + { + "epoch": 0.6017258086844169, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 3.5244835048827686e-06, + "logits/chosen": 1631689889.6842105, + "logits/rejected": 2221806513.230769, + "logps/chosen": -346.6318616365132, + "logps/rejected": -544.2051908052885, + "loss": 0.1875, + "rewards/chosen": 1.1542564191316302, + "rewards/margins": 8.68240595056943, + "rewards/rejected": -7.5281495314378, + "step": 1630 + }, + { + "epoch": 0.6020949656222602, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 3.5188612931594014e-06, + "logits/chosen": 2082609408.0, + "logits/rejected": 2092284672.0, + "logps/chosen": -350.1873474121094, + "logps/rejected": -524.6607666015625, + "loss": 0.1649, + "rewards/chosen": 1.359944462776184, + "rewards/margins": 8.298981308937073, + "rewards/rejected": -6.939036846160889, + "step": 1631 + }, + { + "epoch": 0.6024641225601034, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 3.5132411335348946e-06, + "logits/chosen": 1836533436.631579, + "logits/rejected": 1979641540.9230769, + "logps/chosen": -266.5839586759868, + "logps/rejected": -507.2454176682692, + "loss": 0.2065, + "rewards/chosen": 1.002906598542866, + "rewards/margins": 9.646923744726761, + "rewards/rejected": -8.644017146183895, + "step": 1632 + }, + { + "epoch": 0.6028332794979465, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 3.5076230337959095e-06, + "logits/chosen": 1921999104.0, + "logits/rejected": 2073769472.0, + "logps/chosen": -298.8757019042969, + "logps/rejected": -514.8553466796875, + "loss": 0.1789, + "rewards/chosen": 0.9934766292572021, + "rewards/margins": 10.51366400718689, + "rewards/rejected": -9.520187377929688, + "step": 1633 + }, + { + "epoch": 0.6032024364357897, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 3.5020070017262515e-06, + "logits/chosen": 1624123801.6, + "logits/rejected": 1496517692.235294, + "logps/chosen": -292.53736979166666, + "logps/rejected": -510.63694852941177, + "loss": 0.1539, + "rewards/chosen": 0.9884806315104167, + "rewards/margins": 9.178919294768688, + "rewards/rejected": -8.190438663258272, + "step": 1634 + }, + { + "epoch": 0.603571593373633, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 3.4963930451068585e-06, + "logits/chosen": 2591898828.8, + "logits/rejected": 1598295722.6666667, + "logps/chosen": -355.4533203125, + "logps/rejected": -439.8095296223958, + "loss": 0.1852, + "rewards/chosen": 1.1918656349182128, + "rewards/margins": 8.744467767079671, + "rewards/rejected": -7.552602132161458, + "step": 1635 + }, + { + "epoch": 0.6039407503114762, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 3.4907811717157993e-06, + "logits/chosen": 2172658627.7647057, + "logits/rejected": 1925869294.9333334, + "logps/chosen": -347.44025735294116, + "logps/rejected": -438.0999348958333, + "loss": 0.1654, + "rewards/chosen": 1.2642085131476908, + "rewards/margins": 9.765701839970609, + "rewards/rejected": -8.501493326822917, + "step": 1636 + }, + { + "epoch": 0.6043099072493193, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 3.4851713893282523e-06, + "logits/chosen": 1232921600.0, + "logits/rejected": 1693002658.909091, + "logps/chosen": -244.3478759765625, + "logps/rejected": -514.0584161931819, + "loss": 0.1162, + "rewards/chosen": 0.9193907737731933, + "rewards/margins": 9.599955654144287, + "rewards/rejected": -8.680564880371094, + "step": 1637 + }, + { + "epoch": 0.6046790641871626, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 3.479563705716499e-06, + "logits/chosen": 2320270677.3333335, + "logits/rejected": 1127342923.2941177, + "logps/chosen": -292.6070963541667, + "logps/rejected": -474.93103745404414, + "loss": 0.0728, + "rewards/chosen": 3.074571228027344, + "rewards/margins": 13.213877689137178, + "rewards/rejected": -10.139306461109834, + "step": 1638 + }, + { + "epoch": 0.6050482211250058, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 3.4739581286499147e-06, + "logits/chosen": 2006009976.4705882, + "logits/rejected": 1519261832.5333333, + "logps/chosen": -284.2265625, + "logps/rejected": -405.062109375, + "loss": 0.0995, + "rewards/chosen": 2.3184132295496322, + "rewards/margins": 8.41774387733609, + "rewards/rejected": -6.099330647786458, + "step": 1639 + }, + { + "epoch": 0.605417378062849, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 3.468354665894955e-06, + "logits/chosen": 2138486422.5882354, + "logits/rejected": 2246447923.2, + "logps/chosen": -277.7363855698529, + "logps/rejected": -421.2583984375, + "loss": 0.1524, + "rewards/chosen": 1.692600923426011, + "rewards/margins": 8.856699027267156, + "rewards/rejected": -7.1640981038411455, + "step": 1640 + }, + { + "epoch": 0.6057865350006921, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 3.4627533252151465e-06, + "logits/chosen": 2658548555.2941175, + "logits/rejected": 2881318638.9333334, + "logps/chosen": -294.2243221507353, + "logps/rejected": -424.55914713541665, + "loss": 0.1667, + "rewards/chosen": 1.2250293282901539, + "rewards/margins": 8.39314354541255, + "rewards/rejected": -7.168114217122396, + "step": 1641 + }, + { + "epoch": 0.6061556919385354, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 3.4571541143710757e-06, + "logits/chosen": 1463200828.235294, + "logits/rejected": 1308167918.9333334, + "logps/chosen": -302.4575769761029, + "logps/rejected": -473.1975911458333, + "loss": 0.1653, + "rewards/chosen": 1.001958173864028, + "rewards/margins": 9.27481278812184, + "rewards/rejected": -8.272854614257813, + "step": 1642 + }, + { + "epoch": 0.6065248488763786, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 3.4515570411203782e-06, + "logits/chosen": 1630996480.0, + "logits/rejected": 1854279907.5555556, + "logps/chosen": -240.64622279575892, + "logps/rejected": -566.2054036458334, + "loss": 0.0924, + "rewards/chosen": 2.261340822492327, + "rewards/margins": 10.89906580485995, + "rewards/rejected": -8.637724982367622, + "step": 1643 + }, + { + "epoch": 0.6068940058142218, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 3.445962113217726e-06, + "logits/chosen": 1701690289.2307692, + "logits/rejected": 1645259614.3157895, + "logps/chosen": -267.75443209134613, + "logps/rejected": -415.60384971217104, + "loss": 0.1131, + "rewards/chosen": 1.8407862736628606, + "rewards/margins": 10.227249639719604, + "rewards/rejected": -8.386463366056743, + "step": 1644 + }, + { + "epoch": 0.6072631627520649, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 3.440369338414823e-06, + "logits/chosen": 1247563264.0, + "logits/rejected": 1583388330.6666667, + "logps/chosen": -220.256787109375, + "logps/rejected": -463.1741943359375, + "loss": 0.1901, + "rewards/chosen": 1.3647086143493652, + "rewards/margins": 7.72011760075887, + "rewards/rejected": -6.355408986409505, + "step": 1645 + }, + { + "epoch": 0.6076323196899082, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 3.434778724460387e-06, + "logits/chosen": 1939517244.952381, + "logits/rejected": 2035502545.4545455, + "logps/chosen": -285.6201869419643, + "logps/rejected": -424.9036310369318, + "loss": 0.1797, + "rewards/chosen": 1.672899155389695, + "rewards/margins": 9.348340550542394, + "rewards/rejected": -7.675441395152699, + "step": 1646 + }, + { + "epoch": 0.6080014766277514, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 3.4291902791001406e-06, + "logits/chosen": 1618230272.0, + "logits/rejected": 1495346176.0, + "logps/chosen": -300.44677734375, + "logps/rejected": -526.0714721679688, + "loss": 0.1249, + "rewards/chosen": 1.397760033607483, + "rewards/margins": 9.302549958229065, + "rewards/rejected": -7.904789924621582, + "step": 1647 + }, + { + "epoch": 0.6083706335655946, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 3.4236040100768077e-06, + "logits/chosen": 2241304576.0, + "logits/rejected": 2088613205.3333333, + "logps/chosen": -225.5244873046875, + "logps/rejected": -458.0351969401042, + "loss": 0.1229, + "rewards/chosen": 2.1307147979736327, + "rewards/margins": 9.27016487121582, + "rewards/rejected": -7.1394500732421875, + "step": 1648 + }, + { + "epoch": 0.6087397905034377, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 3.4180199251300898e-06, + "logits/chosen": 1719709525.3333333, + "logits/rejected": 1987867852.8, + "logps/chosen": -239.36163330078125, + "logps/rejected": -489.067138671875, + "loss": 0.1186, + "rewards/chosen": 1.1994252999623616, + "rewards/margins": 8.593945868810017, + "rewards/rejected": -7.394520568847656, + "step": 1649 + }, + { + "epoch": 0.609108947441281, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 3.4124380319966665e-06, + "logits/chosen": 2150348068.571429, + "logits/rejected": 1452960654.2222223, + "logps/chosen": -312.0421665736607, + "logps/rejected": -453.3916829427083, + "loss": 0.1599, + "rewards/chosen": 0.8073318345206124, + "rewards/margins": 7.562949188171871, + "rewards/rejected": -6.7556173536512585, + "step": 1650 + }, + { + "epoch": 0.6094781043791242, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 3.406858338410181e-06, + "logits/chosen": 1396103529.4117646, + "logits/rejected": 1509150310.4, + "logps/chosen": -308.63795381433823, + "logps/rejected": -497.2597981770833, + "loss": 0.1669, + "rewards/chosen": 1.4825885997099035, + "rewards/margins": 10.674512631285424, + "rewards/rejected": -9.191924031575521, + "step": 1651 + }, + { + "epoch": 0.6098472613169674, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 3.401280852101228e-06, + "logits/chosen": 1654067931.4285715, + "logits/rejected": 1570852181.3333333, + "logps/chosen": -241.77889578683036, + "logps/rejected": -363.21327039930554, + "loss": 0.1366, + "rewards/chosen": 1.4482773372105189, + "rewards/margins": 7.785082468910823, + "rewards/rejected": -6.336805131700304, + "step": 1652 + }, + { + "epoch": 0.6102164182548105, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 3.3957055807973416e-06, + "logits/chosen": 2258465280.0, + "logits/rejected": 2666235136.0, + "logps/chosen": -357.1292724609375, + "logps/rejected": -620.12353515625, + "loss": 0.1741, + "rewards/chosen": 1.1401983499526978, + "rewards/margins": 9.453250527381897, + "rewards/rejected": -8.3130521774292, + "step": 1653 + }, + { + "epoch": 0.6105855751926538, + "grad_norm": 11.3125, + "kl": 0.09126853942871094, + "learning_rate": 3.390132532222991e-06, + "logits/chosen": 1960998765.7142856, + "logits/rejected": 1711303338.6666667, + "logps/chosen": -340.1356724330357, + "logps/rejected": -456.57514105902777, + "loss": 0.1285, + "rewards/chosen": 1.6650814328874861, + "rewards/margins": 8.429196660480802, + "rewards/rejected": -6.764115227593316, + "step": 1654 + }, + { + "epoch": 0.610954732130497, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 3.3845617140995628e-06, + "logits/chosen": 2019018752.0, + "logits/rejected": 1380787584.0, + "logps/chosen": -225.12893676757812, + "logps/rejected": -434.8202819824219, + "loss": 0.1538, + "rewards/chosen": 1.7167807817459106, + "rewards/margins": 9.005027413368225, + "rewards/rejected": -7.2882466316223145, + "step": 1655 + }, + { + "epoch": 0.6113238890683402, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 3.3789931341453564e-06, + "logits/chosen": 1448777620.2105262, + "logits/rejected": 1633716381.5384614, + "logps/chosen": -307.2329872532895, + "logps/rejected": -416.1271409254808, + "loss": 0.1779, + "rewards/chosen": 1.6335481342516447, + "rewards/margins": 9.564255625612823, + "rewards/rejected": -7.9307074913611775, + "step": 1656 + }, + { + "epoch": 0.6116930460061833, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 3.373426800075569e-06, + "logits/chosen": 1740718694.4, + "logits/rejected": 1235025322.6666667, + "logps/chosen": -287.291650390625, + "logps/rejected": -446.6099039713542, + "loss": 0.2265, + "rewards/chosen": 0.8233596801757812, + "rewards/margins": 8.703475189208984, + "rewards/rejected": -7.880115509033203, + "step": 1657 + }, + { + "epoch": 0.6120622029440266, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 3.3678627196022827e-06, + "logits/chosen": 1721524906.6666667, + "logits/rejected": 2055355977.142857, + "logps/chosen": -222.07284884982639, + "logps/rejected": -490.21829659598217, + "loss": 0.1625, + "rewards/chosen": 1.2529555426703558, + "rewards/margins": 8.997759380037822, + "rewards/rejected": -7.744803837367466, + "step": 1658 + }, + { + "epoch": 0.6124313598818698, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 3.362300900434463e-06, + "logits/chosen": 1880364694.5882354, + "logits/rejected": 2040767965.8666666, + "logps/chosen": -290.33766084558823, + "logps/rejected": -591.4699869791667, + "loss": 0.147, + "rewards/chosen": 1.3073763005873735, + "rewards/margins": 12.93174051770977, + "rewards/rejected": -11.624364217122396, + "step": 1659 + }, + { + "epoch": 0.6128005168197129, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 3.35674135027794e-06, + "logits/chosen": 1492000452.9230769, + "logits/rejected": 1292173527.5789473, + "logps/chosen": -228.28032977764423, + "logps/rejected": -430.1819490131579, + "loss": 0.1323, + "rewards/chosen": 1.3155011397141676, + "rewards/margins": 8.086292444453065, + "rewards/rejected": -6.770791304738898, + "step": 1660 + }, + { + "epoch": 0.6131696737575562, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 3.3511840768353977e-06, + "logits/chosen": 1485744742.4, + "logits/rejected": 1840780629.3333333, + "logps/chosen": -240.580712890625, + "logps/rejected": -404.1775716145833, + "loss": 0.2058, + "rewards/chosen": 1.192998504638672, + "rewards/margins": 8.198807017008464, + "rewards/rejected": -7.005808512369792, + "step": 1661 + }, + { + "epoch": 0.6135388306953994, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 3.345629087806369e-06, + "logits/chosen": 2114884900.5714285, + "logits/rejected": 1701836913.7777777, + "logps/chosen": -279.2403564453125, + "logps/rejected": -632.4192708333334, + "loss": 0.1463, + "rewards/chosen": 1.160733904157366, + "rewards/margins": 11.181558760385665, + "rewards/rejected": -10.020824856228298, + "step": 1662 + }, + { + "epoch": 0.6139079876332426, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 3.3400763908872214e-06, + "logits/chosen": 2066354176.0, + "logits/rejected": 2520795028.2105265, + "logps/chosen": -267.02894005408655, + "logps/rejected": -414.5072985197368, + "loss": 0.1451, + "rewards/chosen": 1.066404709449181, + "rewards/margins": 7.925558858554856, + "rewards/rejected": -6.859154149105675, + "step": 1663 + }, + { + "epoch": 0.6142771445710857, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 3.3345259937711436e-06, + "logits/chosen": 1869146473.4117646, + "logits/rejected": 1772192017.0666666, + "logps/chosen": -278.11213235294116, + "logps/rejected": -488.10423177083334, + "loss": 0.1916, + "rewards/chosen": 0.978169833912569, + "rewards/margins": 9.892382887297986, + "rewards/rejected": -8.914213053385417, + "step": 1664 + }, + { + "epoch": 0.614646301508929, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 3.328977904148143e-06, + "logits/chosen": 2155303152.9411764, + "logits/rejected": 2330366225.0666666, + "logps/chosen": -280.27180032169116, + "logps/rejected": -442.84632161458336, + "loss": 0.1684, + "rewards/chosen": 1.1025946000043083, + "rewards/margins": 8.072788590075923, + "rewards/rejected": -6.970193990071615, + "step": 1665 + }, + { + "epoch": 0.6150154584467722, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 3.3234321297050264e-06, + "logits/chosen": 1362881457.2307692, + "logits/rejected": 2127862730.1052632, + "logps/chosen": -279.43241060697113, + "logps/rejected": -535.1716694078947, + "loss": 0.1472, + "rewards/chosen": 1.0266855680025542, + "rewards/margins": 9.101022372844248, + "rewards/rejected": -8.074336804841694, + "step": 1666 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 3.317888678125392e-06, + "logits/chosen": 1715399773.090909, + "logits/rejected": 2110989068.1904762, + "logps/chosen": -273.5418590198864, + "logps/rejected": -394.6907552083333, + "loss": 0.1084, + "rewards/chosen": 1.543008804321289, + "rewards/margins": 7.854798907325382, + "rewards/rejected": -6.311790103004093, + "step": 1667 + }, + { + "epoch": 0.6157537723224585, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 3.3123475570896246e-06, + "logits/chosen": 2192445952.0, + "logits/rejected": 1778131840.0, + "logps/chosen": -254.2170867919922, + "logps/rejected": -637.5602416992188, + "loss": 0.1761, + "rewards/chosen": 0.9179825186729431, + "rewards/margins": 11.66454166173935, + "rewards/rejected": -10.746559143066406, + "step": 1668 + }, + { + "epoch": 0.6161229292603018, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 3.3068087742748763e-06, + "logits/chosen": 2408355732.2105265, + "logits/rejected": 1454882185.8461537, + "logps/chosen": -276.99609375, + "logps/rejected": -526.1326622596154, + "loss": 0.1754, + "rewards/chosen": 1.1643919693796259, + "rewards/margins": 9.024329197068928, + "rewards/rejected": -7.8599372276893025, + "step": 1669 + }, + { + "epoch": 0.616492086198145, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 3.301272337355058e-06, + "logits/chosen": 1979430034.2857144, + "logits/rejected": 2045904440.8888888, + "logps/chosen": -295.3558872767857, + "logps/rejected": -474.37109375, + "loss": 0.1244, + "rewards/chosen": 1.447951180594308, + "rewards/margins": 11.041991945296997, + "rewards/rejected": -9.59404076470269, + "step": 1670 + }, + { + "epoch": 0.6168612431359882, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 3.295738254000835e-06, + "logits/chosen": 1805041078.857143, + "logits/rejected": 1825772450.909091, + "logps/chosen": -364.81640625, + "logps/rejected": -551.9992897727273, + "loss": 0.2478, + "rewards/chosen": 0.6506555648077101, + "rewards/margins": 10.366134635297767, + "rewards/rejected": -9.715479070490057, + "step": 1671 + }, + { + "epoch": 0.6172304000738313, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 3.2902065318796072e-06, + "logits/chosen": 1152821020.4444444, + "logits/rejected": 1570909915.4285715, + "logps/chosen": -213.51112196180554, + "logps/rejected": -369.6878138950893, + "loss": 0.167, + "rewards/chosen": 1.3174479802449544, + "rewards/margins": 8.449300266447521, + "rewards/rejected": -7.131852286202567, + "step": 1672 + }, + { + "epoch": 0.6175995570116746, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 3.2846771786555075e-06, + "logits/chosen": 2466765312.0, + "logits/rejected": 2394496000.0, + "logps/chosen": -234.5821533203125, + "logps/rejected": -473.2803649902344, + "loss": 0.1, + "rewards/chosen": 2.143615245819092, + "rewards/margins": 10.130607604980469, + "rewards/rejected": -7.986992359161377, + "step": 1673 + }, + { + "epoch": 0.6179687139495178, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 3.279150201989384e-06, + "logits/chosen": 1632577776.9411764, + "logits/rejected": 1923666466.1333334, + "logps/chosen": -271.71691176470586, + "logps/rejected": -463.56946614583336, + "loss": 0.1801, + "rewards/chosen": 1.055081423591165, + "rewards/margins": 8.769726809333353, + "rewards/rejected": -7.7146453857421875, + "step": 1674 + }, + { + "epoch": 0.618337870887361, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 3.2736256095387912e-06, + "logits/chosen": 1602015232.0, + "logits/rejected": 1785485994.6666667, + "logps/chosen": -259.06279296875, + "logps/rejected": -507.8134358723958, + "loss": 0.1754, + "rewards/chosen": 1.4135662078857423, + "rewards/margins": 9.649550374348959, + "rewards/rejected": -8.235984166463217, + "step": 1675 + }, + { + "epoch": 0.6187070278252041, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 3.2681034089579843e-06, + "logits/chosen": 1517040909.4736843, + "logits/rejected": 1721436947.6923077, + "logps/chosen": -303.10508326480266, + "logps/rejected": -594.3753756009615, + "loss": 0.1754, + "rewards/chosen": 1.301822260806435, + "rewards/margins": 9.423890225800426, + "rewards/rejected": -8.12206796499399, + "step": 1676 + }, + { + "epoch": 0.6190761847630474, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 3.2625836078979013e-06, + "logits/chosen": 1614337792.0, + "logits/rejected": 2033136512.0, + "logps/chosen": -250.39779663085938, + "logps/rejected": -511.2657165527344, + "loss": 0.1705, + "rewards/chosen": 1.4732367992401123, + "rewards/margins": 8.933802843093872, + "rewards/rejected": -7.46056604385376, + "step": 1677 + }, + { + "epoch": 0.6194453417008906, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 3.2570662140061543e-06, + "logits/chosen": 1564671784.4210527, + "logits/rejected": 1479717021.5384614, + "logps/chosen": -217.8938116776316, + "logps/rejected": -373.8444636418269, + "loss": 0.1569, + "rewards/chosen": 1.6969614530864514, + "rewards/margins": 9.428016492712352, + "rewards/rejected": -7.731055039625901, + "step": 1678 + }, + { + "epoch": 0.6198144986387338, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 3.251551234927025e-06, + "logits/chosen": 2061252931.368421, + "logits/rejected": 1972583660.3076923, + "logps/chosen": -340.6321957236842, + "logps/rejected": -522.5441706730769, + "loss": 0.2037, + "rewards/chosen": 0.8200723246524209, + "rewards/margins": 9.488633819919848, + "rewards/rejected": -8.668561495267427, + "step": 1679 + }, + { + "epoch": 0.620183655576577, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 3.2460386783014466e-06, + "logits/chosen": 1395049411.764706, + "logits/rejected": 1312176947.2, + "logps/chosen": -256.86609604779414, + "logps/rejected": -443.91455078125, + "loss": 0.1578, + "rewards/chosen": 1.5018219667322494, + "rewards/margins": 10.727863633398917, + "rewards/rejected": -9.226041666666667, + "step": 1680 + }, + { + "epoch": 0.6205528125144202, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 3.240528551766994e-06, + "logits/chosen": 1857399988.7058823, + "logits/rejected": 1813999889.0666666, + "logps/chosen": -298.77975643382354, + "logps/rejected": -385.6814778645833, + "loss": 0.1821, + "rewards/chosen": 1.34210766063017, + "rewards/margins": 8.893485020656211, + "rewards/rejected": -7.551377360026041, + "step": 1681 + }, + { + "epoch": 0.6209219694522634, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 3.2350208629578795e-06, + "logits/chosen": 2247623601.230769, + "logits/rejected": 1806591784.4210527, + "logps/chosen": -246.66229717548077, + "logps/rejected": -528.8346011513158, + "loss": 0.0893, + "rewards/chosen": 1.8404043637789214, + "rewards/margins": 10.799881035499727, + "rewards/rejected": -8.959476671720806, + "step": 1682 + }, + { + "epoch": 0.6212911263901066, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 3.229515619504936e-06, + "logits/chosen": 1820332662.1538463, + "logits/rejected": 1714031562.1052632, + "logps/chosen": -271.50704251802887, + "logps/rejected": -426.86012027138156, + "loss": 0.1416, + "rewards/chosen": 1.1421913733849158, + "rewards/margins": 8.337911845218798, + "rewards/rejected": -7.195720471833882, + "step": 1683 + }, + { + "epoch": 0.6216602833279498, + "grad_norm": 9.875, + "kl": 1.1440200805664062, + "learning_rate": 3.224012829035607e-06, + "logits/chosen": 2045953462.857143, + "logits/rejected": 2483917710.2222223, + "logps/chosen": -254.52403041294642, + "logps/rejected": -504.8468967013889, + "loss": 0.1252, + "rewards/chosen": 1.6369176592145647, + "rewards/margins": 11.385050152975415, + "rewards/rejected": -9.748132493760851, + "step": 1684 + }, + { + "epoch": 0.622029440265793, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 3.2185124991739406e-06, + "logits/chosen": 1901988750.2222223, + "logits/rejected": 1447574820.5714285, + "logps/chosen": -260.805908203125, + "logps/rejected": -440.1688755580357, + "loss": 0.1623, + "rewards/chosen": 1.1999361250135634, + "rewards/margins": 8.90965183197506, + "rewards/rejected": -7.709715706961496, + "step": 1685 + }, + { + "epoch": 0.6223985972036362, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 3.2130146375405748e-06, + "logits/chosen": 1365164909.7142856, + "logits/rejected": 1508327765.3333333, + "logps/chosen": -317.40415736607144, + "logps/rejected": -453.70513237847223, + "loss": 0.1308, + "rewards/chosen": 1.134988580431257, + "rewards/margins": 9.019631287408254, + "rewards/rejected": -7.884642706976996, + "step": 1686 + }, + { + "epoch": 0.6227677541414794, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 3.2075192517527233e-06, + "logits/chosen": 2086348416.0, + "logits/rejected": 1762430848.0, + "logps/chosen": -290.7536926269531, + "logps/rejected": -432.8097229003906, + "loss": 0.1378, + "rewards/chosen": 1.693253755569458, + "rewards/margins": 9.063836336135864, + "rewards/rejected": -7.370582580566406, + "step": 1687 + }, + { + "epoch": 0.6231369110793226, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 3.2020263494241757e-06, + "logits/chosen": 1656788736.0, + "logits/rejected": 1649222400.0, + "logps/chosen": -336.7756652832031, + "logps/rejected": -512.8466796875, + "loss": 0.1371, + "rewards/chosen": 1.5657343864440918, + "rewards/margins": 11.179041385650635, + "rewards/rejected": -9.613306999206543, + "step": 1688 + }, + { + "epoch": 0.6235060680171658, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 3.196535938165277e-06, + "logits/chosen": 1934576640.0, + "logits/rejected": 1820247936.0, + "logps/chosen": -230.32496643066406, + "logps/rejected": -457.3692626953125, + "loss": 0.1479, + "rewards/chosen": 1.679469347000122, + "rewards/margins": 9.68521237373352, + "rewards/rejected": -8.005743026733398, + "step": 1689 + }, + { + "epoch": 0.623875224955009, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 3.1910480255829235e-06, + "logits/chosen": 2056885632.0, + "logits/rejected": 1642409472.0, + "logps/chosen": -289.0788269042969, + "logps/rejected": -543.446533203125, + "loss": 0.1508, + "rewards/chosen": 1.2769770622253418, + "rewards/margins": 10.936142444610596, + "rewards/rejected": -9.659165382385254, + "step": 1690 + }, + { + "epoch": 0.6242443818928523, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 3.185562619280549e-06, + "logits/chosen": 1115299498.6666667, + "logits/rejected": 1543022387.2, + "logps/chosen": -194.36759440104166, + "logps/rejected": -430.013916015625, + "loss": 0.1252, + "rewards/chosen": 1.6609063148498535, + "rewards/margins": 8.837511539459229, + "rewards/rejected": -7.176605224609375, + "step": 1691 + }, + { + "epoch": 0.6246135388306954, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 3.1800797268581115e-06, + "logits/chosen": 1975555072.0, + "logits/rejected": 1877435392.0, + "logps/chosen": -265.1686197916667, + "logps/rejected": -487.069775390625, + "loss": 0.1247, + "rewards/chosen": 0.8555692036946615, + "rewards/margins": 8.785660298665364, + "rewards/rejected": -7.930091094970703, + "step": 1692 + }, + { + "epoch": 0.6249826957685386, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 3.174599355912092e-06, + "logits/chosen": 2115906560.0, + "logits/rejected": 3331036160.0, + "logps/chosen": -259.2759033203125, + "logps/rejected": -622.5422770182291, + "loss": 0.1701, + "rewards/chosen": 1.7016826629638673, + "rewards/margins": 10.799715932210287, + "rewards/rejected": -9.09803326924642, + "step": 1693 + }, + { + "epoch": 0.6253518527063818, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 3.169121514035473e-06, + "logits/chosen": 1138357760.0, + "logits/rejected": 1722837401.6, + "logps/chosen": -249.2686971028646, + "logps/rejected": -451.378955078125, + "loss": 0.0805, + "rewards/chosen": 2.008759339650472, + "rewards/margins": 10.794098695119223, + "rewards/rejected": -8.78533935546875, + "step": 1694 + }, + { + "epoch": 0.625721009644225, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 3.1636462088177345e-06, + "logits/chosen": 1883316539.0769231, + "logits/rejected": 1760025114.9473684, + "logps/chosen": -276.87558218149036, + "logps/rejected": -396.3931178042763, + "loss": 0.1237, + "rewards/chosen": 1.4049161764291616, + "rewards/margins": 9.46523818506403, + "rewards/rejected": -8.060322008634868, + "step": 1695 + }, + { + "epoch": 0.625721009644225, + "eval_kl": 0.0, + "eval_logits/chosen": 3492037783.8851676, + "eval_logits/rejected": 3515905117.090909, + "eval_logps/chosen": -291.6400082236842, + "eval_logps/rejected": -480.33512581168833, + "eval_loss": 0.1299210786819458, + "eval_rewards/chosen": 1.5374773381429425, + "eval_rewards/margins": 9.797945792214911, + "eval_rewards/rejected": -8.260468454071969, + "eval_runtime": 109.8037, + "eval_samples_per_second": 7.978, + "eval_steps_per_second": 0.501, + "step": 1695 + }, + { + "epoch": 0.6260901665820682, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 3.1581734478448447e-06, + "logits/chosen": 1790090791.3846154, + "logits/rejected": 1810853241.2631578, + "logps/chosen": -306.7212665264423, + "logps/rejected": -465.09159128289474, + "loss": 0.1083, + "rewards/chosen": 1.6189005925105169, + "rewards/margins": 9.744036315423758, + "rewards/rejected": -8.12513572291324, + "step": 1696 + }, + { + "epoch": 0.6264593235199114, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 3.152703238699242e-06, + "logits/chosen": 1535022464.0, + "logits/rejected": 1646643584.0, + "logps/chosen": -312.5274658203125, + "logps/rejected": -479.5491943359375, + "loss": 0.1413, + "rewards/chosen": 1.3774850368499756, + "rewards/margins": 9.659639120101929, + "rewards/rejected": -8.282154083251953, + "step": 1697 + }, + { + "epoch": 0.6268284804577546, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 3.147235588959831e-06, + "logits/chosen": 2542973201.0666666, + "logits/rejected": 1595657999.0588236, + "logps/chosen": -322.2865885416667, + "logps/rejected": -721.1806066176471, + "loss": 0.1433, + "rewards/chosen": 1.4778574625651042, + "rewards/margins": 11.68929491230086, + "rewards/rejected": -10.211437449735755, + "step": 1698 + }, + { + "epoch": 0.6271976373955978, + "grad_norm": 12.1875, + "kl": 0.9220037460327148, + "learning_rate": 3.1417705062019742e-06, + "logits/chosen": 1942862794.1052632, + "logits/rejected": 1695545028.9230769, + "logps/chosen": -301.3944541529605, + "logps/rejected": -443.4700270432692, + "loss": 0.155, + "rewards/chosen": 1.8510730141087581, + "rewards/margins": 8.896369903193794, + "rewards/rejected": -7.045296889085036, + "step": 1699 + }, + { + "epoch": 0.627566794333441, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 3.136307997997472e-06, + "logits/chosen": 2189451008.0, + "logits/rejected": 2053007616.0, + "logps/chosen": -331.4158630371094, + "logps/rejected": -454.6242370605469, + "loss": 0.1964, + "rewards/chosen": 0.53312087059021, + "rewards/margins": 10.105805158615112, + "rewards/rejected": -9.572684288024902, + "step": 1700 + }, + { + "epoch": 0.6279359512712842, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 3.1308480719145594e-06, + "logits/chosen": 1826587306.6666667, + "logits/rejected": 1461836068.5714285, + "logps/chosen": -304.9659830729167, + "logps/rejected": -413.19981166294644, + "loss": 0.1276, + "rewards/chosen": 2.100526385837131, + "rewards/margins": 8.926547852773515, + "rewards/rejected": -6.826021466936384, + "step": 1701 + }, + { + "epoch": 0.6283051082091274, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 3.125390735517898e-06, + "logits/chosen": 1306806954.6666667, + "logits/rejected": 2164630949.647059, + "logps/chosen": -230.56248372395834, + "logps/rejected": -586.6368336397059, + "loss": 0.103, + "rewards/chosen": 1.9168128967285156, + "rewards/margins": 11.36042449053596, + "rewards/rejected": -9.443611593807445, + "step": 1702 + }, + { + "epoch": 0.6286742651469706, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 3.119935996368556e-06, + "logits/chosen": 1728719450.3529413, + "logits/rejected": 1985413802.6666667, + "logps/chosen": -334.2109375, + "logps/rejected": -469.477734375, + "loss": 0.1405, + "rewards/chosen": 1.5352252511417164, + "rewards/margins": 8.756512279136508, + "rewards/rejected": -7.221287027994792, + "step": 1703 + }, + { + "epoch": 0.6290434220848138, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 3.1144838620240038e-06, + "logits/chosen": 1809598008.8888888, + "logits/rejected": 1754192310.857143, + "logps/chosen": -262.4754231770833, + "logps/rejected": -388.09877232142856, + "loss": 0.1523, + "rewards/chosen": 1.7008012135823567, + "rewards/margins": 9.727014723278227, + "rewards/rejected": -8.02621350969587, + "step": 1704 + }, + { + "epoch": 0.629412579022657, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 3.109034340038106e-06, + "logits/chosen": 1729962211.5555556, + "logits/rejected": 1730806345.142857, + "logps/chosen": -254.44390190972223, + "logps/rejected": -517.7466169084821, + "loss": 0.1313, + "rewards/chosen": 1.9145052168104384, + "rewards/margins": 11.397748886592804, + "rewards/rejected": -9.483243669782366, + "step": 1705 + }, + { + "epoch": 0.6297817359605002, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 3.103587437961104e-06, + "logits/chosen": 1857832686.9333334, + "logits/rejected": 2263682831.0588236, + "logps/chosen": -259.82643229166666, + "logps/rejected": -817.5048828125, + "loss": 0.1441, + "rewards/chosen": 1.3412460327148437, + "rewards/margins": 13.514742952234606, + "rewards/rejected": -12.173496919519762, + "step": 1706 + }, + { + "epoch": 0.6301508928983434, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 3.0981431633396153e-06, + "logits/chosen": 1915028918.857143, + "logits/rejected": 1636607544.8888888, + "logps/chosen": -174.4471435546875, + "logps/rejected": -508.09814453125, + "loss": 0.1374, + "rewards/chosen": 1.3117670331682478, + "rewards/margins": 10.04846645536877, + "rewards/rejected": -8.736699422200521, + "step": 1707 + }, + { + "epoch": 0.6305200498361866, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 3.0927015237166104e-06, + "logits/chosen": 1498749400.6153846, + "logits/rejected": 3043003984.8421054, + "logps/chosen": -271.8415339543269, + "logps/rejected": -536.6656044407895, + "loss": 0.1298, + "rewards/chosen": 1.2310983217679536, + "rewards/margins": 8.979904120750273, + "rewards/rejected": -7.748805798982319, + "step": 1708 + }, + { + "epoch": 0.6308892067740298, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 3.0872625266314104e-06, + "logits/chosen": 1518927149.1764705, + "logits/rejected": 1475433676.8, + "logps/chosen": -319.3023897058824, + "logps/rejected": -590.1461588541666, + "loss": 0.1554, + "rewards/chosen": 1.5672034095315373, + "rewards/margins": 10.297811523138307, + "rewards/rejected": -8.73060811360677, + "step": 1709 + }, + { + "epoch": 0.631258363711873, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 3.081826179619681e-06, + "logits/chosen": 1593665851.0769231, + "logits/rejected": 1475822645.8947368, + "logps/chosen": -323.49207481971155, + "logps/rejected": -451.6514185855263, + "loss": 0.1444, + "rewards/chosen": 0.8700519708486704, + "rewards/margins": 9.09000179352548, + "rewards/rejected": -8.21994982267681, + "step": 1710 + }, + { + "epoch": 0.6316275206497162, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 3.076392490213409e-06, + "logits/chosen": 1529107046.4, + "logits/rejected": 1448901073.4545455, + "logps/chosen": -183.67763671875, + "logps/rejected": -507.54354580965907, + "loss": 0.1099, + "rewards/chosen": 1.062927532196045, + "rewards/margins": 10.644964330846614, + "rewards/rejected": -9.582036798650568, + "step": 1711 + }, + { + "epoch": 0.6319966775875594, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 3.0709614659409013e-06, + "logits/chosen": 1662172321.6842105, + "logits/rejected": 2042915603.6923077, + "logps/chosen": -204.6530119243421, + "logps/rejected": -502.40640024038464, + "loss": 0.1304, + "rewards/chosen": 1.866694400185033, + "rewards/margins": 11.246755623141762, + "rewards/rejected": -9.38006122295673, + "step": 1712 + }, + { + "epoch": 0.6323658345254026, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 3.0655331143267758e-06, + "logits/chosen": 2089483264.0, + "logits/rejected": 1788932352.0, + "logps/chosen": -241.79959106445312, + "logps/rejected": -526.4071044921875, + "loss": 0.1497, + "rewards/chosen": 1.3191936016082764, + "rewards/margins": 10.339909315109253, + "rewards/rejected": -9.020715713500977, + "step": 1713 + }, + { + "epoch": 0.6327349914632459, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 3.060107442891943e-06, + "logits/chosen": 1188669326.2222223, + "logits/rejected": 1478683940.5714285, + "logps/chosen": -186.76203070746527, + "logps/rejected": -566.9585309709821, + "loss": 0.1289, + "rewards/chosen": 2.146583769056532, + "rewards/margins": 10.502802924504355, + "rewards/rejected": -8.356219155447823, + "step": 1714 + }, + { + "epoch": 0.633104148401089, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 3.054684459153601e-06, + "logits/chosen": 1577397043.2, + "logits/rejected": 1619033389.1764705, + "logps/chosen": -228.72063802083332, + "logps/rejected": -482.39573759191177, + "loss": 0.1191, + "rewards/chosen": 2.0886881510416666, + "rewards/margins": 9.200836660347733, + "rewards/rejected": -7.112148509306066, + "step": 1715 + }, + { + "epoch": 0.6334733053389322, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 3.049264170625227e-06, + "logits/chosen": 1849639321.6, + "logits/rejected": 2335815509.3333335, + "logps/chosen": -296.333251953125, + "logps/rejected": -500.8224690755208, + "loss": 0.2006, + "rewards/chosen": 1.1095999717712401, + "rewards/margins": 10.591076437632243, + "rewards/rejected": -9.481476465861002, + "step": 1716 + }, + { + "epoch": 0.6338424622767754, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 3.043846584816561e-06, + "logits/chosen": 1842557952.0, + "logits/rejected": 1966346695.1111112, + "logps/chosen": -297.67103794642856, + "logps/rejected": -579.2967664930555, + "loss": 0.1531, + "rewards/chosen": 1.0386837550571986, + "rewards/margins": 9.73711413428897, + "rewards/rejected": -8.698430379231771, + "step": 1717 + }, + { + "epoch": 0.6342116192146187, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 3.038431709233598e-06, + "logits/chosen": 1621928779.2941177, + "logits/rejected": 2075183786.6666667, + "logps/chosen": -266.59616268382354, + "logps/rejected": -518.5646809895833, + "loss": 0.189, + "rewards/chosen": 1.1755582024069393, + "rewards/margins": 8.111580642999387, + "rewards/rejected": -6.936022440592448, + "step": 1718 + }, + { + "epoch": 0.6345807761524618, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 3.033019551378581e-06, + "logits/chosen": 1943437539.5555556, + "logits/rejected": 1520295789.7142856, + "logps/chosen": -225.80391438802084, + "logps/rejected": -432.38832310267856, + "loss": 0.1715, + "rewards/chosen": 1.412443584865994, + "rewards/margins": 7.87001107231019, + "rewards/rejected": -6.457567487444196, + "step": 1719 + }, + { + "epoch": 0.634949933090305, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 3.0276101187499864e-06, + "logits/chosen": 1549416960.0, + "logits/rejected": 1677546752.0, + "logps/chosen": -294.03875732421875, + "logps/rejected": -466.393310546875, + "loss": 0.1737, + "rewards/chosen": 1.0645500421524048, + "rewards/margins": 9.73201048374176, + "rewards/rejected": -8.667460441589355, + "step": 1720 + }, + { + "epoch": 0.6353190900281482, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 3.022203418842512e-06, + "logits/chosen": 1359144329.8461537, + "logits/rejected": 1078347883.7894738, + "logps/chosen": -214.98775540865384, + "logps/rejected": -461.64334909539474, + "loss": 0.1486, + "rewards/chosen": 1.2513307424692006, + "rewards/margins": 8.642611870398888, + "rewards/rejected": -7.3912811279296875, + "step": 1721 + }, + { + "epoch": 0.6356882469659915, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 3.016799459147074e-06, + "logits/chosen": 2887582720.0, + "logits/rejected": 2016151961.6, + "logps/chosen": -197.77655029296875, + "logps/rejected": -561.439892578125, + "loss": 0.1098, + "rewards/chosen": 1.527806282043457, + "rewards/margins": 10.421476936340332, + "rewards/rejected": -8.893670654296875, + "step": 1722 + }, + { + "epoch": 0.6360574039038346, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 3.0113982471507873e-06, + "logits/chosen": 1505564792.4705882, + "logits/rejected": 1864562824.5333333, + "logps/chosen": -261.7475011488971, + "logps/rejected": -556.6546875, + "loss": 0.1811, + "rewards/chosen": 0.9186405855066636, + "rewards/margins": 9.376309652889477, + "rewards/rejected": -8.457669067382813, + "step": 1723 + }, + { + "epoch": 0.6364265608416778, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 3.0059997903369658e-06, + "logits/chosen": 2105681317.6470587, + "logits/rejected": 1457846545.0666666, + "logps/chosen": -284.77783203125, + "logps/rejected": -432.8744140625, + "loss": 0.1881, + "rewards/chosen": 1.101176093606388, + "rewards/margins": 7.455071662454045, + "rewards/rejected": -6.353895568847657, + "step": 1724 + }, + { + "epoch": 0.636795717779521, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 3.0006040961851014e-06, + "logits/chosen": 1686932608.0, + "logits/rejected": 2457115392.0, + "logps/chosen": -336.6414794921875, + "logps/rejected": -454.3401794433594, + "loss": 0.1647, + "rewards/chosen": 1.2766751050949097, + "rewards/margins": 9.276304125785828, + "rewards/rejected": -7.999629020690918, + "step": 1725 + }, + { + "epoch": 0.6371648747173643, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 2.9952111721708576e-06, + "logits/chosen": 1724031590.4, + "logits/rejected": 1941282986.6666667, + "logps/chosen": -174.42364501953125, + "logps/rejected": -487.356689453125, + "loss": 0.1944, + "rewards/chosen": 1.0539214134216308, + "rewards/margins": 9.046721808115642, + "rewards/rejected": -7.992800394694011, + "step": 1726 + }, + { + "epoch": 0.6375340316552074, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 2.9898210257660664e-06, + "logits/chosen": 1453423820.8, + "logits/rejected": 1498183559.5294118, + "logps/chosen": -278.6811197916667, + "logps/rejected": -531.4298023897059, + "loss": 0.1172, + "rewards/chosen": 1.8685840606689452, + "rewards/margins": 10.430130835140453, + "rewards/rejected": -8.561546774471507, + "step": 1727 + }, + { + "epoch": 0.6379031885930506, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 2.984433664438704e-06, + "logits/chosen": 1609055982.9333334, + "logits/rejected": 1810584274.8235295, + "logps/chosen": -358.8484700520833, + "logps/rejected": -447.06565946691177, + "loss": 0.1586, + "rewards/chosen": 1.4974717458089193, + "rewards/margins": 8.38639539082845, + "rewards/rejected": -6.888923645019531, + "step": 1728 + }, + { + "epoch": 0.6382723455308938, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 2.979049095652892e-06, + "logits/chosen": 1849931873.5238094, + "logits/rejected": 1998822306.909091, + "logps/chosen": -259.44805617559524, + "logps/rejected": -444.55495383522725, + "loss": 0.1681, + "rewards/chosen": 1.6155709766206288, + "rewards/margins": 8.270301339946268, + "rewards/rejected": -6.65473036332564, + "step": 1729 + }, + { + "epoch": 0.6386415024687371, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 2.973667326868882e-06, + "logits/chosen": 1669544448.0, + "logits/rejected": 1706386304.0, + "logps/chosen": -226.70619201660156, + "logps/rejected": -612.3822021484375, + "loss": 0.1866, + "rewards/chosen": 0.6698537468910217, + "rewards/margins": 10.657772600650787, + "rewards/rejected": -9.987918853759766, + "step": 1730 + }, + { + "epoch": 0.6390106594065802, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 2.968288365543047e-06, + "logits/chosen": 2062155483.4285715, + "logits/rejected": 2071139214.2222223, + "logps/chosen": -282.76048060825894, + "logps/rejected": -515.2914496527778, + "loss": 0.1123, + "rewards/chosen": 1.570793969290597, + "rewards/margins": 10.144247236705962, + "rewards/rejected": -8.573453267415365, + "step": 1731 + }, + { + "epoch": 0.6393798163444234, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 2.9629122191278677e-06, + "logits/chosen": 2122650419.2, + "logits/rejected": 1804599898.3529413, + "logps/chosen": -326.0494140625, + "logps/rejected": -595.8511029411765, + "loss": 0.1057, + "rewards/chosen": 1.7886093139648438, + "rewards/margins": 10.544834002326517, + "rewards/rejected": -8.756224688361673, + "step": 1732 + }, + { + "epoch": 0.6397489732822667, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 2.9575388950719286e-06, + "logits/chosen": 1493281996.8, + "logits/rejected": 1390504056.4705882, + "logps/chosen": -313.9703776041667, + "logps/rejected": -470.6578584558824, + "loss": 0.1505, + "rewards/chosen": 1.0185026168823241, + "rewards/margins": 8.703419393651625, + "rewards/rejected": -7.684916776769302, + "step": 1733 + }, + { + "epoch": 0.6401181302201098, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 2.9521684008199012e-06, + "logits/chosen": 1817771349.3333333, + "logits/rejected": 2193218620.2352943, + "logps/chosen": -291.75361328125, + "logps/rejected": -385.39137178308823, + "loss": 0.1381, + "rewards/chosen": 1.4159891764322916, + "rewards/margins": 8.620478491689646, + "rewards/rejected": -7.204489315257353, + "step": 1734 + }, + { + "epoch": 0.640487287157953, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 2.946800743812537e-06, + "logits/chosen": 1400013430.1538463, + "logits/rejected": 1372715546.9473684, + "logps/chosen": -272.0937312199519, + "logps/rejected": -424.3016293174342, + "loss": 0.1469, + "rewards/chosen": 0.8261612378633939, + "rewards/margins": 8.196255938726882, + "rewards/rejected": -7.370094700863487, + "step": 1735 + }, + { + "epoch": 0.6408564440957962, + "grad_norm": 14.4375, + "kl": 0.026165008544921875, + "learning_rate": 2.941435931486656e-06, + "logits/chosen": 1241421619.2, + "logits/rejected": 1248436736.0, + "logps/chosen": -268.7498046875, + "logps/rejected": -337.2467854817708, + "loss": 0.2112, + "rewards/chosen": 1.1999575614929199, + "rewards/margins": 8.022062333424886, + "rewards/rejected": -6.822104771931966, + "step": 1736 + }, + { + "epoch": 0.6412256010336395, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 2.9360739712751394e-06, + "logits/chosen": 1703388160.0, + "logits/rejected": 1621348249.6, + "logps/chosen": -228.9038289388021, + "logps/rejected": -437.64453125, + "loss": 0.1058, + "rewards/chosen": 1.549069881439209, + "rewards/margins": 10.511331844329835, + "rewards/rejected": -8.962261962890626, + "step": 1737 + }, + { + "epoch": 0.6415947579714826, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 2.9307148706069145e-06, + "logits/chosen": 1518766682.3529413, + "logits/rejected": 1679217732.2666667, + "logps/chosen": -234.90064912683823, + "logps/rejected": -526.574609375, + "loss": 0.1461, + "rewards/chosen": 1.8097980723661535, + "rewards/margins": 9.064565935321882, + "rewards/rejected": -7.254767862955729, + "step": 1738 + }, + { + "epoch": 0.6419639149093258, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 2.9253586369069447e-06, + "logits/chosen": 1437202204.4444444, + "logits/rejected": 1861387849.142857, + "logps/chosen": -232.45366753472223, + "logps/rejected": -430.4017857142857, + "loss": 0.1524, + "rewards/chosen": 1.6615893046061199, + "rewards/margins": 7.624254862467448, + "rewards/rejected": -5.962665557861328, + "step": 1739 + }, + { + "epoch": 0.642333071847169, + "grad_norm": 12.1875, + "kl": 1.0651836395263672, + "learning_rate": 2.920005277596225e-06, + "logits/chosen": 1410561560.3809524, + "logits/rejected": 1518172625.4545455, + "logps/chosen": -235.15783110119048, + "logps/rejected": -387.80166903409093, + "loss": 0.195, + "rewards/chosen": 2.1221451532273066, + "rewards/margins": 8.652567710711326, + "rewards/rejected": -6.53042255748402, + "step": 1740 + }, + { + "epoch": 0.6427022287850123, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 2.914654800091768e-06, + "logits/chosen": 1439663854.9333334, + "logits/rejected": 1393218138.3529413, + "logps/chosen": -326.56529947916664, + "logps/rejected": -457.75390625, + "loss": 0.1361, + "rewards/chosen": 1.4508090972900392, + "rewards/margins": 9.367051808974322, + "rewards/rejected": -7.9162427116842835, + "step": 1741 + }, + { + "epoch": 0.6430713857228554, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 2.9093072118065903e-06, + "logits/chosen": 2171099648.0, + "logits/rejected": 1984760960.0, + "logps/chosen": -307.12799072265625, + "logps/rejected": -437.9398498535156, + "loss": 0.1493, + "rewards/chosen": 1.3458000421524048, + "rewards/margins": 8.126190304756165, + "rewards/rejected": -6.78039026260376, + "step": 1742 + }, + { + "epoch": 0.6434405426606986, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 2.9039625201497106e-06, + "logits/chosen": 1682469432.8888888, + "logits/rejected": 2116641645.7142856, + "logps/chosen": -238.043701171875, + "logps/rejected": -414.85117885044644, + "loss": 0.1731, + "rewards/chosen": 1.0674418343438044, + "rewards/margins": 8.218233547513448, + "rewards/rejected": -7.150791713169643, + "step": 1743 + }, + { + "epoch": 0.6438096995985418, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 2.8986207325261272e-06, + "logits/chosen": 1588726362.3529413, + "logits/rejected": 1637277832.5333333, + "logps/chosen": -254.1457088694853, + "logps/rejected": -514.6052083333333, + "loss": 0.1867, + "rewards/chosen": 1.245274824254653, + "rewards/margins": 7.324071719599705, + "rewards/rejected": -6.078796895345052, + "step": 1744 + }, + { + "epoch": 0.6441788565363851, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 2.893281856336822e-06, + "logits/chosen": 1421459298.4615386, + "logits/rejected": 2175275008.0, + "logps/chosen": -218.2535682091346, + "logps/rejected": -428.2290852864583, + "loss": 0.2532, + "rewards/chosen": 1.2354866908146784, + "rewards/margins": 7.251517320290589, + "rewards/rejected": -6.016030629475911, + "step": 1745 + }, + { + "epoch": 0.6445480134742282, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 2.887945898978741e-06, + "logits/chosen": 1450779940.5714285, + "logits/rejected": 1396881749.3333333, + "logps/chosen": -218.02373395647322, + "logps/rejected": -491.47650824652777, + "loss": 0.1145, + "rewards/chosen": 1.6349485942295618, + "rewards/margins": 10.537491556197876, + "rewards/rejected": -8.902542961968315, + "step": 1746 + }, + { + "epoch": 0.6449171704120714, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 2.8826128678447806e-06, + "logits/chosen": 1626131176.7272727, + "logits/rejected": 1910597632.0, + "logps/chosen": -243.8557794744318, + "logps/rejected": -487.12255859375, + "loss": 0.2014, + "rewards/chosen": 1.2226325815374202, + "rewards/margins": 8.054589063471013, + "rewards/rejected": -6.831956481933593, + "step": 1747 + }, + { + "epoch": 0.6452863273499146, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 2.8772827703237914e-06, + "logits/chosen": 1484805760.0, + "logits/rejected": 1679886592.0, + "logps/chosen": -278.3089599609375, + "logps/rejected": -437.4908447265625, + "loss": 0.1448, + "rewards/chosen": 1.5581783056259155, + "rewards/margins": 8.487257838249207, + "rewards/rejected": -6.929079532623291, + "step": 1748 + }, + { + "epoch": 0.6456554842877579, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 2.871955613800557e-06, + "logits/chosen": 1600315392.0, + "logits/rejected": 1680333004.8, + "logps/chosen": -252.0072576349432, + "logps/rejected": -519.589404296875, + "loss": 0.1465, + "rewards/chosen": 1.9806114543568005, + "rewards/margins": 8.743281589854847, + "rewards/rejected": -6.7626701354980465, + "step": 1749 + }, + { + "epoch": 0.646024641225601, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 2.8666314056557815e-06, + "logits/chosen": 1639986744.8888888, + "logits/rejected": 1401422848.0, + "logps/chosen": -290.07237413194446, + "logps/rejected": -557.7448381696429, + "loss": 0.143, + "rewards/chosen": 1.6530928081936307, + "rewards/margins": 10.05908885834709, + "rewards/rejected": -8.40599605015346, + "step": 1750 + }, + { + "epoch": 0.6463937981634442, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 2.8613101532660894e-06, + "logits/chosen": 1957819792.6956522, + "logits/rejected": 2025136583.1111112, + "logps/chosen": -323.31216032608694, + "logps/rejected": -488.0973849826389, + "loss": 0.2222, + "rewards/chosen": 1.295059287029764, + "rewards/margins": 8.96638920917603, + "rewards/rejected": -7.671329922146267, + "step": 1751 + }, + { + "epoch": 0.6467629551012875, + "grad_norm": 10.25, + "kl": 0.1228790283203125, + "learning_rate": 2.85599186400401e-06, + "logits/chosen": 1503412736.0, + "logits/rejected": 2080323328.0, + "logps/chosen": -238.78848266601562, + "logps/rejected": -579.555419921875, + "loss": 0.135, + "rewards/chosen": 2.1247544288635254, + "rewards/margins": 9.694404125213623, + "rewards/rejected": -7.569649696350098, + "step": 1752 + }, + { + "epoch": 0.6471321120391307, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 2.8506765452379604e-06, + "logits/chosen": 1743120707.368421, + "logits/rejected": 1839980071.3846154, + "logps/chosen": -363.9820106907895, + "logps/rejected": -422.5124323918269, + "loss": 0.1886, + "rewards/chosen": 1.1062774658203125, + "rewards/margins": 8.77898700420673, + "rewards/rejected": -7.672709538386418, + "step": 1753 + }, + { + "epoch": 0.6475012689769738, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 2.8453642043322517e-06, + "logits/chosen": 1610771797.3333333, + "logits/rejected": 1926180864.0, + "logps/chosen": -249.49176025390625, + "logps/rejected": -469.95048828125, + "loss": 0.1099, + "rewards/chosen": 1.3058868249257405, + "rewards/margins": 10.19219511349996, + "rewards/rejected": -8.886308288574218, + "step": 1754 + }, + { + "epoch": 0.647870425914817, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 2.8400548486470657e-06, + "logits/chosen": 2026788756.2105262, + "logits/rejected": 2121689718.1538463, + "logps/chosen": -231.2956414473684, + "logps/rejected": -584.2376051682693, + "loss": 0.1816, + "rewards/chosen": 1.1849013880679482, + "rewards/margins": 9.702947253640364, + "rewards/rejected": -8.518045865572416, + "step": 1755 + }, + { + "epoch": 0.6482395828526603, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 2.834748485538444e-06, + "logits/chosen": 1966098904.6153846, + "logits/rejected": 1581342181.0526316, + "logps/chosen": -229.25392503004807, + "logps/rejected": -433.3120888157895, + "loss": 0.119, + "rewards/chosen": 2.34921382023738, + "rewards/margins": 9.399209644147742, + "rewards/rejected": -7.049995823910362, + "step": 1756 + }, + { + "epoch": 0.6486087397905035, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 2.829445122358285e-06, + "logits/chosen": 2322706841.6, + "logits/rejected": 2196023978.6666665, + "logps/chosen": -248.18466796875, + "logps/rejected": -450.963134765625, + "loss": 0.1784, + "rewards/chosen": 1.3760186195373536, + "rewards/margins": 8.374096012115478, + "rewards/rejected": -6.998077392578125, + "step": 1757 + }, + { + "epoch": 0.6489778967283466, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 2.824144766454333e-06, + "logits/chosen": 1939325952.0, + "logits/rejected": 1755870822.4, + "logps/chosen": -297.0069986979167, + "logps/rejected": -485.7572265625, + "loss": 0.0778, + "rewards/chosen": 2.3295761744181314, + "rewards/margins": 9.803959719340007, + "rewards/rejected": -7.474383544921875, + "step": 1758 + }, + { + "epoch": 0.6493470536661898, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 2.8188474251701647e-06, + "logits/chosen": 1646267090.8235295, + "logits/rejected": 1304236578.1333334, + "logps/chosen": -319.97443704044116, + "logps/rejected": -458.5048502604167, + "loss": 0.1456, + "rewards/chosen": 1.5487974952248966, + "rewards/margins": 9.317224517523075, + "rewards/rejected": -7.7684270222981775, + "step": 1759 + }, + { + "epoch": 0.6497162106040331, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 2.8135531058451746e-06, + "logits/chosen": 1849215658.6666667, + "logits/rejected": 1755812659.2, + "logps/chosen": -308.88242594401044, + "logps/rejected": -443.1197265625, + "loss": 0.1232, + "rewards/chosen": 1.032127857208252, + "rewards/margins": 9.011654376983643, + "rewards/rejected": -7.979526519775391, + "step": 1760 + }, + { + "epoch": 0.6500853675418763, + "grad_norm": 14.125, + "kl": 0.19447994232177734, + "learning_rate": 2.8082618158145792e-06, + "logits/chosen": 1778281472.0, + "logits/rejected": 1562170026.6666667, + "logps/chosen": -282.419921875, + "logps/rejected": -410.0982259114583, + "loss": 0.1719, + "rewards/chosen": 1.660873031616211, + "rewards/margins": 10.095853424072265, + "rewards/rejected": -8.434980392456055, + "step": 1761 + }, + { + "epoch": 0.6504545244797194, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 2.8029735624093936e-06, + "logits/chosen": 2212414720.0, + "logits/rejected": 2238866688.0, + "logps/chosen": -271.9280090332031, + "logps/rejected": -445.75616455078125, + "loss": 0.1317, + "rewards/chosen": 1.4081871509552002, + "rewards/margins": 9.355467081069946, + "rewards/rejected": -7.947279930114746, + "step": 1762 + }, + { + "epoch": 0.6508236814175626, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 2.7976883529564226e-06, + "logits/chosen": 2048523093.3333333, + "logits/rejected": 2126595686.4, + "logps/chosen": -217.1915283203125, + "logps/rejected": -476.541650390625, + "loss": 0.0883, + "rewards/chosen": 2.0120860735575357, + "rewards/margins": 9.227694288889566, + "rewards/rejected": -7.215608215332031, + "step": 1763 + }, + { + "epoch": 0.6511928383554059, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 2.7924061947782576e-06, + "logits/chosen": 1548863146.6666667, + "logits/rejected": 2312828518.4, + "logps/chosen": -322.1246744791667, + "logps/rejected": -491.530322265625, + "loss": 0.0964, + "rewards/chosen": 1.5321962038675945, + "rewards/margins": 10.890319220225015, + "rewards/rejected": -9.358123016357421, + "step": 1764 + }, + { + "epoch": 0.6515619952932491, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 2.7871270951932655e-06, + "logits/chosen": 1460262297.6, + "logits/rejected": 1499130880.0, + "logps/chosen": -227.08955078125, + "logps/rejected": -453.4925130208333, + "loss": 0.1755, + "rewards/chosen": 1.2326203346252442, + "rewards/margins": 10.064694817860921, + "rewards/rejected": -8.832074483235678, + "step": 1765 + }, + { + "epoch": 0.6519311522310922, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 2.7818510615155667e-06, + "logits/chosen": 2071357212.4444444, + "logits/rejected": 1495302144.0, + "logps/chosen": -309.7990993923611, + "logps/rejected": -426.55552455357144, + "loss": 0.1615, + "rewards/chosen": 1.7406115002102323, + "rewards/margins": 8.646820628453815, + "rewards/rejected": -6.9062091282435825, + "step": 1766 + }, + { + "epoch": 0.6523003091689354, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 2.776578101055041e-06, + "logits/chosen": 2932167680.0, + "logits/rejected": 1887281280.0, + "logps/chosen": -289.66436767578125, + "logps/rejected": -437.18310546875, + "loss": 0.1328, + "rewards/chosen": 1.6232454776763916, + "rewards/margins": 8.70189881324768, + "rewards/rejected": -7.078653335571289, + "step": 1767 + }, + { + "epoch": 0.6526694661067787, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 2.771308221117309e-06, + "logits/chosen": 1698562993.2307692, + "logits/rejected": 1912679801.2631578, + "logps/chosen": -257.4043532151442, + "logps/rejected": -425.8418739720395, + "loss": 0.1156, + "rewards/chosen": 1.303918985220102, + "rewards/margins": 8.452072992981204, + "rewards/rejected": -7.148154007761102, + "step": 1768 + }, + { + "epoch": 0.6530386230446218, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 2.7660414290037203e-06, + "logits/chosen": 1576948480.0, + "logits/rejected": 1925420544.0, + "logps/chosen": -259.84942626953125, + "logps/rejected": -525.4083251953125, + "loss": 0.1591, + "rewards/chosen": 1.127535343170166, + "rewards/margins": 9.890512943267822, + "rewards/rejected": -8.762977600097656, + "step": 1769 + }, + { + "epoch": 0.653407779982465, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 2.7607777320113494e-06, + "logits/chosen": 1439648870.4, + "logits/rejected": 1736763050.6666667, + "logps/chosen": -238.5429443359375, + "logps/rejected": -354.5942789713542, + "loss": 0.182, + "rewards/chosen": 1.7514907836914062, + "rewards/margins": 6.631125005086263, + "rewards/rejected": -4.8796342213948565, + "step": 1770 + }, + { + "epoch": 0.6537769369203082, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 2.7555171374329837e-06, + "logits/chosen": 1932329369.6, + "logits/rejected": 2263987029.3333335, + "logps/chosen": -295.6961181640625, + "logps/rejected": -480.8057047526042, + "loss": 0.1883, + "rewards/chosen": 1.2162141799926758, + "rewards/margins": 9.434388160705566, + "rewards/rejected": -8.21817398071289, + "step": 1771 + }, + { + "epoch": 0.6541460938581515, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 2.750259652557108e-06, + "logits/chosen": 1669025792.0, + "logits/rejected": 1351208487.3846154, + "logps/chosen": -278.5795127467105, + "logps/rejected": -452.48324819711536, + "loss": 0.1533, + "rewards/chosen": 1.666694440339741, + "rewards/margins": 10.010740812973456, + "rewards/rejected": -8.344046372633715, + "step": 1772 + }, + { + "epoch": 0.6545152507959946, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 2.7450052846678987e-06, + "logits/chosen": 2418753536.0, + "logits/rejected": 2141584213.3333333, + "logps/chosen": -249.714892578125, + "logps/rejected": -467.9006754557292, + "loss": 0.1901, + "rewards/chosen": 1.1067865371704102, + "rewards/margins": 10.415618069966635, + "rewards/rejected": -9.308831532796225, + "step": 1773 + }, + { + "epoch": 0.6548844077338378, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 2.7397540410452206e-06, + "logits/chosen": 1712186221.7142856, + "logits/rejected": 1981263416.8888888, + "logps/chosen": -211.16264997209822, + "logps/rejected": -525.6368815104166, + "loss": 0.1059, + "rewards/chosen": 2.2943335941859653, + "rewards/margins": 9.344783389379106, + "rewards/rejected": -7.050449795193142, + "step": 1774 + }, + { + "epoch": 0.655253564671681, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 2.734505928964601e-06, + "logits/chosen": 1447523879.3846154, + "logits/rejected": 1704261955.368421, + "logps/chosen": -320.89539513221155, + "logps/rejected": -504.8295127467105, + "loss": 0.0615, + "rewards/chosen": 2.358127007117638, + "rewards/margins": 9.693497522639841, + "rewards/rejected": -7.335370515522204, + "step": 1775 + }, + { + "epoch": 0.6556227216095243, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 2.7292609556972333e-06, + "logits/chosen": 1250325211.4285715, + "logits/rejected": 1641172536.8888888, + "logps/chosen": -243.51906040736608, + "logps/rejected": -462.79584418402777, + "loss": 0.1699, + "rewards/chosen": 0.6329573903764997, + "rewards/margins": 7.918899468013218, + "rewards/rejected": -7.285942077636719, + "step": 1776 + }, + { + "epoch": 0.6559918785473674, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 2.724019128509964e-06, + "logits/chosen": 2513929830.4, + "logits/rejected": 1626523089.4545455, + "logps/chosen": -276.0488525390625, + "logps/rejected": -465.8650568181818, + "loss": 0.1023, + "rewards/chosen": 1.319387435913086, + "rewards/margins": 8.150345022028143, + "rewards/rejected": -6.830957586115057, + "step": 1777 + }, + { + "epoch": 0.6563610354852106, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 2.7187804546652742e-06, + "logits/chosen": 1567583883.6363637, + "logits/rejected": 2179145318.4, + "logps/chosen": -253.1637517755682, + "logps/rejected": -485.55888671875, + "loss": 0.1772, + "rewards/chosen": 1.572661659934304, + "rewards/margins": 8.904539559104226, + "rewards/rejected": -7.331877899169922, + "step": 1778 + }, + { + "epoch": 0.6567301924230539, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 2.7135449414212822e-06, + "logits/chosen": 1805073203.2, + "logits/rejected": 1795851685.6470587, + "logps/chosen": -304.08942057291665, + "logps/rejected": -427.2484777113971, + "loss": 0.1711, + "rewards/chosen": 0.7443264643351237, + "rewards/margins": 8.739339533039168, + "rewards/rejected": -7.995013068704044, + "step": 1779 + }, + { + "epoch": 0.6570993493608971, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 2.708312596031727e-06, + "logits/chosen": 1590097799.5294118, + "logits/rejected": 1544811861.3333333, + "logps/chosen": -348.0030158547794, + "logps/rejected": -387.2202473958333, + "loss": 0.1917, + "rewards/chosen": 0.9926937327665442, + "rewards/margins": 7.450203091490502, + "rewards/rejected": -6.457509358723958, + "step": 1780 + }, + { + "epoch": 0.6574685062987402, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 2.7030834257459513e-06, + "logits/chosen": 1866278068.7058823, + "logits/rejected": 1677997670.4, + "logps/chosen": -323.55980009191177, + "logps/rejected": -547.9143880208334, + "loss": 0.1521, + "rewards/chosen": 1.3404015933766085, + "rewards/margins": 9.195265586703432, + "rewards/rejected": -7.8548639933268225, + "step": 1781 + }, + { + "epoch": 0.6578376632365834, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 2.6978574378089085e-06, + "logits/chosen": 2094499254.857143, + "logits/rejected": 2740532792.888889, + "logps/chosen": -312.24703543526783, + "logps/rejected": -432.92724609375, + "loss": 0.116, + "rewards/chosen": 1.5192752565656389, + "rewards/margins": 7.947457646566724, + "rewards/rejected": -6.428182390001085, + "step": 1782 + }, + { + "epoch": 0.6582068201744267, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 2.692634639461138e-06, + "logits/chosen": 1924056746.6666667, + "logits/rejected": 2465758003.2, + "logps/chosen": -275.78798421223956, + "logps/rejected": -481.33671875, + "loss": 0.1041, + "rewards/chosen": 1.3930780092875164, + "rewards/margins": 8.414834753672281, + "rewards/rejected": -7.021756744384765, + "step": 1783 + }, + { + "epoch": 0.6585759771122699, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 2.6874150379387583e-06, + "logits/chosen": 1852525514.1052632, + "logits/rejected": 1400043204.9230769, + "logps/chosen": -314.0132349917763, + "logps/rejected": -568.2874098557693, + "loss": 0.1451, + "rewards/chosen": 1.6873496206183183, + "rewards/margins": 8.383081150441035, + "rewards/rejected": -6.695731529822717, + "step": 1784 + }, + { + "epoch": 0.658945134050113, + "grad_norm": 17.25, + "kl": 0.0, + "learning_rate": 2.6821986404734623e-06, + "logits/chosen": 2103991575.2727273, + "logits/rejected": 1735128064.0, + "logps/chosen": -320.56551846590907, + "logps/rejected": -419.467919921875, + "loss": 0.1881, + "rewards/chosen": 1.5679621262983843, + "rewards/margins": 8.780196623368697, + "rewards/rejected": -7.2122344970703125, + "step": 1785 + }, + { + "epoch": 0.6593142909879562, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 2.6769854542925045e-06, + "logits/chosen": 1588650586.3529413, + "logits/rejected": 1565297595.7333333, + "logps/chosen": -391.79308363970586, + "logps/rejected": -441.53359375, + "loss": 0.1507, + "rewards/chosen": 1.3654640422147863, + "rewards/margins": 8.949434175678327, + "rewards/rejected": -7.583970133463541, + "step": 1786 + }, + { + "epoch": 0.6596834479257995, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 2.6717754866186845e-06, + "logits/chosen": 1782974756.5714285, + "logits/rejected": 1556200448.0, + "logps/chosen": -247.95439801897322, + "logps/rejected": -458.67811414930554, + "loss": 0.1553, + "rewards/chosen": 0.9813518524169922, + "rewards/margins": 8.161427603827583, + "rewards/rejected": -7.18007575141059, + "step": 1787 + }, + { + "epoch": 0.6600526048636427, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 2.666568744670348e-06, + "logits/chosen": 1456236288.0, + "logits/rejected": 2672261632.0, + "logps/chosen": -219.67930603027344, + "logps/rejected": -523.4579467773438, + "loss": 0.1177, + "rewards/chosen": 1.7884666919708252, + "rewards/margins": 10.376601457595825, + "rewards/rejected": -8.588134765625, + "step": 1788 + }, + { + "epoch": 0.6604217618014858, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 2.6613652356613716e-06, + "logits/chosen": 1501416106.6666667, + "logits/rejected": 2062051886.5454545, + "logps/chosen": -264.8597470238095, + "logps/rejected": -556.0903764204545, + "loss": 0.1764, + "rewards/chosen": 1.6580685206821986, + "rewards/margins": 10.46091317511224, + "rewards/rejected": -8.802844654430043, + "step": 1789 + }, + { + "epoch": 0.660790918739329, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 2.656164966801149e-06, + "logits/chosen": 1686411673.6, + "logits/rejected": 2078693014.5882354, + "logps/chosen": -273.3792317708333, + "logps/rejected": -495.5188993566176, + "loss": 0.1521, + "rewards/chosen": 1.5920888264973958, + "rewards/margins": 9.744242290422028, + "rewards/rejected": -8.152153463924632, + "step": 1790 + }, + { + "epoch": 0.6611600756771723, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 2.6509679452945847e-06, + "logits/chosen": 1464935290.4347825, + "logits/rejected": 1434922325.3333333, + "logps/chosen": -213.1452105978261, + "logps/rejected": -532.7172309027778, + "loss": 0.1647, + "rewards/chosen": 1.959974537725034, + "rewards/margins": 9.786448197664269, + "rewards/rejected": -7.826473659939236, + "step": 1791 + }, + { + "epoch": 0.6615292326150155, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 2.6457741783420885e-06, + "logits/chosen": 1753793604.2666667, + "logits/rejected": 1626140792.4705882, + "logps/chosen": -292.4064453125, + "logps/rejected": -486.0618106617647, + "loss": 0.1371, + "rewards/chosen": 1.2500040690104166, + "rewards/margins": 10.326674098594516, + "rewards/rejected": -9.076670029584099, + "step": 1792 + }, + { + "epoch": 0.6618983895528586, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 2.6405836731395594e-06, + "logits/chosen": 1779415168.0, + "logits/rejected": 1361864576.0, + "logps/chosen": -369.597900390625, + "logps/rejected": -876.531982421875, + "loss": 0.1455, + "rewards/chosen": 1.595271110534668, + "rewards/margins": 31.889519691467285, + "rewards/rejected": -30.294248580932617, + "step": 1793 + }, + { + "epoch": 0.6622675464907019, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 2.635396436878374e-06, + "logits/chosen": 1362979532.8, + "logits/rejected": 1437518848.0, + "logps/chosen": -266.05390625, + "logps/rejected": -389.6498209635417, + "loss": 0.1717, + "rewards/chosen": 1.6777544021606445, + "rewards/margins": 7.491937319437663, + "rewards/rejected": -5.8141829172770185, + "step": 1794 + }, + { + "epoch": 0.6626367034285451, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 2.630212476745383e-06, + "logits/chosen": 1871706931.2, + "logits/rejected": 1808788058.3529413, + "logps/chosen": -256.8220703125, + "logps/rejected": -424.83228975183823, + "loss": 0.1053, + "rewards/chosen": 1.801486078898112, + "rewards/margins": 10.079095511342965, + "rewards/rejected": -8.277609432444853, + "step": 1795 + }, + { + "epoch": 0.6630058603663883, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 2.6250317999228993e-06, + "logits/chosen": 2002431109.5652175, + "logits/rejected": 2309064476.4444447, + "logps/chosen": -290.6122409986413, + "logps/rejected": -428.26063368055554, + "loss": 0.2252, + "rewards/chosen": 0.9820673569389011, + "rewards/margins": 9.850985034076487, + "rewards/rejected": -8.868917677137587, + "step": 1796 + }, + { + "epoch": 0.6633750173042314, + "grad_norm": 14.625, + "kl": 0.5518121719360352, + "learning_rate": 2.6198544135886818e-06, + "logits/chosen": 1749354172.631579, + "logits/rejected": 2079570707.6923077, + "logps/chosen": -252.75208162006578, + "logps/rejected": -509.5930363581731, + "loss": 0.2312, + "rewards/chosen": 0.6273404171592311, + "rewards/margins": 8.777539720419448, + "rewards/rejected": -8.150199303260216, + "step": 1797 + }, + { + "epoch": 0.6637441742420747, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 2.6146803249159335e-06, + "logits/chosen": 1201631368.5333333, + "logits/rejected": 1398584500.7058823, + "logps/chosen": -219.70162760416667, + "logps/rejected": -465.7134363511029, + "loss": 0.1121, + "rewards/chosen": 1.4890632629394531, + "rewards/margins": 8.821295569924747, + "rewards/rejected": -7.332232306985294, + "step": 1798 + }, + { + "epoch": 0.6641133311799179, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 2.609509541073293e-06, + "logits/chosen": 1377925632.0, + "logits/rejected": 2201597952.0, + "logps/chosen": -246.51422119140625, + "logps/rejected": -529.2877197265625, + "loss": 0.1067, + "rewards/chosen": 2.3279287815093994, + "rewards/margins": 10.458044290542603, + "rewards/rejected": -8.130115509033203, + "step": 1799 + }, + { + "epoch": 0.6644824881177611, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 2.60434206922481e-06, + "logits/chosen": 2230923605.3333335, + "logits/rejected": 2296409907.2, + "logps/chosen": -328.73207600911456, + "logps/rejected": -429.267919921875, + "loss": 0.1489, + "rewards/chosen": 0.6577372550964355, + "rewards/margins": 8.869567394256592, + "rewards/rejected": -8.211830139160156, + "step": 1800 + }, + { + "epoch": 0.6648516450556042, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 2.599177916529954e-06, + "logits/chosen": 1276581376.0, + "logits/rejected": 1358893312.0, + "logps/chosen": -225.39784240722656, + "logps/rejected": -587.2905883789062, + "loss": 0.1591, + "rewards/chosen": 0.9404831528663635, + "rewards/margins": 9.967606604099274, + "rewards/rejected": -9.02712345123291, + "step": 1801 + }, + { + "epoch": 0.6652208019934475, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 2.5940170901435945e-06, + "logits/chosen": 2114540483.764706, + "logits/rejected": 1802040934.4, + "logps/chosen": -300.80026424632354, + "logps/rejected": -541.8185221354166, + "loss": 0.1122, + "rewards/chosen": 2.095517775591682, + "rewards/margins": 9.999207350786994, + "rewards/rejected": -7.903689575195313, + "step": 1802 + }, + { + "epoch": 0.6655899589312907, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 2.5888595972159864e-06, + "logits/chosen": 1587220753.0666666, + "logits/rejected": 1886542908.235294, + "logps/chosen": -266.887548828125, + "logps/rejected": -499.2874540441176, + "loss": 0.155, + "rewards/chosen": 1.3916994730631511, + "rewards/margins": 9.068980572270412, + "rewards/rejected": -7.677281099207261, + "step": 1803 + }, + { + "epoch": 0.6659591158691338, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 2.5837054448927733e-06, + "logits/chosen": 1066392576.0, + "logits/rejected": 1877362688.0, + "logps/chosen": -252.239501953125, + "logps/rejected": -555.009521484375, + "loss": 0.1306, + "rewards/chosen": 1.5011729001998901, + "rewards/margins": 10.464088320732117, + "rewards/rejected": -8.962915420532227, + "step": 1804 + }, + { + "epoch": 0.666328272806977, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 2.5785546403149696e-06, + "logits/chosen": 1695060329.4117646, + "logits/rejected": 1453831372.8, + "logps/chosen": -194.9714786305147, + "logps/rejected": -378.96715494791664, + "loss": 0.1423, + "rewards/chosen": 1.4430308622472428, + "rewards/margins": 7.6814870796951595, + "rewards/rejected": -6.238456217447917, + "step": 1805 + }, + { + "epoch": 0.6666974297448203, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 2.573407190618948e-06, + "logits/chosen": 2027876205.7142856, + "logits/rejected": 1553025433.6, + "logps/chosen": -257.9387904575893, + "logps/rejected": -516.3011328125, + "loss": 0.109, + "rewards/chosen": 0.3104919024876186, + "rewards/margins": 9.315559041159492, + "rewards/rejected": -9.005067138671874, + "step": 1806 + }, + { + "epoch": 0.6670665866826635, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 2.568263102936431e-06, + "logits/chosen": 1662700416.0, + "logits/rejected": 2655929856.0, + "logps/chosen": -230.6611785888672, + "logps/rejected": -540.3072509765625, + "loss": 0.129, + "rewards/chosen": 1.9568324089050293, + "rewards/margins": 12.016018390655518, + "rewards/rejected": -10.059185981750488, + "step": 1807 + }, + { + "epoch": 0.6674357436205066, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 2.5631223843944937e-06, + "logits/chosen": 1893139524.2666667, + "logits/rejected": 2002654870.5882354, + "logps/chosen": -243.64425455729167, + "logps/rejected": -400.11827895220586, + "loss": 0.1203, + "rewards/chosen": 1.8218401590983073, + "rewards/margins": 8.608680261350145, + "rewards/rejected": -6.786840102251838, + "step": 1808 + }, + { + "epoch": 0.6678049005583498, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 2.5579850421155294e-06, + "logits/chosen": 1557470549.3333333, + "logits/rejected": 1371534482.2857144, + "logps/chosen": -239.25724283854166, + "logps/rejected": -590.69921875, + "loss": 0.1132, + "rewards/chosen": 1.9917689429389105, + "rewards/margins": 11.000986886402917, + "rewards/rejected": -9.009217943464007, + "step": 1809 + }, + { + "epoch": 0.6681740574961931, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 2.5528510832172646e-06, + "logits/chosen": 1348889258.6666667, + "logits/rejected": 1264755060.3636363, + "logps/chosen": -235.9487769717262, + "logps/rejected": -485.80721768465907, + "loss": 0.1391, + "rewards/chosen": 2.284846532912481, + "rewards/margins": 10.624069527630166, + "rewards/rejected": -8.339222994717685, + "step": 1810 + }, + { + "epoch": 0.6685432144340363, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 2.5477205148127347e-06, + "logits/chosen": 1224534784.0, + "logits/rejected": 1837919232.0, + "logps/chosen": -285.3564453125, + "logps/rejected": -559.5455932617188, + "loss": 0.1328, + "rewards/chosen": 1.9252358675003052, + "rewards/margins": 10.526804089546204, + "rewards/rejected": -8.601568222045898, + "step": 1811 + }, + { + "epoch": 0.6689123713718794, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 2.5425933440102737e-06, + "logits/chosen": 2082836007.3846154, + "logits/rejected": 1845945182.3157895, + "logps/chosen": -299.2668644831731, + "logps/rejected": -492.24229029605266, + "loss": 0.1324, + "rewards/chosen": 1.3280663123497596, + "rewards/margins": 7.850541705544661, + "rewards/rejected": -6.5224753931949015, + "step": 1812 + }, + { + "epoch": 0.6692815283097227, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 2.537469577913514e-06, + "logits/chosen": 1657822859.6363637, + "logits/rejected": 1332249941.3333333, + "logps/chosen": -256.6019398082386, + "logps/rejected": -377.5781017485119, + "loss": 0.1362, + "rewards/chosen": 1.012354157187722, + "rewards/margins": 7.594868602174701, + "rewards/rejected": -6.5825144449869795, + "step": 1813 + }, + { + "epoch": 0.6696506852475659, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 2.532349223621369e-06, + "logits/chosen": 2157450293.894737, + "logits/rejected": 1659152699.0769231, + "logps/chosen": -312.1851356907895, + "logps/rejected": -388.30855618990387, + "loss": 0.1802, + "rewards/chosen": 1.1523074099892063, + "rewards/margins": 9.62126105613554, + "rewards/rejected": -8.468953646146334, + "step": 1814 + }, + { + "epoch": 0.6700198421854091, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 2.527232288228022e-06, + "logits/chosen": 2174915242.6666665, + "logits/rejected": 1484675794.8235295, + "logps/chosen": -294.6117838541667, + "logps/rejected": -472.72107651654414, + "loss": 0.1354, + "rewards/chosen": 1.3754538218180339, + "rewards/margins": 9.995120426252777, + "rewards/rejected": -8.619666604434743, + "step": 1815 + }, + { + "epoch": 0.6703889991232522, + "grad_norm": 12.0625, + "kl": 0.11527442932128906, + "learning_rate": 2.522118778822924e-06, + "logits/chosen": 1767250944.0, + "logits/rejected": 2073569792.0, + "logps/chosen": -296.074951171875, + "logps/rejected": -550.4539794921875, + "loss": 0.1547, + "rewards/chosen": 1.7766334533691406, + "rewards/margins": 9.777936808268228, + "rewards/rejected": -8.001303354899088, + "step": 1816 + }, + { + "epoch": 0.6707581560610955, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 2.517008702490778e-06, + "logits/chosen": 1634676167.1111112, + "logits/rejected": 1793366893.7142856, + "logps/chosen": -234.28738064236111, + "logps/rejected": -426.3314732142857, + "loss": 0.1304, + "rewards/chosen": 2.016174740261502, + "rewards/margins": 8.711341736808656, + "rewards/rejected": -6.695166996547154, + "step": 1817 + }, + { + "epoch": 0.6711273129989387, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 2.511902066311527e-06, + "logits/chosen": 1050155212.8, + "logits/rejected": 1315798954.6666667, + "logps/chosen": -206.6834228515625, + "logps/rejected": -500.2472330729167, + "loss": 0.1212, + "rewards/chosen": 2.235467529296875, + "rewards/margins": 10.279139455159505, + "rewards/rejected": -8.04367192586263, + "step": 1818 + }, + { + "epoch": 0.6714964699367819, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 2.5067988773603523e-06, + "logits/chosen": 2893668352.0, + "logits/rejected": 3052377702.4, + "logps/chosen": -278.1406824448529, + "logps/rejected": -715.4054036458333, + "loss": 0.1743, + "rewards/chosen": 1.00731580397662, + "rewards/margins": 10.713310473572975, + "rewards/rejected": -9.705994669596354, + "step": 1819 + }, + { + "epoch": 0.671865626874625, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 2.5016991427076585e-06, + "logits/chosen": 1815605248.0, + "logits/rejected": 1442384179.2, + "logps/chosen": -299.70401278409093, + "logps/rejected": -437.274267578125, + "loss": 0.226, + "rewards/chosen": 1.1419852863658557, + "rewards/margins": 7.598835719715465, + "rewards/rejected": -6.45685043334961, + "step": 1820 + }, + { + "epoch": 0.6722347838124683, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 2.4966028694190607e-06, + "logits/chosen": 1719133184.0, + "logits/rejected": 1725091986.2857144, + "logps/chosen": -294.2526584201389, + "logps/rejected": -424.90262276785717, + "loss": 0.172, + "rewards/chosen": 1.1304768456353083, + "rewards/margins": 9.704093963380846, + "rewards/rejected": -8.573617117745536, + "step": 1821 + }, + { + "epoch": 0.6726039407503115, + "grad_norm": 7.53125, + "kl": 0.0, + "learning_rate": 2.491510064555381e-06, + "logits/chosen": 3066610944.0, + "logits/rejected": 2005792000.0, + "logps/chosen": -212.1416015625, + "logps/rejected": -473.2079772949219, + "loss": 0.0882, + "rewards/chosen": 2.241234064102173, + "rewards/margins": 9.862127542495728, + "rewards/rejected": -7.620893478393555, + "step": 1822 + }, + { + "epoch": 0.6729730976881547, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 2.48642073517264e-06, + "logits/chosen": 2007819059.2, + "logits/rejected": 1447310506.6666667, + "logps/chosen": -264.1034423828125, + "logps/rejected": -471.8236897786458, + "loss": 0.1276, + "rewards/chosen": 2.324996566772461, + "rewards/margins": 10.888628260294595, + "rewards/rejected": -8.563631693522135, + "step": 1823 + }, + { + "epoch": 0.6733422546259978, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 2.481334888322035e-06, + "logits/chosen": 1266717635.764706, + "logits/rejected": 2068051694.9333334, + "logps/chosen": -291.97506893382354, + "logps/rejected": -523.9169270833333, + "loss": 0.1726, + "rewards/chosen": 1.095582176657284, + "rewards/margins": 10.42832753798541, + "rewards/rejected": -9.332745361328126, + "step": 1824 + }, + { + "epoch": 0.6737114115638411, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 2.4762525310499413e-06, + "logits/chosen": 1531175680.0, + "logits/rejected": 2002698880.0, + "logps/chosen": -263.8525695800781, + "logps/rejected": -449.5025634765625, + "loss": 0.1685, + "rewards/chosen": 1.0050832033157349, + "rewards/margins": 8.547943711280823, + "rewards/rejected": -7.542860507965088, + "step": 1825 + }, + { + "epoch": 0.6740805685016843, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 2.4711736703979015e-06, + "logits/chosen": 2065974193.2307692, + "logits/rejected": 2230527029.894737, + "logps/chosen": -272.4829852764423, + "logps/rejected": -470.12577097039474, + "loss": 0.1437, + "rewards/chosen": 0.9971044246967022, + "rewards/margins": 8.440077511405173, + "rewards/rejected": -7.44297308670847, + "step": 1826 + }, + { + "epoch": 0.6744497254395275, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 2.4660983134026156e-06, + "logits/chosen": 2596429824.0, + "logits/rejected": 1548359141.0526316, + "logps/chosen": -230.12659630408655, + "logps/rejected": -554.060701069079, + "loss": 0.1222, + "rewards/chosen": 1.427984090951773, + "rewards/margins": 10.860303330517974, + "rewards/rejected": -9.432319239566201, + "step": 1827 + }, + { + "epoch": 0.6748188823773706, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 2.461026467095921e-06, + "logits/chosen": 1696020293.8181818, + "logits/rejected": 1847711539.2, + "logps/chosen": -258.0262562144886, + "logps/rejected": -364.776171875, + "loss": 0.1816, + "rewards/chosen": 1.817153583873402, + "rewards/margins": 8.226541553844106, + "rewards/rejected": -6.4093879699707035, + "step": 1828 + }, + { + "epoch": 0.6751880393152139, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 2.4559581385047993e-06, + "logits/chosen": 1577357204.2105262, + "logits/rejected": 1749869804.3076923, + "logps/chosen": -277.63905736019734, + "logps/rejected": -469.46326622596155, + "loss": 0.1519, + "rewards/chosen": 1.7740956356650905, + "rewards/margins": 9.538242293755536, + "rewards/rejected": -7.764146658090445, + "step": 1829 + }, + { + "epoch": 0.6755571962530571, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 2.4508933346513563e-06, + "logits/chosen": 1437677714.2857144, + "logits/rejected": 1581115164.4444444, + "logps/chosen": -304.3115931919643, + "logps/rejected": -440.0614420572917, + "loss": 0.1527, + "rewards/chosen": 0.8881688117980957, + "rewards/margins": 8.064026355743408, + "rewards/rejected": -7.1758575439453125, + "step": 1830 + }, + { + "epoch": 0.6759263531909003, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 2.445832062552811e-06, + "logits/chosen": 2010082048.0, + "logits/rejected": 2248150016.0, + "logps/chosen": -263.6219787597656, + "logps/rejected": -516.5211791992188, + "loss": 0.1692, + "rewards/chosen": 0.989793598651886, + "rewards/margins": 9.436528980731964, + "rewards/rejected": -8.446735382080078, + "step": 1831 + }, + { + "epoch": 0.6762955101287434, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 2.440774329221492e-06, + "logits/chosen": 2348346026.6666665, + "logits/rejected": 1381610057.142857, + "logps/chosen": -184.9735107421875, + "logps/rejected": -487.77040318080356, + "loss": 0.1377, + "rewards/chosen": 1.6971351835462782, + "rewards/margins": 9.0003142432561, + "rewards/rejected": -7.303179059709821, + "step": 1832 + }, + { + "epoch": 0.6766646670665867, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 2.435720141664827e-06, + "logits/chosen": 1616903918.9333334, + "logits/rejected": 1634448203.2941177, + "logps/chosen": -283.48828125, + "logps/rejected": -385.28610409007354, + "loss": 0.1005, + "rewards/chosen": 2.2429850260416666, + "rewards/margins": 9.774015119964002, + "rewards/rejected": -7.531030093922334, + "step": 1833 + }, + { + "epoch": 0.6770338240044299, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 2.430669506885326e-06, + "logits/chosen": 2366322064.695652, + "logits/rejected": 2592576853.3333335, + "logps/chosen": -353.7388756793478, + "logps/rejected": -417.4991861979167, + "loss": 0.1808, + "rewards/chosen": 1.5348212200662363, + "rewards/margins": 8.88172639856016, + "rewards/rejected": -7.346905178493923, + "step": 1834 + }, + { + "epoch": 0.6774029809422731, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 2.425622431880579e-06, + "logits/chosen": 1707603968.0, + "logits/rejected": 1606638114.1333334, + "logps/chosen": -258.8014131433824, + "logps/rejected": -449.73352864583336, + "loss": 0.1643, + "rewards/chosen": 1.2624996409696692, + "rewards/margins": 10.162690884459252, + "rewards/rejected": -8.900191243489584, + "step": 1835 + }, + { + "epoch": 0.6777721378801163, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 2.420578923643248e-06, + "logits/chosen": 1684716604.235294, + "logits/rejected": 1211133678.9333334, + "logps/chosen": -263.18175551470586, + "logps/rejected": -569.7994791666666, + "loss": 0.1179, + "rewards/chosen": 2.0082756491268383, + "rewards/margins": 14.481849460975797, + "rewards/rejected": -12.473573811848958, + "step": 1836 + }, + { + "epoch": 0.6781412948179595, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 2.4155389891610454e-06, + "logits/chosen": 2158068224.0, + "logits/rejected": 1304492160.0, + "logps/chosen": -288.1869201660156, + "logps/rejected": -473.0697326660156, + "loss": 0.1589, + "rewards/chosen": 1.2390435934066772, + "rewards/margins": 8.955077528953552, + "rewards/rejected": -7.716033935546875, + "step": 1837 + }, + { + "epoch": 0.6785104517558027, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 2.4105026354167376e-06, + "logits/chosen": 1422285092.5714285, + "logits/rejected": 1917650033.7777777, + "logps/chosen": -357.55360630580356, + "logps/rejected": -507.98871527777777, + "loss": 0.1629, + "rewards/chosen": 0.7540498461042132, + "rewards/margins": 9.566576972840323, + "rewards/rejected": -8.81252712673611, + "step": 1838 + }, + { + "epoch": 0.6788796086936458, + "grad_norm": 10.5, + "kl": 1.847276210784912, + "learning_rate": 2.405469869388131e-06, + "logits/chosen": 1842025865.8461537, + "logits/rejected": 1610903013.0526316, + "logps/chosen": -258.28555063100964, + "logps/rejected": -451.9459292763158, + "loss": 0.1289, + "rewards/chosen": 1.5402676508976862, + "rewards/margins": 9.692441144935515, + "rewards/rejected": -8.152173494037829, + "step": 1839 + }, + { + "epoch": 0.6792487656314891, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 2.400440698048056e-06, + "logits/chosen": 1347317760.0, + "logits/rejected": 1674520371.2, + "logps/chosen": -288.7831217447917, + "logps/rejected": -424.494287109375, + "loss": 0.0965, + "rewards/chosen": 1.6326177914937336, + "rewards/margins": 9.55650389989217, + "rewards/rejected": -7.923886108398437, + "step": 1840 + }, + { + "epoch": 0.6796179225693323, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 2.395415128364368e-06, + "logits/chosen": 1447200674.909091, + "logits/rejected": 1750732019.8095238, + "logps/chosen": -231.3095703125, + "logps/rejected": -546.2236793154761, + "loss": 0.0887, + "rewards/chosen": 1.4354571429165928, + "rewards/margins": 10.757238119711609, + "rewards/rejected": -9.321780976795015, + "step": 1841 + }, + { + "epoch": 0.6799870795071755, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 2.390393167299929e-06, + "logits/chosen": 1373770342.4, + "logits/rejected": 1834337745.4545455, + "logps/chosen": -291.35107421875, + "logps/rejected": -446.21244673295456, + "loss": 0.0598, + "rewards/chosen": 2.116550254821777, + "rewards/margins": 10.414523194052958, + "rewards/rejected": -8.29797293923118, + "step": 1842 + }, + { + "epoch": 0.6803562364450186, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 2.3853748218126e-06, + "logits/chosen": 2044567142.4, + "logits/rejected": 1645286339.764706, + "logps/chosen": -216.75791015625, + "logps/rejected": -424.40837545955884, + "loss": 0.1935, + "rewards/chosen": 0.7933956146240234, + "rewards/margins": 7.809014780381146, + "rewards/rejected": -7.015619165757123, + "step": 1843 + }, + { + "epoch": 0.6807253933828619, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 2.3803600988552373e-06, + "logits/chosen": 2239281425.0666666, + "logits/rejected": 1709942904.4705882, + "logps/chosen": -290.30120442708335, + "logps/rejected": -477.1741153492647, + "loss": 0.1794, + "rewards/chosen": 0.8928747812906901, + "rewards/margins": 9.864970943974514, + "rewards/rejected": -8.972096162683824, + "step": 1844 + }, + { + "epoch": 0.6810945503207051, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 2.3753490053756766e-06, + "logits/chosen": 2340735561.142857, + "logits/rejected": 1781532785.7777777, + "logps/chosen": -329.75209263392856, + "logps/rejected": -460.8782552083333, + "loss": 0.1276, + "rewards/chosen": 1.5030321393694197, + "rewards/margins": 8.313785855732267, + "rewards/rejected": -6.810753716362847, + "step": 1845 + }, + { + "epoch": 0.6814637072585483, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 2.370341548316722e-06, + "logits/chosen": 1669777250.4615386, + "logits/rejected": 1907553118.3157895, + "logps/chosen": -295.5551945612981, + "logps/rejected": -462.0615748355263, + "loss": 0.1568, + "rewards/chosen": 1.6138220566969652, + "rewards/margins": 7.586136188584301, + "rewards/rejected": -5.972314131887336, + "step": 1846 + }, + { + "epoch": 0.6818328641963914, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 2.3653377346161423e-06, + "logits/chosen": 2146256440.8888888, + "logits/rejected": 2179321856.0, + "logps/chosen": -329.32481553819446, + "logps/rejected": -473.94461495535717, + "loss": 0.1266, + "rewards/chosen": 1.5894126892089844, + "rewards/margins": 9.121059962681361, + "rewards/rejected": -7.531647273472378, + "step": 1847 + }, + { + "epoch": 0.6822020211342347, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 2.36033757120666e-06, + "logits/chosen": 1506084659.2, + "logits/rejected": 1893861034.6666667, + "logps/chosen": -296.00234375, + "logps/rejected": -457.6116536458333, + "loss": 0.184, + "rewards/chosen": 1.6127111434936523, + "rewards/margins": 9.999127260843911, + "rewards/rejected": -8.38641611735026, + "step": 1848 + }, + { + "epoch": 0.6825711780720779, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 2.3553410650159347e-06, + "logits/chosen": 2107004672.0, + "logits/rejected": 1820703872.0, + "logps/chosen": -255.36077880859375, + "logps/rejected": -447.7759704589844, + "loss": 0.1824, + "rewards/chosen": 0.808235764503479, + "rewards/margins": 8.178991913795471, + "rewards/rejected": -7.370756149291992, + "step": 1849 + }, + { + "epoch": 0.6829403350099211, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 2.3503482229665637e-06, + "logits/chosen": 1190100129.6842105, + "logits/rejected": 2011496132.9230769, + "logps/chosen": -248.64738384046052, + "logps/rejected": -590.8464918870193, + "loss": 0.1513, + "rewards/chosen": 1.6577070135819285, + "rewards/margins": 11.443496503328022, + "rewards/rejected": -9.785789489746094, + "step": 1850 + }, + { + "epoch": 0.6833094919477642, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 2.3453590519760676e-06, + "logits/chosen": 2181107165.866667, + "logits/rejected": 2016423936.0, + "logps/chosen": -209.05, + "logps/rejected": -440.6190831801471, + "loss": 0.1203, + "rewards/chosen": 1.796624755859375, + "rewards/margins": 8.856276298971737, + "rewards/rejected": -7.059651543112362, + "step": 1851 + }, + { + "epoch": 0.6836786488856075, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 2.340373558956877e-06, + "logits/chosen": 1289178043.7333333, + "logits/rejected": 1951695570.8235295, + "logps/chosen": -254.44558919270833, + "logps/rejected": -420.13947610294116, + "loss": 0.1527, + "rewards/chosen": 1.1091058095296225, + "rewards/margins": 9.303872583426681, + "rewards/rejected": -8.194766773897058, + "step": 1852 + }, + { + "epoch": 0.6840478058234507, + "grad_norm": 16.25, + "kl": 0.14300203323364258, + "learning_rate": 2.3353917508163297e-06, + "logits/chosen": 1694208682.6666667, + "logits/rejected": 2352705776.9411764, + "logps/chosen": -279.57281901041665, + "logps/rejected": -483.86891084558823, + "loss": 0.1443, + "rewards/chosen": 1.3617869059244792, + "rewards/margins": 9.846018293792126, + "rewards/rejected": -8.484231387867647, + "step": 1853 + }, + { + "epoch": 0.6844169627612939, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 2.3304136344566603e-06, + "logits/chosen": 2026874606.9333334, + "logits/rejected": 2259009295.0588236, + "logps/chosen": -328.0416015625, + "logps/rejected": -635.4447380514706, + "loss": 0.1727, + "rewards/chosen": 0.7837352752685547, + "rewards/margins": 11.257474001716165, + "rewards/rejected": -10.47373872644761, + "step": 1854 + }, + { + "epoch": 0.684786119699137, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 2.325439216774982e-06, + "logits/chosen": 1907851264.0, + "logits/rejected": 1161671258.3529413, + "logps/chosen": -195.85983072916667, + "logps/rejected": -559.1506204044117, + "loss": 0.1124, + "rewards/chosen": 1.5873835245768229, + "rewards/margins": 9.785525183584177, + "rewards/rejected": -8.198141659007353, + "step": 1855 + }, + { + "epoch": 0.6851552766369803, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 2.3204685046632884e-06, + "logits/chosen": 2039686290.2857144, + "logits/rejected": 2223153379.5555553, + "logps/chosen": -324.53271484375, + "logps/rejected": -521.7889539930555, + "loss": 0.1273, + "rewards/chosen": 1.6513823100498743, + "rewards/margins": 9.792065802074614, + "rewards/rejected": -8.14068349202474, + "step": 1856 + }, + { + "epoch": 0.6855244335748235, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 2.3155015050084406e-06, + "logits/chosen": 1602740224.0, + "logits/rejected": 1352817112.6153846, + "logps/chosen": -312.42457339638156, + "logps/rejected": -418.7940204326923, + "loss": 0.2057, + "rewards/chosen": 0.9935300726639597, + "rewards/margins": 7.8836491851188875, + "rewards/rejected": -6.8901191124549275, + "step": 1857 + }, + { + "epoch": 0.6858935905126667, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 2.3105382246921516e-06, + "logits/chosen": 1850208737.8823528, + "logits/rejected": 1667200341.3333333, + "logps/chosen": -333.01901424632354, + "logps/rejected": -460.8021484375, + "loss": 0.1397, + "rewards/chosen": 1.599672878489775, + "rewards/margins": 9.994944688385608, + "rewards/rejected": -8.395271809895833, + "step": 1858 + }, + { + "epoch": 0.6862627474505099, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 2.3055786705909803e-06, + "logits/chosen": 1744784822.857143, + "logits/rejected": 1645122446.2222223, + "logps/chosen": -249.96090262276786, + "logps/rejected": -367.3926595052083, + "loss": 0.1245, + "rewards/chosen": 1.4633640561785017, + "rewards/margins": 8.083724536592998, + "rewards/rejected": -6.620360480414496, + "step": 1859 + }, + { + "epoch": 0.6866319043883531, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 2.3006228495763295e-06, + "logits/chosen": 1734515840.0, + "logits/rejected": 1629295744.0, + "logps/chosen": -298.3246154785156, + "logps/rejected": -460.05450439453125, + "loss": 0.174, + "rewards/chosen": 1.0186371803283691, + "rewards/margins": 9.3659987449646, + "rewards/rejected": -8.34736156463623, + "step": 1860 + }, + { + "epoch": 0.6870010613261963, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 2.295670768514427e-06, + "logits/chosen": 1367317267.6923077, + "logits/rejected": 1442224882.5263157, + "logps/chosen": -156.10695237379807, + "logps/rejected": -439.7779091282895, + "loss": 0.0856, + "rewards/chosen": 2.1029814206636868, + "rewards/margins": 9.850369758451516, + "rewards/rejected": -7.747388337787829, + "step": 1861 + }, + { + "epoch": 0.6873702182640395, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 2.290722434266315e-06, + "logits/chosen": 1863443516.235294, + "logits/rejected": 2505469678.9333334, + "logps/chosen": -264.03693704044116, + "logps/rejected": -380.4642578125, + "loss": 0.1623, + "rewards/chosen": 1.3238257239846623, + "rewards/margins": 8.131736388860965, + "rewards/rejected": -6.807910664876302, + "step": 1862 + }, + { + "epoch": 0.6877393752018827, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 2.285777853687849e-06, + "logits/chosen": 1616906870.1538463, + "logits/rejected": 1815105967.1578948, + "logps/chosen": -320.0695988581731, + "logps/rejected": -470.73745888157896, + "loss": 0.1155, + "rewards/chosen": 1.5938695760873647, + "rewards/margins": 11.01686087697141, + "rewards/rejected": -9.422991300884046, + "step": 1863 + }, + { + "epoch": 0.6881085321397259, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 2.280837033629684e-06, + "logits/chosen": 1706536839.5294118, + "logits/rejected": 1157500928.0, + "logps/chosen": -272.28762637867646, + "logps/rejected": -448.58092447916664, + "loss": 0.1396, + "rewards/chosen": 1.6136057236615349, + "rewards/margins": 8.591038991890702, + "rewards/rejected": -6.977433268229166, + "step": 1864 + }, + { + "epoch": 0.6884776890775691, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 2.275899980937262e-06, + "logits/chosen": 2339191466.6666665, + "logits/rejected": 2795441590.857143, + "logps/chosen": -297.89453125, + "logps/rejected": -587.38134765625, + "loss": 0.1655, + "rewards/chosen": 1.4880304336547852, + "rewards/margins": 10.45331505366734, + "rewards/rejected": -8.965284620012556, + "step": 1865 + }, + { + "epoch": 0.6888468460154124, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 2.270966702450806e-06, + "logits/chosen": 1963413048.8888888, + "logits/rejected": 1440222061.7142856, + "logps/chosen": -249.27408854166666, + "logps/rejected": -451.9585658482143, + "loss": 0.1615, + "rewards/chosen": 1.3866745630900066, + "rewards/margins": 8.89476880573091, + "rewards/rejected": -7.508094242640904, + "step": 1866 + }, + { + "epoch": 0.6892160029532555, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 2.2660372050053136e-06, + "logits/chosen": 2219330628.266667, + "logits/rejected": 1696203956.7058823, + "logps/chosen": -364.0068033854167, + "logps/rejected": -476.48879825367646, + "loss": 0.1155, + "rewards/chosen": 1.5993817647298176, + "rewards/margins": 10.498470904780369, + "rewards/rejected": -8.89908914005055, + "step": 1867 + }, + { + "epoch": 0.6895851598910987, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 2.2611114954305372e-06, + "logits/chosen": 2013377142.1538463, + "logits/rejected": 1814668557.4736843, + "logps/chosen": -247.83531775841345, + "logps/rejected": -428.3452919407895, + "loss": 0.121, + "rewards/chosen": 1.9658473088191106, + "rewards/margins": 9.108252768574456, + "rewards/rejected": -7.142405459755345, + "step": 1868 + }, + { + "epoch": 0.6899543168289419, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 2.256189580550987e-06, + "logits/chosen": 1891612672.0, + "logits/rejected": 1822078634.6666667, + "logps/chosen": -323.7779296875, + "logps/rejected": -461.7660725911458, + "loss": 0.2268, + "rewards/chosen": 0.953312873840332, + "rewards/margins": 8.214058876037598, + "rewards/rejected": -7.260746002197266, + "step": 1869 + }, + { + "epoch": 0.6903234737667852, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 2.2512714671859147e-06, + "logits/chosen": 1603711457.8823528, + "logits/rejected": 1154236142.9333334, + "logps/chosen": -406.5901309742647, + "logps/rejected": -512.5411458333333, + "loss": 0.1312, + "rewards/chosen": 1.5213451385498047, + "rewards/margins": 10.866423670450846, + "rewards/rejected": -9.345078531901041, + "step": 1870 + }, + { + "epoch": 0.6906926307046283, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 2.2463571621493006e-06, + "logits/chosen": 1429026923.7894738, + "logits/rejected": 1641615044.9230769, + "logps/chosen": -301.4481907894737, + "logps/rejected": -607.7018855168269, + "loss": 0.134, + "rewards/chosen": 1.653300636693051, + "rewards/margins": 9.862562017402187, + "rewards/rejected": -8.209261380709135, + "step": 1871 + }, + { + "epoch": 0.6910617876424715, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 2.241446672249854e-06, + "logits/chosen": 1883220329.4117646, + "logits/rejected": 1461217006.9333334, + "logps/chosen": -378.24181410845586, + "logps/rejected": -480.53440755208334, + "loss": 0.1774, + "rewards/chosen": 1.1035297618192785, + "rewards/margins": 8.459967444924747, + "rewards/rejected": -7.356437683105469, + "step": 1872 + }, + { + "epoch": 0.6914309445803147, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 2.2365400042909973e-06, + "logits/chosen": 1602360661.3333333, + "logits/rejected": 1336622569.7391305, + "logps/chosen": -303.81011284722223, + "logps/rejected": -503.7156504755435, + "loss": 0.1133, + "rewards/chosen": 1.2572858598497179, + "rewards/margins": 12.177017331699243, + "rewards/rejected": -10.919731471849525, + "step": 1873 + }, + { + "epoch": 0.691800101518158, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 2.2316371650708534e-06, + "logits/chosen": 1758147470.2222223, + "logits/rejected": 2110564059.4285715, + "logps/chosen": -253.47333441840277, + "logps/rejected": -564.9872349330357, + "loss": 0.0936, + "rewards/chosen": 2.623703214857313, + "rewards/margins": 11.19162716941228, + "rewards/rejected": -8.567923954554967, + "step": 1874 + }, + { + "epoch": 0.6921692584560011, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 2.2267381613822482e-06, + "logits/chosen": 1658723474.2857144, + "logits/rejected": 1486572657.7777777, + "logps/chosen": -374.79310825892856, + "logps/rejected": -412.43055555555554, + "loss": 0.1506, + "rewards/chosen": 0.9434152330671038, + "rewards/margins": 7.285788975064717, + "rewards/rejected": -6.342373741997613, + "step": 1875 + }, + { + "epoch": 0.6925384153938443, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 2.2218430000126858e-06, + "logits/chosen": 1371146922.6666667, + "logits/rejected": 1833887451.4285715, + "logps/chosen": -235.55593532986111, + "logps/rejected": -464.63560267857144, + "loss": 0.1644, + "rewards/chosen": 1.7837965223524306, + "rewards/margins": 9.127891479976594, + "rewards/rejected": -7.344094957624163, + "step": 1876 + }, + { + "epoch": 0.6929075723316875, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 2.2169516877443487e-06, + "logits/chosen": 1972519936.0, + "logits/rejected": 2291934720.0, + "logps/chosen": -333.4064025878906, + "logps/rejected": -458.2525634765625, + "loss": 0.1921, + "rewards/chosen": 0.5849500894546509, + "rewards/margins": 9.045790314674377, + "rewards/rejected": -8.460840225219727, + "step": 1877 + }, + { + "epoch": 0.6932767292695307, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 2.2120642313540906e-06, + "logits/chosen": 1925660672.0, + "logits/rejected": 1576332950.5882354, + "logps/chosen": -289.5315755208333, + "logps/rejected": -493.17170266544116, + "loss": 0.1023, + "rewards/chosen": 1.8047245025634766, + "rewards/margins": 9.993597793579102, + "rewards/rejected": -8.188873291015625, + "step": 1878 + }, + { + "epoch": 0.6936458862073739, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 2.207180637613421e-06, + "logits/chosen": 2256676278.857143, + "logits/rejected": 2766745827.5555553, + "logps/chosen": -249.97063337053572, + "logps/rejected": -528.5993923611111, + "loss": 0.1418, + "rewards/chosen": 1.7629266466413225, + "rewards/margins": 9.658927826654343, + "rewards/rejected": -7.8960011800130205, + "step": 1879 + }, + { + "epoch": 0.6940150431452171, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 2.202300913288494e-06, + "logits/chosen": 1400941312.0, + "logits/rejected": 1906510028.8, + "logps/chosen": -340.58412679036456, + "logps/rejected": -420.857470703125, + "loss": 0.0957, + "rewards/chosen": 1.4305424690246582, + "rewards/margins": 8.944724750518798, + "rewards/rejected": -7.514182281494141, + "step": 1880 + }, + { + "epoch": 0.6943842000830603, + "grad_norm": 12.5, + "kl": 0.09846115112304688, + "learning_rate": 2.197425065140107e-06, + "logits/chosen": 1479614008.8888888, + "logits/rejected": 1758102966.857143, + "logps/chosen": -223.58641221788196, + "logps/rejected": -412.44168526785717, + "loss": 0.185, + "rewards/chosen": 1.2820663452148438, + "rewards/margins": 8.536558968680247, + "rewards/rejected": -7.254492623465402, + "step": 1881 + }, + { + "epoch": 0.6947533570209035, + "grad_norm": 13.625, + "kl": 0.11667346954345703, + "learning_rate": 2.1925530999236875e-06, + "logits/chosen": 2112605388.8, + "logits/rejected": 2510091203.7647057, + "logps/chosen": -341.1023763020833, + "logps/rejected": -471.14869600183823, + "loss": 0.1505, + "rewards/chosen": 1.478588612874349, + "rewards/margins": 8.908110061346315, + "rewards/rejected": -7.4295214484719665, + "step": 1882 + }, + { + "epoch": 0.6951225139587467, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 2.1876850243892787e-06, + "logits/chosen": 1589485999.1578948, + "logits/rejected": 2109097038.7692308, + "logps/chosen": -269.87913754111844, + "logps/rejected": -425.1867487980769, + "loss": 0.1276, + "rewards/chosen": 1.847877903988487, + "rewards/margins": 9.74156053056601, + "rewards/rejected": -7.893682626577524, + "step": 1883 + }, + { + "epoch": 0.6954916708965899, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 2.182820845281538e-06, + "logits/chosen": 1456064819.2, + "logits/rejected": 1322483200.0, + "logps/chosen": -225.683447265625, + "logps/rejected": -412.6587727864583, + "loss": 0.1799, + "rewards/chosen": 1.534384059906006, + "rewards/margins": 9.885791428883872, + "rewards/rejected": -8.351407368977865, + "step": 1884 + }, + { + "epoch": 0.6958608278344331, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 2.1779605693397264e-06, + "logits/chosen": 1591956086.1538463, + "logits/rejected": 1660338176.0, + "logps/chosen": -199.99547400841345, + "logps/rejected": -407.19384765625, + "loss": 0.1115, + "rewards/chosen": 1.7219452491173377, + "rewards/margins": 9.653917552005908, + "rewards/rejected": -7.931972302888569, + "step": 1885 + }, + { + "epoch": 0.6962299847722763, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 2.1731042032976903e-06, + "logits/chosen": 1911606613.3333333, + "logits/rejected": 2018521380.5714285, + "logps/chosen": -340.94707573784723, + "logps/rejected": -389.50697544642856, + "loss": 0.1821, + "rewards/chosen": 1.033693101671007, + "rewards/margins": 7.973703717428541, + "rewards/rejected": -6.940010615757534, + "step": 1886 + }, + { + "epoch": 0.6965991417101195, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 2.1682517538838648e-06, + "logits/chosen": 1992220086.857143, + "logits/rejected": 1637858190.2222223, + "logps/chosen": -300.9009486607143, + "logps/rejected": -354.62456597222223, + "loss": 0.1386, + "rewards/chosen": 1.2108236040387834, + "rewards/margins": 7.134843311612569, + "rewards/rejected": -5.924019707573785, + "step": 1887 + }, + { + "epoch": 0.6969682986479627, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 2.1634032278212597e-06, + "logits/chosen": 1752878830.9333334, + "logits/rejected": 1892052992.0, + "logps/chosen": -280.23896484375, + "logps/rejected": -390.1344784007353, + "loss": 0.1517, + "rewards/chosen": 1.23019650777181, + "rewards/margins": 8.16774632323022, + "rewards/rejected": -6.93754981545841, + "step": 1888 + }, + { + "epoch": 0.697337455585806, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 2.1585586318274423e-06, + "logits/chosen": 1318021688.8888888, + "logits/rejected": 1362137673.142857, + "logps/chosen": -219.36673990885416, + "logps/rejected": -537.4291294642857, + "loss": 0.1626, + "rewards/chosen": 1.2826719284057617, + "rewards/margins": 11.069444520132881, + "rewards/rejected": -9.78677259172712, + "step": 1889 + }, + { + "epoch": 0.6977066125236491, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 2.1537179726145395e-06, + "logits/chosen": 2038681600.0, + "logits/rejected": 1719642219.7894738, + "logps/chosen": -315.2042893629808, + "logps/rejected": -274.4374486019737, + "loss": 0.129, + "rewards/chosen": 1.50281128516564, + "rewards/margins": 7.149033851469094, + "rewards/rejected": -5.646222566303454, + "step": 1890 + }, + { + "epoch": 0.6980757694614923, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 2.1488812568892263e-06, + "logits/chosen": 1586156088.8888888, + "logits/rejected": 1418195382.857143, + "logps/chosen": -333.17323133680554, + "logps/rejected": -466.0956333705357, + "loss": 0.1535, + "rewards/chosen": 1.260075569152832, + "rewards/margins": 10.405581201825823, + "rewards/rejected": -9.145505632672991, + "step": 1891 + }, + { + "epoch": 0.6984449263993355, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 2.1440484913527066e-06, + "logits/chosen": 1733926092.8, + "logits/rejected": 1651784523.2941177, + "logps/chosen": -284.1832682291667, + "logps/rejected": -548.9617417279412, + "loss": 0.1531, + "rewards/chosen": 1.169210433959961, + "rewards/margins": 9.932644451365752, + "rewards/rejected": -8.76343401740579, + "step": 1892 + }, + { + "epoch": 0.6988140833371788, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 2.1392196827007193e-06, + "logits/chosen": 1669742136.8888888, + "logits/rejected": 2175218834.285714, + "logps/chosen": -217.19047037760416, + "logps/rejected": -666.4724469866071, + "loss": 0.152, + "rewards/chosen": 1.4164294136895075, + "rewards/margins": 10.599738696264843, + "rewards/rejected": -9.183309282575335, + "step": 1893 + }, + { + "epoch": 0.6991832402750219, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 2.1343948376235146e-06, + "logits/chosen": 1656357156.5714285, + "logits/rejected": 1435724686.2222223, + "logps/chosen": -350.31912667410717, + "logps/rejected": -405.4563259548611, + "loss": 0.0969, + "rewards/chosen": 1.9493780136108398, + "rewards/margins": 9.898159344991047, + "rewards/rejected": -7.948781331380208, + "step": 1894 + }, + { + "epoch": 0.6995523972128651, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 2.1295739628058567e-06, + "logits/chosen": 1657913002.6666667, + "logits/rejected": 1286941900.8, + "logps/chosen": -356.0221761067708, + "logps/rejected": -473.81650390625, + "loss": 0.1531, + "rewards/chosen": 0.5884482065836588, + "rewards/margins": 9.396152369181314, + "rewards/rejected": -8.807704162597656, + "step": 1895 + }, + { + "epoch": 0.6999215541507083, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 2.1247570649270027e-06, + "logits/chosen": 2171823872.0, + "logits/rejected": 2134447104.0, + "logps/chosen": -274.44744873046875, + "logps/rejected": -396.210205078125, + "loss": 0.1654, + "rewards/chosen": 1.4456597566604614, + "rewards/margins": 8.261679768562317, + "rewards/rejected": -6.8160200119018555, + "step": 1896 + }, + { + "epoch": 0.7002907110885516, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 2.119944150660706e-06, + "logits/chosen": 1654459943.3846154, + "logits/rejected": 1246348988.631579, + "logps/chosen": -313.78699669471155, + "logps/rejected": -476.9000308388158, + "loss": 0.0936, + "rewards/chosen": 1.9036068549522986, + "rewards/margins": 10.151361403677628, + "rewards/rejected": -8.247754548725329, + "step": 1897 + }, + { + "epoch": 0.7006598680263947, + "grad_norm": 14.0, + "kl": 0.16172218322753906, + "learning_rate": 2.1151352266751996e-06, + "logits/chosen": 1716693333.3333333, + "logits/rejected": 1476853467.4285715, + "logps/chosen": -286.23643663194446, + "logps/rejected": -505.4270717075893, + "loss": 0.1705, + "rewards/chosen": 1.2921305762396917, + "rewards/margins": 9.423718013460675, + "rewards/rejected": -8.131587437220983, + "step": 1898 + }, + { + "epoch": 0.7010290249642379, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 2.1103302996331832e-06, + "logits/chosen": 1696733476.5714285, + "logits/rejected": 1702422983.1111112, + "logps/chosen": -264.6835239955357, + "logps/rejected": -536.92919921875, + "loss": 0.1689, + "rewards/chosen": 0.7013754844665527, + "rewards/margins": 8.668458885616726, + "rewards/rejected": -7.967083401150173, + "step": 1899 + }, + { + "epoch": 0.7013981819020811, + "grad_norm": 14.0, + "kl": 0.2617521286010742, + "learning_rate": 2.105529376191824e-06, + "logits/chosen": 1652546218.6666667, + "logits/rejected": 2057999540.7058823, + "logps/chosen": -371.07138671875, + "logps/rejected": -544.0606617647059, + "loss": 0.1668, + "rewards/chosen": 0.985341199239095, + "rewards/margins": 10.418890968023561, + "rewards/rejected": -9.433549768784466, + "step": 1900 + }, + { + "epoch": 0.7017673388399244, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 2.1007324630027416e-06, + "logits/chosen": 1790270259.2, + "logits/rejected": 1870257212.235294, + "logps/chosen": -291.78294270833334, + "logps/rejected": -504.85472196691177, + "loss": 0.1468, + "rewards/chosen": 1.140659713745117, + "rewards/margins": 7.371274768604952, + "rewards/rejected": -6.230615054859834, + "step": 1901 + }, + { + "epoch": 0.7021364957777675, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 2.0959395667119946e-06, + "logits/chosen": 1825670272.0, + "logits/rejected": 1857402880.0, + "logps/chosen": -378.6161193847656, + "logps/rejected": -435.9293518066406, + "loss": 0.1509, + "rewards/chosen": 1.1476335525512695, + "rewards/margins": 8.153486728668213, + "rewards/rejected": -7.005853176116943, + "step": 1902 + }, + { + "epoch": 0.7025056527156107, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 2.091150693960083e-06, + "logits/chosen": 1793436250.3529413, + "logits/rejected": 1672517768.5333333, + "logps/chosen": -272.1263212316176, + "logps/rejected": -440.91100260416664, + "loss": 0.1649, + "rewards/chosen": 1.0941445967730354, + "rewards/margins": 8.619170028088138, + "rewards/rejected": -7.525025431315104, + "step": 1903 + }, + { + "epoch": 0.702874809653454, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 2.0863658513819296e-06, + "logits/chosen": 1435449344.0, + "logits/rejected": 1662761283.368421, + "logps/chosen": -342.20688100961536, + "logps/rejected": -591.3154296875, + "loss": 0.1459, + "rewards/chosen": 0.8898434272179236, + "rewards/margins": 11.045775401930095, + "rewards/rejected": -10.155931974712171, + "step": 1904 + }, + { + "epoch": 0.7032439665912972, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 2.0815850456068703e-06, + "logits/chosen": 1414495817.142857, + "logits/rejected": 1939060963.5555556, + "logps/chosen": -300.76548549107144, + "logps/rejected": -364.98084852430554, + "loss": 0.1555, + "rewards/chosen": 1.001375947679792, + "rewards/margins": 7.7880924088614325, + "rewards/rejected": -6.786716461181641, + "step": 1905 + }, + { + "epoch": 0.7036131235291403, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 2.0768082832586524e-06, + "logits/chosen": 2437488230.4, + "logits/rejected": 1745021610.6666667, + "logps/chosen": -286.238671875, + "logps/rejected": -468.1820475260417, + "loss": 0.1165, + "rewards/chosen": 1.8910602569580077, + "rewards/margins": 9.540869776407877, + "rewards/rejected": -7.64980951944987, + "step": 1906 + }, + { + "epoch": 0.7039822804669835, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 2.072035570955421e-06, + "logits/chosen": 2057695547.0769231, + "logits/rejected": 2149988244.2105265, + "logps/chosen": -302.2595778245192, + "logps/rejected": -508.1533717105263, + "loss": 0.1407, + "rewards/chosen": 0.8845767974853516, + "rewards/margins": 10.58692681161981, + "rewards/rejected": -9.702350014134458, + "step": 1907 + }, + { + "epoch": 0.7043514374048268, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 2.067266915309704e-06, + "logits/chosen": 2132096341.3333333, + "logits/rejected": 2309416345.6, + "logps/chosen": -267.05731201171875, + "logps/rejected": -532.04599609375, + "loss": 0.1275, + "rewards/chosen": 1.1825052897135417, + "rewards/margins": 8.603927866617838, + "rewards/rejected": -7.4214225769042965, + "step": 1908 + }, + { + "epoch": 0.70472059434267, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 2.062502322928417e-06, + "logits/chosen": 1818644616.5333333, + "logits/rejected": 2401961863.529412, + "logps/chosen": -271.43004557291664, + "logps/rejected": -492.32278262867646, + "loss": 0.1222, + "rewards/chosen": 1.6339256286621093, + "rewards/margins": 9.716375148997587, + "rewards/rejected": -8.082449520335478, + "step": 1909 + }, + { + "epoch": 0.7050897512805131, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 2.057741800412844e-06, + "logits/chosen": 1640614980.2666667, + "logits/rejected": 1614905223.5294118, + "logps/chosen": -308.77805989583334, + "logps/rejected": -348.58312270220586, + "loss": 0.1704, + "rewards/chosen": 0.8724853515625, + "rewards/margins": 8.667066147748162, + "rewards/rejected": -7.794580796185662, + "step": 1910 + }, + { + "epoch": 0.7054589082183563, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 2.052985354358622e-06, + "logits/chosen": 2263245255.111111, + "logits/rejected": 1640947533.9130435, + "logps/chosen": -387.05560980902777, + "logps/rejected": -398.48445991847825, + "loss": 0.0784, + "rewards/chosen": 1.6075640784369574, + "rewards/margins": 8.718190299140083, + "rewards/rejected": -7.110626220703125, + "step": 1911 + }, + { + "epoch": 0.7058280651561996, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 2.04823299135575e-06, + "logits/chosen": 1558276864.0, + "logits/rejected": 1677312128.0, + "logps/chosen": -332.3336486816406, + "logps/rejected": -475.0021667480469, + "loss": 0.1038, + "rewards/chosen": 1.9191839694976807, + "rewards/margins": 11.742841958999634, + "rewards/rejected": -9.823657989501953, + "step": 1912 + }, + { + "epoch": 0.7061972220940427, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 2.0434847179885687e-06, + "logits/chosen": 1552974643.2, + "logits/rejected": 1361277952.0, + "logps/chosen": -218.5765869140625, + "logps/rejected": -367.0594889322917, + "loss": 0.1549, + "rewards/chosen": 1.6129720687866211, + "rewards/margins": 8.593300946553548, + "rewards/rejected": -6.980328877766927, + "step": 1913 + }, + { + "epoch": 0.7065663790318859, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 2.0387405408357464e-06, + "logits/chosen": 1574236267.7894738, + "logits/rejected": 1482231650.4615386, + "logps/chosen": -254.90041632401315, + "logps/rejected": -492.60366586538464, + "loss": 0.1728, + "rewards/chosen": 1.2843106922350431, + "rewards/margins": 9.178459298755476, + "rewards/rejected": -7.8941486065204325, + "step": 1914 + }, + { + "epoch": 0.7069355359697291, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 2.034000466470283e-06, + "logits/chosen": 2036793990.7368422, + "logits/rejected": 2300653568.0, + "logps/chosen": -183.27181846217104, + "logps/rejected": -477.9987605168269, + "loss": 0.1649, + "rewards/chosen": 1.3297780689440275, + "rewards/margins": 8.585768317404064, + "rewards/rejected": -7.255990248460036, + "step": 1915 + }, + { + "epoch": 0.7073046929075724, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 2.0292645014594917e-06, + "logits/chosen": 1460736000.0, + "logits/rejected": 1417178112.0, + "logps/chosen": -207.85072544642858, + "logps/rejected": -424.74793836805554, + "loss": 0.1379, + "rewards/chosen": 1.1513370786394392, + "rewards/margins": 9.365322945609925, + "rewards/rejected": -8.213985866970486, + "step": 1916 + }, + { + "epoch": 0.7076738498454155, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 2.0245326523649896e-06, + "logits/chosen": 1773496966.7368422, + "logits/rejected": 1255198798.7692308, + "logps/chosen": -287.7201377467105, + "logps/rejected": -454.7179612379808, + "loss": 0.1603, + "rewards/chosen": 1.6773944653962787, + "rewards/margins": 9.07902574732236, + "rewards/rejected": -7.401631281926082, + "step": 1917 + }, + { + "epoch": 0.7080430067832587, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 2.0198049257426943e-06, + "logits/chosen": 2290179510.857143, + "logits/rejected": 1690197333.3333333, + "logps/chosen": -314.53763253348217, + "logps/rejected": -469.83197699652777, + "loss": 0.1428, + "rewards/chosen": 1.454674448285784, + "rewards/margins": 9.241072594173371, + "rewards/rejected": -7.786398145887587, + "step": 1918 + }, + { + "epoch": 0.7084121637211019, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 2.015081328142813e-06, + "logits/chosen": 1419544234.6666667, + "logits/rejected": 1735777572.5714285, + "logps/chosen": -296.6409505208333, + "logps/rejected": -806.8405412946429, + "loss": 0.1462, + "rewards/chosen": 1.5860153834025066, + "rewards/margins": 44.3154334567842, + "rewards/rejected": -42.729418073381694, + "step": 1919 + }, + { + "epoch": 0.7087813206589452, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 2.0103618661098274e-06, + "logits/chosen": 2056919401.4117646, + "logits/rejected": 2041047995.7333333, + "logps/chosen": -286.4535271139706, + "logps/rejected": -475.4564453125, + "loss": 0.1646, + "rewards/chosen": 1.2640297833610983, + "rewards/margins": 11.918227984858493, + "rewards/rejected": -10.654198201497396, + "step": 1920 + }, + { + "epoch": 0.7091504775967883, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 2.0056465461824932e-06, + "logits/chosen": 2317811858.285714, + "logits/rejected": 1654886172.4444444, + "logps/chosen": -378.30172293526783, + "logps/rejected": -463.59532335069446, + "loss": 0.1491, + "rewards/chosen": 0.9713711057390485, + "rewards/margins": 8.253738388182626, + "rewards/rejected": -7.282367282443577, + "step": 1921 + }, + { + "epoch": 0.7095196345346315, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 2.0009353748938277e-06, + "logits/chosen": 1700974807.5789473, + "logits/rejected": 1813095502.7692308, + "logps/chosen": -279.61256167763156, + "logps/rejected": -441.0225360576923, + "loss": 0.2042, + "rewards/chosen": 0.9066262496145148, + "rewards/margins": 7.862616380699251, + "rewards/rejected": -6.955990131084736, + "step": 1922 + }, + { + "epoch": 0.7098887914724747, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 1.9962283587710962e-06, + "logits/chosen": 2050765209.6, + "logits/rejected": 1758490744.4705882, + "logps/chosen": -246.40986328125, + "logps/rejected": -465.47883157169116, + "loss": 0.1595, + "rewards/chosen": 1.1059247334798177, + "rewards/margins": 8.774218405929266, + "rewards/rejected": -7.668293672449448, + "step": 1923 + }, + { + "epoch": 0.710257948410318, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 1.9915255043358105e-06, + "logits/chosen": 2148307712.0, + "logits/rejected": 2098973696.0, + "logps/chosen": -362.0143737792969, + "logps/rejected": -425.70147705078125, + "loss": 0.1345, + "rewards/chosen": 1.4800760746002197, + "rewards/margins": 8.72295880317688, + "rewards/rejected": -7.24288272857666, + "step": 1924 + }, + { + "epoch": 0.7106271053481611, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 1.9868268181037186e-06, + "logits/chosen": 1863070558.3157895, + "logits/rejected": 1818719153.2307692, + "logps/chosen": -277.6617495888158, + "logps/rejected": -545.0937124399038, + "loss": 0.1734, + "rewards/chosen": 1.2317462720369037, + "rewards/margins": 9.815220064479814, + "rewards/rejected": -8.58347379244291, + "step": 1925 + }, + { + "epoch": 0.7109962622860043, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 1.9821323065847866e-06, + "logits/chosen": 1378512018.2857144, + "logits/rejected": 1600150072.8888888, + "logps/chosen": -247.19184221540178, + "logps/rejected": -412.59288194444446, + "loss": 0.1365, + "rewards/chosen": 1.6329451969691686, + "rewards/margins": 8.06268398345463, + "rewards/rejected": -6.42973878648546, + "step": 1926 + }, + { + "epoch": 0.7113654192238476, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 1.9774419762832035e-06, + "logits/chosen": 1608499042.4615386, + "logits/rejected": 1297378142.3157895, + "logps/chosen": -225.78560697115384, + "logps/rejected": -400.9488589638158, + "loss": 0.1572, + "rewards/chosen": 0.9178163088285006, + "rewards/margins": 9.081232318028748, + "rewards/rejected": -8.163416009200247, + "step": 1927 + }, + { + "epoch": 0.7117345761616908, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 1.9727558336973594e-06, + "logits/chosen": 2301651236.571429, + "logits/rejected": 1689635043.5555556, + "logps/chosen": -286.926025390625, + "logps/rejected": -518.5090060763889, + "loss": 0.0876, + "rewards/chosen": 1.873828342982701, + "rewards/margins": 9.742938450404576, + "rewards/rejected": -7.869110107421875, + "step": 1928 + }, + { + "epoch": 0.7121037330995339, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 1.968073885319849e-06, + "logits/chosen": 1610965594.3529413, + "logits/rejected": 2143219302.4, + "logps/chosen": -332.7050206801471, + "logps/rejected": -641.8518229166667, + "loss": 0.123, + "rewards/chosen": 1.6709298526539522, + "rewards/margins": 12.88289986404718, + "rewards/rejected": -11.211970011393229, + "step": 1929 + }, + { + "epoch": 0.7124728900373771, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 1.963396137637448e-06, + "logits/chosen": 1810259727.0588236, + "logits/rejected": 1478793079.4666667, + "logps/chosen": -251.48150275735293, + "logps/rejected": -504.90966796875, + "loss": 0.149, + "rewards/chosen": 1.4181889926686007, + "rewards/margins": 9.8518071567311, + "rewards/rejected": -8.4336181640625, + "step": 1930 + }, + { + "epoch": 0.7128420469752204, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 1.958722597131119e-06, + "logits/chosen": 1786585497.6, + "logits/rejected": 1719712225.8823528, + "logps/chosen": -269.5969563802083, + "logps/rejected": -323.3721564797794, + "loss": 0.1306, + "rewards/chosen": 1.3635360717773437, + "rewards/margins": 8.099677052217372, + "rewards/rejected": -6.736140980440028, + "step": 1931 + }, + { + "epoch": 0.7132112039130636, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 1.9540532702759944e-06, + "logits/chosen": 1641085337.6, + "logits/rejected": 1685637601.8823528, + "logps/chosen": -287.16741536458335, + "logps/rejected": -476.3626493566176, + "loss": 0.1153, + "rewards/chosen": 1.9814954121907553, + "rewards/margins": 10.127055284088733, + "rewards/rejected": -8.145559871897978, + "step": 1932 + }, + { + "epoch": 0.7135803608509067, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 1.949388163541364e-06, + "logits/chosen": 2037827704.4705882, + "logits/rejected": 2080951910.4, + "logps/chosen": -233.10085880055146, + "logps/rejected": -444.0278645833333, + "loss": 0.1376, + "rewards/chosen": 1.9236059749827665, + "rewards/margins": 8.631775530646829, + "rewards/rejected": -6.708169555664062, + "step": 1933 + }, + { + "epoch": 0.7139495177887499, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 1.944727283390675e-06, + "logits/chosen": 2300319864.470588, + "logits/rejected": 2813908036.266667, + "logps/chosen": -284.33547794117646, + "logps/rejected": -529.2115234375, + "loss": 0.152, + "rewards/chosen": 1.5343275631175322, + "rewards/margins": 10.294247810513365, + "rewards/rejected": -8.759920247395833, + "step": 1934 + }, + { + "epoch": 0.7143186747265932, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 1.9400706362815195e-06, + "logits/chosen": 953342634.6666666, + "logits/rejected": 1194313914.1818182, + "logps/chosen": -241.8696754092262, + "logps/rejected": -429.70414595170456, + "loss": 0.1541, + "rewards/chosen": 1.9226666405087425, + "rewards/margins": 9.707560716769397, + "rewards/rejected": -7.784894076260653, + "step": 1935 + }, + { + "epoch": 0.7146878316644364, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 1.9354182286656203e-06, + "logits/chosen": 1802902072.8888888, + "logits/rejected": 1575879826.2857144, + "logps/chosen": -318.8416341145833, + "logps/rejected": -364.26827566964283, + "loss": 0.2316, + "rewards/chosen": 0.8005232281155057, + "rewards/margins": 7.999288029140896, + "rewards/rejected": -7.198764801025391, + "step": 1936 + }, + { + "epoch": 0.7150569886022795, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 1.9307700669888303e-06, + "logits/chosen": 1613121415.5294118, + "logits/rejected": 1697954065.0666666, + "logps/chosen": -286.9732306985294, + "logps/rejected": -539.4375651041667, + "loss": 0.1365, + "rewards/chosen": 1.7996118208941292, + "rewards/margins": 10.583148604748295, + "rewards/rejected": -8.783536783854167, + "step": 1937 + }, + { + "epoch": 0.7154261455401227, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 1.9261261576911196e-06, + "logits/chosen": 2106319985.7777777, + "logits/rejected": 1747999597.7142856, + "logps/chosen": -220.62504069010416, + "logps/rejected": -481.69771902901783, + "loss": 0.1549, + "rewards/chosen": 1.358289400736491, + "rewards/margins": 11.131718181428454, + "rewards/rejected": -9.773428780691964, + "step": 1938 + }, + { + "epoch": 0.715795302477966, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 1.921486507206562e-06, + "logits/chosen": 1894059068.235294, + "logits/rejected": 1670049382.4, + "logps/chosen": -276.94019990808823, + "logps/rejected": -460.56272786458334, + "loss": 0.1437, + "rewards/chosen": 1.3763928132898666, + "rewards/margins": 10.091425772274242, + "rewards/rejected": -8.715032958984375, + "step": 1939 + }, + { + "epoch": 0.7161644594158092, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 1.916851121963337e-06, + "logits/chosen": 1679232377.2631578, + "logits/rejected": 1650773543.3846154, + "logps/chosen": -314.20078638980266, + "logps/rejected": -480.4426457331731, + "loss": 0.1747, + "rewards/chosen": 1.1818823563425165, + "rewards/margins": 8.916722162532421, + "rewards/rejected": -7.734839806189904, + "step": 1940 + }, + { + "epoch": 0.7165336163536523, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 1.9122200083837134e-06, + "logits/chosen": 2269000310.1538463, + "logits/rejected": 1888344171.7894738, + "logps/chosen": -437.8488957331731, + "logps/rejected": -590.7824835526316, + "loss": 0.0963, + "rewards/chosen": 1.8094172844519982, + "rewards/margins": 10.737810196664169, + "rewards/rejected": -8.928392912212171, + "step": 1941 + }, + { + "epoch": 0.7169027732914955, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 1.907593172884037e-06, + "logits/chosen": 2332334710.1538463, + "logits/rejected": 2097592643.368421, + "logps/chosen": -256.2663762019231, + "logps/rejected": -404.2579409950658, + "loss": 0.1511, + "rewards/chosen": 0.8979165003849909, + "rewards/margins": 8.76681940951328, + "rewards/rejected": -7.868902909128289, + "step": 1942 + }, + { + "epoch": 0.7172719302293388, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 1.9029706218747302e-06, + "logits/chosen": 1897584913.0666666, + "logits/rejected": 2137610601.4117646, + "logps/chosen": -244.54768880208334, + "logps/rejected": -445.67443129595586, + "loss": 0.1694, + "rewards/chosen": 1.0271358489990234, + "rewards/margins": 8.182770560769473, + "rewards/rejected": -7.155634711770451, + "step": 1943 + }, + { + "epoch": 0.717641087167182, + "grad_norm": 11.375, + "kl": 1.6294357776641846, + "learning_rate": 1.8983523617602834e-06, + "logits/chosen": 1655840904.5333333, + "logits/rejected": 1624459866.3529413, + "logps/chosen": -345.1699544270833, + "logps/rejected": -521.7017463235294, + "loss": 0.1424, + "rewards/chosen": 1.5841512044270833, + "rewards/margins": 9.186178648705576, + "rewards/rejected": -7.602027444278493, + "step": 1944 + }, + { + "epoch": 0.7180102441050251, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 1.8937383989392294e-06, + "logits/chosen": 1643813546.6666667, + "logits/rejected": 1804966731.2941177, + "logps/chosen": -315.864453125, + "logps/rejected": -592.1216681985294, + "loss": 0.1544, + "rewards/chosen": 0.9769868850708008, + "rewards/margins": 9.040445496054257, + "rewards/rejected": -8.063458610983457, + "step": 1945 + }, + { + "epoch": 0.7183794010428683, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 1.8891287398041591e-06, + "logits/chosen": 2341956608.0, + "logits/rejected": 2444383232.0, + "logps/chosen": -294.866943359375, + "logps/rejected": -509.8988342285156, + "loss": 0.1655, + "rewards/chosen": 1.0673905611038208, + "rewards/margins": 8.549897313117981, + "rewards/rejected": -7.48250675201416, + "step": 1946 + }, + { + "epoch": 0.7187485579807116, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 1.8845233907416987e-06, + "logits/chosen": 1854718598.7368422, + "logits/rejected": 2121042550.1538463, + "logps/chosen": -264.3248355263158, + "logps/rejected": -457.5680588942308, + "loss": 0.1767, + "rewards/chosen": 1.4926294025621916, + "rewards/margins": 9.43411605464302, + "rewards/rejected": -7.94148665208083, + "step": 1947 + }, + { + "epoch": 0.7191177149185547, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 1.8799223581324965e-06, + "logits/chosen": 2182441870.2222223, + "logits/rejected": 2086491428.5714285, + "logps/chosen": -319.7055935329861, + "logps/rejected": -424.88058035714283, + "loss": 0.2121, + "rewards/chosen": 1.1118122736612956, + "rewards/margins": 8.782249132792154, + "rewards/rejected": -7.670436859130859, + "step": 1948 + }, + { + "epoch": 0.7194868718563979, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 1.8753256483512272e-06, + "logits/chosen": 1796765137.4545455, + "logits/rejected": 1717684633.6, + "logps/chosen": -340.73903586647725, + "logps/rejected": -519.499462890625, + "loss": 0.1825, + "rewards/chosen": 1.471113378351385, + "rewards/margins": 9.500176793878728, + "rewards/rejected": -8.029063415527343, + "step": 1949 + }, + { + "epoch": 0.7198560287942412, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 1.8707332677665752e-06, + "logits/chosen": 1801618733.1764705, + "logits/rejected": 1558627669.3333333, + "logps/chosen": -244.0266544117647, + "logps/rejected": -473.43678385416666, + "loss": 0.1529, + "rewards/chosen": 1.7565014783073873, + "rewards/margins": 10.620112318151136, + "rewards/rejected": -8.86361083984375, + "step": 1950 + }, + { + "epoch": 0.7202251857320844, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 1.866145222741222e-06, + "logits/chosen": 1750420426.1052632, + "logits/rejected": 1559778540.3076923, + "logps/chosen": -265.71761924342104, + "logps/rejected": -476.34825721153845, + "loss": 0.2046, + "rewards/chosen": 1.2228604366904812, + "rewards/margins": 10.121510385984353, + "rewards/rejected": -8.89864994929387, + "step": 1951 + }, + { + "epoch": 0.7205943426699275, + "grad_norm": 13.625, + "kl": 0.16300010681152344, + "learning_rate": 1.8615615196318476e-06, + "logits/chosen": 3335250782.3157897, + "logits/rejected": 1821355716.9230769, + "logps/chosen": -337.06997841282896, + "logps/rejected": -370.25259164663464, + "loss": 0.1649, + "rewards/chosen": 1.2425981822766756, + "rewards/margins": 7.5505481287535385, + "rewards/rejected": -6.307949946476863, + "step": 1952 + }, + { + "epoch": 0.7209634996077707, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 1.8569821647891163e-06, + "logits/chosen": 3016571810.909091, + "logits/rejected": 2088263289.9047618, + "logps/chosen": -251.8709383877841, + "logps/rejected": -515.7583705357143, + "loss": 0.1036, + "rewards/chosen": 1.4255327744917436, + "rewards/margins": 10.05871511847426, + "rewards/rejected": -8.633182343982515, + "step": 1953 + }, + { + "epoch": 0.721332656545614, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 1.8524071645576613e-06, + "logits/chosen": 1505678056.7272727, + "logits/rejected": 1688193843.2, + "logps/chosen": -265.09614701704544, + "logps/rejected": -488.6462890625, + "loss": 0.1662, + "rewards/chosen": 1.684290799227628, + "rewards/margins": 7.892694958773526, + "rewards/rejected": -6.208404159545898, + "step": 1954 + }, + { + "epoch": 0.7217018134834572, + "grad_norm": 9.4375, + "kl": 0.10736751556396484, + "learning_rate": 1.847836525276091e-06, + "logits/chosen": 2564772181.3333335, + "logits/rejected": 1535913984.0, + "logps/chosen": -317.8386637369792, + "logps/rejected": -517.48359375, + "loss": 0.0919, + "rewards/chosen": 1.9416996637980144, + "rewards/margins": 9.696629015604655, + "rewards/rejected": -7.7549293518066404, + "step": 1955 + }, + { + "epoch": 0.7220709704213003, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 1.8432702532769685e-06, + "logits/chosen": 2254675285.3333335, + "logits/rejected": 2294868845.714286, + "logps/chosen": -289.9012044270833, + "logps/rejected": -469.20445033482144, + "loss": 0.196, + "rewards/chosen": 1.057506455315484, + "rewards/margins": 7.667009520152258, + "rewards/rejected": -6.609503064836774, + "step": 1956 + }, + { + "epoch": 0.7224401273591435, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 1.8387083548868023e-06, + "logits/chosen": 2111328665.6, + "logits/rejected": 1558254080.0, + "logps/chosen": -330.7250732421875, + "logps/rejected": -422.3018391927083, + "loss": 0.2005, + "rewards/chosen": 1.1782587051391602, + "rewards/margins": 10.825766690572104, + "rewards/rejected": -9.647507985432943, + "step": 1957 + }, + { + "epoch": 0.7228092842969868, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 1.8341508364260469e-06, + "logits/chosen": 1468615137.8823528, + "logits/rejected": 1528532172.8, + "logps/chosen": -259.65455537683823, + "logps/rejected": -463.82041015625, + "loss": 0.1458, + "rewards/chosen": 2.007230422076057, + "rewards/margins": 8.587151381548713, + "rewards/rejected": -6.579920959472656, + "step": 1958 + }, + { + "epoch": 0.72317844123483, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 1.829597704209088e-06, + "logits/chosen": 1602757973.3333333, + "logits/rejected": 1422459611.4285715, + "logps/chosen": -309.1552734375, + "logps/rejected": -432.13710239955356, + "loss": 0.1779, + "rewards/chosen": 1.2044161690606012, + "rewards/margins": 8.572245083157979, + "rewards/rejected": -7.367828914097378, + "step": 1959 + }, + { + "epoch": 0.7235475981726731, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 1.8250489645442283e-06, + "logits/chosen": 2895638983.111111, + "logits/rejected": 2392358473.142857, + "logps/chosen": -310.15858289930554, + "logps/rejected": -448.1923828125, + "loss": 0.1729, + "rewards/chosen": 1.5156158871120877, + "rewards/margins": 7.772471988011921, + "rewards/rejected": -6.2568561008998325, + "step": 1960 + }, + { + "epoch": 0.7239167551105163, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 1.820504623733692e-06, + "logits/chosen": 2301157616.9411764, + "logits/rejected": 1861658624.0, + "logps/chosen": -287.59426700367646, + "logps/rejected": -416.1691080729167, + "loss": 0.1819, + "rewards/chosen": 1.1785736083984375, + "rewards/margins": 8.121446736653645, + "rewards/rejected": -6.942873128255209, + "step": 1961 + }, + { + "epoch": 0.7242859120483596, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 1.8159646880736036e-06, + "logits/chosen": 2024950897.7777777, + "logits/rejected": 1540329618.2857144, + "logps/chosen": -327.9093967013889, + "logps/rejected": -600.7601143973214, + "loss": 0.1465, + "rewards/chosen": 1.5519801245795355, + "rewards/margins": 10.267729592701745, + "rewards/rejected": -8.71574946812221, + "step": 1962 + }, + { + "epoch": 0.7246550689862028, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 1.8114291638539883e-06, + "logits/chosen": 1911320360.4210527, + "logits/rejected": 1761113166.7692308, + "logps/chosen": -300.3303865131579, + "logps/rejected": -461.1847956730769, + "loss": 0.1655, + "rewards/chosen": 1.1692043103669818, + "rewards/margins": 8.655910970711032, + "rewards/rejected": -7.48670666034405, + "step": 1963 + }, + { + "epoch": 0.7250242259240459, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 1.8068980573587547e-06, + "logits/chosen": 1108500864.0, + "logits/rejected": 1744081280.0, + "logps/chosen": -195.7095947265625, + "logps/rejected": -432.3100280761719, + "loss": 0.1649, + "rewards/chosen": 1.1288038492202759, + "rewards/margins": 9.264139771461487, + "rewards/rejected": -8.135335922241211, + "step": 1964 + }, + { + "epoch": 0.7253933828618891, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 1.8023713748656946e-06, + "logits/chosen": 3105950464.0, + "logits/rejected": 2268626432.0, + "logps/chosen": -238.43032836914062, + "logps/rejected": -570.9690551757812, + "loss": 0.1522, + "rewards/chosen": 1.047446370124817, + "rewards/margins": 9.650784611701965, + "rewards/rejected": -8.603338241577148, + "step": 1965 + }, + { + "epoch": 0.7257625397997324, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 1.7978491226464706e-06, + "logits/chosen": 1981421086.1176472, + "logits/rejected": 1704447863.4666667, + "logps/chosen": -258.20490579044116, + "logps/rejected": -323.4916015625, + "loss": 0.1504, + "rewards/chosen": 1.6247109805836397, + "rewards/margins": 6.885661704867494, + "rewards/rejected": -5.2609507242838545, + "step": 1966 + }, + { + "epoch": 0.7261316967375756, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 1.7933313069666026e-06, + "logits/chosen": 2212240725.3333335, + "logits/rejected": 1686240870.4, + "logps/chosen": -251.3861287434896, + "logps/rejected": -402.7097900390625, + "loss": 0.1305, + "rewards/chosen": 1.073053201039632, + "rewards/margins": 9.363820107777913, + "rewards/rejected": -8.29076690673828, + "step": 1967 + }, + { + "epoch": 0.7265008536754187, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 1.788817934085469e-06, + "logits/chosen": 1091048448.0, + "logits/rejected": 2065439278.5454545, + "logps/chosen": -262.1652587890625, + "logps/rejected": -480.84330610795456, + "loss": 0.1124, + "rewards/chosen": 1.0068857192993164, + "rewards/margins": 9.69625353379683, + "rewards/rejected": -8.689367814497514, + "step": 1968 + }, + { + "epoch": 0.726870010613262, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 1.784309010256291e-06, + "logits/chosen": 1779878707.2, + "logits/rejected": 1390983770.3529413, + "logps/chosen": -267.0896809895833, + "logps/rejected": -472.68244485294116, + "loss": 0.0945, + "rewards/chosen": 1.7913057963053385, + "rewards/margins": 10.772409132415174, + "rewards/rejected": -8.981103336109834, + "step": 1969 + }, + { + "epoch": 0.7272391675511052, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 1.7798045417261234e-06, + "logits/chosen": 1638250057.142857, + "logits/rejected": 2132506851.5555556, + "logps/chosen": -349.80859375, + "logps/rejected": -463.2126193576389, + "loss": 0.1573, + "rewards/chosen": 1.1879049028669084, + "rewards/margins": 8.080611879863437, + "rewards/rejected": -6.892706976996528, + "step": 1970 + }, + { + "epoch": 0.7276083244889484, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 1.7753045347358505e-06, + "logits/chosen": 1563980800.0, + "logits/rejected": 1647994368.0, + "logps/chosen": -214.0360107421875, + "logps/rejected": -481.29803466796875, + "loss": 0.132, + "rewards/chosen": 1.6789591312408447, + "rewards/margins": 8.39842677116394, + "rewards/rejected": -6.719467639923096, + "step": 1971 + }, + { + "epoch": 0.7279774814267915, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 1.7708089955201773e-06, + "logits/chosen": 1548362820.2666667, + "logits/rejected": 1410255209.4117646, + "logps/chosen": -268.935888671875, + "logps/rejected": -417.40696806066177, + "loss": 0.1567, + "rewards/chosen": 1.0627482732137044, + "rewards/margins": 9.516142545961866, + "rewards/rejected": -8.453394272748161, + "step": 1972 + }, + { + "epoch": 0.7283466383646348, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 1.7663179303076127e-06, + "logits/chosen": 1400535625.142857, + "logits/rejected": 1682018531.5555556, + "logps/chosen": -260.8867710658482, + "logps/rejected": -554.8662109375, + "loss": 0.1622, + "rewards/chosen": 0.840252126966204, + "rewards/margins": 10.732905698201014, + "rewards/rejected": -9.89265357123481, + "step": 1973 + }, + { + "epoch": 0.728715795302478, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 1.7618313453204723e-06, + "logits/chosen": 1436244104.5333333, + "logits/rejected": 3391752071.529412, + "logps/chosen": -230.3421875, + "logps/rejected": -507.4557674632353, + "loss": 0.1613, + "rewards/chosen": 1.1802563985188803, + "rewards/margins": 9.079345538569433, + "rewards/rejected": -7.899089140050552, + "step": 1974 + }, + { + "epoch": 0.7290849522403212, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 1.7573492467748636e-06, + "logits/chosen": 2130440601.6, + "logits/rejected": 1857340717.1764705, + "logps/chosen": -202.19210611979167, + "logps/rejected": -424.02326516544116, + "loss": 0.1677, + "rewards/chosen": 1.0567240397135416, + "rewards/margins": 8.55725809733073, + "rewards/rejected": -7.5005340576171875, + "step": 1975 + }, + { + "epoch": 0.7294541091781643, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 1.752871640880675e-06, + "logits/chosen": 2653426176.0, + "logits/rejected": 1987994752.0, + "logps/chosen": -295.0529479980469, + "logps/rejected": -550.6559448242188, + "loss": 0.1122, + "rewards/chosen": 1.8300837278366089, + "rewards/margins": 9.745269656181335, + "rewards/rejected": -7.915185928344727, + "step": 1976 + }, + { + "epoch": 0.7298232661160076, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 1.7483985338415731e-06, + "logits/chosen": 1856091687.3846154, + "logits/rejected": 1684825357.4736843, + "logps/chosen": -275.28675255408655, + "logps/rejected": -427.3822985197368, + "loss": 0.0951, + "rewards/chosen": 1.7936730018028846, + "rewards/margins": 8.83259014175971, + "rewards/rejected": -7.038917139956825, + "step": 1977 + }, + { + "epoch": 0.7301924230538508, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 1.7439299318549936e-06, + "logits/chosen": 1271976832.0, + "logits/rejected": 1841462400.0, + "logps/chosen": -297.3148193359375, + "logps/rejected": -391.4338073730469, + "loss": 0.1276, + "rewards/chosen": 1.7633472681045532, + "rewards/margins": 7.89594042301178, + "rewards/rejected": -6.132593154907227, + "step": 1978 + }, + { + "epoch": 0.730561579991694, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 1.739465841112125e-06, + "logits/chosen": 2065496320.0, + "logits/rejected": 1578620160.0, + "logps/chosen": -231.57823181152344, + "logps/rejected": -446.24609375, + "loss": 0.1351, + "rewards/chosen": 1.5963077545166016, + "rewards/margins": 8.9649019241333, + "rewards/rejected": -7.368594169616699, + "step": 1979 + }, + { + "epoch": 0.7309307369295371, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 1.7350062677979075e-06, + "logits/chosen": 1290629601.8823528, + "logits/rejected": 1801100902.4, + "logps/chosen": -270.06310317095586, + "logps/rejected": -413.9748046875, + "loss": 0.1197, + "rewards/chosen": 1.798466401941636, + "rewards/margins": 9.442689933028875, + "rewards/rejected": -7.644223531087239, + "step": 1980 + }, + { + "epoch": 0.7312998938673804, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 1.7305512180910244e-06, + "logits/chosen": 1460973568.0, + "logits/rejected": 2414173952.0, + "logps/chosen": -225.66567993164062, + "logps/rejected": -438.7323913574219, + "loss": 0.148, + "rewards/chosen": 1.3848485946655273, + "rewards/margins": 9.28318452835083, + "rewards/rejected": -7.898335933685303, + "step": 1981 + }, + { + "epoch": 0.7316690508052236, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 1.726100698163893e-06, + "logits/chosen": 1643839488.0, + "logits/rejected": 2188574105.6, + "logps/chosen": -276.30908203125, + "logps/rejected": -538.66064453125, + "loss": 0.1502, + "rewards/chosen": 0.6614227692286173, + "rewards/margins": 9.97179721991221, + "rewards/rejected": -9.310374450683593, + "step": 1982 + }, + { + "epoch": 0.7320382077430668, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 1.7216547141826472e-06, + "logits/chosen": 1538422237.8666666, + "logits/rejected": 1472822452.7058823, + "logps/chosen": -280.83046875, + "logps/rejected": -536.107421875, + "loss": 0.1438, + "rewards/chosen": 1.5416072845458983, + "rewards/margins": 9.715738790175495, + "rewards/rejected": -8.174131505629596, + "step": 1983 + }, + { + "epoch": 0.73240736468091, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 1.7172132723071444e-06, + "logits/chosen": 1572798610.2857144, + "logits/rejected": 2665436501.3333335, + "logps/chosen": -294.79115513392856, + "logps/rejected": -548.3828125, + "loss": 0.148, + "rewards/chosen": 1.2043771743774414, + "rewards/margins": 9.734153005811903, + "rewards/rejected": -8.529775831434462, + "step": 1984 + }, + { + "epoch": 0.7327765216187532, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 1.7127763786909474e-06, + "logits/chosen": 1569829956.2666667, + "logits/rejected": 1748294716.235294, + "logps/chosen": -288.2527669270833, + "logps/rejected": -486.24626608455884, + "loss": 0.1083, + "rewards/chosen": 1.6621631622314452, + "rewards/margins": 10.251312188541188, + "rewards/rejected": -8.589149026309743, + "step": 1985 + }, + { + "epoch": 0.7331456785565964, + "grad_norm": 11.75, + "kl": 0.5053348541259766, + "learning_rate": 1.7083440394813116e-06, + "logits/chosen": 2612277833.142857, + "logits/rejected": 1866176739.5555556, + "logps/chosen": -302.57041713169644, + "logps/rejected": -551.5469835069445, + "loss": 0.1349, + "rewards/chosen": 1.3359718322753906, + "rewards/margins": 9.616308000352648, + "rewards/rejected": -8.280336168077257, + "step": 1986 + }, + { + "epoch": 0.7335148354944395, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 1.7039162608191895e-06, + "logits/chosen": 1569158736.8421052, + "logits/rejected": 1438770727.3846154, + "logps/chosen": -295.4756630345395, + "logps/rejected": -498.9400165264423, + "loss": 0.1779, + "rewards/chosen": 1.285874316566869, + "rewards/margins": 11.02053043041152, + "rewards/rejected": -9.734656113844652, + "step": 1987 + }, + { + "epoch": 0.7338839924322828, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 1.6994930488392135e-06, + "logits/chosen": 1582624621.7142856, + "logits/rejected": 1540500707.5555556, + "logps/chosen": -309.56815011160717, + "logps/rejected": -475.3017578125, + "loss": 0.1427, + "rewards/chosen": 1.300595419747489, + "rewards/margins": 8.643313756064764, + "rewards/rejected": -7.342718336317274, + "step": 1988 + }, + { + "epoch": 0.734253149370126, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 1.6950744096696843e-06, + "logits/chosen": 2603780352.0, + "logits/rejected": 1597924864.0, + "logps/chosen": -322.58282470703125, + "logps/rejected": -397.26483154296875, + "loss": 0.1333, + "rewards/chosen": 1.5857160091400146, + "rewards/margins": 8.600787878036499, + "rewards/rejected": -7.015071868896484, + "step": 1989 + }, + { + "epoch": 0.7346223063079692, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 1.690660349432573e-06, + "logits/chosen": 1862380657.7777777, + "logits/rejected": 1625645933.7142856, + "logps/chosen": -264.57904730902777, + "logps/rejected": -437.2371303013393, + "loss": 0.2125, + "rewards/chosen": 0.7193364567226834, + "rewards/margins": 8.869306920066713, + "rewards/rejected": -8.14997046334403, + "step": 1990 + }, + { + "epoch": 0.7349914632458123, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 1.6862508742435014e-06, + "logits/chosen": 2167651388.2352943, + "logits/rejected": 1732830549.3333333, + "logps/chosen": -324.41779641544116, + "logps/rejected": -466.04938151041665, + "loss": 0.162, + "rewards/chosen": 1.1615176481359146, + "rewards/margins": 9.714138090844248, + "rewards/rejected": -8.552620442708333, + "step": 1991 + }, + { + "epoch": 0.7353606201836556, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 1.6818459902117429e-06, + "logits/chosen": 1851232142.2222223, + "logits/rejected": 1706098541.7142856, + "logps/chosen": -240.57893880208334, + "logps/rejected": -579.1620396205357, + "loss": 0.1604, + "rewards/chosen": 1.4452656639946833, + "rewards/margins": 9.546194924248589, + "rewards/rejected": -8.100929260253906, + "step": 1992 + }, + { + "epoch": 0.7357297771214988, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 1.6774457034402097e-06, + "logits/chosen": 1523688789.3333333, + "logits/rejected": 2261735862.857143, + "logps/chosen": -295.7059733072917, + "logps/rejected": -440.19248744419644, + "loss": 0.1774, + "rewards/chosen": 1.396969371371799, + "rewards/margins": 9.62642777155316, + "rewards/rejected": -8.229458400181361, + "step": 1993 + }, + { + "epoch": 0.736098934059342, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 1.67305002002544e-06, + "logits/chosen": 1685844260.5714285, + "logits/rejected": 1479366883.5555556, + "logps/chosen": -272.11648995535717, + "logps/rejected": -536.4620225694445, + "loss": 0.1297, + "rewards/chosen": 1.1988229751586914, + "rewards/margins": 10.012544525994194, + "rewards/rejected": -8.813721550835503, + "step": 1994 + }, + { + "epoch": 0.7364680909971851, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 1.6686589460575992e-06, + "logits/chosen": 2031185920.0, + "logits/rejected": 2010303658.6666667, + "logps/chosen": -265.165576171875, + "logps/rejected": -602.2980550130209, + "loss": 0.1816, + "rewards/chosen": 1.4395484924316406, + "rewards/margins": 10.83706792195638, + "rewards/rejected": -9.39751942952474, + "step": 1995 + }, + { + "epoch": 0.7368372479350284, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 1.6642724876204658e-06, + "logits/chosen": 1800537673.142857, + "logits/rejected": 2436081208.888889, + "logps/chosen": -380.09451729910717, + "logps/rejected": -569.7732747395834, + "loss": 0.1274, + "rewards/chosen": 1.8291357585362025, + "rewards/margins": 10.603065778338719, + "rewards/rejected": -8.773930019802517, + "step": 1996 + }, + { + "epoch": 0.7372064048728716, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 1.6598906507914214e-06, + "logits/chosen": 1521241088.0, + "logits/rejected": 1121740288.0, + "logps/chosen": -324.784423828125, + "logps/rejected": -531.5628662109375, + "loss": 0.1759, + "rewards/chosen": 0.8331305980682373, + "rewards/margins": 8.79560112953186, + "rewards/rejected": -7.962470531463623, + "step": 1997 + }, + { + "epoch": 0.7375755618107148, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 1.6555134416414426e-06, + "logits/chosen": 2049920682.6666667, + "logits/rejected": 2295576576.0, + "logps/chosen": -348.6402180989583, + "logps/rejected": -395.8947265625, + "loss": 0.0976, + "rewards/chosen": 1.5350672403971355, + "rewards/margins": 9.258140818277994, + "rewards/rejected": -7.723073577880859, + "step": 1998 + }, + { + "epoch": 0.7379447187485579, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 1.6511408662350993e-06, + "logits/chosen": 2031306547.2, + "logits/rejected": 1580439381.3333333, + "logps/chosen": -215.437451171875, + "logps/rejected": -427.0163981119792, + "loss": 0.1877, + "rewards/chosen": 1.4528703689575195, + "rewards/margins": 8.554362297058105, + "rewards/rejected": -7.101491928100586, + "step": 1999 + }, + { + "epoch": 0.7383138756864012, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 1.6467729306305408e-06, + "logits/chosen": 2088893751.652174, + "logits/rejected": 1765136497.7777777, + "logps/chosen": -260.0537109375, + "logps/rejected": -504.19688585069446, + "loss": 0.1455, + "rewards/chosen": 2.097767042077106, + "rewards/margins": 10.403045396298026, + "rewards/rejected": -8.30527835422092, + "step": 2000 + }, + { + "epoch": 0.7386830326242444, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 1.6424096408794825e-06, + "logits/chosen": 1534491554.909091, + "logits/rejected": 1392271750.0952382, + "logps/chosen": -390.44633345170456, + "logps/rejected": -535.5016276041666, + "loss": 0.1452, + "rewards/chosen": 0.4332693706859242, + "rewards/margins": 9.512482104363379, + "rewards/rejected": -9.079212733677455, + "step": 2001 + }, + { + "epoch": 0.7390521895620876, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 1.6380510030272089e-06, + "logits/chosen": 2060291120.7619047, + "logits/rejected": 2122214493.090909, + "logps/chosen": -286.66420200892856, + "logps/rejected": -486.3800159801136, + "loss": 0.1548, + "rewards/chosen": 1.9916581653413319, + "rewards/margins": 10.759363909304401, + "rewards/rejected": -8.767705743963068, + "step": 2002 + }, + { + "epoch": 0.7394213464999307, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 1.63369702311256e-06, + "logits/chosen": 1191412326.4, + "logits/rejected": 1645715269.8181818, + "logps/chosen": -150.720654296875, + "logps/rejected": -644.9925426136364, + "loss": 0.0628, + "rewards/chosen": 2.0184791564941404, + "rewards/margins": 11.345816941694778, + "rewards/rejected": -9.327337785200639, + "step": 2003 + }, + { + "epoch": 0.739790503437774, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 1.6293477071679147e-06, + "logits/chosen": 1411427805.8666666, + "logits/rejected": 2137001984.0, + "logps/chosen": -299.84739583333334, + "logps/rejected": -505.0784122242647, + "loss": 0.126, + "rewards/chosen": 1.2816637674967448, + "rewards/margins": 8.658625868255017, + "rewards/rejected": -7.376962100758272, + "step": 2004 + }, + { + "epoch": 0.7401596603756172, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 1.6250030612191974e-06, + "logits/chosen": 1804686637.1764705, + "logits/rejected": 2237966199.4666667, + "logps/chosen": -228.74974149816177, + "logps/rejected": -399.57923177083336, + "loss": 0.1607, + "rewards/chosen": 1.270489075604607, + "rewards/margins": 9.36920835457596, + "rewards/rejected": -8.098719278971354, + "step": 2005 + }, + { + "epoch": 0.7405288173134604, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 1.6206630912858618e-06, + "logits/chosen": 1340897761.8823528, + "logits/rejected": 1486663680.0, + "logps/chosen": -307.7419864430147, + "logps/rejected": -460.2329427083333, + "loss": 0.1605, + "rewards/chosen": 1.2877806495217716, + "rewards/margins": 9.026524240830366, + "rewards/rejected": -7.738743591308594, + "step": 2006 + }, + { + "epoch": 0.7408979742513035, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 1.6163278033808777e-06, + "logits/chosen": 2088985804.8, + "logits/rejected": 1280757162.6666667, + "logps/chosen": -289.086572265625, + "logps/rejected": -389.0430908203125, + "loss": 0.2279, + "rewards/chosen": 0.8304847717285156, + "rewards/margins": 6.396250152587891, + "rewards/rejected": -5.565765380859375, + "step": 2007 + }, + { + "epoch": 0.7412671311891468, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 1.6119972035107328e-06, + "logits/chosen": 1404605203.6923077, + "logits/rejected": 2662176444.631579, + "logps/chosen": -283.82703575721155, + "logps/rejected": -508.6012027138158, + "loss": 0.1125, + "rewards/chosen": 1.6501397352952223, + "rewards/margins": 10.106007301855666, + "rewards/rejected": -8.455867566560444, + "step": 2008 + }, + { + "epoch": 0.74163628812699, + "grad_norm": 15.5625, + "kl": 0.42487335205078125, + "learning_rate": 1.6076712976754199e-06, + "logits/chosen": 1712959683.047619, + "logits/rejected": 1902837387.6363637, + "logps/chosen": -349.7591843377976, + "logps/rejected": -503.59818892045456, + "loss": 0.2502, + "rewards/chosen": 0.931639898390997, + "rewards/margins": 7.613548245780912, + "rewards/rejected": -6.681908347389915, + "step": 2009 + }, + { + "epoch": 0.7420054450648332, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 1.6033500918684232e-06, + "logits/chosen": 1770638713.2631578, + "logits/rejected": 1559546328.6153846, + "logps/chosen": -229.06059827302633, + "logps/rejected": -443.6920823317308, + "loss": 0.1536, + "rewards/chosen": 1.702586525364926, + "rewards/margins": 9.672177519392871, + "rewards/rejected": -7.969590994027945, + "step": 2010 + }, + { + "epoch": 0.7423746020026764, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 1.5990335920767202e-06, + "logits/chosen": 1589511314.2857144, + "logits/rejected": 1378913507.5555556, + "logps/chosen": -359.80447823660717, + "logps/rejected": -423.61347113715277, + "loss": 0.1127, + "rewards/chosen": 1.510901996067592, + "rewards/margins": 8.955688067844935, + "rewards/rejected": -7.444786071777344, + "step": 2011 + }, + { + "epoch": 0.7427437589405196, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 1.5947218042807682e-06, + "logits/chosen": 2169736325.5652175, + "logits/rejected": 2284491434.6666665, + "logps/chosen": -283.2922469429348, + "logps/rejected": -520.5833333333334, + "loss": 0.2245, + "rewards/chosen": 1.21649692369544, + "rewards/margins": 7.530188804663322, + "rewards/rejected": -6.313691880967882, + "step": 2012 + }, + { + "epoch": 0.7431129158783628, + "grad_norm": 12.625, + "kl": 0.05138587951660156, + "learning_rate": 1.5904147344544928e-06, + "logits/chosen": 2191944434.5263157, + "logits/rejected": 1736126148.9230769, + "logps/chosen": -274.99051706414474, + "logps/rejected": -425.6302959735577, + "loss": 0.1694, + "rewards/chosen": 1.4637787467554997, + "rewards/margins": 7.3842452431497305, + "rewards/rejected": -5.920466496394231, + "step": 2013 + }, + { + "epoch": 0.743482072816206, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 1.5861123885652829e-06, + "logits/chosen": 1949744936.4210527, + "logits/rejected": 2425127384.6153846, + "logps/chosen": -237.39627878289474, + "logps/rejected": -478.59652944711536, + "loss": 0.1769, + "rewards/chosen": 1.5944910551372327, + "rewards/margins": 9.194780033127016, + "rewards/rejected": -7.600288977989783, + "step": 2014 + }, + { + "epoch": 0.7438512297540492, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 1.5818147725739858e-06, + "logits/chosen": 1770025080.4705882, + "logits/rejected": 1967467724.8, + "logps/chosen": -254.45068359375, + "logps/rejected": -579.8020833333334, + "loss": 0.1512, + "rewards/chosen": 1.3310606339398552, + "rewards/margins": 9.311398158353917, + "rewards/rejected": -7.980337524414063, + "step": 2015 + }, + { + "epoch": 0.7442203866918924, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 1.577521892434895e-06, + "logits/chosen": 1740072528.8421052, + "logits/rejected": 1595317956.9230769, + "logps/chosen": -323.8920641447368, + "logps/rejected": -589.3928786057693, + "loss": 0.1862, + "rewards/chosen": 1.1277326282701994, + "rewards/margins": 8.983541766641594, + "rewards/rejected": -7.855809138371394, + "step": 2016 + }, + { + "epoch": 0.7445895436297356, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 1.573233754095738e-06, + "logits/chosen": 1792351027.2, + "logits/rejected": 1697371557.6470587, + "logps/chosen": -226.50751953125, + "logps/rejected": -394.9206112132353, + "loss": 0.1657, + "rewards/chosen": 0.9657628377278645, + "rewards/margins": 8.215194672229243, + "rewards/rejected": -7.249431834501379, + "step": 2017 + }, + { + "epoch": 0.7449587005675788, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 1.5689503634976788e-06, + "logits/chosen": 1904832950.857143, + "logits/rejected": 1873826474.6666667, + "logps/chosen": -239.10124860491072, + "logps/rejected": -469.3585611979167, + "loss": 0.1241, + "rewards/chosen": 1.7657978875296456, + "rewards/margins": 9.499736316620357, + "rewards/rejected": -7.733938429090712, + "step": 2018 + }, + { + "epoch": 0.745327857505422, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 1.5646717265753013e-06, + "logits/chosen": 1986450701.4736843, + "logits/rejected": 2396135739.076923, + "logps/chosen": -241.61657072368422, + "logps/rejected": -397.73031850961536, + "loss": 0.1678, + "rewards/chosen": 1.6886261387875205, + "rewards/margins": 10.509963325160719, + "rewards/rejected": -8.821337186373198, + "step": 2019 + }, + { + "epoch": 0.7456970144432652, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 1.5603978492566002e-06, + "logits/chosen": 1873859523.764706, + "logits/rejected": 2265760699.733333, + "logps/chosen": -331.1399931066176, + "logps/rejected": -456.55817057291665, + "loss": 0.1201, + "rewards/chosen": 1.8466377258300781, + "rewards/margins": 9.006876881917318, + "rewards/rejected": -7.16023915608724, + "step": 2020 + }, + { + "epoch": 0.7460661713811084, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 1.5561287374629786e-06, + "logits/chosen": 1831538102.857143, + "logits/rejected": 1750228536.8888888, + "logps/chosen": -305.8779296875, + "logps/rejected": -470.52777777777777, + "loss": 0.1671, + "rewards/chosen": 0.705066340310233, + "rewards/margins": 8.358361221495128, + "rewards/rejected": -7.6532948811848955, + "step": 2021 + }, + { + "epoch": 0.7464353283189515, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 1.551864397109239e-06, + "logits/chosen": 1880796702.1176472, + "logits/rejected": 1113266176.0, + "logps/chosen": -277.3734777113971, + "logps/rejected": -412.78704427083335, + "loss": 0.1737, + "rewards/chosen": 1.074118782492245, + "rewards/margins": 9.428783902934954, + "rewards/rejected": -8.354665120442709, + "step": 2022 + }, + { + "epoch": 0.7468044852567948, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 1.5476048341035678e-06, + "logits/chosen": 1784248922.3529413, + "logits/rejected": 2011540957.8666666, + "logps/chosen": -286.40929457720586, + "logps/rejected": -348.04306640625, + "loss": 0.1315, + "rewards/chosen": 1.775191587560317, + "rewards/margins": 8.26601749494964, + "rewards/rejected": -6.490825907389323, + "step": 2023 + }, + { + "epoch": 0.747173642194638, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 1.5433500543475361e-06, + "logits/chosen": 2063790552.6153846, + "logits/rejected": 2513774915.368421, + "logps/chosen": -236.9729942908654, + "logps/rejected": -529.3488384046053, + "loss": 0.0763, + "rewards/chosen": 2.3255245502178488, + "rewards/margins": 9.9062740619366, + "rewards/rejected": -7.58074951171875, + "step": 2024 + }, + { + "epoch": 0.7475427991324812, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 1.5391000637360898e-06, + "logits/chosen": 1637925156.5714285, + "logits/rejected": 1864353223.1111112, + "logps/chosen": -353.87259347098217, + "logps/rejected": -482.75672743055554, + "loss": 0.1078, + "rewards/chosen": 1.789870125906808, + "rewards/margins": 9.744940621512276, + "rewards/rejected": -7.955070495605469, + "step": 2025 + }, + { + "epoch": 0.7479119560703243, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 1.5348548681575332e-06, + "logits/chosen": 1775353472.0, + "logits/rejected": 1954466048.0, + "logps/chosen": -341.3355407714844, + "logps/rejected": -482.01177978515625, + "loss": 0.1673, + "rewards/chosen": 0.940376877784729, + "rewards/margins": 9.433802247047424, + "rewards/rejected": -8.493425369262695, + "step": 2026 + }, + { + "epoch": 0.7482811130081676, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 1.530614473493533e-06, + "logits/chosen": 2241257231.0588236, + "logits/rejected": 2097583035.7333333, + "logps/chosen": -305.94485294117646, + "logps/rejected": -464.1955078125, + "loss": 0.1286, + "rewards/chosen": 1.9090230605181526, + "rewards/margins": 10.341946441052006, + "rewards/rejected": -8.432923380533854, + "step": 2027 + }, + { + "epoch": 0.7486502699460108, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 1.5263788856191026e-06, + "logits/chosen": 2165522659.5555553, + "logits/rejected": 1601105773.7142856, + "logps/chosen": -249.18337673611111, + "logps/rejected": -476.3527134486607, + "loss": 0.1567, + "rewards/chosen": 1.611591551038954, + "rewards/margins": 8.853612839229523, + "rewards/rejected": -7.2420212881905695, + "step": 2028 + }, + { + "epoch": 0.749019426883854, + "grad_norm": 14.4375, + "kl": 0.6446971893310547, + "learning_rate": 1.522148110402593e-06, + "logits/chosen": 2082465792.0, + "logits/rejected": 3665182479.0588236, + "logps/chosen": -340.22942708333335, + "logps/rejected": -403.16661879595586, + "loss": 0.1903, + "rewards/chosen": 0.7115572611490886, + "rewards/margins": 7.92915028590782, + "rewards/rejected": -7.217593024758732, + "step": 2029 + }, + { + "epoch": 0.7493885838216972, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 1.517922153705692e-06, + "logits/chosen": 2066237440.0, + "logits/rejected": 1974291894.857143, + "logps/chosen": -306.0081232244318, + "logps/rejected": -428.7064267113095, + "loss": 0.0932, + "rewards/chosen": 2.1237402829256924, + "rewards/margins": 9.525141621048832, + "rewards/rejected": -7.40140133812314, + "step": 2030 + }, + { + "epoch": 0.7497577407595404, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 1.5137010213834086e-06, + "logits/chosen": 1198102118.4, + "logits/rejected": 1693244355.764706, + "logps/chosen": -251.52625325520833, + "logps/rejected": -447.50143612132354, + "loss": 0.1438, + "rewards/chosen": 1.4787413279215496, + "rewards/margins": 9.701842775531842, + "rewards/rejected": -8.223101447610293, + "step": 2031 + }, + { + "epoch": 0.7501268976973836, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 1.5094847192840644e-06, + "logits/chosen": 2283451090.8235292, + "logits/rejected": 2115396266.6666667, + "logps/chosen": -293.87643612132354, + "logps/rejected": -528.1277994791667, + "loss": 0.1661, + "rewards/chosen": 1.1253389470717485, + "rewards/margins": 10.392732949350394, + "rewards/rejected": -9.267394002278646, + "step": 2032 + }, + { + "epoch": 0.7504960546352268, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 1.5052732532492959e-06, + "logits/chosen": 1601040657.0666666, + "logits/rejected": 1389713648.9411764, + "logps/chosen": -252.74925130208334, + "logps/rejected": -503.89832261029414, + "loss": 0.1383, + "rewards/chosen": 1.1622403462727864, + "rewards/margins": 9.236667333864698, + "rewards/rejected": -8.074426987591911, + "step": 2033 + }, + { + "epoch": 0.75086521157307, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 1.5010666291140363e-06, + "logits/chosen": 1674565955.368421, + "logits/rejected": 1944174434.4615386, + "logps/chosen": -298.4876130756579, + "logps/rejected": -501.1081730769231, + "loss": 0.1675, + "rewards/chosen": 1.4137258027729236, + "rewards/margins": 7.747316970516312, + "rewards/rejected": -6.333591167743389, + "step": 2034 + }, + { + "epoch": 0.75086521157307, + "eval_kl": 0.0, + "eval_logits/chosen": 3491091431.5023923, + "eval_logits/rejected": 3514796679.203463, + "eval_logps/chosen": -291.61257102272725, + "eval_logps/rejected": -480.46617965367966, + "eval_loss": 0.12946222722530365, + "eval_rewards/chosen": 1.5402224600029903, + "eval_rewards/margins": 9.813795769831183, + "eval_rewards/rejected": -8.273573309828192, + "eval_runtime": 110.0144, + "eval_samples_per_second": 7.963, + "eval_steps_per_second": 0.5, + "step": 2034 + }, + { + "epoch": 0.7512343685109132, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 1.4968648527065066e-06, + "logits/chosen": 2100772382.1176472, + "logits/rejected": 1255694336.0, + "logps/chosen": -291.55592256433823, + "logps/rejected": -348.9048177083333, + "loss": 0.15, + "rewards/chosen": 1.5768876917221968, + "rewards/margins": 7.9804368860581345, + "rewards/rejected": -6.4035491943359375, + "step": 2035 + }, + { + "epoch": 0.7516035254487564, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 1.4926679298482172e-06, + "logits/chosen": 1998166016.0, + "logits/rejected": 1682031885.4736843, + "logps/chosen": -328.45620492788464, + "logps/rejected": -569.9995888157895, + "loss": 0.1187, + "rewards/chosen": 1.2175022271963267, + "rewards/margins": 11.157281408425767, + "rewards/rejected": -9.93977918122944, + "step": 2036 + }, + { + "epoch": 0.7519726823865996, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 1.4884758663539517e-06, + "logits/chosen": 1458324540.235294, + "logits/rejected": 1294229230.9333334, + "logps/chosen": -268.3307674632353, + "logps/rejected": -432.7127278645833, + "loss": 0.1841, + "rewards/chosen": 0.9029093910666073, + "rewards/margins": 8.799043322544472, + "rewards/rejected": -7.8961339314778645, + "step": 2037 + }, + { + "epoch": 0.7523418393244428, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 1.4842886680317592e-06, + "logits/chosen": 1548305106.8235295, + "logits/rejected": 1867160098.1333334, + "logps/chosen": -217.9685776654412, + "logps/rejected": -394.34137369791665, + "loss": 0.1536, + "rewards/chosen": 1.3390962937298942, + "rewards/margins": 7.6884177862429155, + "rewards/rejected": -6.349321492513021, + "step": 2038 + }, + { + "epoch": 0.752710996262286, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 1.4801063406829497e-06, + "logits/chosen": 1770052788.7058823, + "logits/rejected": 1684754295.4666667, + "logps/chosen": -264.49264705882354, + "logps/rejected": -519.18349609375, + "loss": 0.1418, + "rewards/chosen": 1.7266181497012867, + "rewards/margins": 10.8214942483341, + "rewards/rejected": -9.094876098632813, + "step": 2039 + }, + { + "epoch": 0.7530801532001292, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 1.4759288901020875e-06, + "logits/chosen": 2506167040.0, + "logits/rejected": 1770246784.0, + "logps/chosen": -257.1165466308594, + "logps/rejected": -480.35791015625, + "loss": 0.1473, + "rewards/chosen": 1.169373631477356, + "rewards/margins": 10.302106022834778, + "rewards/rejected": -9.132732391357422, + "step": 2040 + }, + { + "epoch": 0.7534493101379725, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 1.4717563220769733e-06, + "logits/chosen": 1428244736.0, + "logits/rejected": 1451381504.0, + "logps/chosen": -237.78933715820312, + "logps/rejected": -572.7919311523438, + "loss": 0.1527, + "rewards/chosen": 1.1213841438293457, + "rewards/margins": 11.713635921478271, + "rewards/rejected": -10.592251777648926, + "step": 2041 + }, + { + "epoch": 0.7538184670758156, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 1.4675886423886488e-06, + "logits/chosen": 1468843315.2, + "logits/rejected": 1445589643.6363637, + "logps/chosen": -335.6010009765625, + "logps/rejected": -391.7439630681818, + "loss": 0.0791, + "rewards/chosen": 1.41378173828125, + "rewards/margins": 7.7074121648615055, + "rewards/rejected": -6.293630426580256, + "step": 2042 + }, + { + "epoch": 0.7541876240136588, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 1.4634258568113835e-06, + "logits/chosen": 2037932168.5333333, + "logits/rejected": 2468108649.4117646, + "logps/chosen": -342.67760416666664, + "logps/rejected": -503.5645105698529, + "loss": 0.1572, + "rewards/chosen": 1.4779627482096354, + "rewards/margins": 10.26282818364162, + "rewards/rejected": -8.784865435431986, + "step": 2043 + }, + { + "epoch": 0.754556780951502, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 1.45926797111266e-06, + "logits/chosen": 1640859501.7142856, + "logits/rejected": 2216230684.4444447, + "logps/chosen": -261.72769601004467, + "logps/rejected": -505.19151475694446, + "loss": 0.1521, + "rewards/chosen": 0.9498520578656878, + "rewards/margins": 11.127851766253274, + "rewards/rejected": -10.177999708387587, + "step": 2044 + }, + { + "epoch": 0.7549259378893453, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 1.4551149910531781e-06, + "logits/chosen": 1538776405.3333333, + "logits/rejected": 2650349568.0, + "logps/chosen": -224.544384765625, + "logps/rejected": -570.9022288602941, + "loss": 0.1191, + "rewards/chosen": 1.6731047312418619, + "rewards/margins": 9.69268804063984, + "rewards/rejected": -8.019583309397978, + "step": 2045 + }, + { + "epoch": 0.7552950948271884, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 1.45096692238684e-06, + "logits/chosen": 1687230190.9333334, + "logits/rejected": 1461072112.9411764, + "logps/chosen": -303.60817057291666, + "logps/rejected": -365.3150850183824, + "loss": 0.1942, + "rewards/chosen": 0.8164337158203125, + "rewards/margins": 7.65706823012408, + "rewards/rejected": -6.840634514303768, + "step": 2046 + }, + { + "epoch": 0.7556642517650316, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 1.4468237708607397e-06, + "logits/chosen": 2416697825.882353, + "logits/rejected": 2606436078.9333334, + "logps/chosen": -287.55778952205884, + "logps/rejected": -446.8732096354167, + "loss": 0.1797, + "rewards/chosen": 0.960392671472886, + "rewards/margins": 8.126960664636949, + "rewards/rejected": -7.166567993164063, + "step": 2047 + }, + { + "epoch": 0.7560334087028748, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 1.4426855422151636e-06, + "logits/chosen": 1780912742.4, + "logits/rejected": 1892006570.6666667, + "logps/chosen": -258.7445068359375, + "logps/rejected": -526.1732991536459, + "loss": 0.1824, + "rewards/chosen": 1.2071907997131348, + "rewards/margins": 10.445830631256104, + "rewards/rejected": -9.238639831542969, + "step": 2048 + }, + { + "epoch": 0.7564025656407181, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 1.4385522421835724e-06, + "logits/chosen": 1906040263.1111112, + "logits/rejected": 1742791094.857143, + "logps/chosen": -328.1267903645833, + "logps/rejected": -427.4940708705357, + "loss": 0.1384, + "rewards/chosen": 1.5462820265028212, + "rewards/margins": 9.13619117131309, + "rewards/rejected": -7.589909144810268, + "step": 2049 + }, + { + "epoch": 0.7567717225785612, + "grad_norm": 11.25, + "kl": 0.1808032989501953, + "learning_rate": 1.4344238764926032e-06, + "logits/chosen": 1492639914.6666667, + "logits/rejected": 1535617638.4, + "logps/chosen": -346.338623046875, + "logps/rejected": -520.3287109375, + "loss": 0.1465, + "rewards/chosen": 1.1984612147013347, + "rewards/margins": 9.242656008402506, + "rewards/rejected": -8.044194793701172, + "step": 2050 + }, + { + "epoch": 0.7571408795164044, + "grad_norm": 12.5625, + "kl": 0.10809516906738281, + "learning_rate": 1.4303004508620515e-06, + "logits/chosen": 2027193524.7058823, + "logits/rejected": 1960737860.2666667, + "logps/chosen": -262.01217830882354, + "logps/rejected": -379.46497395833336, + "loss": 0.1713, + "rewards/chosen": 1.272620257209329, + "rewards/margins": 7.531946429084329, + "rewards/rejected": -6.259326171875, + "step": 2051 + }, + { + "epoch": 0.7575100364542476, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 1.4261819710048725e-06, + "logits/chosen": 2007357667.5555556, + "logits/rejected": 2046024411.4285715, + "logps/chosen": -290.27734375, + "logps/rejected": -421.71651785714283, + "loss": 0.1714, + "rewards/chosen": 1.2697999742296007, + "rewards/margins": 8.894061012873573, + "rewards/rejected": -7.624261038643973, + "step": 2052 + }, + { + "epoch": 0.7578791933920909, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 1.4220684426271692e-06, + "logits/chosen": 1473697792.0, + "logits/rejected": 1691422378.6666667, + "logps/chosen": -282.3156982421875, + "logps/rejected": -480.6361490885417, + "loss": 0.1507, + "rewards/chosen": 1.824575424194336, + "rewards/margins": 9.094924290974934, + "rewards/rejected": -7.270348866780599, + "step": 2053 + }, + { + "epoch": 0.758248350329934, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 1.41795987142818e-06, + "logits/chosen": 1496330649.6, + "logits/rejected": 1555092138.6666667, + "logps/chosen": -271.495751953125, + "logps/rejected": -467.8869222005208, + "loss": 0.1458, + "rewards/chosen": 1.6860584259033202, + "rewards/margins": 11.180962626139323, + "rewards/rejected": -9.494904200236002, + "step": 2054 + }, + { + "epoch": 0.7586175072677772, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 1.4138562631002794e-06, + "logits/chosen": 1401350729.142857, + "logits/rejected": 1531460266.6666667, + "logps/chosen": -325.7025669642857, + "logps/rejected": -593.95751953125, + "loss": 0.1293, + "rewards/chosen": 1.4312733241489954, + "rewards/margins": 10.00260985843719, + "rewards/rejected": -8.571336534288195, + "step": 2055 + }, + { + "epoch": 0.7589866642056204, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 1.4097576233289662e-06, + "logits/chosen": 2113031296.0, + "logits/rejected": 1966601728.0, + "logps/chosen": -255.28561401367188, + "logps/rejected": -472.4173583984375, + "loss": 0.1624, + "rewards/chosen": 1.329160213470459, + "rewards/margins": 11.008141994476318, + "rewards/rejected": -9.67898178100586, + "step": 2056 + }, + { + "epoch": 0.7593558211434636, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 1.405663957792851e-06, + "logits/chosen": 1640980821.3333333, + "logits/rejected": 1596189696.0, + "logps/chosen": -298.97222222222223, + "logps/rejected": -633.6383231026786, + "loss": 0.1555, + "rewards/chosen": 1.2846007876925998, + "rewards/margins": 10.625713197011796, + "rewards/rejected": -9.341112409319196, + "step": 2057 + }, + { + "epoch": 0.7597249780813068, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 1.4015752721636573e-06, + "logits/chosen": 1596920934.4, + "logits/rejected": 3072759808.0, + "logps/chosen": -358.5586669921875, + "logps/rejected": -527.5802408854166, + "loss": 0.1599, + "rewards/chosen": 1.7826927185058594, + "rewards/margins": 10.626574961344401, + "rewards/rejected": -8.843882242838541, + "step": 2058 + }, + { + "epoch": 0.76009413501915, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 1.397491572106207e-06, + "logits/chosen": 3162062848.0, + "logits/rejected": 2265030070.857143, + "logps/chosen": -313.0281032986111, + "logps/rejected": -690.3570731026786, + "loss": 0.1198, + "rewards/chosen": 1.8450406392415364, + "rewards/margins": 13.688767751057943, + "rewards/rejected": -11.843727111816406, + "step": 2059 + }, + { + "epoch": 0.7604632919569932, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 1.3934128632784132e-06, + "logits/chosen": 1589545506.1333334, + "logits/rejected": 1876692028.235294, + "logps/chosen": -274.78489583333334, + "logps/rejected": -557.6839384191177, + "loss": 0.1589, + "rewards/chosen": 1.1055254618326822, + "rewards/margins": 9.000105180927351, + "rewards/rejected": -7.894579719094669, + "step": 2060 + }, + { + "epoch": 0.7608324488948364, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 1.3893391513312759e-06, + "logits/chosen": 1586611541.3333333, + "logits/rejected": 2146018304.0, + "logps/chosen": -229.23879665798611, + "logps/rejected": -552.7053571428571, + "loss": 0.2123, + "rewards/chosen": 0.7878450287712945, + "rewards/margins": 8.72503385846577, + "rewards/rejected": -7.937188829694476, + "step": 2061 + }, + { + "epoch": 0.7612016058326796, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 1.385270441908873e-06, + "logits/chosen": 2005175040.0, + "logits/rejected": 1790334720.0, + "logps/chosen": -391.4029846191406, + "logps/rejected": -509.5302429199219, + "loss": 0.1585, + "rewards/chosen": 1.1778837442398071, + "rewards/margins": 9.970008492469788, + "rewards/rejected": -8.79212474822998, + "step": 2062 + }, + { + "epoch": 0.7615707627705228, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 1.381206740648347e-06, + "logits/chosen": 1897054412.8, + "logits/rejected": 1368102144.0, + "logps/chosen": -271.8548583984375, + "logps/rejected": -414.8579915364583, + "loss": 0.1779, + "rewards/chosen": 1.3230349540710449, + "rewards/margins": 8.796514542897542, + "rewards/rejected": -7.473479588826497, + "step": 2063 + }, + { + "epoch": 0.761939919708366, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 1.3771480531799054e-06, + "logits/chosen": 1872396151.4666667, + "logits/rejected": 1711762130.8235295, + "logps/chosen": -360.7239583333333, + "logps/rejected": -469.14358340992646, + "loss": 0.1803, + "rewards/chosen": 0.8167257308959961, + "rewards/margins": 7.89181064156925, + "rewards/rejected": -7.075084910673254, + "step": 2064 + }, + { + "epoch": 0.7623090766462092, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 1.3730943851268109e-06, + "logits/chosen": 1480819200.0, + "logits/rejected": 1683455627.6363637, + "logps/chosen": -199.68333740234374, + "logps/rejected": -429.24405184659093, + "loss": 0.0783, + "rewards/chosen": 2.0715450286865233, + "rewards/margins": 11.533458397605203, + "rewards/rejected": -9.46191336891868, + "step": 2065 + }, + { + "epoch": 0.7626782335840524, + "grad_norm": 12.75, + "kl": 0.0015287399291992188, + "learning_rate": 1.3690457421053638e-06, + "logits/chosen": 1568928426.6666667, + "logits/rejected": 1999498902.5882354, + "logps/chosen": -317.79296875, + "logps/rejected": -404.32306985294116, + "loss": 0.1472, + "rewards/chosen": 1.2369155883789062, + "rewards/margins": 8.202629986931296, + "rewards/rejected": -6.9657143985523895, + "step": 2066 + }, + { + "epoch": 0.7630473905218956, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 1.36500212972491e-06, + "logits/chosen": 2008749312.0, + "logits/rejected": 2969949184.0, + "logps/chosen": -264.8365478515625, + "logps/rejected": -497.43597412109375, + "loss": 0.1665, + "rewards/chosen": 1.323774814605713, + "rewards/margins": 9.320192813873291, + "rewards/rejected": -7.996417999267578, + "step": 2067 + }, + { + "epoch": 0.7634165474597389, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 1.3609635535878246e-06, + "logits/chosen": 1551519528.4210527, + "logits/rejected": 1609918779.0769231, + "logps/chosen": -247.6013055098684, + "logps/rejected": -442.3217022235577, + "loss": 0.1541, + "rewards/chosen": 1.5751354819849919, + "rewards/margins": 9.576055117464259, + "rewards/rejected": -8.000919635479267, + "step": 2068 + }, + { + "epoch": 0.763785704397582, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 1.3569300192895006e-06, + "logits/chosen": 1492523165.5384614, + "logits/rejected": 1513475880.4210527, + "logps/chosen": -315.8913386418269, + "logps/rejected": -467.3920127467105, + "loss": 0.1288, + "rewards/chosen": 1.0183073190542369, + "rewards/margins": 9.488321389263941, + "rewards/rejected": -8.470014070209704, + "step": 2069 + }, + { + "epoch": 0.7641548613354252, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 1.3529015324183509e-06, + "logits/chosen": 1752607683.764706, + "logits/rejected": 2396700672.0, + "logps/chosen": -377.1692325367647, + "logps/rejected": -497.57861328125, + "loss": 0.1939, + "rewards/chosen": 0.7621687720803654, + "rewards/margins": 8.058951710719688, + "rewards/rejected": -7.296782938639323, + "step": 2070 + }, + { + "epoch": 0.7645240182732684, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 1.348878098555793e-06, + "logits/chosen": 1547665046.5882354, + "logits/rejected": 1414015658.6666667, + "logps/chosen": -292.2236328125, + "logps/rejected": -449.35032552083334, + "loss": 0.1449, + "rewards/chosen": 1.3376334695255054, + "rewards/margins": 9.316419175091912, + "rewards/rejected": -7.978785705566406, + "step": 2071 + }, + { + "epoch": 0.7648931752111117, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 1.344859723276241e-06, + "logits/chosen": 1783757172.3636363, + "logits/rejected": 1629238988.8, + "logps/chosen": -249.5548428622159, + "logps/rejected": -440.015380859375, + "loss": 0.2039, + "rewards/chosen": 1.303561037236994, + "rewards/margins": 8.530482118779963, + "rewards/rejected": -7.226921081542969, + "step": 2072 + }, + { + "epoch": 0.7652623321489548, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 1.3408464121471048e-06, + "logits/chosen": 1644504244.7058823, + "logits/rejected": 2378587613.866667, + "logps/chosen": -255.50927734375, + "logps/rejected": -465.95270182291665, + "loss": 0.1649, + "rewards/chosen": 1.2305217069738053, + "rewards/margins": 11.362267108992034, + "rewards/rejected": -10.131745402018229, + "step": 2073 + }, + { + "epoch": 0.765631489086798, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 1.3368381707287764e-06, + "logits/chosen": 1493595264.0, + "logits/rejected": 1680534272.0, + "logps/chosen": -307.7608337402344, + "logps/rejected": -369.1847229003906, + "loss": 0.1839, + "rewards/chosen": 0.7893334627151489, + "rewards/margins": 7.378596901893616, + "rewards/rejected": -6.589263439178467, + "step": 2074 + }, + { + "epoch": 0.7660006460246412, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 1.3328350045746213e-06, + "logits/chosen": 1971079314.2857144, + "logits/rejected": 1937878812.4444444, + "logps/chosen": -284.118408203125, + "logps/rejected": -457.79871961805554, + "loss": 0.1451, + "rewards/chosen": 1.730255126953125, + "rewards/margins": 8.756823221842449, + "rewards/rejected": -7.026568094889323, + "step": 2075 + }, + { + "epoch": 0.7663698029624845, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 1.3288369192309764e-06, + "logits/chosen": 2036104078.2222223, + "logits/rejected": 2459109083.428571, + "logps/chosen": -243.297119140625, + "logps/rejected": -463.45556640625, + "loss": 0.1669, + "rewards/chosen": 1.3403686947292752, + "rewards/margins": 8.601618781922356, + "rewards/rejected": -7.261250087193081, + "step": 2076 + }, + { + "epoch": 0.7667389599003276, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 1.3248439202371399e-06, + "logits/chosen": 1673170944.0, + "logits/rejected": 2513821696.0, + "logps/chosen": -304.8301478794643, + "logps/rejected": -474.85590277777777, + "loss": 0.1424, + "rewards/chosen": 1.0497095925467355, + "rewards/margins": 8.946515249827552, + "rewards/rejected": -7.896805657280816, + "step": 2077 + }, + { + "epoch": 0.7671081168381708, + "grad_norm": 12.75, + "kl": 0.49268054962158203, + "learning_rate": 1.3208560131253578e-06, + "logits/chosen": 2571503977.4117646, + "logits/rejected": 1727598045.8666666, + "logps/chosen": -274.81603285845586, + "logps/rejected": -384.95729166666666, + "loss": 0.1716, + "rewards/chosen": 1.342420690199908, + "rewards/margins": 9.15631280038871, + "rewards/rejected": -7.813892110188802, + "step": 2078 + }, + { + "epoch": 0.767477273776014, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 1.3168732034208264e-06, + "logits/chosen": 1499237444.2666667, + "logits/rejected": 1592785498.3529413, + "logps/chosen": -223.54334309895833, + "logps/rejected": -550.0683019301471, + "loss": 0.1209, + "rewards/chosen": 1.7906943003336588, + "rewards/margins": 10.69718126035204, + "rewards/rejected": -8.906486960018382, + "step": 2079 + }, + { + "epoch": 0.7678464307138573, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 1.3128954966416801e-06, + "logits/chosen": 1664074112.0, + "logits/rejected": 1747345280.0, + "logps/chosen": -299.033203125, + "logps/rejected": -451.5663757324219, + "loss": 0.1694, + "rewards/chosen": 0.853320300579071, + "rewards/margins": 7.871069133281708, + "rewards/rejected": -7.017748832702637, + "step": 2080 + }, + { + "epoch": 0.7682155876517004, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 1.3089228982989771e-06, + "logits/chosen": 2248553764.571429, + "logits/rejected": 1865484970.6666667, + "logps/chosen": -405.90980747767856, + "logps/rejected": -360.74517144097223, + "loss": 0.1314, + "rewards/chosen": 1.4083318710327148, + "rewards/margins": 7.361144913567437, + "rewards/rejected": -5.952813042534722, + "step": 2081 + }, + { + "epoch": 0.7685847445895436, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 1.3049554138967052e-06, + "logits/chosen": 2476451384.888889, + "logits/rejected": 2028000987.4285715, + "logps/chosen": -288.8951009114583, + "logps/rejected": -542.3028738839286, + "loss": 0.1231, + "rewards/chosen": 1.947161144680447, + "rewards/margins": 11.208701148865714, + "rewards/rejected": -9.261540004185267, + "step": 2082 + }, + { + "epoch": 0.7689539015273869, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 1.3009930489317613e-06, + "logits/chosen": 1328423424.0, + "logits/rejected": 1596221696.0, + "logps/chosen": -244.1783447265625, + "logps/rejected": -495.1112976074219, + "loss": 0.1544, + "rewards/chosen": 1.1717948913574219, + "rewards/margins": 8.98978853225708, + "rewards/rejected": -7.817993640899658, + "step": 2083 + }, + { + "epoch": 0.7693230584652301, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 1.2970358088939534e-06, + "logits/chosen": 1486632960.0, + "logits/rejected": 1730396160.0, + "logps/chosen": -299.4842418323864, + "logps/rejected": -484.91629464285717, + "loss": 0.1062, + "rewards/chosen": 1.6065597534179688, + "rewards/margins": 11.049230303083148, + "rewards/rejected": -9.442670549665179, + "step": 2084 + }, + { + "epoch": 0.7696922154030732, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 1.2930836992659857e-06, + "logits/chosen": 1279730688.0, + "logits/rejected": 1605823180.8, + "logps/chosen": -250.98199462890625, + "logps/rejected": -464.480224609375, + "loss": 0.1069, + "rewards/chosen": 1.5622483889261882, + "rewards/margins": 9.997207800547281, + "rewards/rejected": -8.434959411621094, + "step": 2085 + }, + { + "epoch": 0.7700613723409164, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 1.2891367255234566e-06, + "logits/chosen": 1335847219.2, + "logits/rejected": 1339099818.6666667, + "logps/chosen": -303.011376953125, + "logps/rejected": -328.13055419921875, + "loss": 0.1736, + "rewards/chosen": 1.7935924530029297, + "rewards/margins": 7.458396275838216, + "rewards/rejected": -5.664803822835286, + "step": 2086 + }, + { + "epoch": 0.7704305292787597, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 1.2851948931348495e-06, + "logits/chosen": 1627452793.2631578, + "logits/rejected": 2052854232.6153846, + "logps/chosen": -305.1001747532895, + "logps/rejected": -372.1571514423077, + "loss": 0.131, + "rewards/chosen": 2.13705103020919, + "rewards/margins": 8.923292414862134, + "rewards/rejected": -6.786241384652945, + "step": 2087 + }, + { + "epoch": 0.7707996862166029, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 1.281258207561521e-06, + "logits/chosen": 1731540992.0, + "logits/rejected": 1913558196.7058823, + "logps/chosen": -252.41598307291667, + "logps/rejected": -502.98713235294116, + "loss": 0.1365, + "rewards/chosen": 1.398080062866211, + "rewards/margins": 8.820497198665842, + "rewards/rejected": -7.422417135799632, + "step": 2088 + }, + { + "epoch": 0.771168843154446, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 1.277326674257699e-06, + "logits/chosen": 2199679407.1578946, + "logits/rejected": 1497409851.0769231, + "logps/chosen": -309.81558388157896, + "logps/rejected": -540.4041090745193, + "loss": 0.1603, + "rewards/chosen": 1.2597451460988898, + "rewards/margins": 11.915035016140957, + "rewards/rejected": -10.655289870042067, + "step": 2089 + }, + { + "epoch": 0.7715380000922892, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 1.2734002986704757e-06, + "logits/chosen": 1619760469.3333333, + "logits/rejected": 1690481152.0, + "logps/chosen": -344.0538330078125, + "logps/rejected": -330.18740234375, + "loss": 0.1318, + "rewards/chosen": 1.4841979344685872, + "rewards/margins": 7.232413419087727, + "rewards/rejected": -5.7482154846191404, + "step": 2090 + }, + { + "epoch": 0.7719071570301325, + "grad_norm": 7.59375, + "kl": 0.0, + "learning_rate": 1.269479086239791e-06, + "logits/chosen": 793173643.6363636, + "logits/rejected": 1610658279.6190476, + "logps/chosen": -231.5226384943182, + "logps/rejected": -502.23409598214283, + "loss": 0.0526, + "rewards/chosen": 2.5993451205166904, + "rewards/margins": 10.306494807784176, + "rewards/rejected": -7.707149687267485, + "step": 2091 + }, + { + "epoch": 0.7722763139679756, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 1.2655630423984367e-06, + "logits/chosen": 1286810555.7333333, + "logits/rejected": 2071296843.2941177, + "logps/chosen": -293.77600911458336, + "logps/rejected": -484.50373391544116, + "loss": 0.1338, + "rewards/chosen": 1.0917074839274088, + "rewards/margins": 9.716355634203143, + "rewards/rejected": -8.624648150275736, + "step": 2092 + }, + { + "epoch": 0.7726454709058188, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 1.2616521725720427e-06, + "logits/chosen": 1779963392.0, + "logits/rejected": 1474359936.0, + "logps/chosen": -216.84512329101562, + "logps/rejected": -435.30267333984375, + "loss": 0.149, + "rewards/chosen": 1.2801272869110107, + "rewards/margins": 8.830106973648071, + "rewards/rejected": -7.5499796867370605, + "step": 2093 + }, + { + "epoch": 0.773014627843662, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 1.2577464821790675e-06, + "logits/chosen": 2363963830.857143, + "logits/rejected": 1482057728.0, + "logps/chosen": -328.80967494419644, + "logps/rejected": -423.0745442708333, + "loss": 0.1226, + "rewards/chosen": 1.53678526197161, + "rewards/margins": 10.072484894404337, + "rewards/rejected": -8.535699632432726, + "step": 2094 + }, + { + "epoch": 0.7733837847815053, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 1.253845976630796e-06, + "logits/chosen": 1957459090.2857144, + "logits/rejected": 1779946951.1111112, + "logps/chosen": -311.94126674107144, + "logps/rejected": -410.6589626736111, + "loss": 0.1451, + "rewards/chosen": 1.3760772432599748, + "rewards/margins": 8.385124857463534, + "rewards/rejected": -7.009047614203559, + "step": 2095 + }, + { + "epoch": 0.7737529417193484, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 1.2499506613313307e-06, + "logits/chosen": 2149391360.0, + "logits/rejected": 2805846471.111111, + "logps/chosen": -244.37953404017858, + "logps/rejected": -481.7652994791667, + "loss": 0.1355, + "rewards/chosen": 1.4698817389351981, + "rewards/margins": 9.458992897518097, + "rewards/rejected": -7.989111158582899, + "step": 2096 + }, + { + "epoch": 0.7741220986571916, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 1.2460605416775789e-06, + "logits/chosen": 1702555520.0, + "logits/rejected": 1686884992.0, + "logps/chosen": -268.7099304199219, + "logps/rejected": -439.53253173828125, + "loss": 0.1603, + "rewards/chosen": 1.0775701999664307, + "rewards/margins": 8.424110174179077, + "rewards/rejected": -7.3465399742126465, + "step": 2097 + }, + { + "epoch": 0.7744912555950348, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 1.2421756230592535e-06, + "logits/chosen": 2036433377.8823528, + "logits/rejected": 1577710114.1333334, + "logps/chosen": -288.525390625, + "logps/rejected": -461.3525065104167, + "loss": 0.1435, + "rewards/chosen": 1.7544322294347428, + "rewards/margins": 8.672567808861826, + "rewards/rejected": -6.918135579427084, + "step": 2098 + }, + { + "epoch": 0.7748604125328781, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 1.2382959108588627e-06, + "logits/chosen": 1568905352.5333333, + "logits/rejected": 1454831736.4705882, + "logps/chosen": -323.3774739583333, + "logps/rejected": -526.5220588235294, + "loss": 0.1535, + "rewards/chosen": 1.3379816691080728, + "rewards/margins": 9.048392292097503, + "rewards/rejected": -7.71041062298943, + "step": 2099 + }, + { + "epoch": 0.7752295694707212, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 1.2344214104516921e-06, + "logits/chosen": 1760174720.0, + "logits/rejected": 1455342208.0, + "logps/chosen": -273.86212158203125, + "logps/rejected": -470.70526123046875, + "loss": 0.1488, + "rewards/chosen": 1.162919521331787, + "rewards/margins": 9.845232486724854, + "rewards/rejected": -8.682312965393066, + "step": 2100 + }, + { + "epoch": 0.7755987264085644, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 1.2305521272058163e-06, + "logits/chosen": 1755282909.8666666, + "logits/rejected": 1513678727.5294118, + "logps/chosen": -214.956298828125, + "logps/rejected": -419.9303193933824, + "loss": 0.1233, + "rewards/chosen": 1.5693906148274739, + "rewards/margins": 9.188648822260838, + "rewards/rejected": -7.619258207433364, + "step": 2101 + }, + { + "epoch": 0.7759678833464076, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 1.2266880664820797e-06, + "logits/chosen": 2703942178.133333, + "logits/rejected": 2495907237.647059, + "logps/chosen": -294.95579427083334, + "logps/rejected": -613.66796875, + "loss": 0.1227, + "rewards/chosen": 1.5480325063069662, + "rewards/margins": 12.783603077308804, + "rewards/rejected": -11.235570571001839, + "step": 2102 + }, + { + "epoch": 0.7763370402842509, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 1.2228292336340857e-06, + "logits/chosen": 1814881673.8461537, + "logits/rejected": 1860712986.9473684, + "logps/chosen": -338.79233022836536, + "logps/rejected": -574.5554070723684, + "loss": 0.1339, + "rewards/chosen": 1.7570706881009615, + "rewards/margins": 10.463628112545862, + "rewards/rejected": -8.7065574244449, + "step": 2103 + }, + { + "epoch": 0.776706197222094, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 1.2189756340082004e-06, + "logits/chosen": 1294910008.8888888, + "logits/rejected": 1503529837.7142856, + "logps/chosen": -203.624755859375, + "logps/rejected": -459.20912388392856, + "loss": 0.1955, + "rewards/chosen": 1.1149960623847113, + "rewards/margins": 9.422772922213117, + "rewards/rejected": -8.307776859828405, + "step": 2104 + }, + { + "epoch": 0.7770753541599372, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 1.2151272729435376e-06, + "logits/chosen": 1434282496.0, + "logits/rejected": 1420347904.0, + "logps/chosen": -222.31857299804688, + "logps/rejected": -398.11553955078125, + "loss": 0.177, + "rewards/chosen": 1.0690759420394897, + "rewards/margins": 8.211897730827332, + "rewards/rejected": -7.142821788787842, + "step": 2105 + }, + { + "epoch": 0.7774445110977805, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 1.2112841557719506e-06, + "logits/chosen": 1729533590.5882354, + "logits/rejected": 1500298444.8, + "logps/chosen": -207.35436293658088, + "logps/rejected": -522.6194010416667, + "loss": 0.1668, + "rewards/chosen": 1.234585818122415, + "rewards/margins": 9.450026186774759, + "rewards/rejected": -8.215440368652343, + "step": 2106 + }, + { + "epoch": 0.7778136680356237, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 1.207446287818031e-06, + "logits/chosen": 1418827946.6666667, + "logits/rejected": 1519582310.4, + "logps/chosen": -332.44016520182294, + "logps/rejected": -516.4619140625, + "loss": 0.0997, + "rewards/chosen": 1.5088043212890625, + "rewards/margins": 10.407460784912109, + "rewards/rejected": -8.898656463623047, + "step": 2107 + }, + { + "epoch": 0.7781828249734668, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 1.2036136743990968e-06, + "logits/chosen": 1924216960.0, + "logits/rejected": 2107853184.0, + "logps/chosen": -275.3818054199219, + "logps/rejected": -493.56317138671875, + "loss": 0.1448, + "rewards/chosen": 1.3006627559661865, + "rewards/margins": 10.011147260665894, + "rewards/rejected": -8.710484504699707, + "step": 2108 + }, + { + "epoch": 0.77855198191131, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 1.1997863208251825e-06, + "logits/chosen": 1430259832.4705882, + "logits/rejected": 1578486169.6, + "logps/chosen": -207.3798828125, + "logps/rejected": -474.17981770833336, + "loss": 0.1809, + "rewards/chosen": 1.0920757966883041, + "rewards/margins": 8.39398619707893, + "rewards/rejected": -7.301910400390625, + "step": 2109 + }, + { + "epoch": 0.7789211388491533, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 1.19596423239904e-06, + "logits/chosen": 1889715584.0, + "logits/rejected": 1589796096.0, + "logps/chosen": -293.8381652832031, + "logps/rejected": -454.4057922363281, + "loss": 0.1482, + "rewards/chosen": 1.903711199760437, + "rewards/margins": 9.954961657524109, + "rewards/rejected": -8.051250457763672, + "step": 2110 + }, + { + "epoch": 0.7792902957869965, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 1.1921474144161249e-06, + "logits/chosen": 1412733771.2941177, + "logits/rejected": 1862752802.1333334, + "logps/chosen": -236.1786678538603, + "logps/rejected": -398.0810872395833, + "loss": 0.1921, + "rewards/chosen": 0.9859764996696921, + "rewards/margins": 7.683327573888442, + "rewards/rejected": -6.69735107421875, + "step": 2111 + }, + { + "epoch": 0.7796594527248396, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 1.1883358721645876e-06, + "logits/chosen": 1999403795.6923077, + "logits/rejected": 1782930378.1052632, + "logps/chosen": -183.85516826923077, + "logps/rejected": -515.3681126644736, + "loss": 0.0931, + "rewards/chosen": 1.9577616178072417, + "rewards/margins": 11.10010202693553, + "rewards/rejected": -9.14234040912829, + "step": 2112 + }, + { + "epoch": 0.7800286096626828, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 1.1845296109252724e-06, + "logits/chosen": 1378603654.7368422, + "logits/rejected": 2240638345.8461537, + "logps/chosen": -280.9996916118421, + "logps/rejected": -516.7759540264423, + "loss": 0.2098, + "rewards/chosen": 0.9379185124447471, + "rewards/margins": 9.39060452882095, + "rewards/rejected": -8.452686016376202, + "step": 2113 + }, + { + "epoch": 0.7803977666005261, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 1.180728635971708e-06, + "logits/chosen": 1947232737.8823528, + "logits/rejected": 1392377309.8666666, + "logps/chosen": -244.28478285845588, + "logps/rejected": -562.0152994791666, + "loss": 0.1033, + "rewards/chosen": 2.499034881591797, + "rewards/margins": 11.266665903727214, + "rewards/rejected": -8.767631022135417, + "step": 2114 + }, + { + "epoch": 0.7807669235383693, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 1.1769329525700934e-06, + "logits/chosen": 1636103899.4285715, + "logits/rejected": 1678361372.4444444, + "logps/chosen": -286.09702845982144, + "logps/rejected": -500.0782877604167, + "loss": 0.1206, + "rewards/chosen": 1.3569446291242326, + "rewards/margins": 9.93461271316286, + "rewards/rejected": -8.577668084038628, + "step": 2115 + }, + { + "epoch": 0.7811360804762124, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 1.1731425659793028e-06, + "logits/chosen": 2203224356.571429, + "logits/rejected": 1597955537.4545455, + "logps/chosen": -301.1664574032738, + "logps/rejected": -371.5704456676136, + "loss": 0.2115, + "rewards/chosen": 1.0718748910086495, + "rewards/margins": 7.948622715937627, + "rewards/rejected": -6.8767478249289775, + "step": 2116 + }, + { + "epoch": 0.7815052374140556, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 1.1693574814508657e-06, + "logits/chosen": 2271681024.0, + "logits/rejected": 2306313011.2, + "logps/chosen": -354.0397542317708, + "logps/rejected": -551.590234375, + "loss": 0.1052, + "rewards/chosen": 1.256086270014445, + "rewards/margins": 10.313297573725382, + "rewards/rejected": -9.057211303710938, + "step": 2117 + }, + { + "epoch": 0.7818743943518989, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 1.1655777042289724e-06, + "logits/chosen": 1273346420.3636363, + "logits/rejected": 1362985369.6, + "logps/chosen": -265.50978781960225, + "logps/rejected": -645.742138671875, + "loss": 0.1593, + "rewards/chosen": 1.7483104359019885, + "rewards/margins": 15.131628001819957, + "rewards/rejected": -13.383317565917968, + "step": 2118 + }, + { + "epoch": 0.7822435512897421, + "grad_norm": 14.625, + "kl": 0.40540599822998047, + "learning_rate": 1.161803239550452e-06, + "logits/chosen": 1912421760.0, + "logits/rejected": 2155990272.0, + "logps/chosen": -352.4815673828125, + "logps/rejected": -501.4020080566406, + "loss": 0.1414, + "rewards/chosen": 1.4339648485183716, + "rewards/margins": 9.227373719215393, + "rewards/rejected": -7.7934088706970215, + "step": 2119 + }, + { + "epoch": 0.7826127082275852, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 1.1580340926447797e-06, + "logits/chosen": 1634771316.3636363, + "logits/rejected": 1515953395.8095238, + "logps/chosen": -240.21775124289772, + "logps/rejected": -481.4310360863095, + "loss": 0.1114, + "rewards/chosen": 1.2349912470037288, + "rewards/margins": 8.543245670599338, + "rewards/rejected": -7.30825442359561, + "step": 2120 + }, + { + "epoch": 0.7829818651654284, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 1.1542702687340612e-06, + "logits/chosen": 1582313881.6, + "logits/rejected": 2011793769.4117646, + "logps/chosen": -289.14296875, + "logps/rejected": -516.9860983455883, + "loss": 0.1766, + "rewards/chosen": 0.6836233139038086, + "rewards/margins": 8.638850436491126, + "rewards/rejected": -7.955227122587316, + "step": 2121 + }, + { + "epoch": 0.7833510221032717, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 1.150511773033025e-06, + "logits/chosen": 1659531776.0, + "logits/rejected": 1617297612.8, + "logps/chosen": -313.4281005859375, + "logps/rejected": -550.017578125, + "loss": 0.0891, + "rewards/chosen": 2.096837361653646, + "rewards/margins": 10.141339619954428, + "rewards/rejected": -8.044502258300781, + "step": 2122 + }, + { + "epoch": 0.7837201790411149, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 1.1467586107490202e-06, + "logits/chosen": 1916028245.3333333, + "logits/rejected": 2139765579.2941177, + "logps/chosen": -405.83424479166666, + "logps/rejected": -455.9104434742647, + "loss": 0.1498, + "rewards/chosen": 1.2896779378255208, + "rewards/margins": 9.014205633425245, + "rewards/rejected": -7.724527695599725, + "step": 2123 + }, + { + "epoch": 0.784089335978958, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 1.143010787082006e-06, + "logits/chosen": 1666315392.0, + "logits/rejected": 2108586752.0, + "logps/chosen": -185.77969360351562, + "logps/rejected": -556.0291137695312, + "loss": 0.147, + "rewards/chosen": 1.490041971206665, + "rewards/margins": 10.566503763198853, + "rewards/rejected": -9.076461791992188, + "step": 2124 + }, + { + "epoch": 0.7844584929168013, + "grad_norm": 14.6875, + "kl": 0.09320878982543945, + "learning_rate": 1.139268307224543e-06, + "logits/chosen": 2287349581.9130435, + "logits/rejected": 1767121123.5555556, + "logps/chosen": -255.02662194293478, + "logps/rejected": -445.91663953993054, + "loss": 0.2055, + "rewards/chosen": 1.3409176702084749, + "rewards/margins": 7.798617763795714, + "rewards/rejected": -6.457700093587239, + "step": 2125 + }, + { + "epoch": 0.7848276498546445, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 1.13553117636179e-06, + "logits/chosen": 1668549812.7058823, + "logits/rejected": 1356975172.2666667, + "logps/chosen": -290.9557100183824, + "logps/rejected": -452.66949869791665, + "loss": 0.1289, + "rewards/chosen": 2.160562851849724, + "rewards/margins": 9.526698512657015, + "rewards/rejected": -7.366135660807291, + "step": 2126 + }, + { + "epoch": 0.7851968067924877, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 1.131799399671496e-06, + "logits/chosen": 1735426304.0, + "logits/rejected": 2424764672.0, + "logps/chosen": -248.26498413085938, + "logps/rejected": -604.7933959960938, + "loss": 0.1319, + "rewards/chosen": 1.7885369062423706, + "rewards/margins": 9.70484721660614, + "rewards/rejected": -7.9163103103637695, + "step": 2127 + }, + { + "epoch": 0.7855659637303308, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 1.1280729823239872e-06, + "logits/chosen": 1929404656.9411764, + "logits/rejected": 2402213341.866667, + "logps/chosen": -305.65076401654414, + "logps/rejected": -499.26588541666666, + "loss": 0.1802, + "rewards/chosen": 0.863837859209846, + "rewards/margins": 8.923859831866096, + "rewards/rejected": -8.06002197265625, + "step": 2128 + }, + { + "epoch": 0.7859351206681741, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 1.1243519294821693e-06, + "logits/chosen": 2158078510.5454545, + "logits/rejected": 1751424243.8095238, + "logps/chosen": -335.9339710582386, + "logps/rejected": -398.8560267857143, + "loss": 0.1, + "rewards/chosen": 1.7850001942027698, + "rewards/margins": 8.567956272141759, + "rewards/rejected": -6.782956077938988, + "step": 2129 + }, + { + "epoch": 0.7863042776060173, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 1.1206362463015146e-06, + "logits/chosen": 1641980691.6923077, + "logits/rejected": 1933989133.4736843, + "logps/chosen": -254.8580603966346, + "logps/rejected": -355.37240439967104, + "loss": 0.1431, + "rewards/chosen": 1.0959196090698242, + "rewards/margins": 6.935558168511641, + "rewards/rejected": -5.839638559441817, + "step": 2130 + }, + { + "epoch": 0.7866734345438604, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 1.1169259379300524e-06, + "logits/chosen": 1612942336.0, + "logits/rejected": 1408450332.4444444, + "logps/chosen": -285.51754324776783, + "logps/rejected": -414.4538845486111, + "loss": 0.146, + "rewards/chosen": 1.0097789083208357, + "rewards/margins": 8.292112282344274, + "rewards/rejected": -7.2823333740234375, + "step": 2131 + }, + { + "epoch": 0.7870425914817036, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 1.1132210095083696e-06, + "logits/chosen": 1398566353.4545455, + "logits/rejected": 1432116019.2, + "logps/chosen": -214.5286532315341, + "logps/rejected": -348.2657958984375, + "loss": 0.2253, + "rewards/chosen": 1.0093212994662197, + "rewards/margins": 7.837964335354892, + "rewards/rejected": -6.828643035888672, + "step": 2132 + }, + { + "epoch": 0.7874117484195469, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 1.1095214661695985e-06, + "logits/chosen": 1566108020.3636363, + "logits/rejected": 1917627801.6, + "logps/chosen": -231.8875177556818, + "logps/rejected": -524.026416015625, + "loss": 0.1608, + "rewards/chosen": 1.717980298128995, + "rewards/margins": 10.55288630398837, + "rewards/rejected": -8.834906005859375, + "step": 2133 + }, + { + "epoch": 0.7877809053573901, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 1.1058273130394075e-06, + "logits/chosen": 1344210944.0, + "logits/rejected": 1989409738.1052632, + "logps/chosen": -270.01695838341345, + "logps/rejected": -549.825349506579, + "loss": 0.084, + "rewards/chosen": 1.9236715756929839, + "rewards/margins": 10.743314295162556, + "rewards/rejected": -8.819642719469572, + "step": 2134 + }, + { + "epoch": 0.7881500622952332, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 1.1021385552359982e-06, + "logits/chosen": 1615123163.4285715, + "logits/rejected": 1665027185.7777777, + "logps/chosen": -277.03995186941967, + "logps/rejected": -530.6915147569445, + "loss": 0.1293, + "rewards/chosen": 1.2842340469360352, + "rewards/margins": 10.039756880866157, + "rewards/rejected": -8.755522833930122, + "step": 2135 + }, + { + "epoch": 0.7885192192330764, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 1.0984551978701001e-06, + "logits/chosen": 1762699036.4444444, + "logits/rejected": 1873581056.0, + "logps/chosen": -264.49574110243054, + "logps/rejected": -609.1109095982143, + "loss": 0.1611, + "rewards/chosen": 1.5279376771714952, + "rewards/margins": 9.101227820865692, + "rewards/rejected": -7.573290143694196, + "step": 2136 + }, + { + "epoch": 0.7888883761709197, + "grad_norm": 11.3125, + "kl": 0.4724884033203125, + "learning_rate": 1.0947772460449558e-06, + "logits/chosen": 1430769800.5333333, + "logits/rejected": 1625487239.5294118, + "logps/chosen": -315.79521484375, + "logps/rejected": -458.32634420955884, + "loss": 0.0963, + "rewards/chosen": 2.2747952779134115, + "rewards/margins": 10.52536703371534, + "rewards/rejected": -8.25057175580193, + "step": 2137 + }, + { + "epoch": 0.7892575331087629, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 1.0911047048563212e-06, + "logits/chosen": 1564248576.0, + "logits/rejected": 1318154922.6666667, + "logps/chosen": -293.43603515625, + "logps/rejected": -567.2138264973959, + "loss": 0.1662, + "rewards/chosen": 1.2930603981018067, + "rewards/margins": 8.966344292958578, + "rewards/rejected": -7.6732838948567705, + "step": 2138 + }, + { + "epoch": 0.789626690046606, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 1.0874375793924575e-06, + "logits/chosen": 1731448149.3333333, + "logits/rejected": 1711015731.2, + "logps/chosen": -198.1775919596354, + "logps/rejected": -465.154736328125, + "loss": 0.14, + "rewards/chosen": 0.7425759633382162, + "rewards/margins": 7.519910558064779, + "rewards/rejected": -6.777334594726563, + "step": 2139 + }, + { + "epoch": 0.7899958469844492, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 1.0837758747341176e-06, + "logits/chosen": 2301348924.2352943, + "logits/rejected": 1284777437.8666666, + "logps/chosen": -286.23583984375, + "logps/rejected": -418.6337565104167, + "loss": 0.1938, + "rewards/chosen": 0.9447941499597886, + "rewards/margins": 8.383119141821767, + "rewards/rejected": -7.438324991861979, + "step": 2140 + }, + { + "epoch": 0.7903650039222925, + "grad_norm": 13.4375, + "kl": 0.47329235076904297, + "learning_rate": 1.0801195959545486e-06, + "logits/chosen": 1811425159.5294118, + "logits/rejected": 1666418278.4, + "logps/chosen": -326.9568876378676, + "logps/rejected": -550.8907552083333, + "loss": 0.1599, + "rewards/chosen": 1.1159837386187386, + "rewards/margins": 10.5276689192828, + "rewards/rejected": -9.411685180664062, + "step": 2141 + }, + { + "epoch": 0.7907341608601357, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 1.0764687481194786e-06, + "logits/chosen": 1920406869.3333333, + "logits/rejected": 2057775689.142857, + "logps/chosen": -307.5798068576389, + "logps/rejected": -463.77396065848217, + "loss": 0.1324, + "rewards/chosen": 1.8217654758029513, + "rewards/margins": 9.698861924428789, + "rewards/rejected": -7.877096448625837, + "step": 2142 + }, + { + "epoch": 0.7911033177979788, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 1.0728233362871087e-06, + "logits/chosen": 1423374823.6190476, + "logits/rejected": 1459926295.2727273, + "logps/chosen": -212.15518043154762, + "logps/rejected": -427.9959605823864, + "loss": 0.1896, + "rewards/chosen": 1.770999545142764, + "rewards/margins": 9.740399430840561, + "rewards/rejected": -7.969399885697798, + "step": 2143 + }, + { + "epoch": 0.791472474735822, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 1.0691833655081124e-06, + "logits/chosen": 2652236137.4117646, + "logits/rejected": 1466845320.5333333, + "logps/chosen": -307.37353515625, + "logps/rejected": -479.64964192708334, + "loss": 0.1351, + "rewards/chosen": 1.6141483082490808, + "rewards/margins": 9.017488457174862, + "rewards/rejected": -7.403340148925781, + "step": 2144 + }, + { + "epoch": 0.7918416316736653, + "grad_norm": 11.625, + "kl": 2.956151008605957, + "learning_rate": 1.0655488408256243e-06, + "logits/chosen": 1751860292.2666667, + "logits/rejected": 1917332660.7058823, + "logps/chosen": -218.73834635416668, + "logps/rejected": -515.8008961397059, + "loss": 0.2008, + "rewards/chosen": 0.6363428115844727, + "rewards/margins": 8.50607224632712, + "rewards/rejected": -7.869729434742647, + "step": 2145 + }, + { + "epoch": 0.7922107886115085, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 1.0619197672752285e-06, + "logits/chosen": 1462889585.7777777, + "logits/rejected": 1584917459.4782608, + "logps/chosen": -319.048583984375, + "logps/rejected": -534.2581946331521, + "loss": 0.0572, + "rewards/chosen": 2.354755401611328, + "rewards/margins": 11.368802775507389, + "rewards/rejected": -9.01404737389606, + "step": 2146 + }, + { + "epoch": 0.7925799455493516, + "grad_norm": 7.375, + "kl": 0.0, + "learning_rate": 1.058296149884963e-06, + "logits/chosen": 1167286551.2727273, + "logits/rejected": 1528973994.6666667, + "logps/chosen": -231.3524724786932, + "logps/rejected": -512.0945870535714, + "loss": 0.0811, + "rewards/chosen": 2.1403995860706675, + "rewards/margins": 10.883541123691575, + "rewards/rejected": -8.743141537620907, + "step": 2147 + }, + { + "epoch": 0.7929491024871949, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 1.0546779936753037e-06, + "logits/chosen": 1556781056.0, + "logits/rejected": 1519836091.7333333, + "logps/chosen": -321.21875, + "logps/rejected": -393.6167317708333, + "loss": 0.1583, + "rewards/chosen": 1.4540210050695084, + "rewards/margins": 8.97680949790805, + "rewards/rejected": -7.522788492838542, + "step": 2148 + }, + { + "epoch": 0.7933182594250381, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 1.0510653036591583e-06, + "logits/chosen": 1133023697.4545455, + "logits/rejected": 1987925528.3809524, + "logps/chosen": -247.78486772017047, + "logps/rejected": -633.7120535714286, + "loss": 0.1026, + "rewards/chosen": 1.3745264573530718, + "rewards/margins": 18.112008437449795, + "rewards/rejected": -16.737481980096724, + "step": 2149 + }, + { + "epoch": 0.7936874163628813, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 1.0474580848418643e-06, + "logits/chosen": 1400624790.5882354, + "logits/rejected": 1580587827.2, + "logps/chosen": -285.51754940257354, + "logps/rejected": -535.4962890625, + "loss": 0.1562, + "rewards/chosen": 1.0662593841552734, + "rewards/margins": 10.124297714233398, + "rewards/rejected": -9.058038330078125, + "step": 2150 + }, + { + "epoch": 0.7940565733007244, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 1.0438563422211784e-06, + "logits/chosen": 1974573875.2, + "logits/rejected": 2143817728.0, + "logps/chosen": -270.6696533203125, + "logps/rejected": -740.5305989583334, + "loss": 0.1409, + "rewards/chosen": 1.5236953735351562, + "rewards/margins": 13.086130523681641, + "rewards/rejected": -11.562435150146484, + "step": 2151 + }, + { + "epoch": 0.7944257302385677, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 1.0402600807872676e-06, + "logits/chosen": 1588163291.4285715, + "logits/rejected": 1572457358.2222223, + "logps/chosen": -245.01032366071428, + "logps/rejected": -546.9867621527778, + "loss": 0.1273, + "rewards/chosen": 1.3038132531302316, + "rewards/margins": 9.981727342756967, + "rewards/rejected": -8.677914089626736, + "step": 2152 + }, + { + "epoch": 0.7947948871764109, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 1.0366693055227063e-06, + "logits/chosen": 1527320883.2, + "logits/rejected": 1914518528.0, + "logps/chosen": -284.924658203125, + "logps/rejected": -417.2930094401042, + "loss": 0.1582, + "rewards/chosen": 1.4870843887329102, + "rewards/margins": 7.901366233825684, + "rewards/rejected": -6.414281845092773, + "step": 2153 + }, + { + "epoch": 0.7951640441142541, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 1.033084021402468e-06, + "logits/chosen": 1535388525.7142856, + "logits/rejected": 2052172458.6666667, + "logps/chosen": -256.46534946986606, + "logps/rejected": -731.9357096354166, + "loss": 0.132, + "rewards/chosen": 1.18899781363351, + "rewards/margins": 27.36370755755712, + "rewards/rejected": -26.17470974392361, + "step": 2154 + }, + { + "epoch": 0.7955332010520972, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 1.0295042333939204e-06, + "logits/chosen": 2378730642.285714, + "logits/rejected": 1467363100.4444444, + "logps/chosen": -310.45472935267856, + "logps/rejected": -468.06776258680554, + "loss": 0.1565, + "rewards/chosen": 0.7424507822309222, + "rewards/margins": 9.46723817643665, + "rewards/rejected": -8.724787394205729, + "step": 2155 + }, + { + "epoch": 0.7959023579899405, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 1.0259299464568112e-06, + "logits/chosen": 1370636408.4705882, + "logits/rejected": 1223687782.4, + "logps/chosen": -321.8867761948529, + "logps/rejected": -363.42421875, + "loss": 0.1564, + "rewards/chosen": 1.9227891809800093, + "rewards/margins": 9.280984167959176, + "rewards/rejected": -7.358194986979167, + "step": 2156 + }, + { + "epoch": 0.7962715149277837, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 1.0223611655432713e-06, + "logits/chosen": 2038635373.7142856, + "logits/rejected": 2011491048.7272727, + "logps/chosen": -323.2655319940476, + "logps/rejected": -440.01802201704544, + "loss": 0.1916, + "rewards/chosen": 1.5339293706984747, + "rewards/margins": 10.545698058553588, + "rewards/rejected": -9.011768687855113, + "step": 2157 + }, + { + "epoch": 0.7966406718656269, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 1.0187978955978028e-06, + "logits/chosen": 1505409994.1052632, + "logits/rejected": 1601845878.1538463, + "logps/chosen": -183.44148334703948, + "logps/rejected": -451.71011117788464, + "loss": 0.1971, + "rewards/chosen": 1.2321148922568874, + "rewards/margins": 10.67469137400268, + "rewards/rejected": -9.442576481745792, + "step": 2158 + }, + { + "epoch": 0.79700982880347, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 1.0152401415572677e-06, + "logits/chosen": 1761092186.3529413, + "logits/rejected": 1483552631.4666667, + "logps/chosen": -301.11038028492646, + "logps/rejected": -465.6580403645833, + "loss": 0.1186, + "rewards/chosen": 2.018575107350069, + "rewards/margins": 10.140547254973766, + "rewards/rejected": -8.121972147623698, + "step": 2159 + }, + { + "epoch": 0.7973789857413133, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 1.0116879083508908e-06, + "logits/chosen": 1572166144.0, + "logits/rejected": 2132218197.3333333, + "logps/chosen": -287.95224609375, + "logps/rejected": -355.5868733723958, + "loss": 0.1776, + "rewards/chosen": 1.3507423400878906, + "rewards/margins": 7.554182688395183, + "rewards/rejected": -6.203440348307292, + "step": 2160 + }, + { + "epoch": 0.7977481426791565, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 1.0081412009002466e-06, + "logits/chosen": 1735533568.0, + "logits/rejected": 1361812480.0, + "logps/chosen": -251.6065673828125, + "logps/rejected": -460.9575602213542, + "loss": 0.1664, + "rewards/chosen": 1.8688716888427734, + "rewards/margins": 10.90865707397461, + "rewards/rejected": -9.039785385131836, + "step": 2161 + }, + { + "epoch": 0.7981172996169997, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 1.0046000241192516e-06, + "logits/chosen": 1642267105.8823528, + "logits/rejected": 1597251310.9333334, + "logps/chosen": -280.70826631433823, + "logps/rejected": -527.4910481770834, + "loss": 0.1156, + "rewards/chosen": 1.8576963088091683, + "rewards/margins": 9.999160033581305, + "rewards/rejected": -8.141463724772136, + "step": 2162 + }, + { + "epoch": 0.7984864565548428, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 1.0010643829141624e-06, + "logits/chosen": 1335953134.9333334, + "logits/rejected": 1254956694.5882354, + "logps/chosen": -224.051806640625, + "logps/rejected": -397.82453469669116, + "loss": 0.1634, + "rewards/chosen": 1.0238357543945313, + "rewards/margins": 8.973808468089384, + "rewards/rejected": -7.949972713694853, + "step": 2163 + }, + { + "epoch": 0.7988556134926861, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 9.975342821835654e-07, + "logits/chosen": 2821326555.428571, + "logits/rejected": 2708558734.2222223, + "logps/chosen": -308.8710239955357, + "logps/rejected": -505.3097330729167, + "loss": 0.1107, + "rewards/chosen": 1.4779868807111467, + "rewards/margins": 10.321690256633456, + "rewards/rejected": -8.84370337592231, + "step": 2164 + }, + { + "epoch": 0.7992247704305293, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.94009726818369e-07, + "logits/chosen": 1935049547.2941177, + "logits/rejected": 1586535765.3333333, + "logps/chosen": -308.8475126378676, + "logps/rejected": -399.35966796875, + "loss": 0.1446, + "rewards/chosen": 1.3767098819508272, + "rewards/margins": 8.534067670036764, + "rewards/rejected": -7.157357788085937, + "step": 2165 + }, + { + "epoch": 0.7995939273683724, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 9.904907217018e-07, + "logits/chosen": 1521098922.6666667, + "logits/rejected": 1893562368.0, + "logps/chosen": -217.7696329752604, + "logps/rejected": -469.833349609375, + "loss": 0.0864, + "rewards/chosen": 1.9619135856628418, + "rewards/margins": 9.757936382293702, + "rewards/rejected": -7.79602279663086, + "step": 2166 + }, + { + "epoch": 0.7999630843062157, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 9.869772717093974e-07, + "logits/chosen": 1583157504.0, + "logits/rejected": 1364822272.0, + "logps/chosen": -293.1698913574219, + "logps/rejected": -422.4199523925781, + "loss": 0.1736, + "rewards/chosen": 0.9990107417106628, + "rewards/margins": 8.982476890087128, + "rewards/rejected": -7.983466148376465, + "step": 2167 + }, + { + "epoch": 0.8003322412440589, + "grad_norm": 13.3125, + "kl": 0.6712570190429688, + "learning_rate": 9.834693817089996e-07, + "logits/chosen": 1889860985.2631578, + "logits/rejected": 1911802801.2307692, + "logps/chosen": -276.3702456825658, + "logps/rejected": -452.89107572115387, + "loss": 0.1909, + "rewards/chosen": 0.9691461261950041, + "rewards/margins": 9.611243012463031, + "rewards/rejected": -8.642096886268028, + "step": 2168 + }, + { + "epoch": 0.8007013981819021, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 9.799670565607427e-07, + "logits/chosen": 2044885401.6, + "logits/rejected": 1788227463.5294118, + "logps/chosen": -270.6243489583333, + "logps/rejected": -566.0275160845588, + "loss": 0.1525, + "rewards/chosen": 0.956959597269694, + "rewards/margins": 10.343701684241202, + "rewards/rejected": -9.386742086971507, + "step": 2169 + }, + { + "epoch": 0.8010705551197452, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 9.764703011170568e-07, + "logits/chosen": 1777840947.2, + "logits/rejected": 1896904944.9411764, + "logps/chosen": -292.8211263020833, + "logps/rejected": -526.5515854779412, + "loss": 0.0862, + "rewards/chosen": 2.147962697347005, + "rewards/margins": 10.438210895014743, + "rewards/rejected": -8.290248197667738, + "step": 2170 + }, + { + "epoch": 0.8014397120575885, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 9.729791202226484e-07, + "logits/chosen": 2517132083.2, + "logits/rejected": 2666882730.6666665, + "logps/chosen": -359.5236328125, + "logps/rejected": -417.093994140625, + "loss": 0.2224, + "rewards/chosen": 0.9769222259521484, + "rewards/margins": 7.254006576538086, + "rewards/rejected": -6.2770843505859375, + "step": 2171 + }, + { + "epoch": 0.8018088689954317, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.69493518714506e-07, + "logits/chosen": 1838926896.7619047, + "logits/rejected": 1888096256.0, + "logps/chosen": -283.52320498511904, + "logps/rejected": -321.61148348721593, + "loss": 0.2038, + "rewards/chosen": 1.2469618661063058, + "rewards/margins": 6.5190798276430595, + "rewards/rejected": -5.272117961536754, + "step": 2172 + }, + { + "epoch": 0.8021780259332749, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 9.66013501421888e-07, + "logits/chosen": 1077729652.3636363, + "logits/rejected": 1587120225.5238094, + "logps/chosen": -228.80495383522728, + "logps/rejected": -593.3181733630952, + "loss": 0.0841, + "rewards/chosen": 1.8013621243563565, + "rewards/margins": 11.423359775956058, + "rewards/rejected": -9.621997651599703, + "step": 2173 + }, + { + "epoch": 0.802547182871118, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 9.625390731663114e-07, + "logits/chosen": 1583933337.6, + "logits/rejected": 1776379904.0, + "logps/chosen": -334.64931640625, + "logps/rejected": -573.6937255859375, + "loss": 0.162, + "rewards/chosen": 1.567155933380127, + "rewards/margins": 12.24744234085083, + "rewards/rejected": -10.680286407470703, + "step": 2174 + }, + { + "epoch": 0.8029163398089613, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 9.59070238761553e-07, + "logits/chosen": 1783710573.7142856, + "logits/rejected": 2434910435.5555553, + "logps/chosen": -310.02835518973217, + "logps/rejected": -529.7412651909722, + "loss": 0.1171, + "rewards/chosen": 1.5648961748395647, + "rewards/margins": 11.594220176575675, + "rewards/rejected": -10.02932400173611, + "step": 2175 + }, + { + "epoch": 0.8032854967468045, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 9.556070030136411e-07, + "logits/chosen": 1764773497.9047618, + "logits/rejected": 2411590376.7272725, + "logps/chosen": -324.5560360863095, + "logps/rejected": -555.7627840909091, + "loss": 0.1648, + "rewards/chosen": 1.6364704313732328, + "rewards/margins": 12.708194369361514, + "rewards/rejected": -11.071723937988281, + "step": 2176 + }, + { + "epoch": 0.8036546536846477, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 9.521493707208412e-07, + "logits/chosen": 1521069959.5294118, + "logits/rejected": 1329541393.0666666, + "logps/chosen": -266.5746495863971, + "logps/rejected": -535.1810872395833, + "loss": 0.1938, + "rewards/chosen": 0.849915111766142, + "rewards/margins": 9.023707632924996, + "rewards/rejected": -8.173792521158854, + "step": 2177 + }, + { + "epoch": 0.8040238106224908, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 9.48697346673661e-07, + "logits/chosen": 2360387268.923077, + "logits/rejected": 1485234499.368421, + "logps/chosen": -340.94200721153845, + "logps/rejected": -427.8266344572368, + "loss": 0.1101, + "rewards/chosen": 1.681122559767503, + "rewards/margins": 8.714964144625645, + "rewards/rejected": -7.033841584858141, + "step": 2178 + }, + { + "epoch": 0.8043929675603341, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 9.452509356548367e-07, + "logits/chosen": 1626941952.0, + "logits/rejected": 2307097395.2, + "logps/chosen": -374.3042399088542, + "logps/rejected": -515.6525390625, + "loss": 0.1188, + "rewards/chosen": 0.9493025143941244, + "rewards/margins": 9.339968713124593, + "rewards/rejected": -8.390666198730468, + "step": 2179 + }, + { + "epoch": 0.8047621244981773, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 9.41810142439325e-07, + "logits/chosen": 2318132709.0526314, + "logits/rejected": 2212241408.0, + "logps/chosen": -268.5389854029605, + "logps/rejected": -655.9038461538462, + "loss": 0.1611, + "rewards/chosen": 1.5220589888723273, + "rewards/margins": 14.303735061213073, + "rewards/rejected": -12.781676072340746, + "step": 2180 + }, + { + "epoch": 0.8051312814360205, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.383749717943024e-07, + "logits/chosen": 1539077771.6363637, + "logits/rejected": 1592852684.8, + "logps/chosen": -328.51273970170456, + "logps/rejected": -571.217822265625, + "loss": 0.178, + "rewards/chosen": 1.5040109807794744, + "rewards/margins": 7.802318399602717, + "rewards/rejected": -6.298307418823242, + "step": 2181 + }, + { + "epoch": 0.8055004383738636, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 9.349454284791549e-07, + "logits/chosen": 1643761348.9230769, + "logits/rejected": 2513979499.7894735, + "logps/chosen": -359.49819711538464, + "logps/rejected": -490.24830386513156, + "loss": 0.1347, + "rewards/chosen": 1.0618402774517353, + "rewards/margins": 8.206704008434466, + "rewards/rejected": -7.14486373098273, + "step": 2182 + }, + { + "epoch": 0.8058695953117069, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 9.315215172454689e-07, + "logits/chosen": 1557577591.4666667, + "logits/rejected": 1442498078.1176472, + "logps/chosen": -303.25891927083336, + "logps/rejected": -407.9158720128676, + "loss": 0.1318, + "rewards/chosen": 1.4630953470865886, + "rewards/margins": 8.160126704795688, + "rewards/rejected": -6.6970313577091, + "step": 2183 + }, + { + "epoch": 0.8062387522495501, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.281032428370318e-07, + "logits/chosen": 1936145920.0, + "logits/rejected": 1576789376.0, + "logps/chosen": -297.2735290527344, + "logps/rejected": -451.94500732421875, + "loss": 0.136, + "rewards/chosen": 1.5717835426330566, + "rewards/margins": 7.973571300506592, + "rewards/rejected": -6.401787757873535, + "step": 2184 + }, + { + "epoch": 0.8066079091873933, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 9.246906099898196e-07, + "logits/chosen": 1849713225.142857, + "logits/rejected": 2013171939.5555556, + "logps/chosen": -336.5219029017857, + "logps/rejected": -504.22705078125, + "loss": 0.1294, + "rewards/chosen": 1.5801647731236048, + "rewards/margins": 8.336325509207589, + "rewards/rejected": -6.756160736083984, + "step": 2185 + }, + { + "epoch": 0.8069770661252365, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 9.212836234319911e-07, + "logits/chosen": 1703882524.4444444, + "logits/rejected": 1552820516.5714285, + "logps/chosen": -226.53084309895834, + "logps/rejected": -330.9081333705357, + "loss": 0.1237, + "rewards/chosen": 2.101261986626519, + "rewards/margins": 8.644369155641586, + "rewards/rejected": -6.543107169015067, + "step": 2186 + }, + { + "epoch": 0.8073462230630797, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 9.178822878838817e-07, + "logits/chosen": 1955670289.0666666, + "logits/rejected": 1424360749.1764705, + "logps/chosen": -232.074609375, + "logps/rejected": -613.8466222426471, + "loss": 0.1301, + "rewards/chosen": 1.3803258260091147, + "rewards/margins": 12.347646885292203, + "rewards/rejected": -10.967321059283089, + "step": 2187 + }, + { + "epoch": 0.8077153800009229, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 9.144866080579995e-07, + "logits/chosen": 1680780288.0, + "logits/rejected": 2005036160.0, + "logps/chosen": -308.54656982421875, + "logps/rejected": -466.43658447265625, + "loss": 0.1402, + "rewards/chosen": 1.2740952968597412, + "rewards/margins": 9.326094388961792, + "rewards/rejected": -8.05199909210205, + "step": 2188 + }, + { + "epoch": 0.8080845369387661, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 9.110965886590167e-07, + "logits/chosen": 1762161664.0, + "logits/rejected": 1866357191.1111112, + "logps/chosen": -226.72739955357142, + "logps/rejected": -671.59326171875, + "loss": 0.1279, + "rewards/chosen": 1.2159690856933594, + "rewards/margins": 12.805528852674696, + "rewards/rejected": -11.589559766981337, + "step": 2189 + }, + { + "epoch": 0.8084536938766093, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 9.077122343837608e-07, + "logits/chosen": 1182996224.0, + "logits/rejected": 1165766784.0, + "logps/chosen": -230.0865020751953, + "logps/rejected": -527.5303955078125, + "loss": 0.0921, + "rewards/chosen": 2.1966946125030518, + "rewards/margins": 9.829431772232056, + "rewards/rejected": -7.632737159729004, + "step": 2190 + }, + { + "epoch": 0.8088228508144525, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 9.043335499212119e-07, + "logits/chosen": 1959007232.0, + "logits/rejected": 2572397909.3333335, + "logps/chosen": -272.0761474609375, + "logps/rejected": -562.954833984375, + "loss": 0.229, + "rewards/chosen": 0.6635544776916504, + "rewards/margins": 8.53498519261678, + "rewards/rejected": -7.87143071492513, + "step": 2191 + }, + { + "epoch": 0.8091920077522957, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 9.009605399524957e-07, + "logits/chosen": 1898074543.1578948, + "logits/rejected": 1440864413.5384614, + "logps/chosen": -297.2524157072368, + "logps/rejected": -560.0905573918269, + "loss": 0.1589, + "rewards/chosen": 1.4157122561806126, + "rewards/margins": 12.730312100306213, + "rewards/rejected": -11.3145998441256, + "step": 2192 + }, + { + "epoch": 0.809561164690139, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 8.975932091508727e-07, + "logits/chosen": 2167793956.571429, + "logits/rejected": 1351862365.090909, + "logps/chosen": -278.6380208333333, + "logps/rejected": -512.8408203125, + "loss": 0.1623, + "rewards/chosen": 1.6517321268717449, + "rewards/margins": 9.359802361690637, + "rewards/rejected": -7.708070234818892, + "step": 2193 + }, + { + "epoch": 0.8099303216279821, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 8.942315621817377e-07, + "logits/chosen": 1764804715.7894738, + "logits/rejected": 2246970131.6923075, + "logps/chosen": -280.04042454769734, + "logps/rejected": -521.5290715144231, + "loss": 0.1435, + "rewards/chosen": 1.6642152886641652, + "rewards/margins": 9.838086255648841, + "rewards/rejected": -8.173870966984676, + "step": 2194 + }, + { + "epoch": 0.8102994785658253, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 8.908756037026112e-07, + "logits/chosen": 2265536256.0, + "logits/rejected": 2820118016.0, + "logps/chosen": -329.4911804199219, + "logps/rejected": -449.8356018066406, + "loss": 0.1405, + "rewards/chosen": 1.644680142402649, + "rewards/margins": 10.285784840583801, + "rewards/rejected": -8.641104698181152, + "step": 2195 + }, + { + "epoch": 0.8106686355036685, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 8.875253383631288e-07, + "logits/chosen": 1601877284.5714285, + "logits/rejected": 1571152668.4444444, + "logps/chosen": -251.97898646763392, + "logps/rejected": -410.21137152777777, + "loss": 0.1491, + "rewards/chosen": 1.4518996647426061, + "rewards/margins": 7.869853579808795, + "rewards/rejected": -6.417953915066189, + "step": 2196 + }, + { + "epoch": 0.8110377924415118, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 8.841807708050415e-07, + "logits/chosen": 1690838129.7777777, + "logits/rejected": 1539441371.4285715, + "logps/chosen": -282.1701931423611, + "logps/rejected": -490.95912388392856, + "loss": 0.2061, + "rewards/chosen": 0.7340145111083984, + "rewards/margins": 7.417731421334403, + "rewards/rejected": -6.683716910226004, + "step": 2197 + }, + { + "epoch": 0.8114069493793549, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 8.808419056622064e-07, + "logits/chosen": 1722432090.3529413, + "logits/rejected": 1614984942.9333334, + "logps/chosen": -269.81275850183823, + "logps/rejected": -517.7045572916667, + "loss": 0.1621, + "rewards/chosen": 1.2260669259464039, + "rewards/margins": 8.148015168133904, + "rewards/rejected": -6.9219482421875, + "step": 2198 + }, + { + "epoch": 0.8117761063171981, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 8.775087475605765e-07, + "logits/chosen": 2413795141.818182, + "logits/rejected": 1552044032.0, + "logps/chosen": -226.38423295454547, + "logps/rejected": -480.3802083333333, + "loss": 0.0994, + "rewards/chosen": 1.5079777457497336, + "rewards/margins": 10.053525231101297, + "rewards/rejected": -8.545547485351562, + "step": 2199 + }, + { + "epoch": 0.8121452632550413, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 8.741813011182015e-07, + "logits/chosen": 1880479744.0, + "logits/rejected": 2317558852.266667, + "logps/chosen": -305.46923828125, + "logps/rejected": -408.01611328125, + "loss": 0.1402, + "rewards/chosen": 1.6687148599063648, + "rewards/margins": 9.109659277224074, + "rewards/rejected": -7.440944417317708, + "step": 2200 + }, + { + "epoch": 0.8125144201928844, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 8.708595709452166e-07, + "logits/chosen": 2129551600.9411764, + "logits/rejected": 1920789708.8, + "logps/chosen": -278.21725643382354, + "logps/rejected": -632.4718098958333, + "loss": 0.1681, + "rewards/chosen": 0.9954329097972197, + "rewards/margins": 9.884851448208678, + "rewards/rejected": -8.889418538411459, + "step": 2201 + }, + { + "epoch": 0.8128835771307277, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 8.675435616438349e-07, + "logits/chosen": 1211711624.5333333, + "logits/rejected": 1584061500.235294, + "logps/chosen": -249.432080078125, + "logps/rejected": -503.3366268382353, + "loss": 0.1626, + "rewards/chosen": 0.8829253514607748, + "rewards/margins": 9.953306067223643, + "rewards/rejected": -9.070380715762868, + "step": 2202 + }, + { + "epoch": 0.8132527340685709, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 8.642332778083473e-07, + "logits/chosen": 1954213660.4444444, + "logits/rejected": 2764190866.285714, + "logps/chosen": -261.58702256944446, + "logps/rejected": -486.32554408482144, + "loss": 0.1133, + "rewards/chosen": 1.8147739834255643, + "rewards/margins": 9.722403813922217, + "rewards/rejected": -7.907629830496652, + "step": 2203 + }, + { + "epoch": 0.8136218910064141, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 8.60928724025108e-07, + "logits/chosen": 1480958795.2941177, + "logits/rejected": 1393963281.0666666, + "logps/chosen": -220.18795955882354, + "logps/rejected": -438.9124348958333, + "loss": 0.1078, + "rewards/chosen": 2.4415696088005516, + "rewards/margins": 9.957842598709405, + "rewards/rejected": -7.516272989908854, + "step": 2204 + }, + { + "epoch": 0.8139910479442573, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 8.576299048725362e-07, + "logits/chosen": 2033661269.3333333, + "logits/rejected": 1870072738.909091, + "logps/chosen": -285.9882579985119, + "logps/rejected": -447.2080078125, + "loss": 0.1987, + "rewards/chosen": 1.070476350330171, + "rewards/margins": 7.903115177567387, + "rewards/rejected": -6.832638827237216, + "step": 2205 + }, + { + "epoch": 0.8143602048821005, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 8.543368249211015e-07, + "logits/chosen": 1615784228.5714285, + "logits/rejected": 1533081320.7272727, + "logps/chosen": -364.4990466889881, + "logps/rejected": -485.3890269886364, + "loss": 0.1972, + "rewards/chosen": 1.0916763487316312, + "rewards/margins": 8.917647778729856, + "rewards/rejected": -7.825971429998225, + "step": 2206 + }, + { + "epoch": 0.8147293618199437, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 8.510494887333276e-07, + "logits/chosen": 1383570578.2857144, + "logits/rejected": 1345683456.0, + "logps/chosen": -345.3506556919643, + "logps/rejected": -472.9454752604167, + "loss": 0.1145, + "rewards/chosen": 1.7457269941057478, + "rewards/margins": 11.161639198424325, + "rewards/rejected": -9.415912204318577, + "step": 2207 + }, + { + "epoch": 0.8150985187577869, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 8.477679008637735e-07, + "logits/chosen": 1662570268.4444444, + "logits/rejected": 1969219876.5714285, + "logps/chosen": -271.24205186631946, + "logps/rejected": -404.4187709263393, + "loss": 0.1444, + "rewards/chosen": 1.5695275200737848, + "rewards/margins": 9.031637222047836, + "rewards/rejected": -7.462109701974051, + "step": 2208 + }, + { + "epoch": 0.8154676756956301, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 8.444920658590388e-07, + "logits/chosen": 1322816102.4, + "logits/rejected": 1410954069.3333333, + "logps/chosen": -261.1537109375, + "logps/rejected": -433.2914632161458, + "loss": 0.2025, + "rewards/chosen": 1.2240297317504882, + "rewards/margins": 9.456203651428222, + "rewards/rejected": -8.232173919677734, + "step": 2209 + }, + { + "epoch": 0.8158368326334733, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 8.412219882577538e-07, + "logits/chosen": 1418360490.6666667, + "logits/rejected": 1699070390.857143, + "logps/chosen": -247.82579210069446, + "logps/rejected": -379.1536342075893, + "loss": 0.1747, + "rewards/chosen": 1.209618992275662, + "rewards/margins": 9.89083455100892, + "rewards/rejected": -8.681215558733259, + "step": 2210 + }, + { + "epoch": 0.8162059895713165, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 8.379576725905653e-07, + "logits/chosen": 2385824881.7777777, + "logits/rejected": 1722603373.7142856, + "logps/chosen": -288.4800075954861, + "logps/rejected": -427.7875279017857, + "loss": 0.1823, + "rewards/chosen": 1.1556208928426106, + "rewards/margins": 7.23282546088809, + "rewards/rejected": -6.07720456804548, + "step": 2211 + }, + { + "epoch": 0.8165751465091597, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 8.346991233801438e-07, + "logits/chosen": 1670833453.1764705, + "logits/rejected": 1688464179.2, + "logps/chosen": -262.5830652573529, + "logps/rejected": -474.76393229166666, + "loss": 0.161, + "rewards/chosen": 1.633112402523265, + "rewards/margins": 9.34011313494514, + "rewards/rejected": -7.707000732421875, + "step": 2212 + }, + { + "epoch": 0.8169443034470029, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 8.314463451411681e-07, + "logits/chosen": 1777829888.0, + "logits/rejected": 1907932160.0, + "logps/chosen": -248.737939453125, + "logps/rejected": -474.4279378255208, + "loss": 0.1204, + "rewards/chosen": 1.960628128051758, + "rewards/margins": 10.785418446858724, + "rewards/rejected": -8.824790318806967, + "step": 2213 + }, + { + "epoch": 0.8173134603848461, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 8.281993423803192e-07, + "logits/chosen": 1446401077.8947368, + "logits/rejected": 2573453784.6153846, + "logps/chosen": -231.93210320723685, + "logps/rejected": -549.6843449519231, + "loss": 0.2144, + "rewards/chosen": 0.8936365027176706, + "rewards/margins": 9.687941570513644, + "rewards/rejected": -8.794305067795973, + "step": 2214 + }, + { + "epoch": 0.8176826173226893, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 8.249581195962792e-07, + "logits/chosen": 1735290772.2105262, + "logits/rejected": 1856411805.5384614, + "logps/chosen": -279.69873046875, + "logps/rejected": -612.9707782451923, + "loss": 0.1744, + "rewards/chosen": 1.533705761558131, + "rewards/margins": 10.156111767417507, + "rewards/rejected": -8.622406005859375, + "step": 2215 + }, + { + "epoch": 0.8180517742605325, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 8.217226812797225e-07, + "logits/chosen": 1527774976.0, + "logits/rejected": 1858428672.0, + "logps/chosen": -289.64154052734375, + "logps/rejected": -391.8369445800781, + "loss": 0.1338, + "rewards/chosen": 1.5392236709594727, + "rewards/margins": 10.028531074523926, + "rewards/rejected": -8.489307403564453, + "step": 2216 + }, + { + "epoch": 0.8184209311983757, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 8.18493031913305e-07, + "logits/chosen": 1769122514.8235295, + "logits/rejected": 1586558839.4666667, + "logps/chosen": -312.6122472426471, + "logps/rejected": -550.786328125, + "loss": 0.136, + "rewards/chosen": 1.5038372488582836, + "rewards/margins": 10.766296558754117, + "rewards/rejected": -9.262459309895833, + "step": 2217 + }, + { + "epoch": 0.8187900881362189, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 8.152691759716657e-07, + "logits/chosen": 1886655829.3333333, + "logits/rejected": 2714516299.2941175, + "logps/chosen": -197.56936848958333, + "logps/rejected": -480.21447035845586, + "loss": 0.1275, + "rewards/chosen": 1.3449381510416667, + "rewards/margins": 8.204883111691942, + "rewards/rejected": -6.859944960650275, + "step": 2218 + }, + { + "epoch": 0.8191592450740621, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 8.120511179214186e-07, + "logits/chosen": 1828387225.6, + "logits/rejected": 1955632790.5882354, + "logps/chosen": -207.69446614583333, + "logps/rejected": -457.0968807444853, + "loss": 0.1189, + "rewards/chosen": 1.7029767354329428, + "rewards/margins": 9.33823153925877, + "rewards/rejected": -7.635254803825827, + "step": 2219 + }, + { + "epoch": 0.8195284020119054, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 8.088388622211401e-07, + "logits/chosen": 2052278656.0, + "logits/rejected": 1780944640.0, + "logps/chosen": -373.2431335449219, + "logps/rejected": -439.8788146972656, + "loss": 0.1174, + "rewards/chosen": 1.7659285068511963, + "rewards/margins": 10.81023907661438, + "rewards/rejected": -9.044310569763184, + "step": 2220 + }, + { + "epoch": 0.8198975589497485, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 8.056324133213689e-07, + "logits/chosen": 1784013677.7142856, + "logits/rejected": 2294774442.6666665, + "logps/chosen": -341.57373046875, + "logps/rejected": -410.1357150607639, + "loss": 0.1417, + "rewards/chosen": 1.0776114463806152, + "rewards/margins": 8.24187315834893, + "rewards/rejected": -7.164261711968316, + "step": 2221 + }, + { + "epoch": 0.8202667158875917, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 8.024317756645999e-07, + "logits/chosen": 1817935360.0, + "logits/rejected": 2071260416.0, + "logps/chosen": -236.49761962890625, + "logps/rejected": -457.85565185546875, + "loss": 0.1495, + "rewards/chosen": 1.5565900802612305, + "rewards/margins": 7.718361854553223, + "rewards/rejected": -6.161771774291992, + "step": 2222 + }, + { + "epoch": 0.8206358728254349, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 7.992369536852773e-07, + "logits/chosen": 1689066657.6842105, + "logits/rejected": 1943023773.5384614, + "logps/chosen": -308.01197574013156, + "logps/rejected": -458.80472506009613, + "loss": 0.1607, + "rewards/chosen": 1.5627341019479852, + "rewards/margins": 9.526541428044741, + "rewards/rejected": -7.963807326096755, + "step": 2223 + }, + { + "epoch": 0.8210050297632782, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 7.960479518097841e-07, + "logits/chosen": 2096289280.0, + "logits/rejected": 1975169024.0, + "logps/chosen": -298.0325622558594, + "logps/rejected": -486.5216369628906, + "loss": 0.1755, + "rewards/chosen": 0.8029987215995789, + "rewards/margins": 8.005668342113495, + "rewards/rejected": -7.202669620513916, + "step": 2224 + }, + { + "epoch": 0.8213741867011213, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 7.928647744564427e-07, + "logits/chosen": 1824500531.2, + "logits/rejected": 1802159766.5882354, + "logps/chosen": -251.69095052083333, + "logps/rejected": -502.05032169117646, + "loss": 0.143, + "rewards/chosen": 1.7092620849609375, + "rewards/margins": 10.617019114774816, + "rewards/rejected": -8.90775702981388, + "step": 2225 + }, + { + "epoch": 0.8217433436389645, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 7.896874260355064e-07, + "logits/chosen": 2015968195.764706, + "logits/rejected": 1759994948.2666667, + "logps/chosen": -256.6178768382353, + "logps/rejected": -649.9142578125, + "loss": 0.1185, + "rewards/chosen": 2.0329141055836395, + "rewards/margins": 14.179908124138327, + "rewards/rejected": -12.146994018554688, + "step": 2226 + }, + { + "epoch": 0.8221125005768077, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 7.865159109491488e-07, + "logits/chosen": 1511447859.2, + "logits/rejected": 1785684138.6666667, + "logps/chosen": -237.346142578125, + "logps/rejected": -687.1680501302084, + "loss": 0.2039, + "rewards/chosen": 1.0539806365966797, + "rewards/margins": 11.585580317179362, + "rewards/rejected": -10.531599680582682, + "step": 2227 + }, + { + "epoch": 0.822481657514651, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 7.83350233591465e-07, + "logits/chosen": 1542614357.3333333, + "logits/rejected": 1903648768.0, + "logps/chosen": -271.4041748046875, + "logps/rejected": -501.086328125, + "loss": 0.1411, + "rewards/chosen": 1.041278600692749, + "rewards/margins": 9.756065893173218, + "rewards/rejected": -8.71478729248047, + "step": 2228 + }, + { + "epoch": 0.8228508144524941, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 7.801903983484616e-07, + "logits/chosen": 1481533253.8181818, + "logits/rejected": 2351916236.8, + "logps/chosen": -310.4486194957386, + "logps/rejected": -554.77587890625, + "loss": 0.24, + "rewards/chosen": 1.2889210094105115, + "rewards/margins": 11.496300159801137, + "rewards/rejected": -10.207379150390626, + "step": 2229 + }, + { + "epoch": 0.8232199713903373, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 7.770364095980481e-07, + "logits/chosen": 3001854464.0, + "logits/rejected": 2286489088.0, + "logps/chosen": -326.70806884765625, + "logps/rejected": -569.0914916992188, + "loss": 0.1467, + "rewards/chosen": 1.1517821550369263, + "rewards/margins": 11.681254506111145, + "rewards/rejected": -10.529472351074219, + "step": 2230 + }, + { + "epoch": 0.8235891283281805, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 7.738882717100365e-07, + "logits/chosen": 2450497828.571429, + "logits/rejected": 2311506602.6666665, + "logps/chosen": -355.13099888392856, + "logps/rejected": -615.9925672743055, + "loss": 0.0959, + "rewards/chosen": 1.994966779436384, + "rewards/margins": 12.096206301734561, + "rewards/rejected": -10.101239522298178, + "step": 2231 + }, + { + "epoch": 0.8239582852660238, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 7.707459890461338e-07, + "logits/chosen": 1715175628.8, + "logits/rejected": 1724950869.3333333, + "logps/chosen": -331.457080078125, + "logps/rejected": -368.3006591796875, + "loss": 0.1558, + "rewards/chosen": 1.667253303527832, + "rewards/margins": 8.713086255391438, + "rewards/rejected": -7.0458329518636065, + "step": 2232 + }, + { + "epoch": 0.8243274422038669, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 7.676095659599298e-07, + "logits/chosen": 1752444791.4666667, + "logits/rejected": 1931317248.0, + "logps/chosen": -311.6541015625, + "logps/rejected": -435.64338235294116, + "loss": 0.099, + "rewards/chosen": 2.198530578613281, + "rewards/margins": 9.647948141659008, + "rewards/rejected": -7.449417563045726, + "step": 2233 + }, + { + "epoch": 0.8246965991417101, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 7.644790067969005e-07, + "logits/chosen": 1336517120.0, + "logits/rejected": 1424582400.0, + "logps/chosen": -278.1907958984375, + "logps/rejected": -440.922119140625, + "loss": 0.1663, + "rewards/chosen": 1.472585678100586, + "rewards/margins": 9.130938529968262, + "rewards/rejected": -7.658352851867676, + "step": 2234 + }, + { + "epoch": 0.8250657560795533, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 7.613543158943965e-07, + "logits/chosen": 2030438400.0, + "logits/rejected": 1626752292.5714285, + "logps/chosen": -387.67578125, + "logps/rejected": -502.9978724888393, + "loss": 0.1492, + "rewards/chosen": 1.766116460164388, + "rewards/margins": 10.090236572992234, + "rewards/rejected": -8.324120112827845, + "step": 2235 + }, + { + "epoch": 0.8254349130173965, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 7.582354975816348e-07, + "logits/chosen": 1713451463.1111112, + "logits/rejected": 2069828315.4285715, + "logps/chosen": -260.1090494791667, + "logps/rejected": -421.0834263392857, + "loss": 0.2027, + "rewards/chosen": 0.920766724480523, + "rewards/margins": 8.80778029608348, + "rewards/rejected": -7.8870135716029575, + "step": 2236 + }, + { + "epoch": 0.8258040699552397, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 7.551225561797021e-07, + "logits/chosen": 1574060236.8, + "logits/rejected": 2328303957.3333335, + "logps/chosen": -244.1660888671875, + "logps/rejected": -505.9058837890625, + "loss": 0.1565, + "rewards/chosen": 1.5037993431091308, + "rewards/margins": 10.08902858098348, + "rewards/rejected": -8.58522923787435, + "step": 2237 + }, + { + "epoch": 0.8261732268930829, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 7.520154960015352e-07, + "logits/chosen": 2010011420.4444444, + "logits/rejected": 2149238491.428571, + "logps/chosen": -194.66151258680554, + "logps/rejected": -367.1657017299107, + "loss": 0.1934, + "rewards/chosen": 1.157101525200738, + "rewards/margins": 7.4735230339898004, + "rewards/rejected": -6.3164215087890625, + "step": 2238 + }, + { + "epoch": 0.8265423838309262, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 7.489143213519301e-07, + "logits/chosen": 1940938270.1176472, + "logits/rejected": 1709131366.4, + "logps/chosen": -256.80353860294116, + "logps/rejected": -534.3838216145833, + "loss": 0.1507, + "rewards/chosen": 1.4641228844137752, + "rewards/margins": 12.254200602512734, + "rewards/rejected": -10.790077718098958, + "step": 2239 + }, + { + "epoch": 0.8269115407687693, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 7.45819036527522e-07, + "logits/chosen": 1680972458.6666667, + "logits/rejected": 1999016374.857143, + "logps/chosen": -249.33102756076389, + "logps/rejected": -658.4595424107143, + "loss": 0.1565, + "rewards/chosen": 1.3546455171373155, + "rewards/margins": 12.636276986863878, + "rewards/rejected": -11.281631469726562, + "step": 2240 + }, + { + "epoch": 0.8272806977066125, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 7.427296458167898e-07, + "logits/chosen": 1333599653.6470587, + "logits/rejected": 1574782429.8666666, + "logps/chosen": -275.9919002757353, + "logps/rejected": -474.84557291666664, + "loss": 0.1435, + "rewards/chosen": 1.9745828965130974, + "rewards/margins": 11.740627004586013, + "rewards/rejected": -9.766044108072917, + "step": 2241 + }, + { + "epoch": 0.8276498546444557, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 7.396461535000471e-07, + "logits/chosen": 1553605193.142857, + "logits/rejected": 1645875996.4444444, + "logps/chosen": -290.59549386160717, + "logps/rejected": -384.34174262152777, + "loss": 0.1642, + "rewards/chosen": 1.0511916024344308, + "rewards/margins": 8.613861144535125, + "rewards/rejected": -7.562669542100695, + "step": 2242 + }, + { + "epoch": 0.828019011582299, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 7.365685638494297e-07, + "logits/chosen": 2042314898.2857144, + "logits/rejected": 1706810254.2222223, + "logps/chosen": -223.51693289620536, + "logps/rejected": -454.52802191840277, + "loss": 0.1683, + "rewards/chosen": 1.540688923427037, + "rewards/margins": 8.273762263948955, + "rewards/rejected": -6.733073340521918, + "step": 2243 + }, + { + "epoch": 0.8283881685201421, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 7.334968811289006e-07, + "logits/chosen": 1510812364.8, + "logits/rejected": 2203834740.3636365, + "logps/chosen": -355.1525634765625, + "logps/rejected": -499.5017755681818, + "loss": 0.1248, + "rewards/chosen": 0.857939338684082, + "rewards/margins": 9.00126930583607, + "rewards/rejected": -8.143329967151988, + "step": 2244 + }, + { + "epoch": 0.8287573254579853, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 7.30431109594239e-07, + "logits/chosen": 1344304429.1764705, + "logits/rejected": 2197092352.0, + "logps/chosen": -268.4548770680147, + "logps/rejected": -468.85244140625, + "loss": 0.1428, + "rewards/chosen": 1.6857810300939224, + "rewards/margins": 9.270689579084808, + "rewards/rejected": -7.584908548990885, + "step": 2245 + }, + { + "epoch": 0.8291264823958285, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 7.273712534930294e-07, + "logits/chosen": 2218219264.0, + "logits/rejected": 1844122368.0, + "logps/chosen": -332.7561950683594, + "logps/rejected": -428.0511169433594, + "loss": 0.1502, + "rewards/chosen": 1.8059831857681274, + "rewards/margins": 9.278272271156311, + "rewards/rejected": -7.472289085388184, + "step": 2246 + }, + { + "epoch": 0.8294956393336718, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 7.243173170646644e-07, + "logits/chosen": 1772062281.142857, + "logits/rejected": 1625378702.2222223, + "logps/chosen": -248.6826171875, + "logps/rejected": -477.7037760416667, + "loss": 0.1515, + "rewards/chosen": 0.9959030832563128, + "rewards/margins": 9.428416850074889, + "rewards/rejected": -8.432513766818577, + "step": 2247 + }, + { + "epoch": 0.8298647962715149, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 7.212693045403363e-07, + "logits/chosen": 2155220129.6842103, + "logits/rejected": 2149847670.1538463, + "logps/chosen": -250.2693513569079, + "logps/rejected": -443.42784705528845, + "loss": 0.1863, + "rewards/chosen": 1.4341834218878495, + "rewards/margins": 9.371479188865013, + "rewards/rejected": -7.937295766977163, + "step": 2248 + }, + { + "epoch": 0.8302339532093581, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 7.182272201430246e-07, + "logits/chosen": 2500371200.0, + "logits/rejected": 2083336960.0, + "logps/chosen": -325.41796875, + "logps/rejected": -350.01251220703125, + "loss": 0.138, + "rewards/chosen": 1.6141879558563232, + "rewards/margins": 8.418343782424927, + "rewards/rejected": -6.8041558265686035, + "step": 2249 + }, + { + "epoch": 0.8306031101472013, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 7.151910680875001e-07, + "logits/chosen": 1679212544.0, + "logits/rejected": 2274791424.0, + "logps/chosen": -279.5523035386029, + "logps/rejected": -417.6330078125, + "loss": 0.1962, + "rewards/chosen": 0.8684907240026137, + "rewards/margins": 7.818286256229176, + "rewards/rejected": -6.949795532226562, + "step": 2250 + }, + { + "epoch": 0.8309722670850446, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 7.121608525803142e-07, + "logits/chosen": 2145978855.6190476, + "logits/rejected": 1575568104.7272727, + "logps/chosen": -288.26195126488096, + "logps/rejected": -580.3065962357955, + "loss": 0.1886, + "rewards/chosen": 1.4276245662144251, + "rewards/margins": 11.522498167954481, + "rewards/rejected": -10.094873601740057, + "step": 2251 + }, + { + "epoch": 0.8313414240228877, + "grad_norm": 13.4375, + "kl": 0.3970918655395508, + "learning_rate": 7.091365778197895e-07, + "logits/chosen": 1661443640.8888888, + "logits/rejected": 1688496420.5714285, + "logps/chosen": -257.4051106770833, + "logps/rejected": -387.55855887276783, + "loss": 0.1561, + "rewards/chosen": 2.1995357937282987, + "rewards/margins": 8.047606210859994, + "rewards/rejected": -5.848070417131696, + "step": 2252 + }, + { + "epoch": 0.8317105809607309, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 7.061182479960221e-07, + "logits/chosen": 1783270460.235294, + "logits/rejected": 1932910865.0666666, + "logps/chosen": -244.53917738970588, + "logps/rejected": -596.0245442708333, + "loss": 0.1722, + "rewards/chosen": 1.0268682592055376, + "rewards/margins": 11.692617364023246, + "rewards/rejected": -10.665749104817708, + "step": 2253 + }, + { + "epoch": 0.8320797378985741, + "grad_norm": 11.5, + "kl": 0.15639877319335938, + "learning_rate": 7.031058672908692e-07, + "logits/chosen": 1831960120.8888888, + "logits/rejected": 2200279771.428571, + "logps/chosen": -260.26543511284723, + "logps/rejected": -449.32059151785717, + "loss": 0.1686, + "rewards/chosen": 1.4063775804307725, + "rewards/margins": 8.266475738040985, + "rewards/rejected": -6.860098157610212, + "step": 2254 + }, + { + "epoch": 0.8324488948364174, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 7.000994398779443e-07, + "logits/chosen": 2037808014.2222223, + "logits/rejected": 2116052260.5714285, + "logps/chosen": -298.98643663194446, + "logps/rejected": -442.56277901785717, + "loss": 0.1485, + "rewards/chosen": 1.3477650748358831, + "rewards/margins": 10.167746786087278, + "rewards/rejected": -8.819981711251396, + "step": 2255 + }, + { + "epoch": 0.8328180517742605, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 6.970989699226161e-07, + "logits/chosen": 1210794954.1052632, + "logits/rejected": 1278605942.1538463, + "logps/chosen": -305.46597450657896, + "logps/rejected": -439.33451021634613, + "loss": 0.1348, + "rewards/chosen": 2.158742804276316, + "rewards/margins": 11.56932453974056, + "rewards/rejected": -9.410581735464243, + "step": 2256 + }, + { + "epoch": 0.8331872087121037, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 6.941044615819981e-07, + "logits/chosen": 1761028388.5714285, + "logits/rejected": 1472946858.6666667, + "logps/chosen": -309.5155552455357, + "logps/rejected": -509.05815972222223, + "loss": 0.0914, + "rewards/chosen": 1.801774569920131, + "rewards/margins": 9.888938707018655, + "rewards/rejected": -8.087164137098524, + "step": 2257 + }, + { + "epoch": 0.833556365649947, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 6.911159190049416e-07, + "logits/chosen": 1819402786.1333334, + "logits/rejected": 1148366968.4705882, + "logps/chosen": -296.436328125, + "logps/rejected": -351.01146024816177, + "loss": 0.1451, + "rewards/chosen": 1.091633097330729, + "rewards/margins": 8.356337184532016, + "rewards/rejected": -7.264704087201287, + "step": 2258 + }, + { + "epoch": 0.8339255225877902, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 6.881333463320355e-07, + "logits/chosen": 1714867768.8888888, + "logits/rejected": 1608915090.2857144, + "logps/chosen": -294.24620225694446, + "logps/rejected": -428.91500418526783, + "loss": 0.1232, + "rewards/chosen": 2.2495430840386286, + "rewards/margins": 10.087981390574623, + "rewards/rejected": -7.838438306535993, + "step": 2259 + }, + { + "epoch": 0.8342946795256333, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 6.851567476955978e-07, + "logits/chosen": 2508369488.8421054, + "logits/rejected": 1684449280.0, + "logps/chosen": -225.2302374588816, + "logps/rejected": -505.5003004807692, + "loss": 0.1624, + "rewards/chosen": 1.8263523704127262, + "rewards/margins": 10.869420889418135, + "rewards/rejected": -9.04306851900541, + "step": 2260 + }, + { + "epoch": 0.8346638364634765, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 6.821861272196651e-07, + "logits/chosen": 1509014479.2380953, + "logits/rejected": 1451778978.909091, + "logps/chosen": -282.4013904389881, + "logps/rejected": -366.25088778409093, + "loss": 0.1988, + "rewards/chosen": 1.0619749341692244, + "rewards/margins": 6.978625186077959, + "rewards/rejected": -5.916650251908735, + "step": 2261 + }, + { + "epoch": 0.8350329934013198, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 6.79221489019996e-07, + "logits/chosen": 1439655350.857143, + "logits/rejected": 1387478243.5555556, + "logps/chosen": -318.09095982142856, + "logps/rejected": -396.2658962673611, + "loss": 0.0994, + "rewards/chosen": 2.5313712528773715, + "rewards/margins": 9.773004350208101, + "rewards/rejected": -7.2416330973307295, + "step": 2262 + }, + { + "epoch": 0.835402150339163, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 6.762628372040603e-07, + "logits/chosen": 1652643840.0, + "logits/rejected": 1514966016.0, + "logps/chosen": -236.765380859375, + "logps/rejected": -393.9757486979167, + "loss": 0.2047, + "rewards/chosen": 1.059560775756836, + "rewards/margins": 10.381402969360352, + "rewards/rejected": -9.321842193603516, + "step": 2263 + }, + { + "epoch": 0.8357713072770061, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 6.733101758710297e-07, + "logits/chosen": 1309380608.0, + "logits/rejected": 1610171050.6666667, + "logps/chosen": -240.33006068638392, + "logps/rejected": -474.7548828125, + "loss": 0.1521, + "rewards/chosen": 0.9454497609819684, + "rewards/margins": 10.397929653288827, + "rewards/rejected": -9.452479892306858, + "step": 2264 + }, + { + "epoch": 0.8361404642148493, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 6.703635091117804e-07, + "logits/chosen": 2351378659.5555553, + "logits/rejected": 1939887104.0, + "logps/chosen": -287.46161566840277, + "logps/rejected": -634.2000558035714, + "loss": 0.1519, + "rewards/chosen": 1.5625042385525174, + "rewards/margins": 11.579037136501736, + "rewards/rejected": -10.016532897949219, + "step": 2265 + }, + { + "epoch": 0.8365096211526926, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 6.674228410088828e-07, + "logits/chosen": 1996783495.5294118, + "logits/rejected": 2338866790.4, + "logps/chosen": -260.94241153492646, + "logps/rejected": -451.6350911458333, + "loss": 0.1084, + "rewards/chosen": 1.8289047689998852, + "rewards/margins": 9.528080285764208, + "rewards/rejected": -7.699175516764323, + "step": 2266 + }, + { + "epoch": 0.8368787780905358, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 6.644881756365934e-07, + "logits/chosen": 2388092342.857143, + "logits/rejected": 2336725219.5555553, + "logps/chosen": -270.59498814174106, + "logps/rejected": -420.14686414930554, + "loss": 0.162, + "rewards/chosen": 0.8436859675816127, + "rewards/margins": 7.853812717256092, + "rewards/rejected": -7.0101267496744795, + "step": 2267 + }, + { + "epoch": 0.8372479350283789, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 6.615595170608541e-07, + "logits/chosen": 1655263641.6, + "logits/rejected": 1241290581.3333333, + "logps/chosen": -255.357861328125, + "logps/rejected": -365.0638020833333, + "loss": 0.2329, + "rewards/chosen": 0.9345117568969726, + "rewards/margins": 7.165428098042805, + "rewards/rejected": -6.230916341145833, + "step": 2268 + }, + { + "epoch": 0.8376170919662221, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 6.586368693392859e-07, + "logits/chosen": 1650495645.5384614, + "logits/rejected": 2303192872.4210525, + "logps/chosen": -280.4195087139423, + "logps/rejected": -357.1427580180921, + "loss": 0.1492, + "rewards/chosen": 0.8387352870060847, + "rewards/margins": 6.629460995013897, + "rewards/rejected": -5.7907257080078125, + "step": 2269 + }, + { + "epoch": 0.8379862489040654, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 6.557202365211778e-07, + "logits/chosen": 1870038716.631579, + "logits/rejected": 1635577698.4615386, + "logps/chosen": -264.03836862664474, + "logps/rejected": -400.4753605769231, + "loss": 0.1607, + "rewards/chosen": 1.785174520392167, + "rewards/margins": 8.646308976146374, + "rewards/rejected": -6.861134455754207, + "step": 2270 + }, + { + "epoch": 0.8383554058419086, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 6.528096226474894e-07, + "logits/chosen": 1973304050.5263157, + "logits/rejected": 1941550631.3846154, + "logps/chosen": -315.6776058799342, + "logps/rejected": -463.7675030048077, + "loss": 0.178, + "rewards/chosen": 1.3537374797620272, + "rewards/margins": 9.822354258795983, + "rewards/rejected": -8.468616779033955, + "step": 2271 + }, + { + "epoch": 0.8387245627797517, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 6.499050317508371e-07, + "logits/chosen": 1568253952.0, + "logits/rejected": 1467113267.2, + "logps/chosen": -271.73402913411456, + "logps/rejected": -427.102392578125, + "loss": 0.128, + "rewards/chosen": 1.2216994762420654, + "rewards/margins": 7.9822783946990965, + "rewards/rejected": -6.760578918457031, + "step": 2272 + }, + { + "epoch": 0.8390937197175949, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 6.470064678554971e-07, + "logits/chosen": 1699626037.8947368, + "logits/rejected": 1586837976.6153846, + "logps/chosen": -224.58778782894737, + "logps/rejected": -499.84780649038464, + "loss": 0.138, + "rewards/chosen": 2.0064111006887337, + "rewards/margins": 11.637732934372629, + "rewards/rejected": -9.631321833683895, + "step": 2273 + }, + { + "epoch": 0.8394628766554382, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 6.441139349773906e-07, + "logits/chosen": 2191700553.142857, + "logits/rejected": 2151814485.3333335, + "logps/chosen": -338.0355747767857, + "logps/rejected": -477.65760633680554, + "loss": 0.0999, + "rewards/chosen": 1.7375124522617884, + "rewards/margins": 10.032900568038697, + "rewards/rejected": -8.295388115776909, + "step": 2274 + }, + { + "epoch": 0.8398320335932813, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 6.412274371240867e-07, + "logits/chosen": 2393555375.1578946, + "logits/rejected": 2228170594.4615383, + "logps/chosen": -265.4726048519737, + "logps/rejected": -583.5596078725962, + "loss": 0.1556, + "rewards/chosen": 1.6023775401868319, + "rewards/margins": 10.430158630556424, + "rewards/rejected": -8.82778109036959, + "step": 2275 + }, + { + "epoch": 0.8402011905311245, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 6.383469782947915e-07, + "logits/chosen": 1647436868.2666667, + "logits/rejected": 1602225694.1176472, + "logps/chosen": -315.79759114583334, + "logps/rejected": -453.81910615808823, + "loss": 0.0968, + "rewards/chosen": 2.0998374938964846, + "rewards/margins": 9.243967527501724, + "rewards/rejected": -7.144130033605239, + "step": 2276 + }, + { + "epoch": 0.8405703474689677, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 6.354725624803426e-07, + "logits/chosen": 2499923694.9333334, + "logits/rejected": 2166016843.2941175, + "logps/chosen": -294.67493489583336, + "logps/rejected": -535.2101332720588, + "loss": 0.0982, + "rewards/chosen": 2.0412405649820964, + "rewards/margins": 11.084156358008292, + "rewards/rejected": -9.042915793026195, + "step": 2277 + }, + { + "epoch": 0.840939504406811, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 6.326041936632077e-07, + "logits/chosen": 2189545667.047619, + "logits/rejected": 1377370391.2727273, + "logps/chosen": -319.638671875, + "logps/rejected": -612.8331853693181, + "loss": 0.1766, + "rewards/chosen": 2.0741159348260787, + "rewards/margins": 12.561410912187585, + "rewards/rejected": -10.487294977361506, + "step": 2278 + }, + { + "epoch": 0.8413086613446541, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 6.297418758174767e-07, + "logits/chosen": 1580423936.0, + "logits/rejected": 1933603328.0, + "logps/chosen": -268.0578308105469, + "logps/rejected": -401.3442077636719, + "loss": 0.1514, + "rewards/chosen": 2.001235246658325, + "rewards/margins": 8.209280729293823, + "rewards/rejected": -6.208045482635498, + "step": 2279 + }, + { + "epoch": 0.8416778182824973, + "grad_norm": 8.875, + "kl": 0.2481985092163086, + "learning_rate": 6.268856129088518e-07, + "logits/chosen": 1341200793.6, + "logits/rejected": 1807166805.3333333, + "logps/chosen": -221.8482177734375, + "logps/rejected": -394.6148681640625, + "loss": 0.115, + "rewards/chosen": 2.4654794692993165, + "rewards/margins": 10.8001314163208, + "rewards/rejected": -8.334651947021484, + "step": 2280 + }, + { + "epoch": 0.8420469752203406, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 6.240354088946504e-07, + "logits/chosen": 1576628224.0, + "logits/rejected": 1814748774.4, + "logps/chosen": -298.2533365885417, + "logps/rejected": -487.824609375, + "loss": 0.1237, + "rewards/chosen": 1.0585377216339111, + "rewards/margins": 8.59126935005188, + "rewards/rejected": -7.532731628417968, + "step": 2281 + }, + { + "epoch": 0.8424161321581838, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 6.211912677237947e-07, + "logits/chosen": 1903299885.1764705, + "logits/rejected": 1662352042.6666667, + "logps/chosen": -267.13390395220586, + "logps/rejected": -342.44537760416665, + "loss": 0.1137, + "rewards/chosen": 1.9562582128188188, + "rewards/margins": 9.144645257089653, + "rewards/rejected": -7.188387044270834, + "step": 2282 + }, + { + "epoch": 0.8427852890960269, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 6.183531933368048e-07, + "logits/chosen": 1473591022.9333334, + "logits/rejected": 1926583958.5882354, + "logps/chosen": -253.29005533854166, + "logps/rejected": -407.69896024816177, + "loss": 0.1457, + "rewards/chosen": 1.2514466603597005, + "rewards/margins": 8.262529926674038, + "rewards/rejected": -7.011083266314338, + "step": 2283 + }, + { + "epoch": 0.8431544460338701, + "grad_norm": 19.625, + "kl": 0.0, + "learning_rate": 6.15521189665797e-07, + "logits/chosen": 1872956825.6, + "logits/rejected": 2043601640.7272727, + "logps/chosen": -323.504833984375, + "logps/rejected": -476.2122691761364, + "loss": 0.1149, + "rewards/chosen": 0.9681379318237304, + "rewards/margins": 8.187795136191628, + "rewards/rejected": -7.2196572043678975, + "step": 2284 + }, + { + "epoch": 0.8435236029717134, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 6.126952606344777e-07, + "logits/chosen": 1491493683.2, + "logits/rejected": 1743101952.0, + "logps/chosen": -263.8228515625, + "logps/rejected": -464.14088350183823, + "loss": 0.1211, + "rewards/chosen": 1.6045575459798178, + "rewards/margins": 10.627858614454082, + "rewards/rejected": -9.023301068474264, + "step": 2285 + }, + { + "epoch": 0.8438927599095566, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 6.098754101581334e-07, + "logits/chosen": 2304837222.4, + "logits/rejected": 1344945091.764706, + "logps/chosen": -261.98279622395836, + "logps/rejected": -498.8374885110294, + "loss": 0.1504, + "rewards/chosen": 1.1332115173339843, + "rewards/margins": 10.589851424273323, + "rewards/rejected": -9.456639906939339, + "step": 2286 + }, + { + "epoch": 0.8442619168473997, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 6.070616421436326e-07, + "logits/chosen": 1858174683.4285715, + "logits/rejected": 1665432007.1111112, + "logps/chosen": -205.4461669921875, + "logps/rejected": -407.842041015625, + "loss": 0.1505, + "rewards/chosen": 0.9797924586704799, + "rewards/margins": 8.157888806055462, + "rewards/rejected": -7.178096347384983, + "step": 2287 + }, + { + "epoch": 0.8446310737852429, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 6.04253960489416e-07, + "logits/chosen": 1373531075.764706, + "logits/rejected": 1860195259.7333333, + "logps/chosen": -304.42922794117646, + "logps/rejected": -395.53935546875, + "loss": 0.1674, + "rewards/chosen": 1.3410855461569393, + "rewards/margins": 8.307511123956418, + "rewards/rejected": -6.9664255777994795, + "step": 2288 + }, + { + "epoch": 0.8450002307230862, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 6.014523690854895e-07, + "logits/chosen": 1775720789.3333333, + "logits/rejected": 1391481651.2, + "logps/chosen": -240.6632080078125, + "logps/rejected": -559.382958984375, + "loss": 0.089, + "rewards/chosen": 1.731218973795573, + "rewards/margins": 9.666173807779948, + "rewards/rejected": -7.934954833984375, + "step": 2289 + }, + { + "epoch": 0.8453693876609294, + "grad_norm": 12.5625, + "kl": 0.06013298034667969, + "learning_rate": 5.986568718134223e-07, + "logits/chosen": 1599280670.1176472, + "logits/rejected": 2258942225.0666666, + "logps/chosen": -311.3343864889706, + "logps/rejected": -653.906640625, + "loss": 0.1631, + "rewards/chosen": 1.127859115600586, + "rewards/margins": 13.18647117614746, + "rewards/rejected": -12.058612060546874, + "step": 2290 + }, + { + "epoch": 0.8457385445987725, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 5.95867472546341e-07, + "logits/chosen": 1659891927.5789473, + "logits/rejected": 1531163884.3076923, + "logps/chosen": -209.2360968338816, + "logps/rejected": -454.42349008413464, + "loss": 0.1708, + "rewards/chosen": 1.4249490436754728, + "rewards/margins": 9.343274614588935, + "rewards/rejected": -7.918325570913462, + "step": 2291 + }, + { + "epoch": 0.8461077015366157, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 5.930841751489219e-07, + "logits/chosen": 2574348032.0, + "logits/rejected": 2528396544.0, + "logps/chosen": -263.8597412109375, + "logps/rejected": -474.6724548339844, + "loss": 0.12, + "rewards/chosen": 2.1408209800720215, + "rewards/margins": 11.25248384475708, + "rewards/rejected": -9.111662864685059, + "step": 2292 + }, + { + "epoch": 0.846476858474459, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 5.903069834773883e-07, + "logits/chosen": 1555918848.0, + "logits/rejected": 2074272768.0, + "logps/chosen": -299.9290364583333, + "logps/rejected": -452.41046142578125, + "loss": 0.1577, + "rewards/chosen": 1.7492726643880208, + "rewards/margins": 8.235698541005453, + "rewards/rejected": -6.486425876617432, + "step": 2293 + }, + { + "epoch": 0.8468460154123022, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5.875359013795062e-07, + "logits/chosen": 2310530951.529412, + "logits/rejected": 2896509610.6666665, + "logps/chosen": -231.77748736213235, + "logps/rejected": -581.7708333333334, + "loss": 0.1573, + "rewards/chosen": 1.804433037252987, + "rewards/margins": 10.947377373190488, + "rewards/rejected": -9.1429443359375, + "step": 2294 + }, + { + "epoch": 0.8472151723501453, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 5.847709326945717e-07, + "logits/chosen": 2447931847.111111, + "logits/rejected": 2582467145.142857, + "logps/chosen": -287.3014865451389, + "logps/rejected": -504.5467006138393, + "loss": 0.1851, + "rewards/chosen": 1.276943842569987, + "rewards/margins": 9.900239217849004, + "rewards/rejected": -8.623295375279017, + "step": 2295 + }, + { + "epoch": 0.8475843292879885, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 5.820120812534147e-07, + "logits/chosen": 1607532953.6, + "logits/rejected": 1857161216.0, + "logps/chosen": -242.65476888020834, + "logps/rejected": -582.7936580882352, + "loss": 0.1542, + "rewards/chosen": 0.9805529276529948, + "rewards/margins": 10.346439840279375, + "rewards/rejected": -9.36588691262638, + "step": 2296 + }, + { + "epoch": 0.8479534862258318, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 5.792593508783906e-07, + "logits/chosen": 1741614473.8461537, + "logits/rejected": 1495977768.4210527, + "logps/chosen": -293.6637432391827, + "logps/rejected": -493.31954152960526, + "loss": 0.1117, + "rewards/chosen": 1.7067673022930439, + "rewards/margins": 9.505821899846497, + "rewards/rejected": -7.799054597553454, + "step": 2297 + }, + { + "epoch": 0.848322643163675, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 5.765127453833696e-07, + "logits/chosen": 2340868551.111111, + "logits/rejected": 1879554194.2857144, + "logps/chosen": -231.05349392361111, + "logps/rejected": -538.5406319754464, + "loss": 0.171, + "rewards/chosen": 1.0549828211466472, + "rewards/margins": 9.59242734454927, + "rewards/rejected": -8.537444523402623, + "step": 2298 + }, + { + "epoch": 0.8486918001015181, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 5.737722685737401e-07, + "logits/chosen": 1377465295.2380953, + "logits/rejected": 1466989102.5454545, + "logps/chosen": -304.6020740327381, + "logps/rejected": -578.6608664772727, + "loss": 0.186, + "rewards/chosen": 1.4402516682942708, + "rewards/margins": 11.036288405909684, + "rewards/rejected": -9.596036737615412, + "step": 2299 + }, + { + "epoch": 0.8490609570393614, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5.710379242463993e-07, + "logits/chosen": 1527467566.5454545, + "logits/rejected": 1947447881.142857, + "logps/chosen": -291.27889737215907, + "logps/rejected": -449.35532924107144, + "loss": 0.0947, + "rewards/chosen": 2.2840699282559482, + "rewards/margins": 9.452999255357883, + "rewards/rejected": -7.168929327101934, + "step": 2300 + }, + { + "epoch": 0.8494301139772046, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 5.683097161897433e-07, + "logits/chosen": 2296000238.9333334, + "logits/rejected": 2740644924.2352943, + "logps/chosen": -333.78899739583335, + "logps/rejected": -541.6165556066177, + "loss": 0.1163, + "rewards/chosen": 1.686168416341146, + "rewards/margins": 10.538665292777267, + "rewards/rejected": -8.85249687643612, + "step": 2301 + }, + { + "epoch": 0.8497992709150478, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 5.655876481836719e-07, + "logits/chosen": 2385288192.0, + "logits/rejected": 1749430656.0, + "logps/chosen": -338.3543701171875, + "logps/rejected": -619.4234619140625, + "loss": 0.1889, + "rewards/chosen": 0.7456439733505249, + "rewards/margins": 10.948906302452087, + "rewards/rejected": -10.203262329101562, + "step": 2302 + }, + { + "epoch": 0.8501684278528909, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 5.628717239995762e-07, + "logits/chosen": 1660326369.8823528, + "logits/rejected": 2217484561.0666666, + "logps/chosen": -266.24816176470586, + "logps/rejected": -541.00517578125, + "loss": 0.1148, + "rewards/chosen": 1.9191396376665901, + "rewards/margins": 10.545937123018152, + "rewards/rejected": -8.626797485351563, + "step": 2303 + }, + { + "epoch": 0.8505375847907342, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5.601619474003328e-07, + "logits/chosen": 1966330819.764706, + "logits/rejected": 1297409911.4666667, + "logps/chosen": -294.90481387867646, + "logps/rejected": -370.75615234375, + "loss": 0.1214, + "rewards/chosen": 1.9499848309685202, + "rewards/margins": 9.952950122309666, + "rewards/rejected": -8.002965291341146, + "step": 2304 + }, + { + "epoch": 0.8509067417285774, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5.574583221403041e-07, + "logits/chosen": 1591196288.0, + "logits/rejected": 1730212352.0, + "logps/chosen": -230.8409423828125, + "logps/rejected": -598.4368286132812, + "loss": 0.1437, + "rewards/chosen": 1.1640516519546509, + "rewards/margins": 10.596859574317932, + "rewards/rejected": -9.432807922363281, + "step": 2305 + }, + { + "epoch": 0.8512758986664206, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 5.547608519653286e-07, + "logits/chosen": 2099442005.3333333, + "logits/rejected": 1851737907.2, + "logps/chosen": -348.0118001302083, + "logps/rejected": -484.563916015625, + "loss": 0.118, + "rewards/chosen": 0.9622591336568197, + "rewards/margins": 9.511355050404868, + "rewards/rejected": -8.549095916748048, + "step": 2306 + }, + { + "epoch": 0.8516450556042637, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 5.520695406127163e-07, + "logits/chosen": 1698502813.5384614, + "logits/rejected": 1605081734.7368422, + "logps/chosen": -310.5261793870192, + "logps/rejected": -463.9476254111842, + "loss": 0.1146, + "rewards/chosen": 1.1606479057898889, + "rewards/margins": 9.261102938941615, + "rewards/rejected": -8.100455033151727, + "step": 2307 + }, + { + "epoch": 0.852014212542107, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 5.493843918112445e-07, + "logits/chosen": 1827763268.2666667, + "logits/rejected": 2172268544.0, + "logps/chosen": -249.5435546875, + "logps/rejected": -453.55445772058823, + "loss": 0.112, + "rewards/chosen": 1.8461499532063803, + "rewards/margins": 10.266908368877337, + "rewards/rejected": -8.420758415670957, + "step": 2308 + }, + { + "epoch": 0.8523833694799502, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 5.467054092811536e-07, + "logits/chosen": 2760278601.142857, + "logits/rejected": 2366668344.888889, + "logps/chosen": -243.30824497767858, + "logps/rejected": -450.5385470920139, + "loss": 0.1444, + "rewards/chosen": 1.0053012030465263, + "rewards/margins": 7.8312590311444, + "rewards/rejected": -6.825957828097874, + "step": 2309 + }, + { + "epoch": 0.8527525264177933, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 5.440325967341404e-07, + "logits/chosen": 1375666312.5333333, + "logits/rejected": 1733929803.2941177, + "logps/chosen": -273.6230143229167, + "logps/rejected": -470.9559110753676, + "loss": 0.1747, + "rewards/chosen": 0.7554531097412109, + "rewards/margins": 8.581206714405733, + "rewards/rejected": -7.825753604664522, + "step": 2310 + }, + { + "epoch": 0.8531216833556365, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 5.413659578733505e-07, + "logits/chosen": 1751759488.0, + "logits/rejected": 1637091712.0, + "logps/chosen": -273.3004150390625, + "logps/rejected": -523.2642822265625, + "loss": 0.1541, + "rewards/chosen": 1.2179498672485352, + "rewards/margins": 10.916155815124512, + "rewards/rejected": -9.698205947875977, + "step": 2311 + }, + { + "epoch": 0.8534908402934798, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 5.387054963933803e-07, + "logits/chosen": 1548229933.1764705, + "logits/rejected": 1667047014.4, + "logps/chosen": -302.86865234375, + "logps/rejected": -471.2328776041667, + "loss": 0.1959, + "rewards/chosen": 0.8671729143928079, + "rewards/margins": 8.816877911137599, + "rewards/rejected": -7.949704996744791, + "step": 2312 + }, + { + "epoch": 0.853859997231323, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 5.36051215980265e-07, + "logits/chosen": 1936395520.0, + "logits/rejected": 2224669184.0, + "logps/chosen": -296.26287841796875, + "logps/rejected": -601.26904296875, + "loss": 0.1791, + "rewards/chosen": 0.9614465832710266, + "rewards/margins": 10.7477405667305, + "rewards/rejected": -9.786293983459473, + "step": 2313 + }, + { + "epoch": 0.8542291541691661, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 5.334031203114753e-07, + "logits/chosen": 1379549184.0, + "logits/rejected": 2063081618.2857144, + "logps/chosen": -249.69596354166666, + "logps/rejected": -424.37241908482144, + "loss": 0.0917, + "rewards/chosen": 2.782420900132921, + "rewards/margins": 11.101804793827117, + "rewards/rejected": -8.319383893694196, + "step": 2314 + }, + { + "epoch": 0.8545983111070093, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 5.307612130559154e-07, + "logits/chosen": 1342089984.0, + "logits/rejected": 1348802560.0, + "logps/chosen": -249.6090850830078, + "logps/rejected": -577.0531005859375, + "loss": 0.1265, + "rewards/chosen": 1.9345970153808594, + "rewards/margins": 11.457327842712402, + "rewards/rejected": -9.522730827331543, + "step": 2315 + }, + { + "epoch": 0.8549674680448526, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 5.281254978739142e-07, + "logits/chosen": 1459239384.6153846, + "logits/rejected": 1792979914.1052632, + "logps/chosen": -337.30652794471155, + "logps/rejected": -542.4445929276316, + "loss": 0.121, + "rewards/chosen": 1.582151119525616, + "rewards/margins": 11.586077446879646, + "rewards/rejected": -10.00392632735403, + "step": 2316 + }, + { + "epoch": 0.8553366249826958, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 5.254959784172197e-07, + "logits/chosen": 2026288790.5882354, + "logits/rejected": 1895854216.5333333, + "logps/chosen": -358.3353056066176, + "logps/rejected": -519.4071614583333, + "loss": 0.1455, + "rewards/chosen": 1.348184024586397, + "rewards/margins": 9.41542849073223, + "rewards/rejected": -8.067244466145834, + "step": 2317 + }, + { + "epoch": 0.8557057819205389, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 5.22872658329e-07, + "logits/chosen": 2021686823.3846154, + "logits/rejected": 1733009192.4210527, + "logps/chosen": -324.37124399038464, + "logps/rejected": -559.9268092105264, + "loss": 0.1077, + "rewards/chosen": 1.4056660578801081, + "rewards/margins": 10.401283573042525, + "rewards/rejected": -8.995617515162417, + "step": 2318 + }, + { + "epoch": 0.8560749388583822, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 5.202555412438309e-07, + "logits/chosen": 1522419280.8421052, + "logits/rejected": 3090651923.6923075, + "logps/chosen": -341.7558850740132, + "logps/rejected": -509.4961688701923, + "loss": 0.1513, + "rewards/chosen": 1.6696918889095909, + "rewards/margins": 9.671065179925217, + "rewards/rejected": -8.001373291015625, + "step": 2319 + }, + { + "epoch": 0.8564440957962254, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 5.176446307876948e-07, + "logits/chosen": 1534795264.0, + "logits/rejected": 1440511872.0, + "logps/chosen": -281.05352783203125, + "logps/rejected": -465.5245666503906, + "loss": 0.1983, + "rewards/chosen": 0.6463942527770996, + "rewards/margins": 8.688412189483643, + "rewards/rejected": -8.042017936706543, + "step": 2320 + }, + { + "epoch": 0.8568132527340686, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 5.150399305779747e-07, + "logits/chosen": 1542839242.1052632, + "logits/rejected": 2087547037.5384614, + "logps/chosen": -349.68626644736844, + "logps/rejected": -438.02388822115387, + "loss": 0.0838, + "rewards/chosen": 2.741179014507093, + "rewards/margins": 10.576606518826505, + "rewards/rejected": -7.835427504319411, + "step": 2321 + }, + { + "epoch": 0.8571824096719117, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 5.124414442234504e-07, + "logits/chosen": 1578153691.4285715, + "logits/rejected": 1519617251.5555556, + "logps/chosen": -253.53029087611608, + "logps/rejected": -449.60302734375, + "loss": 0.127, + "rewards/chosen": 1.3097378867013114, + "rewards/margins": 8.113034293765113, + "rewards/rejected": -6.803296407063802, + "step": 2322 + }, + { + "epoch": 0.857551566609755, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 5.098491753242918e-07, + "logits/chosen": 1791754854.4, + "logits/rejected": 1290028544.0, + "logps/chosen": -259.93671875, + "logps/rejected": -458.6730143229167, + "loss": 0.1534, + "rewards/chosen": 1.5577438354492188, + "rewards/margins": 8.884622065226237, + "rewards/rejected": -7.3268782297770185, + "step": 2323 + }, + { + "epoch": 0.8579207235475982, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 5.07263127472053e-07, + "logits/chosen": 1743802026.6666667, + "logits/rejected": 1428909670.4, + "logps/chosen": -277.85992431640625, + "logps/rejected": -490.970947265625, + "loss": 0.0858, + "rewards/chosen": 1.863835334777832, + "rewards/margins": 10.331033515930176, + "rewards/rejected": -8.467198181152344, + "step": 2324 + }, + { + "epoch": 0.8582898804854414, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 5.046833042496719e-07, + "logits/chosen": 2133457498.3529413, + "logits/rejected": 1875998310.4, + "logps/chosen": -305.3090245863971, + "logps/rejected": -628.2580078125, + "loss": 0.1653, + "rewards/chosen": 1.112167807186351, + "rewards/margins": 9.91424503700406, + "rewards/rejected": -8.802077229817709, + "step": 2325 + }, + { + "epoch": 0.8586590374232845, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 5.021097092314598e-07, + "logits/chosen": 1447373710.2222223, + "logits/rejected": 1177032777.142857, + "logps/chosen": -275.44232855902777, + "logps/rejected": -412.5445033482143, + "loss": 0.1477, + "rewards/chosen": 1.7312371995713975, + "rewards/margins": 9.091570233541821, + "rewards/rejected": -7.3603330339704245, + "step": 2326 + }, + { + "epoch": 0.8590281943611278, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 4.995423459831006e-07, + "logits/chosen": 1458470115.5555556, + "logits/rejected": 1469624758.857143, + "logps/chosen": -303.78350151909723, + "logps/rejected": -438.0963657924107, + "loss": 0.1606, + "rewards/chosen": 1.2467754152086046, + "rewards/margins": 11.439206183902801, + "rewards/rejected": -10.192430768694196, + "step": 2327 + }, + { + "epoch": 0.859397351298971, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 4.96981218061644e-07, + "logits/chosen": 1863567360.0, + "logits/rejected": 1902350464.0, + "logps/chosen": -250.20001220703125, + "logps/rejected": -453.74542236328125, + "loss": 0.1415, + "rewards/chosen": 1.3504202365875244, + "rewards/margins": 8.694480657577515, + "rewards/rejected": -7.34406042098999, + "step": 2328 + }, + { + "epoch": 0.8597665082368142, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 4.944263290154983e-07, + "logits/chosen": 1835426350.5454545, + "logits/rejected": 2214315739.428571, + "logps/chosen": -301.17036576704544, + "logps/rejected": -479.94210379464283, + "loss": 0.11, + "rewards/chosen": 1.3391880555586382, + "rewards/margins": 9.301938445021063, + "rewards/rejected": -7.962750389462426, + "step": 2329 + }, + { + "epoch": 0.8601356651746573, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 4.918776823844312e-07, + "logits/chosen": 1627449088.0, + "logits/rejected": 2012762880.0, + "logps/chosen": -291.75677490234375, + "logps/rejected": -380.6504211425781, + "loss": 0.1486, + "rewards/chosen": 1.5307914018630981, + "rewards/margins": 7.770965218544006, + "rewards/rejected": -6.240173816680908, + "step": 2330 + }, + { + "epoch": 0.8605048221125006, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 4.893352816995611e-07, + "logits/chosen": 1493470822.4, + "logits/rejected": 1761987162.3529413, + "logps/chosen": -260.33779296875, + "logps/rejected": -623.2793543198529, + "loss": 0.1327, + "rewards/chosen": 1.5681107838948567, + "rewards/margins": 11.243410933251475, + "rewards/rejected": -9.675300149356618, + "step": 2331 + }, + { + "epoch": 0.8608739790503438, + "grad_norm": 13.25, + "kl": 0.13375091552734375, + "learning_rate": 4.867991304833502e-07, + "logits/chosen": 2489063014.4, + "logits/rejected": 1619106304.0, + "logps/chosen": -293.2303955078125, + "logps/rejected": -495.009033203125, + "loss": 0.1858, + "rewards/chosen": 1.552775764465332, + "rewards/margins": 9.480842018127442, + "rewards/rejected": -7.928066253662109, + "step": 2332 + }, + { + "epoch": 0.861243135988187, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 4.842692322496039e-07, + "logits/chosen": 1443701760.0, + "logits/rejected": 1372725504.0, + "logps/chosen": -273.023681640625, + "logps/rejected": -507.7358093261719, + "loss": 0.1608, + "rewards/chosen": 1.0464468002319336, + "rewards/margins": 9.101176261901855, + "rewards/rejected": -8.054729461669922, + "step": 2333 + }, + { + "epoch": 0.8616122929260301, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 4.817455905034657e-07, + "logits/chosen": 1500031249.0666666, + "logits/rejected": 2101561705.4117646, + "logps/chosen": -275.97154947916664, + "logps/rejected": -496.2178308823529, + "loss": 0.156, + "rewards/chosen": 0.9612754185994467, + "rewards/margins": 9.481652185028675, + "rewards/rejected": -8.520376766429228, + "step": 2334 + }, + { + "epoch": 0.8619814498638734, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 4.792282087414068e-07, + "logits/chosen": 2228411211.2941175, + "logits/rejected": 1355134293.3333333, + "logps/chosen": -312.1830193014706, + "logps/rejected": -492.6607421875, + "loss": 0.193, + "rewards/chosen": 0.7428256203146542, + "rewards/margins": 10.164025164585487, + "rewards/rejected": -9.421199544270833, + "step": 2335 + }, + { + "epoch": 0.8623506068017166, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 4.7671709045122914e-07, + "logits/chosen": 1528698112.0, + "logits/rejected": 1544669056.0, + "logps/chosen": -290.7152404785156, + "logps/rejected": -475.8163146972656, + "loss": 0.1978, + "rewards/chosen": 0.7245683670043945, + "rewards/margins": 7.230090618133545, + "rewards/rejected": -6.50552225112915, + "step": 2336 + }, + { + "epoch": 0.8627197637395598, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 4.742122391120557e-07, + "logits/chosen": 1907927654.4, + "logits/rejected": 1780444330.6666667, + "logps/chosen": -310.7647705078125, + "logps/rejected": -443.7956136067708, + "loss": 0.1939, + "rewards/chosen": 1.2759862899780274, + "rewards/margins": 8.86619815826416, + "rewards/rejected": -7.590211868286133, + "step": 2337 + }, + { + "epoch": 0.863088920677403, + "grad_norm": 12.375, + "kl": 0.5158934593200684, + "learning_rate": 4.7171365819432435e-07, + "logits/chosen": 3031715072.0, + "logits/rejected": 2504349184.0, + "logps/chosen": -281.22021484375, + "logps/rejected": -319.5928955078125, + "loss": 0.1793, + "rewards/chosen": 1.1229828596115112, + "rewards/margins": 7.54531991481781, + "rewards/rejected": -6.422337055206299, + "step": 2338 + }, + { + "epoch": 0.8634580776152462, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 4.6922135115978873e-07, + "logits/chosen": 1987557148.4444444, + "logits/rejected": 2507136731.428571, + "logps/chosen": -334.09844292534723, + "logps/rejected": -617.0585239955357, + "loss": 0.1639, + "rewards/chosen": 1.290191438462999, + "rewards/margins": 11.990702825879294, + "rewards/rejected": -10.700511387416295, + "step": 2339 + }, + { + "epoch": 0.8638272345530894, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 4.667353214615089e-07, + "logits/chosen": 1870004497.0666666, + "logits/rejected": 1949512643.764706, + "logps/chosen": -311.301171875, + "logps/rejected": -500.0497472426471, + "loss": 0.1346, + "rewards/chosen": 1.6245638529459636, + "rewards/margins": 10.391581890629787, + "rewards/rejected": -8.767018037683824, + "step": 2340 + }, + { + "epoch": 0.8641963914909326, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 4.642555725438463e-07, + "logits/chosen": 2439054887.3846154, + "logits/rejected": 1833997150.3157895, + "logps/chosen": -325.06546724759613, + "logps/rejected": -363.60731907894734, + "loss": 0.117, + "rewards/chosen": 1.3524084824782152, + "rewards/margins": 7.2684008177475405, + "rewards/rejected": -5.915992335269325, + "step": 2341 + }, + { + "epoch": 0.8645655484287758, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 4.6178210784246116e-07, + "logits/chosen": 2259600640.0, + "logits/rejected": 2286348544.0, + "logps/chosen": -305.0442810058594, + "logps/rejected": -434.0245361328125, + "loss": 0.1437, + "rewards/chosen": 1.6728795766830444, + "rewards/margins": 10.080390334129333, + "rewards/rejected": -8.407510757446289, + "step": 2342 + }, + { + "epoch": 0.864934705366619, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 4.593149307843081e-07, + "logits/chosen": 1491469458.2857144, + "logits/rejected": 2415929116.4444447, + "logps/chosen": -275.54396275111606, + "logps/rejected": -440.1169162326389, + "loss": 0.1348, + "rewards/chosen": 1.2928850991385323, + "rewards/margins": 9.243924731299991, + "rewards/rejected": -7.951039632161458, + "step": 2343 + }, + { + "epoch": 0.8653038623044622, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 4.568540447876307e-07, + "logits/chosen": 1618862867.6923077, + "logits/rejected": 1589335093.8947368, + "logps/chosen": -223.028076171875, + "logps/rejected": -462.9628392269737, + "loss": 0.1334, + "rewards/chosen": 1.2458515167236328, + "rewards/margins": 8.979430650409899, + "rewards/rejected": -7.733579133686266, + "step": 2344 + }, + { + "epoch": 0.8656730192423053, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 4.543994532619533e-07, + "logits/chosen": 2298638149.818182, + "logits/rejected": 1628111628.1904762, + "logps/chosen": -297.87118252840907, + "logps/rejected": -518.500744047619, + "loss": 0.0847, + "rewards/chosen": 1.8045000596479936, + "rewards/margins": 10.114004250728723, + "rewards/rejected": -8.309504191080729, + "step": 2345 + }, + { + "epoch": 0.8660421761801486, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 4.5195115960808166e-07, + "logits/chosen": 1695652522.6666667, + "logits/rejected": 1804980705.8823528, + "logps/chosen": -307.89674479166666, + "logps/rejected": -434.64016544117646, + "loss": 0.1336, + "rewards/chosen": 1.4199632008870442, + "rewards/margins": 9.314796933940812, + "rewards/rejected": -7.894833733053768, + "step": 2346 + }, + { + "epoch": 0.8664113331179918, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 4.4950916721809733e-07, + "logits/chosen": 2004393344.0, + "logits/rejected": 1674242944.0, + "logps/chosen": -340.75628662109375, + "logps/rejected": -404.522216796875, + "loss": 0.1185, + "rewards/chosen": 1.7038301229476929, + "rewards/margins": 8.872053503990173, + "rewards/rejected": -7.1682233810424805, + "step": 2347 + }, + { + "epoch": 0.866780490055835, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 4.470734794753473e-07, + "logits/chosen": 1741012992.0, + "logits/rejected": 1763022392.8888888, + "logps/chosen": -271.5132359095982, + "logps/rejected": -586.5592990451389, + "loss": 0.1144, + "rewards/chosen": 1.9898602621895927, + "rewards/margins": 11.081897039262076, + "rewards/rejected": -9.092036777072483, + "step": 2348 + }, + { + "epoch": 0.8671496469936781, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 4.446440997544471e-07, + "logits/chosen": 2268715885.714286, + "logits/rejected": 1790493354.6666667, + "logps/chosen": -269.98216029575894, + "logps/rejected": -439.42735460069446, + "loss": 0.1636, + "rewards/chosen": 0.9072234289986747, + "rewards/margins": 8.66834284767272, + "rewards/rejected": -7.7611194186740455, + "step": 2349 + }, + { + "epoch": 0.8675188039315214, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 4.422210314212727e-07, + "logits/chosen": 1631840571.0769231, + "logits/rejected": 1722224640.0, + "logps/chosen": -231.54533503605768, + "logps/rejected": -544.6674547697369, + "loss": 0.1046, + "rewards/chosen": 1.6640391716590295, + "rewards/margins": 8.752816412612978, + "rewards/rejected": -7.088777240953948, + "step": 2350 + }, + { + "epoch": 0.8678879608693646, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 4.39804277832952e-07, + "logits/chosen": 1809266057.8461537, + "logits/rejected": 1727844352.0, + "logps/chosen": -315.31234975961536, + "logps/rejected": -566.2394634046053, + "loss": 0.1031, + "rewards/chosen": 1.9785911853496845, + "rewards/margins": 11.535932108458237, + "rewards/rejected": -9.557340923108553, + "step": 2351 + }, + { + "epoch": 0.8682571178072078, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 4.373938423378682e-07, + "logits/chosen": 1923624704.0, + "logits/rejected": 1644592640.0, + "logps/chosen": -199.8976287841797, + "logps/rejected": -445.9142761230469, + "loss": 0.1707, + "rewards/chosen": 0.9787515997886658, + "rewards/margins": 8.827819764614105, + "rewards/rejected": -7.8490681648254395, + "step": 2352 + }, + { + "epoch": 0.8686262747450509, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 4.349897282756488e-07, + "logits/chosen": 1467019504.9411764, + "logits/rejected": 1031739665.0666667, + "logps/chosen": -193.03762637867646, + "logps/rejected": -493.72552083333335, + "loss": 0.1453, + "rewards/chosen": 1.540726381189683, + "rewards/margins": 13.588421286788641, + "rewards/rejected": -12.047694905598958, + "step": 2353 + }, + { + "epoch": 0.8689954316828942, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 4.325919389771627e-07, + "logits/chosen": 2003167074.4615386, + "logits/rejected": 1388898950.7368422, + "logps/chosen": -378.91030649038464, + "logps/rejected": -386.09560032894734, + "loss": 0.0988, + "rewards/chosen": 2.0616637009840746, + "rewards/margins": 9.276030413052332, + "rewards/rejected": -7.214366712068257, + "step": 2354 + }, + { + "epoch": 0.8693645886207374, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 4.3020047776451633e-07, + "logits/chosen": 1713400960.0, + "logits/rejected": 1114259840.0, + "logps/chosen": -297.56207275390625, + "logps/rejected": -417.4426574707031, + "loss": 0.1343, + "rewards/chosen": 1.5767661333084106, + "rewards/margins": 10.474696278572083, + "rewards/rejected": -8.897930145263672, + "step": 2355 + }, + { + "epoch": 0.8697337455585806, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 4.2781534795104995e-07, + "logits/chosen": 2330307662.769231, + "logits/rejected": 1719131513.2631578, + "logps/chosen": -320.5189678485577, + "logps/rejected": -476.1494654605263, + "loss": 0.1406, + "rewards/chosen": 1.1662475145780122, + "rewards/margins": 8.528418888447256, + "rewards/rejected": -7.362171373869243, + "step": 2356 + }, + { + "epoch": 0.8701029024964237, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 4.2543655284132957e-07, + "logits/chosen": 1573459365.6470587, + "logits/rejected": 1445427473.0666666, + "logps/chosen": -249.7428481158088, + "logps/rejected": -511.3021484375, + "loss": 0.123, + "rewards/chosen": 1.8909025753245634, + "rewards/margins": 9.790504829556333, + "rewards/rejected": -7.8996022542317705, + "step": 2357 + }, + { + "epoch": 0.870472059434267, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 4.2306409573114715e-07, + "logits/chosen": 2095367509.3333333, + "logits/rejected": 2126784512.0, + "logps/chosen": -260.9277750651042, + "logps/rejected": -659.310498046875, + "loss": 0.1023, + "rewards/chosen": 1.6052943865458171, + "rewards/margins": 11.192625013987223, + "rewards/rejected": -9.587330627441407, + "step": 2358 + }, + { + "epoch": 0.8708412163721102, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 4.2069797990751007e-07, + "logits/chosen": 1908380765.090909, + "logits/rejected": 1578093811.8095238, + "logps/chosen": -213.78870738636363, + "logps/rejected": -565.416248139881, + "loss": 0.0535, + "rewards/chosen": 2.6214623884721235, + "rewards/margins": 12.10166156653202, + "rewards/rejected": -9.480199178059896, + "step": 2359 + }, + { + "epoch": 0.8712103733099534, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 4.183382086486415e-07, + "logits/chosen": 1621753514.6666667, + "logits/rejected": 1695897892.5714285, + "logps/chosen": -255.9013671875, + "logps/rejected": -416.37451171875, + "loss": 0.1409, + "rewards/chosen": 2.0612496270073786, + "rewards/margins": 9.951455555264912, + "rewards/rejected": -7.890205928257534, + "step": 2360 + }, + { + "epoch": 0.8715795302477966, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 4.1598478522397567e-07, + "logits/chosen": 1723701970.8235295, + "logits/rejected": 1623886506.6666667, + "logps/chosen": -391.2572380514706, + "logps/rejected": -399.03955078125, + "loss": 0.1513, + "rewards/chosen": 1.1866464053883272, + "rewards/margins": 8.701553225049786, + "rewards/rejected": -7.514906819661459, + "step": 2361 + }, + { + "epoch": 0.8719486871856398, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 4.1363771289415154e-07, + "logits/chosen": 1595630592.0, + "logits/rejected": 2079060423.1111112, + "logps/chosen": -248.78351702008928, + "logps/rejected": -454.31434461805554, + "loss": 0.13, + "rewards/chosen": 1.4523277282714844, + "rewards/margins": 8.979418012830946, + "rewards/rejected": -7.527090284559462, + "step": 2362 + }, + { + "epoch": 0.872317844123483, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 4.1129699491100626e-07, + "logits/chosen": 1409431444.2105262, + "logits/rejected": 1592091884.3076923, + "logps/chosen": -257.97080592105266, + "logps/rejected": -359.51062950721155, + "loss": 0.183, + "rewards/chosen": 1.32951525637978, + "rewards/margins": 8.970438976519503, + "rewards/rejected": -7.640923720139724, + "step": 2363 + }, + { + "epoch": 0.8726870010613262, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 4.089626345175757e-07, + "logits/chosen": 1392890071.5789473, + "logits/rejected": 1516170633.8461537, + "logps/chosen": -200.86551706414474, + "logps/rejected": -435.1195537860577, + "loss": 0.1809, + "rewards/chosen": 1.0879708340293484, + "rewards/margins": 9.783116661102666, + "rewards/rejected": -8.695145827073317, + "step": 2364 + }, + { + "epoch": 0.8730561579991694, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 4.0663463494808706e-07, + "logits/chosen": 1461819632.9411764, + "logits/rejected": 1070102186.6666666, + "logps/chosen": -254.30526194852942, + "logps/rejected": -381.31634114583335, + "loss": 0.1242, + "rewards/chosen": 2.6554711285759423, + "rewards/margins": 9.961160622391047, + "rewards/rejected": -7.3056894938151045, + "step": 2365 + }, + { + "epoch": 0.8734253149370126, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 4.043129994279527e-07, + "logits/chosen": 2889606758.4, + "logits/rejected": 1548764501.3333333, + "logps/chosen": -359.000830078125, + "logps/rejected": -431.6427815755208, + "loss": 0.2156, + "rewards/chosen": 0.7354964256286621, + "rewards/margins": 8.002913697560627, + "rewards/rejected": -7.267417271931966, + "step": 2366 + }, + { + "epoch": 0.8737944718748558, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 4.019977311737699e-07, + "logits/chosen": 1712532480.0, + "logits/rejected": 1901728914.2857144, + "logps/chosen": -277.93071831597223, + "logps/rejected": -474.97324916294644, + "loss": 0.1609, + "rewards/chosen": 1.4020648532443576, + "rewards/margins": 8.87588622078063, + "rewards/rejected": -7.473821367536273, + "step": 2367 + }, + { + "epoch": 0.874163628812699, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 3.9968883339331467e-07, + "logits/chosen": 1700610168.4705882, + "logits/rejected": 1775067682.1333334, + "logps/chosen": -306.21179917279414, + "logps/rejected": -339.68444010416664, + "loss": 0.1384, + "rewards/chosen": 1.6907446244183708, + "rewards/margins": 7.838699946683996, + "rewards/rejected": -6.147955322265625, + "step": 2368 + }, + { + "epoch": 0.8745327857505422, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 3.973863092855335e-07, + "logits/chosen": 1507270656.0, + "logits/rejected": 1504746700.8, + "logps/chosen": -263.3938993566176, + "logps/rejected": -442.4373046875, + "loss": 0.1527, + "rewards/chosen": 1.2257837407729204, + "rewards/margins": 9.038414457732555, + "rewards/rejected": -7.812630716959635, + "step": 2369 + }, + { + "epoch": 0.8749019426883854, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 3.9509016204054506e-07, + "logits/chosen": 1542911397.6470587, + "logits/rejected": 1779979059.2, + "logps/chosen": -309.18376608455884, + "logps/rejected": -492.8422526041667, + "loss": 0.1518, + "rewards/chosen": 1.3053505841423483, + "rewards/margins": 9.319153684728285, + "rewards/rejected": -8.013803100585937, + "step": 2370 + }, + { + "epoch": 0.8752710996262286, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 3.928003948396336e-07, + "logits/chosen": 1274698410.6666667, + "logits/rejected": 2211756634.352941, + "logps/chosen": -235.24296875, + "logps/rejected": -519.3604664522059, + "loss": 0.1819, + "rewards/chosen": 0.69678160349528, + "rewards/margins": 9.390801594304104, + "rewards/rejected": -8.694019990808824, + "step": 2371 + }, + { + "epoch": 0.8756402565640719, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 3.9051701085523973e-07, + "logits/chosen": 1589159936.0, + "logits/rejected": 1721019661.4736843, + "logps/chosen": -240.45793269230768, + "logps/rejected": -578.5712890625, + "loss": 0.119, + "rewards/chosen": 1.1905070818387544, + "rewards/margins": 9.677637497905778, + "rewards/rejected": -8.487130416067023, + "step": 2372 + }, + { + "epoch": 0.876009413501915, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 3.8824001325096504e-07, + "logits/chosen": 1694997959.1111112, + "logits/rejected": 1584891026.2857144, + "logps/chosen": -304.432861328125, + "logps/rejected": -532.6552734375, + "loss": 0.1807, + "rewards/chosen": 1.2479754553900824, + "rewards/margins": 7.718447518727136, + "rewards/rejected": -6.470472063337054, + "step": 2373 + }, + { + "epoch": 0.876009413501915, + "eval_kl": 0.0, + "eval_logits/chosen": 3481469868.708134, + "eval_logits/rejected": 3505006565.4025974, + "eval_logps/chosen": -291.5509307715311, + "eval_logps/rejected": -480.4856939935065, + "eval_loss": 0.12881676852703094, + "eval_rewards/chosen": 1.5463871568013607, + "eval_rewards/margins": 9.821909364086263, + "eval_rewards/rejected": -8.275522207284903, + "eval_runtime": 110.768, + "eval_samples_per_second": 7.908, + "eval_steps_per_second": 0.497, + "step": 2373 + }, + { + "epoch": 0.8763785704397582, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 3.8596940518156047e-07, + "logits/chosen": 1648718165.3333333, + "logits/rejected": 2447054262.857143, + "logps/chosen": -331.10926649305554, + "logps/rejected": -441.43007114955356, + "loss": 0.1201, + "rewards/chosen": 2.0128396352132163, + "rewards/margins": 9.559073221115838, + "rewards/rejected": -7.546233585902622, + "step": 2374 + }, + { + "epoch": 0.8767477273776014, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 3.83705189792925e-07, + "logits/chosen": 2022722901.3333333, + "logits/rejected": 1892051148.8, + "logps/chosen": -215.1799112955729, + "logps/rejected": -542.203076171875, + "loss": 0.1414, + "rewards/chosen": 1.3275129795074463, + "rewards/margins": 10.148264169692993, + "rewards/rejected": -8.820751190185547, + "step": 2375 + }, + { + "epoch": 0.8771168843154447, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 3.8144737022209835e-07, + "logits/chosen": 1416501609.4117646, + "logits/rejected": 1685099997.8666666, + "logps/chosen": -244.06979549632354, + "logps/rejected": -520.6416666666667, + "loss": 0.141, + "rewards/chosen": 1.9018054288976334, + "rewards/margins": 10.04710012697706, + "rewards/rejected": -8.145294698079427, + "step": 2376 + }, + { + "epoch": 0.8774860412532878, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 3.791959495972619e-07, + "logits/chosen": 2022917059.764706, + "logits/rejected": 2982748706.133333, + "logps/chosen": -343.36204618566177, + "logps/rejected": -533.6280598958333, + "loss": 0.1852, + "rewards/chosen": 0.8928734835456399, + "rewards/margins": 8.503206735498765, + "rewards/rejected": -7.610333251953125, + "step": 2377 + }, + { + "epoch": 0.877855198191131, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 3.769509310377317e-07, + "logits/chosen": 2362040320.0, + "logits/rejected": 2307799160.470588, + "logps/chosen": -273.2662109375, + "logps/rejected": -474.26266659007354, + "loss": 0.127, + "rewards/chosen": 1.3398053487141928, + "rewards/margins": 9.076470064649396, + "rewards/rejected": -7.736664715935202, + "step": 2378 + }, + { + "epoch": 0.8782243551289742, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 3.7471231765395077e-07, + "logits/chosen": 1563550242.1333334, + "logits/rejected": 1558956272.9411764, + "logps/chosen": -337.43645833333335, + "logps/rejected": -435.1807502297794, + "loss": 0.1604, + "rewards/chosen": 1.1948347727457682, + "rewards/margins": 10.133646131029316, + "rewards/rejected": -8.938811358283548, + "step": 2379 + }, + { + "epoch": 0.8785935120668174, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 3.7248011254748974e-07, + "logits/chosen": 1571805047.4666667, + "logits/rejected": 1484086814.1176472, + "logps/chosen": -278.3046875, + "logps/rejected": -545.7537913602941, + "loss": 0.1159, + "rewards/chosen": 1.7586629231770834, + "rewards/margins": 11.189168114755667, + "rewards/rejected": -9.430505191578584, + "step": 2380 + }, + { + "epoch": 0.8789626690046606, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 3.7025431881104137e-07, + "logits/chosen": 1563438925.9130435, + "logits/rejected": 1481053297.7777777, + "logps/chosen": -279.06118376358694, + "logps/rejected": -556.3423394097222, + "loss": 0.1488, + "rewards/chosen": 2.0257349428923233, + "rewards/margins": 11.650996037727392, + "rewards/rejected": -9.62526109483507, + "step": 2381 + }, + { + "epoch": 0.8793318259425038, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 3.680349395284133e-07, + "logits/chosen": 1904163840.0, + "logits/rejected": 2313790873.6, + "logps/chosen": -219.79052734375, + "logps/rejected": -620.878662109375, + "loss": 0.0959, + "rewards/chosen": 1.515072186787923, + "rewards/margins": 13.79141190846761, + "rewards/rejected": -12.276339721679687, + "step": 2382 + }, + { + "epoch": 0.879700982880347, + "grad_norm": 7.78125, + "kl": 0.0, + "learning_rate": 3.658219777745281e-07, + "logits/chosen": 1569499818.6666667, + "logits/rejected": 1764822220.8, + "logps/chosen": -219.35247802734375, + "logps/rejected": -466.608984375, + "loss": 0.0719, + "rewards/chosen": 2.0737975438435874, + "rewards/margins": 10.117838223775228, + "rewards/rejected": -8.04404067993164, + "step": 2383 + }, + { + "epoch": 0.8800701398181902, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 3.6361543661541654e-07, + "logits/chosen": 1794417810.2857144, + "logits/rejected": 2285807988.3636365, + "logps/chosen": -261.2432338169643, + "logps/rejected": -673.0806107954545, + "loss": 0.1756, + "rewards/chosen": 1.3642409188406808, + "rewards/margins": 9.846161334545581, + "rewards/rejected": -8.4819204157049, + "step": 2384 + }, + { + "epoch": 0.8804392967560334, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 3.614153191082126e-07, + "logits/chosen": 2076746932.7058823, + "logits/rejected": 1879126835.2, + "logps/chosen": -355.85989200367646, + "logps/rejected": -416.23919270833335, + "loss": 0.175, + "rewards/chosen": 1.1687452652875114, + "rewards/margins": 8.107068222644283, + "rewards/rejected": -6.938322957356771, + "step": 2385 + }, + { + "epoch": 0.8808084536938766, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 3.592216283011513e-07, + "logits/chosen": 1498066512.8421052, + "logits/rejected": 1744643623.3846154, + "logps/chosen": -298.47787314967104, + "logps/rejected": -574.296875, + "loss": 0.1788, + "rewards/chosen": 1.2245858844957853, + "rewards/margins": 8.996639529703117, + "rewards/rejected": -7.772053645207332, + "step": 2386 + }, + { + "epoch": 0.8811776106317198, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 3.570343672335641e-07, + "logits/chosen": 1628690560.0, + "logits/rejected": 2274489856.0, + "logps/chosen": -235.30181884765625, + "logps/rejected": -421.9909973144531, + "loss": 0.1343, + "rewards/chosen": 1.4506711959838867, + "rewards/margins": 10.026546478271484, + "rewards/rejected": -8.575875282287598, + "step": 2387 + }, + { + "epoch": 0.881546767569563, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 3.5485353893587204e-07, + "logits/chosen": 1538738585.6, + "logits/rejected": 2169461820.2352943, + "logps/chosen": -257.18059895833335, + "logps/rejected": -527.3236443014706, + "loss": 0.1364, + "rewards/chosen": 1.2028329213460287, + "rewards/margins": 11.125317375332703, + "rewards/rejected": -9.922484453986673, + "step": 2388 + }, + { + "epoch": 0.8819159245074062, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 3.5267914642958534e-07, + "logits/chosen": 1437285990.4, + "logits/rejected": 1763652065.8823528, + "logps/chosen": -226.3341796875, + "logps/rejected": -559.4385914522059, + "loss": 0.1506, + "rewards/chosen": 1.2195711771647135, + "rewards/margins": 9.0038163877001, + "rewards/rejected": -7.784245210535386, + "step": 2389 + }, + { + "epoch": 0.8822850814452494, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 3.505111927272992e-07, + "logits/chosen": 1542692717.7142856, + "logits/rejected": 1803011527.1111112, + "logps/chosen": -289.70127650669644, + "logps/rejected": -500.3254123263889, + "loss": 0.171, + "rewards/chosen": 0.6538628169468471, + "rewards/margins": 10.149046125866118, + "rewards/rejected": -9.495183308919271, + "step": 2390 + }, + { + "epoch": 0.8826542383830926, + "grad_norm": 8.6875, + "kl": 0.6465013027191162, + "learning_rate": 3.4834968083268307e-07, + "logits/chosen": 1598732726.857143, + "logits/rejected": 1357007872.0, + "logps/chosen": -190.70668247767858, + "logps/rejected": -483.9323459201389, + "loss": 0.1078, + "rewards/chosen": 2.420595714024135, + "rewards/margins": 10.884809615120055, + "rewards/rejected": -8.46421390109592, + "step": 2391 + }, + { + "epoch": 0.8830233953209358, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 3.461946137404865e-07, + "logits/chosen": 1758668800.0, + "logits/rejected": 1481446701.1764705, + "logps/chosen": -334.4805013020833, + "logps/rejected": -437.46593520220586, + "loss": 0.1494, + "rewards/chosen": 1.082920455932617, + "rewards/margins": 7.600692457311293, + "rewards/rejected": -6.517772001378677, + "step": 2392 + }, + { + "epoch": 0.883392552258779, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 3.440459944365271e-07, + "logits/chosen": 1112365163.7894738, + "logits/rejected": 1431921900.3076923, + "logps/chosen": -181.07930715460526, + "logps/rejected": -564.6420522836538, + "loss": 0.1444, + "rewards/chosen": 1.8616168373509456, + "rewards/margins": 10.183123631033338, + "rewards/rejected": -8.321506793682392, + "step": 2393 + }, + { + "epoch": 0.8837617091966222, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 3.4190382589768755e-07, + "logits/chosen": 1783432493.1764705, + "logits/rejected": 1206909610.6666667, + "logps/chosen": -218.75955020680146, + "logps/rejected": -318.4763671875, + "loss": 0.1066, + "rewards/chosen": 2.502301608814913, + "rewards/margins": 7.887000095143037, + "rewards/rejected": -5.384698486328125, + "step": 2394 + }, + { + "epoch": 0.8841308661344655, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 3.397681110919171e-07, + "logits/chosen": 1649513728.0, + "logits/rejected": 1573665920.0, + "logps/chosen": -289.9728088378906, + "logps/rejected": -532.0128784179688, + "loss": 0.1574, + "rewards/chosen": 1.1430635452270508, + "rewards/margins": 15.631179809570312, + "rewards/rejected": -14.488116264343262, + "step": 2395 + }, + { + "epoch": 0.8845000230723086, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 3.3763885297822153e-07, + "logits/chosen": 2075573686.857143, + "logits/rejected": 1808228807.1111112, + "logps/chosen": -281.23263113839283, + "logps/rejected": -449.9213595920139, + "loss": 0.1384, + "rewards/chosen": 1.6203512464250838, + "rewards/margins": 8.879464982047914, + "rewards/rejected": -7.2591137356228295, + "step": 2396 + }, + { + "epoch": 0.8848691800101518, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 3.355160545066599e-07, + "logits/chosen": 1783288081.0666666, + "logits/rejected": 1681667855.0588236, + "logps/chosen": -289.5434244791667, + "logps/rejected": -505.43669577205884, + "loss": 0.1218, + "rewards/chosen": 1.598502477010091, + "rewards/margins": 9.777357026642445, + "rewards/rejected": -8.178854549632353, + "step": 2397 + }, + { + "epoch": 0.885238336947995, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 3.333997186183435e-07, + "logits/chosen": 1820651760.9411764, + "logits/rejected": 1745485141.3333333, + "logps/chosen": -256.28972311580884, + "logps/rejected": -534.05791015625, + "loss": 0.1397, + "rewards/chosen": 1.4957084655761719, + "rewards/margins": 10.526041920979818, + "rewards/rejected": -9.030333455403646, + "step": 2398 + }, + { + "epoch": 0.8856074938858383, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 3.312898482454285e-07, + "logits/chosen": 2446041519.1578946, + "logits/rejected": 2539479512.6153846, + "logps/chosen": -292.9473941200658, + "logps/rejected": -609.93701171875, + "loss": 0.2226, + "rewards/chosen": 0.7768659089740954, + "rewards/margins": 8.013567287429625, + "rewards/rejected": -7.236701378455529, + "step": 2399 + }, + { + "epoch": 0.8859766508236814, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 3.2918644631111274e-07, + "logits/chosen": 1954617705.4117646, + "logits/rejected": 1772770645.3333333, + "logps/chosen": -300.86178768382354, + "logps/rejected": -581.820703125, + "loss": 0.1784, + "rewards/chosen": 1.279665329877068, + "rewards/margins": 12.096712449017692, + "rewards/rejected": -10.817047119140625, + "step": 2400 + }, + { + "epoch": 0.8863458077615246, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 3.270895157296339e-07, + "logits/chosen": 1834403682.4615386, + "logits/rejected": 2130833623.5789473, + "logps/chosen": -308.63243689903845, + "logps/rejected": -433.58475534539474, + "loss": 0.111, + "rewards/chosen": 1.4987669724684496, + "rewards/margins": 8.99136886905562, + "rewards/rejected": -7.492601896587171, + "step": 2401 + }, + { + "epoch": 0.8867149646993678, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 3.24999059406263e-07, + "logits/chosen": 1990819976.5333333, + "logits/rejected": 2101379553.8823528, + "logps/chosen": -287.16028645833336, + "logps/rejected": -418.81657858455884, + "loss": 0.1437, + "rewards/chosen": 1.3372042338053385, + "rewards/margins": 9.263565317789713, + "rewards/rejected": -7.926361083984375, + "step": 2402 + }, + { + "epoch": 0.8870841216372111, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 3.229150802372988e-07, + "logits/chosen": 1483133838.2222223, + "logits/rejected": 1355784484.5714285, + "logps/chosen": -258.7846950954861, + "logps/rejected": -554.6003766741071, + "loss": 0.1969, + "rewards/chosen": 0.8799067073398166, + "rewards/margins": 9.82111132334149, + "rewards/rejected": -8.941204616001674, + "step": 2403 + }, + { + "epoch": 0.8874532785750542, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 3.2083758111006946e-07, + "logits/chosen": 1460034861.1764705, + "logits/rejected": 1520666624.0, + "logps/chosen": -195.36019358915442, + "logps/rejected": -488.72685546875, + "loss": 0.1457, + "rewards/chosen": 1.3135634029612822, + "rewards/margins": 9.072482165168314, + "rewards/rejected": -7.758918762207031, + "step": 2404 + }, + { + "epoch": 0.8878224355128974, + "grad_norm": 12.8125, + "kl": 0.14441561698913574, + "learning_rate": 3.187665649029242e-07, + "logits/chosen": 1403021942.1538463, + "logits/rejected": 1580793514.6666667, + "logps/chosen": -236.26491135817307, + "logps/rejected": -326.57529703776044, + "loss": 0.174, + "rewards/chosen": 2.070187055147611, + "rewards/margins": 9.56916026580028, + "rewards/rejected": -7.498973210652669, + "step": 2405 + }, + { + "epoch": 0.8881915924507406, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 3.1670203448522784e-07, + "logits/chosen": 2009383152.9411764, + "logits/rejected": 1454517998.9333334, + "logps/chosen": -385.0765165441176, + "logps/rejected": -491.8958984375, + "loss": 0.1496, + "rewards/chosen": 1.2964347390567554, + "rewards/margins": 9.394766444785922, + "rewards/rejected": -8.098331705729167, + "step": 2406 + }, + { + "epoch": 0.8885607493885839, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 3.1464399271736225e-07, + "logits/chosen": 1358887497.142857, + "logits/rejected": 1817008469.3333333, + "logps/chosen": -232.59926060267858, + "logps/rejected": -453.00379774305554, + "loss": 0.1691, + "rewards/chosen": 1.6784251076834542, + "rewards/margins": 8.474131114899166, + "rewards/rejected": -6.795706007215712, + "step": 2407 + }, + { + "epoch": 0.888929906326427, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 3.125924424507182e-07, + "logits/chosen": 2357175552.0, + "logits/rejected": 2633788160.0, + "logps/chosen": -244.996337890625, + "logps/rejected": -502.7215881347656, + "loss": 0.1121, + "rewards/chosen": 1.8588197231292725, + "rewards/margins": 10.082250356674194, + "rewards/rejected": -8.223430633544922, + "step": 2408 + }, + { + "epoch": 0.8892990632642702, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 3.1054738652769256e-07, + "logits/chosen": 1317879567.0588236, + "logits/rejected": 1755564578.1333334, + "logps/chosen": -273.10377412683823, + "logps/rejected": -411.03623046875, + "loss": 0.1538, + "rewards/chosen": 1.4238368763643152, + "rewards/margins": 9.242520246318742, + "rewards/rejected": -7.818683369954427, + "step": 2409 + }, + { + "epoch": 0.8896682202021134, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 3.0850882778168333e-07, + "logits/chosen": 2557356218.181818, + "logits/rejected": 1544267093.3333333, + "logps/chosen": -240.57832475142047, + "logps/rejected": -449.38006882440476, + "loss": 0.1255, + "rewards/chosen": 1.4032977711070667, + "rewards/margins": 8.724813535615995, + "rewards/rejected": -7.321515764508929, + "step": 2410 + }, + { + "epoch": 0.8900373771399567, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 3.0647676903708846e-07, + "logits/chosen": 1878793088.0, + "logits/rejected": 2268934144.0, + "logps/chosen": -289.04931640625, + "logps/rejected": -450.2090148925781, + "loss": 0.1699, + "rewards/chosen": 1.0630583763122559, + "rewards/margins": 8.07008695602417, + "rewards/rejected": -7.007028579711914, + "step": 2411 + }, + { + "epoch": 0.8904065340777998, + "grad_norm": 11.8125, + "kl": 0.0027685165405273438, + "learning_rate": 3.044512131092997e-07, + "logits/chosen": 1988423248.8421052, + "logits/rejected": 1853563825.2307692, + "logps/chosen": -278.4112613075658, + "logps/rejected": -585.0461989182693, + "loss": 0.1431, + "rewards/chosen": 1.4456566258480674, + "rewards/margins": 11.096698683765736, + "rewards/rejected": -9.651042057917667, + "step": 2412 + }, + { + "epoch": 0.890775691015643, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 3.0243216280469834e-07, + "logits/chosen": 2318195873.6842103, + "logits/rejected": 2058958690.4615386, + "logps/chosen": -320.1441200657895, + "logps/rejected": -434.5381610576923, + "loss": 0.211, + "rewards/chosen": 0.9797908381411904, + "rewards/margins": 8.900473305088305, + "rewards/rejected": -7.920682466947115, + "step": 2413 + }, + { + "epoch": 0.8911448479534863, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 3.004196209206539e-07, + "logits/chosen": 1625776512.0, + "logits/rejected": 1436812032.0, + "logps/chosen": -251.1259765625, + "logps/rejected": -453.0745544433594, + "loss": 0.163, + "rewards/chosen": 1.0825763940811157, + "rewards/margins": 8.050191044807434, + "rewards/rejected": -6.967614650726318, + "step": 2414 + }, + { + "epoch": 0.8915140048913295, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 2.984135902455171e-07, + "logits/chosen": 2189321054.3157897, + "logits/rejected": 2618992167.3846154, + "logps/chosen": -316.30751439144734, + "logps/rejected": -539.7448542668269, + "loss": 0.1362, + "rewards/chosen": 1.646677719919305, + "rewards/margins": 9.293852848562635, + "rewards/rejected": -7.64717512864333, + "step": 2415 + }, + { + "epoch": 0.8918831618291726, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 2.9641407355861796e-07, + "logits/chosen": 1354237269.3333333, + "logits/rejected": 1338761928.347826, + "logps/chosen": -247.88129340277777, + "logps/rejected": -338.7960045855978, + "loss": 0.1259, + "rewards/chosen": 0.709000375535753, + "rewards/margins": 7.348879021722913, + "rewards/rejected": -6.63987864618716, + "step": 2416 + }, + { + "epoch": 0.8922523187670158, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 2.9442107363026106e-07, + "logits/chosen": 2083327044.2666667, + "logits/rejected": 1948074465.8823528, + "logps/chosen": -288.9009114583333, + "logps/rejected": -439.94025735294116, + "loss": 0.1599, + "rewards/chosen": 1.2510449727376303, + "rewards/margins": 7.774328089695351, + "rewards/rejected": -6.523283116957721, + "step": 2417 + }, + { + "epoch": 0.8926214757048591, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 2.9243459322172317e-07, + "logits/chosen": 1628617113.6, + "logits/rejected": 1442201479.5294118, + "logps/chosen": -261.437109375, + "logps/rejected": -450.48316865808823, + "loss": 0.1289, + "rewards/chosen": 1.5159439086914062, + "rewards/margins": 9.481002179314109, + "rewards/rejected": -7.965058270622702, + "step": 2418 + }, + { + "epoch": 0.8929906326427022, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 2.904546350852472e-07, + "logits/chosen": 2352416256.0, + "logits/rejected": 2081082624.0, + "logps/chosen": -271.9472351074219, + "logps/rejected": -507.3582763671875, + "loss": 0.1332, + "rewards/chosen": 1.6886239051818848, + "rewards/margins": 10.054550647735596, + "rewards/rejected": -8.365926742553711, + "step": 2419 + }, + { + "epoch": 0.8933597895805454, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 2.884812019640404e-07, + "logits/chosen": 1238003712.0, + "logits/rejected": 1769259008.0, + "logps/chosen": -268.67684500558033, + "logps/rejected": -463.57655164930554, + "loss": 0.1475, + "rewards/chosen": 1.0226571219308036, + "rewards/margins": 8.794436500186011, + "rewards/rejected": -7.771779378255208, + "step": 2420 + }, + { + "epoch": 0.8937289465183886, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 2.8651429659226906e-07, + "logits/chosen": 1442927001.6, + "logits/rejected": 1594588461.1764705, + "logps/chosen": -317.57578125, + "logps/rejected": -397.64990234375, + "loss": 0.1421, + "rewards/chosen": 1.389474105834961, + "rewards/margins": 7.608068376428941, + "rewards/rejected": -6.21859427059398, + "step": 2421 + }, + { + "epoch": 0.8940981034562319, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 2.8455392169505546e-07, + "logits/chosen": 2407253895.529412, + "logits/rejected": 1569752132.2666667, + "logps/chosen": -284.6984432444853, + "logps/rejected": -438.2962239583333, + "loss": 0.1802, + "rewards/chosen": 1.0110668855554916, + "rewards/margins": 9.23271961586148, + "rewards/rejected": -8.221652730305989, + "step": 2422 + }, + { + "epoch": 0.894467260394075, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 2.826000799884737e-07, + "logits/chosen": 1579644196.5714285, + "logits/rejected": 1484120064.0, + "logps/chosen": -304.26210239955356, + "logps/rejected": -438.62830946180554, + "loss": 0.1368, + "rewards/chosen": 1.5689071927751814, + "rewards/margins": 8.159998348781041, + "rewards/rejected": -6.591091156005859, + "step": 2423 + }, + { + "epoch": 0.8948364173319182, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 2.806527741795478e-07, + "logits/chosen": 1761523143.1111112, + "logits/rejected": 2580986441.142857, + "logps/chosen": -272.2057291666667, + "logps/rejected": -458.7041015625, + "loss": 0.1233, + "rewards/chosen": 1.8270015716552734, + "rewards/margins": 9.836800983973912, + "rewards/rejected": -8.009799412318639, + "step": 2424 + }, + { + "epoch": 0.8952055742697614, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 2.787120069662441e-07, + "logits/chosen": 2005606400.0, + "logits/rejected": 1904080896.0, + "logps/chosen": -234.320068359375, + "logps/rejected": -451.3134765625, + "loss": 0.1968, + "rewards/chosen": 1.0836851331922743, + "rewards/margins": 7.890010228232732, + "rewards/rejected": -6.8063250950404575, + "step": 2425 + }, + { + "epoch": 0.8955747312076047, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 2.767777810374722e-07, + "logits/chosen": 1512564931.047619, + "logits/rejected": 1491951429.8181818, + "logps/chosen": -319.5868210565476, + "logps/rejected": -432.8352716619318, + "loss": 0.1904, + "rewards/chosen": 1.4832215082077753, + "rewards/margins": 8.41211971266445, + "rewards/rejected": -6.928898204456676, + "step": 2426 + }, + { + "epoch": 0.8959438881454478, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 2.748500990730768e-07, + "logits/chosen": 1446496392.5333333, + "logits/rejected": 1228435817.4117646, + "logps/chosen": -298.74033203125, + "logps/rejected": -379.4883386948529, + "loss": 0.1408, + "rewards/chosen": 1.2336742401123046, + "rewards/margins": 8.677345073924345, + "rewards/rejected": -7.443670833812041, + "step": 2427 + }, + { + "epoch": 0.896313045083291, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 2.7292896374383595e-07, + "logits/chosen": 1693396081.7777777, + "logits/rejected": 2274400402.285714, + "logps/chosen": -270.5152994791667, + "logps/rejected": -501.08768136160717, + "loss": 0.2107, + "rewards/chosen": 0.975378672281901, + "rewards/margins": 9.286158788771855, + "rewards/rejected": -8.310780116489955, + "step": 2428 + }, + { + "epoch": 0.8966822020211342, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 2.710143777114588e-07, + "logits/chosen": 2222628317.866667, + "logits/rejected": 1947080583.5294118, + "logps/chosen": -262.7497233072917, + "logps/rejected": -440.43781594669116, + "loss": 0.1599, + "rewards/chosen": 1.1009751637776692, + "rewards/margins": 8.040750518499635, + "rewards/rejected": -6.9397753547219665, + "step": 2429 + }, + { + "epoch": 0.8970513589589775, + "grad_norm": 10.4375, + "kl": 1.6933422088623047, + "learning_rate": 2.691063436285812e-07, + "logits/chosen": 2380392038.4, + "logits/rejected": 1447307946.6666667, + "logps/chosen": -244.466357421875, + "logps/rejected": -452.521484375, + "loss": 0.1758, + "rewards/chosen": 2.300667953491211, + "rewards/margins": 10.624505869547527, + "rewards/rejected": -8.323837916056315, + "step": 2430 + }, + { + "epoch": 0.8974205158968206, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 2.672048641387581e-07, + "logits/chosen": 1397822733.4736843, + "logits/rejected": 2138261031.3846154, + "logps/chosen": -297.6440172697368, + "logps/rejected": -478.55288461538464, + "loss": 0.1687, + "rewards/chosen": 1.9016047025981702, + "rewards/margins": 9.181402770131223, + "rewards/rejected": -7.2797980675330525, + "step": 2431 + }, + { + "epoch": 0.8977896728346638, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 2.6530994187646653e-07, + "logits/chosen": 1431866026.6666667, + "logits/rejected": 1530204904.7272727, + "logps/chosen": -264.12867373511904, + "logps/rejected": -462.2943004261364, + "loss": 0.2359, + "rewards/chosen": 0.8601154145740327, + "rewards/margins": 9.226322190586107, + "rewards/rejected": -8.366206776012074, + "step": 2432 + }, + { + "epoch": 0.898158829772507, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 2.6342157946709745e-07, + "logits/chosen": 1892260107.1304348, + "logits/rejected": 1638468494.2222223, + "logps/chosen": -247.4190090013587, + "logps/rejected": -582.3811306423611, + "loss": 0.2013, + "rewards/chosen": 1.6722674162491509, + "rewards/margins": 12.723286025190122, + "rewards/rejected": -11.051018608940971, + "step": 2433 + }, + { + "epoch": 0.8985279867103503, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 2.615397795269514e-07, + "logits/chosen": 1884289316.5714285, + "logits/rejected": 1898857927.1111112, + "logps/chosen": -365.3566196986607, + "logps/rejected": -485.38178168402777, + "loss": 0.1371, + "rewards/chosen": 1.9506340026855469, + "rewards/margins": 10.798213958740234, + "rewards/rejected": -8.847579956054688, + "step": 2434 + }, + { + "epoch": 0.8988971436481934, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 2.5966454466323956e-07, + "logits/chosen": 2075704173.7142856, + "logits/rejected": 1706942008.8888888, + "logps/chosen": -236.86296735491072, + "logps/rejected": -454.2117513020833, + "loss": 0.1022, + "rewards/chosen": 1.8704306738717216, + "rewards/margins": 10.652814910525368, + "rewards/rejected": -8.782384236653646, + "step": 2435 + }, + { + "epoch": 0.8992663005860366, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 2.577958774740763e-07, + "logits/chosen": 1899878400.0, + "logits/rejected": 1591474312.5333333, + "logps/chosen": -234.92885454963235, + "logps/rejected": -436.33294270833335, + "loss": 0.1499, + "rewards/chosen": 1.3571586608886719, + "rewards/margins": 7.729491933186849, + "rewards/rejected": -6.372333272298177, + "step": 2436 + }, + { + "epoch": 0.8996354575238799, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 2.5593378054847516e-07, + "logits/chosen": 1945353294.7692308, + "logits/rejected": 1924675045.0526316, + "logps/chosen": -304.52779447115387, + "logps/rejected": -458.66591282894734, + "loss": 0.1295, + "rewards/chosen": 1.162974210885855, + "rewards/margins": 9.877641280170394, + "rewards/rejected": -8.71466706928454, + "step": 2437 + }, + { + "epoch": 0.9000046144617231, + "grad_norm": 11.5625, + "kl": 0.6711900234222412, + "learning_rate": 2.54078256466348e-07, + "logits/chosen": 1600265011.2, + "logits/rejected": 2116811113.4117646, + "logps/chosen": -316.5838216145833, + "logps/rejected": -490.2836052389706, + "loss": 0.1386, + "rewards/chosen": 1.8469988505045574, + "rewards/margins": 10.259621638877718, + "rewards/rejected": -8.412622788373161, + "step": 2438 + }, + { + "epoch": 0.9003737713995662, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 2.522293077985011e-07, + "logits/chosen": 1314000256.0, + "logits/rejected": 1533681280.0, + "logps/chosen": -239.50013732910156, + "logps/rejected": -499.9006042480469, + "loss": 0.1297, + "rewards/chosen": 1.9939954280853271, + "rewards/margins": 9.697562456130981, + "rewards/rejected": -7.703567028045654, + "step": 2439 + }, + { + "epoch": 0.9007429283374094, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 2.5038693710662754e-07, + "logits/chosen": 2069923020.8, + "logits/rejected": 2120760621.1764705, + "logps/chosen": -287.98365885416666, + "logps/rejected": -505.8759191176471, + "loss": 0.1581, + "rewards/chosen": 1.1110677083333333, + "rewards/margins": 8.83211191214767, + "rewards/rejected": -7.721044203814338, + "step": 2440 + }, + { + "epoch": 0.9011120852752527, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 2.4855114694330995e-07, + "logits/chosen": 1713692398.9333334, + "logits/rejected": 1604152139.2941177, + "logps/chosen": -256.806591796875, + "logps/rejected": -462.39797794117646, + "loss": 0.1519, + "rewards/chosen": 1.1048998514811197, + "rewards/margins": 8.282156716141046, + "rewards/rejected": -7.177256864659927, + "step": 2441 + }, + { + "epoch": 0.9014812422130959, + "grad_norm": 7.75, + "kl": 0.0, + "learning_rate": 2.467219398520121e-07, + "logits/chosen": 1790987605.3333333, + "logits/rejected": 2226129519.304348, + "logps/chosen": -215.99979654947916, + "logps/rejected": -462.1782863451087, + "loss": 0.1138, + "rewards/chosen": 1.2264383104112413, + "rewards/margins": 8.334421443478497, + "rewards/rejected": -7.107983133067256, + "step": 2442 + }, + { + "epoch": 0.901850399150939, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 2.448993183670756e-07, + "logits/chosen": 1392677774.2222223, + "logits/rejected": 1405760219.4285715, + "logps/chosen": -287.883544921875, + "logps/rejected": -549.7405133928571, + "loss": 0.157, + "rewards/chosen": 1.700225830078125, + "rewards/margins": 11.421756199428014, + "rewards/rejected": -9.721530369349889, + "step": 2443 + }, + { + "epoch": 0.9022195560887822, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 2.4308328501372213e-07, + "logits/chosen": 1643112555.7894738, + "logits/rejected": 1463200689.2307692, + "logps/chosen": -247.60127981085526, + "logps/rejected": -462.02944711538464, + "loss": 0.1555, + "rewards/chosen": 1.8118442736173932, + "rewards/margins": 9.254632617780555, + "rewards/rejected": -7.442788344163161, + "step": 2444 + }, + { + "epoch": 0.9025887130266255, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 2.4127384230803963e-07, + "logits/chosen": 1888837248.0, + "logits/rejected": 1435241472.0, + "logps/chosen": -237.69915771484375, + "logps/rejected": -532.619140625, + "loss": 0.1535, + "rewards/chosen": 1.1428520679473877, + "rewards/margins": 9.465543985366821, + "rewards/rejected": -8.322691917419434, + "step": 2445 + }, + { + "epoch": 0.9029578699644687, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 2.3947099275699103e-07, + "logits/chosen": 1792095573.3333333, + "logits/rejected": 1796104192.0, + "logps/chosen": -233.29597981770834, + "logps/rejected": -459.2489372702206, + "loss": 0.1666, + "rewards/chosen": 0.8635486602783203, + "rewards/margins": 7.75775229510139, + "rewards/rejected": -6.89420363482307, + "step": 2446 + }, + { + "epoch": 0.9033270269023118, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 2.3767473885839943e-07, + "logits/chosen": 1661485397.3333333, + "logits/rejected": 1485020160.0, + "logps/chosen": -365.9834798177083, + "logps/rejected": -514.316357421875, + "loss": 0.0995, + "rewards/chosen": 1.6763308842976887, + "rewards/margins": 8.943765576680502, + "rewards/rejected": -7.267434692382812, + "step": 2447 + }, + { + "epoch": 0.903696183840155, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 2.3588508310095183e-07, + "logits/chosen": 1683683072.0, + "logits/rejected": 1317178368.0, + "logps/chosen": -224.4346923828125, + "logps/rejected": -374.24822998046875, + "loss": 0.1431, + "rewards/chosen": 1.5019068717956543, + "rewards/margins": 7.55769681930542, + "rewards/rejected": -6.055789947509766, + "step": 2448 + }, + { + "epoch": 0.9040653407779983, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 2.3410202796419534e-07, + "logits/chosen": 1671166244.5714285, + "logits/rejected": 1513986161.7777777, + "logps/chosen": -319.64871651785717, + "logps/rejected": -459.67588975694446, + "loss": 0.143, + "rewards/chosen": 1.543597902570452, + "rewards/margins": 7.916916044931563, + "rewards/rejected": -6.373318142361111, + "step": 2449 + }, + { + "epoch": 0.9044344977158415, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 2.3232557591852777e-07, + "logits/chosen": 2264540774.4, + "logits/rejected": 2309130922.6666665, + "logps/chosen": -369.38525390625, + "logps/rejected": -513.3659261067709, + "loss": 0.1399, + "rewards/chosen": 1.563953971862793, + "rewards/margins": 10.493244743347168, + "rewards/rejected": -8.929290771484375, + "step": 2450 + }, + { + "epoch": 0.9048036546536846, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 2.3055572942520256e-07, + "logits/chosen": 1854236876.8, + "logits/rejected": 1527129941.3333333, + "logps/chosen": -260.2765625, + "logps/rejected": -405.5936686197917, + "loss": 0.2154, + "rewards/chosen": 0.8807446479797363, + "rewards/margins": 9.147856744130452, + "rewards/rejected": -8.267112096150717, + "step": 2451 + }, + { + "epoch": 0.9051728115915278, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 2.2879249093631928e-07, + "logits/chosen": 1647724228.9230769, + "logits/rejected": 1499533312.0, + "logps/chosen": -266.2942645733173, + "logps/rejected": -375.90072471217104, + "loss": 0.126, + "rewards/chosen": 1.3770973499004657, + "rewards/margins": 8.24566376257522, + "rewards/rejected": -6.868566412674753, + "step": 2452 + }, + { + "epoch": 0.9055419685293711, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 2.2703586289482215e-07, + "logits/chosen": 1863582924.8, + "logits/rejected": 1793068333.1764705, + "logps/chosen": -252.85608723958333, + "logps/rejected": -524.2092715992648, + "loss": 0.1203, + "rewards/chosen": 2.0181713104248047, + "rewards/margins": 11.756655412561754, + "rewards/rejected": -9.73848410213695, + "step": 2453 + }, + { + "epoch": 0.9059111254672142, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 2.2528584773449657e-07, + "logits/chosen": 1925358933.3333333, + "logits/rejected": 1746871340.5217392, + "logps/chosen": -372.52143012152777, + "logps/rejected": -488.4709154211956, + "loss": 0.0969, + "rewards/chosen": 1.1510911517673068, + "rewards/margins": 8.716429949958544, + "rewards/rejected": -7.565338798191236, + "step": 2454 + }, + { + "epoch": 0.9062802824050574, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 2.2354244787996748e-07, + "logits/chosen": 1633416923.4285715, + "logits/rejected": 2051785159.1111112, + "logps/chosen": -278.32400948660717, + "logps/rejected": -456.7923177083333, + "loss": 0.1364, + "rewards/chosen": 1.1793252399989538, + "rewards/margins": 8.261780769105942, + "rewards/rejected": -7.082455529106988, + "step": 2455 + }, + { + "epoch": 0.9066494393429007, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 2.2180566574669215e-07, + "logits/chosen": 1908952177.7777777, + "logits/rejected": 2220778788.571429, + "logps/chosen": -317.8469509548611, + "logps/rejected": -498.87991768973217, + "loss": 0.1964, + "rewards/chosen": 0.9111866421169705, + "rewards/margins": 8.211839827280196, + "rewards/rejected": -7.300653185163226, + "step": 2456 + }, + { + "epoch": 0.9070185962807439, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 2.2007550374096077e-07, + "logits/chosen": 1661510724.2666667, + "logits/rejected": 1678921246.1176472, + "logps/chosen": -284.16442057291664, + "logps/rejected": -414.00695082720586, + "loss": 0.1654, + "rewards/chosen": 1.2073599497477214, + "rewards/margins": 9.547140869439817, + "rewards/rejected": -8.339780919692096, + "step": 2457 + }, + { + "epoch": 0.907387753218587, + "grad_norm": 10.5, + "kl": 0.08645820617675781, + "learning_rate": 2.183519642598908e-07, + "logits/chosen": 1387377920.0, + "logits/rejected": 2176486656.0, + "logps/chosen": -240.45587158203125, + "logps/rejected": -401.5126953125, + "loss": 0.1646, + "rewards/chosen": 1.4679962396621704, + "rewards/margins": 8.526912331581116, + "rewards/rejected": -7.058916091918945, + "step": 2458 + }, + { + "epoch": 0.9077569101564302, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 2.1663504969142378e-07, + "logits/chosen": 1496554496.0, + "logits/rejected": 1762009315.5555556, + "logps/chosen": -290.9143763950893, + "logps/rejected": -433.26226128472223, + "loss": 0.105, + "rewards/chosen": 1.6283118384225028, + "rewards/margins": 9.998132327246287, + "rewards/rejected": -8.369820488823784, + "step": 2459 + }, + { + "epoch": 0.9081260670942735, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 2.1492476241432303e-07, + "logits/chosen": 1441296091.4285715, + "logits/rejected": 1914068081.7777777, + "logps/chosen": -286.5906982421875, + "logps/rejected": -464.94276258680554, + "loss": 0.1325, + "rewards/chosen": 1.0794450896126884, + "rewards/margins": 8.79949717294602, + "rewards/rejected": -7.720052083333333, + "step": 2460 + }, + { + "epoch": 0.9084952240321167, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 2.1322110479817138e-07, + "logits/chosen": 1767157418.6666667, + "logits/rejected": 1848959561.142857, + "logps/chosen": -253.30154079861111, + "logps/rejected": -470.2975376674107, + "loss": 0.1347, + "rewards/chosen": 1.8138071695963542, + "rewards/margins": 9.216707320440383, + "rewards/rejected": -7.402900150844029, + "step": 2461 + }, + { + "epoch": 0.9088643809699598, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 2.1152407920336348e-07, + "logits/chosen": 2127126300.4444444, + "logits/rejected": 1307760493.7142856, + "logps/chosen": -315.0992838541667, + "logps/rejected": -489.08265904017856, + "loss": 0.1494, + "rewards/chosen": 1.37702390882704, + "rewards/margins": 10.122246817936972, + "rewards/rejected": -8.745222909109932, + "step": 2462 + }, + { + "epoch": 0.909233537907803, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 2.0983368798110582e-07, + "logits/chosen": 1841129344.0, + "logits/rejected": 2492907008.0, + "logps/chosen": -248.83563232421875, + "logps/rejected": -495.1354675292969, + "loss": 0.117, + "rewards/chosen": 1.7059502601623535, + "rewards/margins": 10.26383352279663, + "rewards/rejected": -8.557883262634277, + "step": 2463 + }, + { + "epoch": 0.9096026948456463, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 2.081499334734155e-07, + "logits/chosen": 2291434917.647059, + "logits/rejected": 1782837384.5333333, + "logps/chosen": -209.06870404411765, + "logps/rejected": -561.8498046875, + "loss": 0.1449, + "rewards/chosen": 1.5576727250043083, + "rewards/margins": 10.97796894896264, + "rewards/rejected": -9.420296223958333, + "step": 2464 + }, + { + "epoch": 0.9099718517834895, + "grad_norm": 11.375, + "kl": 0.3632926940917969, + "learning_rate": 2.0647281801311257e-07, + "logits/chosen": 1201353426.8235295, + "logits/rejected": 1441794594.1333334, + "logps/chosen": -286.09015969669116, + "logps/rejected": -578.8830078125, + "loss": 0.1567, + "rewards/chosen": 1.4555932213278377, + "rewards/margins": 10.224601603489296, + "rewards/rejected": -8.769008382161458, + "step": 2465 + }, + { + "epoch": 0.9103410087213326, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 2.0480234392381893e-07, + "logits/chosen": 1275560277.3333333, + "logits/rejected": 1606855972.5714285, + "logps/chosen": -307.92217339409723, + "logps/rejected": -388.3887416294643, + "loss": 0.1412, + "rewards/chosen": 1.4570189581976996, + "rewards/margins": 9.078269110785591, + "rewards/rejected": -7.621250152587891, + "step": 2466 + }, + { + "epoch": 0.9107101656591758, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 2.031385135199554e-07, + "logits/chosen": 2001925820.631579, + "logits/rejected": 1740885858.4615386, + "logps/chosen": -360.5844469572368, + "logps/rejected": -543.7181490384615, + "loss": 0.2092, + "rewards/chosen": 0.947381571719521, + "rewards/margins": 8.34881448938779, + "rewards/rejected": -7.401432917668269, + "step": 2467 + }, + { + "epoch": 0.9110793225970191, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 2.0148132910673857e-07, + "logits/chosen": 1532331520.0, + "logits/rejected": 1812700928.0, + "logps/chosen": -262.7412109375, + "logps/rejected": -422.3827819824219, + "loss": 0.1336, + "rewards/chosen": 1.6047356128692627, + "rewards/margins": 9.359591722488403, + "rewards/rejected": -7.754856109619141, + "step": 2468 + }, + { + "epoch": 0.9114484795348623, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 1.9983079298017517e-07, + "logits/chosen": 1683844827.4285715, + "logits/rejected": 1259961571.5555556, + "logps/chosen": -280.4531947544643, + "logps/rejected": -409.26942274305554, + "loss": 0.1339, + "rewards/chosen": 1.3443912778581892, + "rewards/margins": 8.912581110757495, + "rewards/rejected": -7.568189832899305, + "step": 2469 + }, + { + "epoch": 0.9118176364727054, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 1.9818690742706258e-07, + "logits/chosen": 1476813312.0, + "logits/rejected": 1807882496.0, + "logps/chosen": -258.25531005859375, + "logps/rejected": -533.0360107421875, + "loss": 0.1814, + "rewards/chosen": 0.8315480947494507, + "rewards/margins": 8.956957221031189, + "rewards/rejected": -8.125409126281738, + "step": 2470 + }, + { + "epoch": 0.9121867934105486, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 1.9654967472498342e-07, + "logits/chosen": 1164423383.5789473, + "logits/rejected": 2158041560.6153846, + "logps/chosen": -287.85166529605266, + "logps/rejected": -676.4672475961538, + "loss": 0.1616, + "rewards/chosen": 1.6681529597232216, + "rewards/margins": 9.971518813839808, + "rewards/rejected": -8.303365854116587, + "step": 2471 + }, + { + "epoch": 0.9125559503483919, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 1.9491909714230207e-07, + "logits/chosen": 1581313536.0, + "logits/rejected": 1897307818.6666667, + "logps/chosen": -248.675830078125, + "logps/rejected": -520.8940022786459, + "loss": 0.1789, + "rewards/chosen": 1.4349849700927735, + "rewards/margins": 9.664424387613932, + "rewards/rejected": -8.229439417521158, + "step": 2472 + }, + { + "epoch": 0.9129251072862351, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 1.9329517693816468e-07, + "logits/chosen": 2105907609.6, + "logits/rejected": 2176154684.2352943, + "logps/chosen": -286.38365885416664, + "logps/rejected": -600.4931640625, + "loss": 0.1128, + "rewards/chosen": 2.0446568806966146, + "rewards/margins": 10.442415813371246, + "rewards/rejected": -8.397758932674632, + "step": 2473 + }, + { + "epoch": 0.9132942642240782, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 1.9167791636249044e-07, + "logits/chosen": 1716825788.631579, + "logits/rejected": 1807465078.1538463, + "logps/chosen": -253.7922491776316, + "logps/rejected": -520.0433443509615, + "loss": 0.1336, + "rewards/chosen": 1.934727718955592, + "rewards/margins": 10.58750810507338, + "rewards/rejected": -8.652780386117788, + "step": 2474 + }, + { + "epoch": 0.9136634211619215, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 1.900673176559742e-07, + "logits/chosen": 1830785570.1333334, + "logits/rejected": 2136510704.9411764, + "logps/chosen": -367.49560546875, + "logps/rejected": -435.7543083639706, + "loss": 0.133, + "rewards/chosen": 1.3351568857828775, + "rewards/margins": 8.29999973820705, + "rewards/rejected": -6.964842852424173, + "step": 2475 + }, + { + "epoch": 0.9140325780997647, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 1.8846338305007984e-07, + "logits/chosen": 1801498496.0, + "logits/rejected": 1836694016.0, + "logps/chosen": -246.25527954101562, + "logps/rejected": -406.3639221191406, + "loss": 0.0919, + "rewards/chosen": 2.1307334899902344, + "rewards/margins": 10.13292407989502, + "rewards/rejected": -8.002190589904785, + "step": 2476 + }, + { + "epoch": 0.9144017350376079, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 1.868661147670381e-07, + "logits/chosen": 1532950528.0, + "logits/rejected": 1950822058.6666667, + "logps/chosen": -257.955859375, + "logps/rejected": -437.9686686197917, + "loss": 0.1952, + "rewards/chosen": 1.1670004844665527, + "rewards/margins": 9.135901165008544, + "rewards/rejected": -7.968900680541992, + "step": 2477 + }, + { + "epoch": 0.914770891975451, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 1.852755150198443e-07, + "logits/chosen": 1775914646.5882354, + "logits/rejected": 1937744691.2, + "logps/chosen": -296.97150735294116, + "logps/rejected": -469.86334635416665, + "loss": 0.1743, + "rewards/chosen": 1.0617474948658663, + "rewards/margins": 7.737485003003887, + "rewards/rejected": -6.675737508138021, + "step": 2478 + }, + { + "epoch": 0.9151400489132943, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 1.83691586012254e-07, + "logits/chosen": 1888723626.6666667, + "logits/rejected": 1766717732.5714285, + "logps/chosen": -258.3428005642361, + "logps/rejected": -431.85121372767856, + "loss": 0.1648, + "rewards/chosen": 1.0903806686401367, + "rewards/margins": 9.570606367928642, + "rewards/rejected": -8.480225699288505, + "step": 2479 + }, + { + "epoch": 0.9155092058511375, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 1.8211432993878063e-07, + "logits/chosen": 2312955136.0, + "logits/rejected": 1852360704.0, + "logps/chosen": -239.77610778808594, + "logps/rejected": -465.4001770019531, + "loss": 0.132, + "rewards/chosen": 1.6288495063781738, + "rewards/margins": 9.86661958694458, + "rewards/rejected": -8.237770080566406, + "step": 2480 + }, + { + "epoch": 0.9158783627889807, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 1.8054374898469228e-07, + "logits/chosen": 1422038926.2222223, + "logits/rejected": 1663815972.5714285, + "logps/chosen": -254.03114149305554, + "logps/rejected": -422.11021205357144, + "loss": 0.1429, + "rewards/chosen": 1.8526333702935114, + "rewards/margins": 9.440686392405677, + "rewards/rejected": -7.588053022112165, + "step": 2481 + }, + { + "epoch": 0.9162475197268238, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 1.7897984532600943e-07, + "logits/chosen": 1554092672.0, + "logits/rejected": 2687910400.0, + "logps/chosen": -280.4490051269531, + "logps/rejected": -549.9448852539062, + "loss": 0.1382, + "rewards/chosen": 1.7622506618499756, + "rewards/margins": 9.657455682754517, + "rewards/rejected": -7.895205020904541, + "step": 2482 + }, + { + "epoch": 0.9166166766646671, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 1.7742262112950047e-07, + "logits/chosen": 2161891913.142857, + "logits/rejected": 1949268650.6666667, + "logps/chosen": -341.1492396763393, + "logps/rejected": -497.86143663194446, + "loss": 0.0809, + "rewards/chosen": 2.1776255198887418, + "rewards/margins": 10.661237580435618, + "rewards/rejected": -8.483612060546875, + "step": 2483 + }, + { + "epoch": 0.9169858336025103, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 1.7587207855267962e-07, + "logits/chosen": 1737520049.2307692, + "logits/rejected": 1793788550.7368422, + "logps/chosen": -274.2716533954327, + "logps/rejected": -435.927734375, + "loss": 0.0923, + "rewards/chosen": 2.13893068753756, + "rewards/margins": 8.439144149965603, + "rewards/rejected": -6.300213462428043, + "step": 2484 + }, + { + "epoch": 0.9173549905403535, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 1.7432821974380343e-07, + "logits/chosen": 1555024817.2307692, + "logits/rejected": 1615904121.2631578, + "logps/chosen": -212.91892653245193, + "logps/rejected": -407.2915553042763, + "loss": 0.0941, + "rewards/chosen": 1.72129880464994, + "rewards/margins": 9.399316671888839, + "rewards/rejected": -7.678017867238898, + "step": 2485 + }, + { + "epoch": 0.9177241474781966, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 1.7279104684187032e-07, + "logits/chosen": 1900024832.0, + "logits/rejected": 1475833344.0, + "logps/chosen": -304.79033203125, + "logps/rejected": -530.4561360677084, + "loss": 0.1614, + "rewards/chosen": 2.0264732360839846, + "rewards/margins": 11.066488138834636, + "rewards/rejected": -9.04001490275065, + "step": 2486 + }, + { + "epoch": 0.9180933044160399, + "grad_norm": 12.5625, + "kl": 0.06975507736206055, + "learning_rate": 1.7126056197661222e-07, + "logits/chosen": 1825036760.6153846, + "logits/rejected": 2965846339.368421, + "logps/chosen": -270.32861328125, + "logps/rejected": -524.2323190789474, + "loss": 0.1498, + "rewards/chosen": 1.2684915982759917, + "rewards/margins": 8.969131824941288, + "rewards/rejected": -7.700640226665296, + "step": 2487 + }, + { + "epoch": 0.9184624613538831, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 1.697367672684963e-07, + "logits/chosen": 2152467617.6842103, + "logits/rejected": 1869842274.4615386, + "logps/chosen": -340.01392886513156, + "logps/rejected": -514.3432241586538, + "loss": 0.1576, + "rewards/chosen": 1.4255049856085527, + "rewards/margins": 8.703423874580908, + "rewards/rejected": -7.277918888972356, + "step": 2488 + }, + { + "epoch": 0.9188316182917262, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 1.6821966482872264e-07, + "logits/chosen": 2042234060.8, + "logits/rejected": 2067105450.6666667, + "logps/chosen": -217.11181640625, + "logps/rejected": -463.0387369791667, + "loss": 0.1791, + "rewards/chosen": 1.2375211715698242, + "rewards/margins": 8.018508593241375, + "rewards/rejected": -6.78098742167155, + "step": 2489 + }, + { + "epoch": 0.9192007752295694, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 1.6670925675921545e-07, + "logits/chosen": 1447599887.0588236, + "logits/rejected": 1187774600.5333333, + "logps/chosen": -278.1966911764706, + "logps/rejected": -385.23193359375, + "loss": 0.1688, + "rewards/chosen": 1.5595718832576977, + "rewards/margins": 9.699043827430875, + "rewards/rejected": -8.139471944173177, + "step": 2490 + }, + { + "epoch": 0.9195699321674127, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 1.652055451526269e-07, + "logits/chosen": 2295229719.2727275, + "logits/rejected": 1434496146.2857144, + "logps/chosen": -189.2320223721591, + "logps/rejected": -555.2404203869048, + "loss": 0.1122, + "rewards/chosen": 1.209001367742365, + "rewards/margins": 13.898112986510966, + "rewards/rejected": -12.689111618768601, + "step": 2491 + }, + { + "epoch": 0.9199390891052559, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 1.637085320923304e-07, + "logits/chosen": 1795625187.5555556, + "logits/rejected": 2246221677.714286, + "logps/chosen": -330.7797037760417, + "logps/rejected": -447.63804408482144, + "loss": 0.1817, + "rewards/chosen": 1.0811634063720703, + "rewards/margins": 8.136516843523298, + "rewards/rejected": -7.055353437151227, + "step": 2492 + }, + { + "epoch": 0.920308246043099, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 1.6221821965241747e-07, + "logits/chosen": 2398104007.111111, + "logits/rejected": 1679352539.4285715, + "logps/chosen": -318.6738009982639, + "logps/rejected": -361.28857421875, + "loss": 0.141, + "rewards/chosen": 1.750884797837999, + "rewards/margins": 8.962543820577954, + "rewards/rejected": -7.211659022739956, + "step": 2493 + }, + { + "epoch": 0.9206774029809422, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 1.6073460989769806e-07, + "logits/chosen": 2155802308.923077, + "logits/rejected": 1581868085.8947368, + "logps/chosen": -269.55838716947113, + "logps/rejected": -452.7993935032895, + "loss": 0.0905, + "rewards/chosen": 2.3155574798583984, + "rewards/margins": 9.233397734792609, + "rewards/rejected": -6.917840254934211, + "step": 2494 + }, + { + "epoch": 0.9210465599187855, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 1.5925770488369517e-07, + "logits/chosen": 1718840621.1764705, + "logits/rejected": 1911485371.7333333, + "logps/chosen": -227.53415096507354, + "logps/rejected": -620.2972005208334, + "loss": 0.0863, + "rewards/chosen": 2.1539679134593293, + "rewards/margins": 10.413572812547871, + "rewards/rejected": -8.259604899088542, + "step": 2495 + }, + { + "epoch": 0.9214157168566287, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 1.577875066566409e-07, + "logits/chosen": 2037497075.8095238, + "logits/rejected": 2244022830.5454545, + "logps/chosen": -344.64678664434524, + "logps/rejected": -425.62704190340907, + "loss": 0.2245, + "rewards/chosen": 0.9767544156029111, + "rewards/margins": 7.497334360560298, + "rewards/rejected": -6.520579944957387, + "step": 2496 + }, + { + "epoch": 0.9217848737944718, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 1.563240172534758e-07, + "logits/chosen": 2736045202.285714, + "logits/rejected": 3012804152.888889, + "logps/chosen": -235.52924455915178, + "logps/rejected": -488.8466796875, + "loss": 0.1602, + "rewards/chosen": 0.7102781704493931, + "rewards/margins": 8.715984957558769, + "rewards/rejected": -8.005706787109375, + "step": 2497 + }, + { + "epoch": 0.922154030732315, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 1.5486723870184684e-07, + "logits/chosen": 1684408162.4615386, + "logits/rejected": 1658736208.8421052, + "logps/chosen": -265.5588566706731, + "logps/rejected": -466.0112561677632, + "loss": 0.108, + "rewards/chosen": 1.4830005352313702, + "rewards/margins": 10.090328448214512, + "rewards/rejected": -8.607327912983141, + "step": 2498 + }, + { + "epoch": 0.9225231876701583, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 1.5341717302010228e-07, + "logits/chosen": 2093067342.7692308, + "logits/rejected": 1394314617.2631578, + "logps/chosen": -235.0142540564904, + "logps/rejected": -455.36641652960526, + "loss": 0.078, + "rewards/chosen": 1.876913070678711, + "rewards/margins": 9.345277585481343, + "rewards/rejected": -7.468364514802632, + "step": 2499 + }, + { + "epoch": 0.9228923446080015, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 1.5197382221728896e-07, + "logits/chosen": 1399066717.090909, + "logits/rejected": 1719979212.8, + "logps/chosen": -265.5631658380682, + "logps/rejected": -573.389501953125, + "loss": 0.1748, + "rewards/chosen": 1.5402679443359375, + "rewards/margins": 8.868135070800781, + "rewards/rejected": -7.327867126464843, + "step": 2500 + }, + { + "epoch": 0.9232615015458446, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 1.505371882931511e-07, + "logits/chosen": 1563369179.4285715, + "logits/rejected": 1999577088.0, + "logps/chosen": -237.63544573102678, + "logps/rejected": -581.2690972222222, + "loss": 0.0999, + "rewards/chosen": 2.014077731541225, + "rewards/margins": 10.331600416274297, + "rewards/rejected": -8.317522684733072, + "step": 2501 + }, + { + "epoch": 0.9236306584836879, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 1.491072732381277e-07, + "logits/chosen": 1483516227.368421, + "logits/rejected": 1935184817.2307692, + "logps/chosen": -286.67300575657896, + "logps/rejected": -558.9635667067307, + "loss": 0.1522, + "rewards/chosen": 1.5843334197998047, + "rewards/margins": 11.75483512878418, + "rewards/rejected": -10.170501708984375, + "step": 2502 + }, + { + "epoch": 0.9239998154215311, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 1.476840790333467e-07, + "logits/chosen": 1723878272.0, + "logits/rejected": 1817583872.0, + "logps/chosen": -324.659423828125, + "logps/rejected": -534.4641723632812, + "loss": 0.138, + "rewards/chosen": 1.5856302976608276, + "rewards/margins": 9.756397128105164, + "rewards/rejected": -8.170766830444336, + "step": 2503 + }, + { + "epoch": 0.9243689723593743, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 1.4626760765062586e-07, + "logits/chosen": 1972003462.7368422, + "logits/rejected": 1393599251.6923077, + "logps/chosen": -292.0731907894737, + "logps/rejected": -489.08443509615387, + "loss": 0.2112, + "rewards/chosen": 0.9112708443089536, + "rewards/margins": 8.979826822937259, + "rewards/rejected": -8.068555978628305, + "step": 2504 + }, + { + "epoch": 0.9247381292972174, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 1.4485786105246923e-07, + "logits/chosen": 1762523136.0, + "logits/rejected": 2024422741.3333333, + "logps/chosen": -291.783203125, + "logps/rejected": -437.14117838541665, + "loss": 0.1507, + "rewards/chosen": 1.2987305136287914, + "rewards/margins": 8.761147607541552, + "rewards/rejected": -7.4624170939127605, + "step": 2505 + }, + { + "epoch": 0.9251072862350607, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 1.4345484119206222e-07, + "logits/chosen": 1701437921.8823528, + "logits/rejected": 2116950425.6, + "logps/chosen": -308.24865004595586, + "logps/rejected": -497.48352864583336, + "loss": 0.1528, + "rewards/chosen": 1.277144712560317, + "rewards/margins": 9.37134738996917, + "rewards/rejected": -8.094202677408854, + "step": 2506 + }, + { + "epoch": 0.9254764431729039, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 1.420585500132704e-07, + "logits/chosen": 1180129152.0, + "logits/rejected": 1284517376.0, + "logps/chosen": -191.07518005371094, + "logps/rejected": -425.614501953125, + "loss": 0.1437, + "rewards/chosen": 1.8553723096847534, + "rewards/margins": 9.564806342124939, + "rewards/rejected": -7.7094340324401855, + "step": 2507 + }, + { + "epoch": 0.9258456001107471, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 1.4066898945063856e-07, + "logits/chosen": 2607113947.428571, + "logits/rejected": 2637610780.4444447, + "logps/chosen": -369.8120814732143, + "logps/rejected": -549.5181749131945, + "loss": 0.1055, + "rewards/chosen": 1.5731003625052316, + "rewards/margins": 10.726795363047765, + "rewards/rejected": -9.153695000542534, + "step": 2508 + }, + { + "epoch": 0.9262147570485902, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 1.3928616142938445e-07, + "logits/chosen": 1406605627.0769231, + "logits/rejected": 2082255710.3157895, + "logps/chosen": -258.0966233473558, + "logps/rejected": -412.4898745888158, + "loss": 0.1523, + "rewards/chosen": 1.143973130446214, + "rewards/margins": 8.573230171975819, + "rewards/rejected": -7.429257041529605, + "step": 2509 + }, + { + "epoch": 0.9265839139864335, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 1.379100678653983e-07, + "logits/chosen": 2559587406.769231, + "logits/rejected": 2437588453.0526314, + "logps/chosen": -254.39963942307693, + "logps/rejected": -506.03536184210526, + "loss": 0.1083, + "rewards/chosen": 1.4899855393629808, + "rewards/margins": 9.825948738376137, + "rewards/rejected": -8.335963199013158, + "step": 2510 + }, + { + "epoch": 0.9269530709242767, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 1.3654071066524222e-07, + "logits/chosen": 1355319409.7777777, + "logits/rejected": 1447847312.6956522, + "logps/chosen": -216.51529947916666, + "logps/rejected": -439.45809273097825, + "loss": 0.1054, + "rewards/chosen": 0.891185548570421, + "rewards/margins": 8.375767537361181, + "rewards/rejected": -7.484581988790761, + "step": 2511 + }, + { + "epoch": 0.9273222278621199, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 1.3517809172614137e-07, + "logits/chosen": 1642498205.5384614, + "logits/rejected": 2078102366.3157895, + "logps/chosen": -216.64982722355768, + "logps/rejected": -470.69068667763156, + "loss": 0.1246, + "rewards/chosen": 1.5876045227050781, + "rewards/margins": 9.065030348928351, + "rewards/rejected": -7.477425826223273, + "step": 2512 + }, + { + "epoch": 0.927691384799963, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 1.3382221293598728e-07, + "logits/chosen": 1552554130.2857144, + "logits/rejected": 1414453020.4444444, + "logps/chosen": -270.29673549107144, + "logps/rejected": -359.9291178385417, + "loss": 0.136, + "rewards/chosen": 1.3484920774187361, + "rewards/margins": 8.46254813481891, + "rewards/rejected": -7.114056057400173, + "step": 2513 + }, + { + "epoch": 0.9280605417378063, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 1.3247307617333283e-07, + "logits/chosen": 2488664064.0, + "logits/rejected": 1562189004.8, + "logps/chosen": -243.47061695772058, + "logps/rejected": -471.09778645833336, + "loss": 0.1595, + "rewards/chosen": 1.847094591926126, + "rewards/margins": 11.052055732876648, + "rewards/rejected": -9.204961140950521, + "step": 2514 + }, + { + "epoch": 0.9284296986756495, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 1.3113068330739053e-07, + "logits/chosen": 1563340928.0, + "logits/rejected": 1579787136.0, + "logps/chosen": -314.41473388671875, + "logps/rejected": -559.5940551757812, + "loss": 0.1572, + "rewards/chosen": 1.2059489488601685, + "rewards/margins": 10.284816861152649, + "rewards/rejected": -9.07886791229248, + "step": 2515 + }, + { + "epoch": 0.9287988556134927, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 1.2979503619802715e-07, + "logits/chosen": 2044057892.5714285, + "logits/rejected": 2148248007.111111, + "logps/chosen": -238.96405901227678, + "logps/rejected": -415.7163899739583, + "loss": 0.1059, + "rewards/chosen": 1.4982728958129883, + "rewards/margins": 10.12384425269233, + "rewards/rejected": -8.625571356879341, + "step": 2516 + }, + { + "epoch": 0.9291680125513359, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 1.2846613669576678e-07, + "logits/chosen": 1999046070.857143, + "logits/rejected": 1939684920.8888888, + "logps/chosen": -297.84737723214283, + "logps/rejected": -555.3013780381945, + "loss": 0.1835, + "rewards/chosen": 0.5589901038578579, + "rewards/margins": 8.994154661420792, + "rewards/rejected": -8.435164557562935, + "step": 2517 + }, + { + "epoch": 0.9295371694891791, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 1.2714398664178174e-07, + "logits/chosen": 1456985565.8666666, + "logits/rejected": 1701836920.4705882, + "logps/chosen": -258.07216796875, + "logps/rejected": -429.65148207720586, + "loss": 0.1474, + "rewards/chosen": 1.470602035522461, + "rewards/margins": 8.807218102847829, + "rewards/rejected": -7.336616067325368, + "step": 2518 + }, + { + "epoch": 0.9299063264270223, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 1.2582858786789388e-07, + "logits/chosen": 1798400512.0, + "logits/rejected": 1807833907.2, + "logps/chosen": -315.8483479817708, + "logps/rejected": -484.545068359375, + "loss": 0.116, + "rewards/chosen": 1.0710015296936035, + "rewards/margins": 9.126564884185791, + "rewards/rejected": -8.055563354492188, + "step": 2519 + }, + { + "epoch": 0.9302754833648655, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 1.2451994219657203e-07, + "logits/chosen": 2366505233.0666666, + "logits/rejected": 1340504425.4117646, + "logps/chosen": -201.77483723958332, + "logps/rejected": -487.70398667279414, + "loss": 0.1653, + "rewards/chosen": 0.9505270640055339, + "rewards/margins": 9.149810439465092, + "rewards/rejected": -8.199283375459558, + "step": 2520 + }, + { + "epoch": 0.9306446403027087, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 1.2321805144092757e-07, + "logits/chosen": 1770084509.5384614, + "logits/rejected": 1847489805.4736843, + "logps/chosen": -353.6854717548077, + "logps/rejected": -482.71201685855266, + "loss": 0.1171, + "rewards/chosen": 1.2811222076416016, + "rewards/margins": 8.749951713963558, + "rewards/rejected": -7.468829506321957, + "step": 2521 + }, + { + "epoch": 0.9310137972405519, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 1.2192291740471373e-07, + "logits/chosen": 1636909192.5333333, + "logits/rejected": 1804170059.2941177, + "logps/chosen": -277.9048177083333, + "logps/rejected": -489.5487706801471, + "loss": 0.1201, + "rewards/chosen": 1.5350860595703124, + "rewards/margins": 9.652523085650277, + "rewards/rejected": -8.117437026079964, + "step": 2522 + }, + { + "epoch": 0.9313829541783951, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 1.206345418823235e-07, + "logits/chosen": 2090845184.0, + "logits/rejected": 1572433203.2, + "logps/chosen": -334.7008056640625, + "logps/rejected": -451.971728515625, + "loss": 0.0871, + "rewards/chosen": 1.763327916463216, + "rewards/margins": 8.984890111287434, + "rewards/rejected": -7.221562194824219, + "step": 2523 + }, + { + "epoch": 0.9317521111162383, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 1.1935292665878283e-07, + "logits/chosen": 1361155364.5714285, + "logits/rejected": 2022898574.2222223, + "logps/chosen": -288.69301060267856, + "logps/rejected": -503.23247612847223, + "loss": 0.1462, + "rewards/chosen": 1.199124881199428, + "rewards/margins": 9.56345898764474, + "rewards/rejected": -8.364334106445312, + "step": 2524 + }, + { + "epoch": 0.9321212680540815, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 1.1807807350975476e-07, + "logits/chosen": 2098331648.0, + "logits/rejected": 1543024469.3333333, + "logps/chosen": -287.42138671875, + "logps/rejected": -479.8856201171875, + "loss": 0.1546, + "rewards/chosen": 1.9276544570922851, + "rewards/margins": 9.545364570617675, + "rewards/rejected": -7.617710113525391, + "step": 2525 + }, + { + "epoch": 0.9324904249919247, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 1.1680998420153134e-07, + "logits/chosen": 1924691558.4, + "logits/rejected": 2112658492.235294, + "logps/chosen": -252.12130533854167, + "logps/rejected": -510.171875, + "loss": 0.1169, + "rewards/chosen": 1.5634759267171223, + "rewards/margins": 9.94785738926308, + "rewards/rejected": -8.384381462545957, + "step": 2526 + }, + { + "epoch": 0.9328595819297679, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 1.1554866049103497e-07, + "logits/chosen": 2393290605.714286, + "logits/rejected": 1742923889.7777777, + "logps/chosen": -280.51553780691967, + "logps/rejected": -437.5578884548611, + "loss": 0.1152, + "rewards/chosen": 1.9189919063023158, + "rewards/margins": 9.777308963593983, + "rewards/rejected": -7.858317057291667, + "step": 2527 + }, + { + "epoch": 0.933228738867611, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 1.1429410412581277e-07, + "logits/chosen": 1655471405.1764705, + "logits/rejected": 1756249019.7333333, + "logps/chosen": -256.87169692095586, + "logps/rejected": -490.3489583333333, + "loss": 0.1806, + "rewards/chosen": 1.1053916706758387, + "rewards/margins": 8.526234667908911, + "rewards/rejected": -7.4208429972330725, + "step": 2528 + }, + { + "epoch": 0.9335978958054543, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 1.1304631684403711e-07, + "logits/chosen": 2130591880.5333333, + "logits/rejected": 2208428995.7647057, + "logps/chosen": -348.8228515625, + "logps/rejected": -474.736328125, + "loss": 0.1532, + "rewards/chosen": 0.9391624450683593, + "rewards/margins": 9.745888339771943, + "rewards/rejected": -8.806725894703584, + "step": 2529 + }, + { + "epoch": 0.9339670527432975, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 1.1180530037450176e-07, + "logits/chosen": 1766804480.0, + "logits/rejected": 1969404757.3333333, + "logps/chosen": -254.828173828125, + "logps/rejected": -506.6650390625, + "loss": 0.1459, + "rewards/chosen": 1.6948097229003907, + "rewards/margins": 11.070079930623374, + "rewards/rejected": -9.375270207722982, + "step": 2530 + }, + { + "epoch": 0.9343362096811407, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 1.1057105643661803e-07, + "logits/chosen": 2055575079.3846154, + "logits/rejected": 1723575242.1052632, + "logps/chosen": -331.39073768028845, + "logps/rejected": -401.50765830592104, + "loss": 0.1285, + "rewards/chosen": 1.076309057382437, + "rewards/margins": 7.831121197596253, + "rewards/rejected": -6.754812140213816, + "step": 2531 + }, + { + "epoch": 0.9347053666189838, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 1.0934358674041634e-07, + "logits/chosen": 1905816576.0, + "logits/rejected": 1991768704.0, + "logps/chosen": -257.8365478515625, + "logps/rejected": -561.6373901367188, + "loss": 0.1856, + "rewards/chosen": 1.0542854070663452, + "rewards/margins": 10.408268332481384, + "rewards/rejected": -9.353982925415039, + "step": 2532 + }, + { + "epoch": 0.9350745235568271, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 1.0812289298654077e-07, + "logits/chosen": 1487080652.8, + "logits/rejected": 1641920658.2857144, + "logps/chosen": -318.2828515625, + "logps/rejected": -433.3341587611607, + "loss": 0.2497, + "rewards/chosen": 0.9791152954101563, + "rewards/margins": 8.380748421805245, + "rewards/rejected": -7.401633126395089, + "step": 2533 + }, + { + "epoch": 0.9354436804946703, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 1.0690897686624568e-07, + "logits/chosen": 1275749034.6666667, + "logits/rejected": 1612553035.2941177, + "logps/chosen": -239.42327473958332, + "logps/rejected": -480.9499942555147, + "loss": 0.1215, + "rewards/chosen": 2.0194554646809895, + "rewards/margins": 9.187910251991422, + "rewards/rejected": -7.168454787310432, + "step": 2534 + }, + { + "epoch": 0.9358128374325135, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 1.0570184006139683e-07, + "logits/chosen": 1392902997.3333333, + "logits/rejected": 1335155916.8, + "logps/chosen": -218.5135294596354, + "logps/rejected": -477.434765625, + "loss": 0.0951, + "rewards/chosen": 1.7266942660013835, + "rewards/margins": 10.997198263804117, + "rewards/rejected": -9.270503997802734, + "step": 2535 + }, + { + "epoch": 0.9361819943703567, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 1.0450148424446749e-07, + "logits/chosen": 1923279701.3333333, + "logits/rejected": 2095693184.0, + "logps/chosen": -310.11130777994794, + "logps/rejected": -438.5496826171875, + "loss": 0.1792, + "rewards/chosen": 1.5007623036702473, + "rewards/margins": 8.992669900258383, + "rewards/rejected": -7.491907596588135, + "step": 2536 + }, + { + "epoch": 0.9365511513081999, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 1.033079110785351e-07, + "logits/chosen": 2142950912.0, + "logits/rejected": 2574241792.0, + "logps/chosen": -225.27034505208334, + "logps/rejected": -549.134375, + "loss": 0.092, + "rewards/chosen": 1.4848049481709797, + "rewards/margins": 10.828808244069418, + "rewards/rejected": -9.344003295898437, + "step": 2537 + }, + { + "epoch": 0.9369203082460431, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 1.0212112221727966e-07, + "logits/chosen": 1654151955.6923077, + "logits/rejected": 1947492352.0, + "logps/chosen": -387.56798377403845, + "logps/rejected": -420.6376696134868, + "loss": 0.1145, + "rewards/chosen": 1.5096981342022235, + "rewards/margins": 8.482083741469904, + "rewards/rejected": -6.972385607267681, + "step": 2538 + }, + { + "epoch": 0.9372894651838863, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 1.0094111930498307e-07, + "logits/chosen": 2065852416.0, + "logits/rejected": 2753478656.0, + "logps/chosen": -261.72198893229165, + "logps/rejected": -461.05402688419116, + "loss": 0.1303, + "rewards/chosen": 1.5203502655029297, + "rewards/margins": 8.997437847361846, + "rewards/rejected": -7.477087581858916, + "step": 2539 + }, + { + "epoch": 0.9376586221217295, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 9.976790397652314e-08, + "logits/chosen": 1826333318.7368422, + "logits/rejected": 2151441171.6923075, + "logps/chosen": -291.96175986842104, + "logps/rejected": -479.61245492788464, + "loss": 0.1476, + "rewards/chosen": 1.547782998335989, + "rewards/margins": 10.400035665102816, + "rewards/rejected": -8.852252666766827, + "step": 2540 + }, + { + "epoch": 0.9380277790595727, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.860147785737573e-08, + "logits/chosen": 1511469875.2, + "logits/rejected": 1883208523.2941177, + "logps/chosen": -300.91575520833334, + "logps/rejected": -514.2014590992648, + "loss": 0.146, + "rewards/chosen": 1.210156504313151, + "rewards/margins": 10.154429611505247, + "rewards/rejected": -8.944273107192096, + "step": 2541 + }, + { + "epoch": 0.9383969359974159, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 9.744184256360923e-08, + "logits/chosen": 1369762019.5555556, + "logits/rejected": 1565086573.7142856, + "logps/chosen": -213.75836859809027, + "logps/rejected": -489.6382533482143, + "loss": 0.189, + "rewards/chosen": 0.966751840379503, + "rewards/margins": 8.0918853547838, + "rewards/rejected": -7.125133514404297, + "step": 2542 + }, + { + "epoch": 0.9387660929352591, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.628899970188343e-08, + "logits/chosen": 2131869900.8, + "logits/rejected": 2488811690.6666665, + "logps/chosen": -412.830712890625, + "logps/rejected": -508.7998046875, + "loss": 0.172, + "rewards/chosen": 1.2819436073303223, + "rewards/margins": 10.400216388702393, + "rewards/rejected": -9.11827278137207, + "step": 2543 + }, + { + "epoch": 0.9391352498731023, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 9.514295086944736e-08, + "logits/chosen": 1413098291.2, + "logits/rejected": 2124595882.6666667, + "logps/chosen": -247.0150390625, + "logps/rejected": -426.4667561848958, + "loss": 0.1779, + "rewards/chosen": 1.3144948959350586, + "rewards/margins": 8.740929094950358, + "rewards/rejected": -7.4264341990153, + "step": 2544 + }, + { + "epoch": 0.9395044068109455, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 9.400369765413752e-08, + "logits/chosen": 1910990848.0, + "logits/rejected": 1913262957.7142856, + "logps/chosen": -328.13330078125, + "logps/rejected": -532.3317173549107, + "loss": 0.1309, + "rewards/chosen": 1.5792726940578885, + "rewards/margins": 9.491488335624574, + "rewards/rejected": -7.912215641566685, + "step": 2545 + }, + { + "epoch": 0.9398735637487887, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 9.287124163437411e-08, + "logits/chosen": 2059308646.4, + "logits/rejected": 1940575171.764706, + "logps/chosen": -256.6640950520833, + "logps/rejected": -567.3671875, + "loss": 0.1397, + "rewards/chosen": 1.256694539388021, + "rewards/margins": 9.270928296855853, + "rewards/rejected": -8.014233757467832, + "step": 2546 + }, + { + "epoch": 0.940242720686632, + "grad_norm": 12.875, + "kl": 1.3931360244750977, + "learning_rate": 9.174558437916148e-08, + "logits/chosen": 1541696625.7777777, + "logits/rejected": 1533211648.0, + "logps/chosen": -311.4453396267361, + "logps/rejected": -377.94754464285717, + "loss": 0.1657, + "rewards/chosen": 1.6108074188232422, + "rewards/margins": 9.138286318097796, + "rewards/rejected": -7.527478899274554, + "step": 2547 + }, + { + "epoch": 0.9406118776244751, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 9.06267274480832e-08, + "logits/chosen": 1432841784.8888888, + "logits/rejected": 2087697554.2857144, + "logps/chosen": -338.2185872395833, + "logps/rejected": -645.5369698660714, + "loss": 0.1465, + "rewards/chosen": 1.6366589864095051, + "rewards/margins": 12.305591219947452, + "rewards/rejected": -10.668932233537946, + "step": 2548 + }, + { + "epoch": 0.9409810345623183, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 8.95146723913004e-08, + "logits/chosen": 1824053850.3529413, + "logits/rejected": 1836472320.0, + "logps/chosen": -228.89234834558823, + "logps/rejected": -435.22272135416665, + "loss": 0.112, + "rewards/chosen": 1.9370231628417969, + "rewards/margins": 9.02910130818685, + "rewards/rejected": -7.092078145345052, + "step": 2549 + }, + { + "epoch": 0.9413501915001615, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 8.840942074955172e-08, + "logits/chosen": 2111114808.8888888, + "logits/rejected": 1749867081.142857, + "logps/chosen": -352.1830783420139, + "logps/rejected": -406.04331752232144, + "loss": 0.2021, + "rewards/chosen": 1.0123392740885417, + "rewards/margins": 7.1670110793340776, + "rewards/rejected": -6.154671805245536, + "step": 2550 + }, + { + "epoch": 0.9417193484380048, + "grad_norm": 14.75, + "kl": 0.11972904205322266, + "learning_rate": 8.731097405415057e-08, + "logits/chosen": 1699465011.2, + "logits/rejected": 1523107157.3333333, + "logps/chosen": -353.474609375, + "logps/rejected": -559.5340576171875, + "loss": 0.2033, + "rewards/chosen": 1.1384614944458007, + "rewards/margins": 9.821895027160645, + "rewards/rejected": -8.683433532714844, + "step": 2551 + }, + { + "epoch": 0.9420885053758479, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 8.6219333826979e-08, + "logits/chosen": 1527176533.3333333, + "logits/rejected": 2202246927.0588236, + "logps/chosen": -220.58645833333333, + "logps/rejected": -578.8791934742648, + "loss": 0.129, + "rewards/chosen": 1.6816619873046874, + "rewards/margins": 12.399236162971048, + "rewards/rejected": -10.71757417566636, + "step": 2552 + }, + { + "epoch": 0.9424576623136911, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 8.513450158049109e-08, + "logits/chosen": 1810264495.1578948, + "logits/rejected": 2177487163.076923, + "logps/chosen": -309.8414884868421, + "logps/rejected": -533.9446364182693, + "loss": 0.1843, + "rewards/chosen": 1.3231558548776727, + "rewards/margins": 10.443307668091315, + "rewards/rejected": -9.120151813213642, + "step": 2553 + }, + { + "epoch": 0.9428268192515343, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 8.405647881770951e-08, + "logits/chosen": 2938827025.0666666, + "logits/rejected": 2381061662.117647, + "logps/chosen": -320.17565104166664, + "logps/rejected": -452.91041475183823, + "loss": 0.1634, + "rewards/chosen": 0.9656777699788411, + "rewards/margins": 7.556196945788813, + "rewards/rejected": -6.590519175809972, + "step": 2554 + }, + { + "epoch": 0.9431959761893776, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 8.298526703221899e-08, + "logits/chosen": 1634713127.3846154, + "logits/rejected": 1642049751.5789473, + "logps/chosen": -259.08266977163464, + "logps/rejected": -587.9998972039474, + "loss": 0.0862, + "rewards/chosen": 1.8026448763333833, + "rewards/margins": 12.643405759865455, + "rewards/rejected": -10.840760883532072, + "step": 2555 + }, + { + "epoch": 0.9435651331272207, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 8.192086770817176e-08, + "logits/chosen": 1771069878.857143, + "logits/rejected": 2153890929.7777777, + "logps/chosen": -227.49482073102678, + "logps/rejected": -355.4600423177083, + "loss": 0.1521, + "rewards/chosen": 0.7700471878051758, + "rewards/margins": 7.5454301834106445, + "rewards/rejected": -6.775382995605469, + "step": 2556 + }, + { + "epoch": 0.9439342900650639, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 8.086328232027874e-08, + "logits/chosen": 2023663488.0, + "logits/rejected": 2053683328.0, + "logps/chosen": -307.9173583984375, + "logps/rejected": -643.0717163085938, + "loss": 0.1196, + "rewards/chosen": 1.9159668684005737, + "rewards/margins": 11.8800448179245, + "rewards/rejected": -9.964077949523926, + "step": 2557 + }, + { + "epoch": 0.9443034470029071, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 7.98125123338106e-08, + "logits/chosen": 2018795304.4210527, + "logits/rejected": 2795018712.6153846, + "logps/chosen": -235.97350431743422, + "logps/rejected": -706.9846003605769, + "loss": 0.1346, + "rewards/chosen": 1.9335194637900905, + "rewards/margins": 13.597800281848986, + "rewards/rejected": -11.664280818058895, + "step": 2558 + }, + { + "epoch": 0.9446726039407504, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 7.876855920459613e-08, + "logits/chosen": 1629185462.857143, + "logits/rejected": 1502696334.2222223, + "logps/chosen": -277.49349539620533, + "logps/rejected": -442.98985460069446, + "loss": 0.0931, + "rewards/chosen": 1.735396248953683, + "rewards/margins": 9.443091256277901, + "rewards/rejected": -7.707695007324219, + "step": 2559 + }, + { + "epoch": 0.9450417608785935, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 7.773142437902003e-08, + "logits/chosen": 1579034916.5714285, + "logits/rejected": 1680590734.2222223, + "logps/chosen": -395.30433872767856, + "logps/rejected": -414.03184678819446, + "loss": 0.1076, + "rewards/chosen": 1.9498372759137834, + "rewards/margins": 8.216306595575242, + "rewards/rejected": -6.266469319661458, + "step": 2560 + }, + { + "epoch": 0.9454109178164367, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 7.670110929401786e-08, + "logits/chosen": 1693079825.0666666, + "logits/rejected": 1464196879.0588236, + "logps/chosen": -335.7431966145833, + "logps/rejected": -486.3245634191176, + "loss": 0.1523, + "rewards/chosen": 1.054626210530599, + "rewards/margins": 9.884863206452014, + "rewards/rejected": -8.830236995921416, + "step": 2561 + }, + { + "epoch": 0.9457800747542799, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 7.56776153770794e-08, + "logits/chosen": 1542693726.3157895, + "logits/rejected": 1509062183.3846154, + "logps/chosen": -288.7039730674342, + "logps/rejected": -539.7354642427885, + "loss": 0.1533, + "rewards/chosen": 1.6252046886243319, + "rewards/margins": 9.407065040186833, + "rewards/rejected": -7.7818603515625, + "step": 2562 + }, + { + "epoch": 0.9461492316921231, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 7.466094404624202e-08, + "logits/chosen": 1788852331.7894738, + "logits/rejected": 1179501016.6153846, + "logps/chosen": -329.45101768092104, + "logps/rejected": -469.28064903846155, + "loss": 0.19, + "rewards/chosen": 1.3554346184981496, + "rewards/margins": 9.35238990320368, + "rewards/rejected": -7.996955284705529, + "step": 2563 + }, + { + "epoch": 0.9465183886299663, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 7.365109671009119e-08, + "logits/chosen": 2063574766.9333334, + "logits/rejected": 1688554797.1764705, + "logps/chosen": -302.00572916666664, + "logps/rejected": -400.10857077205884, + "loss": 0.1116, + "rewards/chosen": 1.6736312866210938, + "rewards/margins": 9.767049183565026, + "rewards/rejected": -8.093417896943933, + "step": 2564 + }, + { + "epoch": 0.9468875455678095, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 7.26480747677566e-08, + "logits/chosen": 1892067328.0, + "logits/rejected": 3224146830.2222223, + "logps/chosen": -264.5581752232143, + "logps/rejected": -602.4556749131945, + "loss": 0.1116, + "rewards/chosen": 1.4038448333740234, + "rewards/margins": 12.375555886162651, + "rewards/rejected": -10.971711052788628, + "step": 2565 + }, + { + "epoch": 0.9472567025056527, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 7.165187960891274e-08, + "logits/chosen": 1727879753.142857, + "logits/rejected": 2068222634.6666667, + "logps/chosen": -260.98685128348217, + "logps/rejected": -409.5375162760417, + "loss": 0.0972, + "rewards/chosen": 1.975005831037249, + "rewards/margins": 10.104598726545062, + "rewards/rejected": -8.129592895507812, + "step": 2566 + }, + { + "epoch": 0.9476258594434959, + "grad_norm": 13.25, + "kl": 0.15419673919677734, + "learning_rate": 7.066251261377666e-08, + "logits/chosen": 2020072561.7777777, + "logits/rejected": 1695321673.142857, + "logps/chosen": -311.14100477430554, + "logps/rejected": -479.711669921875, + "loss": 0.165, + "rewards/chosen": 1.1587982177734375, + "rewards/margins": 9.725939614432198, + "rewards/rejected": -8.56714139665876, + "step": 2567 + }, + { + "epoch": 0.9479950163813391, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 6.967997515310188e-08, + "logits/chosen": 1568970240.0, + "logits/rejected": 1840730521.6, + "logps/chosen": -208.1700642903646, + "logps/rejected": -397.8926025390625, + "loss": 0.0662, + "rewards/chosen": 2.2880730628967285, + "rewards/margins": 9.469616794586182, + "rewards/rejected": -7.1815437316894535, + "step": 2568 + }, + { + "epoch": 0.9483641733191823, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 6.87042685881828e-08, + "logits/chosen": 2040003470.2222223, + "logits/rejected": 1584034084.5714285, + "logps/chosen": -247.56985134548611, + "logps/rejected": -405.7607421875, + "loss": 0.1704, + "rewards/chosen": 1.2864073647393122, + "rewards/margins": 7.523657556564089, + "rewards/rejected": -6.237250191824777, + "step": 2569 + }, + { + "epoch": 0.9487333302570256, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 6.773539427084808e-08, + "logits/chosen": 1597774370.1333334, + "logits/rejected": 1606479028.7058823, + "logps/chosen": -312.70807291666665, + "logps/rejected": -519.3124425551471, + "loss": 0.1507, + "rewards/chosen": 1.180313237508138, + "rewards/margins": 8.649679872101428, + "rewards/rejected": -7.469366634593291, + "step": 2570 + }, + { + "epoch": 0.9491024871948687, + "grad_norm": 12.1875, + "kl": 0.32794761657714844, + "learning_rate": 6.67733535434606e-08, + "logits/chosen": 1777683456.0, + "logits/rejected": 1931102663.1111112, + "logps/chosen": -260.6914585658482, + "logps/rejected": -429.93153211805554, + "loss": 0.1625, + "rewards/chosen": 0.8643599918910435, + "rewards/margins": 7.912648549155583, + "rewards/rejected": -7.04828855726454, + "step": 2571 + }, + { + "epoch": 0.9494716441327119, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 6.581814773891581e-08, + "logits/chosen": 1683326520.8888888, + "logits/rejected": 2262879378.285714, + "logps/chosen": -258.9841037326389, + "logps/rejected": -629.56396484375, + "loss": 0.1352, + "rewards/chosen": 1.5690246158175998, + "rewards/margins": 11.605118191431439, + "rewards/rejected": -10.036093575613839, + "step": 2572 + }, + { + "epoch": 0.9498408010705551, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 6.486977818063956e-08, + "logits/chosen": 1834793803.2941177, + "logits/rejected": 1417370419.2, + "logps/chosen": -241.0198471966912, + "logps/rejected": -456.2931315104167, + "loss": 0.1414, + "rewards/chosen": 1.6895588145536535, + "rewards/margins": 10.378303931741153, + "rewards/rejected": -8.6887451171875, + "step": 2573 + }, + { + "epoch": 0.9502099580083984, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 6.39282461825852e-08, + "logits/chosen": 2181310873.6, + "logits/rejected": 2294867365.647059, + "logps/chosen": -343.15856119791664, + "logps/rejected": -541.9382467830883, + "loss": 0.1465, + "rewards/chosen": 1.2811360677083334, + "rewards/margins": 9.834235756070006, + "rewards/rejected": -8.553099688361673, + "step": 2574 + }, + { + "epoch": 0.9505791149462415, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 6.299355304923372e-08, + "logits/chosen": 1258010770.2857144, + "logits/rejected": 1441509376.0, + "logps/chosen": -192.58065359933036, + "logps/rejected": -380.73046875, + "loss": 0.1583, + "rewards/chosen": 1.116128785269601, + "rewards/margins": 7.613388137211875, + "rewards/rejected": -6.497259351942274, + "step": 2575 + }, + { + "epoch": 0.9509482718840847, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 6.206570007559032e-08, + "logits/chosen": 1938304421.6470587, + "logits/rejected": 1630936541.8666666, + "logps/chosen": -285.4405158547794, + "logps/rejected": -446.9533203125, + "loss": 0.1243, + "rewards/chosen": 1.9617789773380054, + "rewards/margins": 10.399874070111442, + "rewards/rejected": -8.438095092773438, + "step": 2576 + }, + { + "epoch": 0.9513174288219279, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 6.114468854718337e-08, + "logits/chosen": 1450272426.6666667, + "logits/rejected": 1650161152.0, + "logps/chosen": -225.83329264322916, + "logps/rejected": -495.2220703125, + "loss": 0.0907, + "rewards/chosen": 1.7967476844787598, + "rewards/margins": 10.577870273590088, + "rewards/rejected": -8.781122589111328, + "step": 2577 + }, + { + "epoch": 0.9516865857597712, + "grad_norm": 13.3125, + "kl": 0.06491565704345703, + "learning_rate": 6.023051974006322e-08, + "logits/chosen": 2277463740.631579, + "logits/rejected": 2352675918.769231, + "logps/chosen": -296.1142578125, + "logps/rejected": -463.9853515625, + "loss": 0.1721, + "rewards/chosen": 1.1480492039730674, + "rewards/margins": 8.56243213854338, + "rewards/rejected": -7.4143829345703125, + "step": 2578 + }, + { + "epoch": 0.9520557426976143, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 5.9323194920798966e-08, + "logits/chosen": 1930035370.6666667, + "logits/rejected": 1229006848.0, + "logps/chosen": -299.37624104817706, + "logps/rejected": -374.75457763671875, + "loss": 0.1917, + "rewards/chosen": 1.483151912689209, + "rewards/margins": 10.944077968597412, + "rewards/rejected": -9.460926055908203, + "step": 2579 + }, + { + "epoch": 0.9524248996354575, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 5.842271534647726e-08, + "logits/chosen": 1644508288.0, + "logits/rejected": 1730659840.0, + "logps/chosen": -254.65335083007812, + "logps/rejected": -541.283935546875, + "loss": 0.1476, + "rewards/chosen": 1.5185198783874512, + "rewards/margins": 11.41326093673706, + "rewards/rejected": -9.89474105834961, + "step": 2580 + }, + { + "epoch": 0.9527940565733007, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 5.752908226470177e-08, + "logits/chosen": 1505097404.631579, + "logits/rejected": 1875300036.9230769, + "logps/chosen": -269.8198499177632, + "logps/rejected": -442.1379957932692, + "loss": 0.2168, + "rewards/chosen": 0.7332319962350946, + "rewards/margins": 7.435674458862799, + "rewards/rejected": -6.702442462627705, + "step": 2581 + }, + { + "epoch": 0.953163213511144, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 5.6642296913589355e-08, + "logits/chosen": 1649214941.8666666, + "logits/rejected": 1442255088.9411764, + "logps/chosen": -255.12526041666666, + "logps/rejected": -438.6337890625, + "loss": 0.1258, + "rewards/chosen": 1.4881348927815756, + "rewards/margins": 10.29732822343415, + "rewards/rejected": -8.809193330652574, + "step": 2582 + }, + { + "epoch": 0.9535323704489871, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5.576236052176942e-08, + "logits/chosen": 2521900800.0, + "logits/rejected": 2134689536.0, + "logps/chosen": -271.831787109375, + "logps/rejected": -457.86138916015625, + "loss": 0.1209, + "rewards/chosen": 1.9776852130889893, + "rewards/margins": 9.534847497940063, + "rewards/rejected": -7.557162284851074, + "step": 2583 + }, + { + "epoch": 0.9539015273868303, + "grad_norm": 13.1875, + "kl": 0.18362808227539062, + "learning_rate": 5.488927430838287e-08, + "logits/chosen": 2226562194.285714, + "logits/rejected": 1766394786.909091, + "logps/chosen": -301.29413132440476, + "logps/rejected": -401.3702503551136, + "loss": 0.1735, + "rewards/chosen": 1.667205083937872, + "rewards/margins": 8.526780528939648, + "rewards/rejected": -6.859575445001775, + "step": 2584 + }, + { + "epoch": 0.9542706843246735, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 5.402303948308041e-08, + "logits/chosen": 1909885952.0, + "logits/rejected": 1465260416.0, + "logps/chosen": -193.8738250732422, + "logps/rejected": -517.5897827148438, + "loss": 0.1292, + "rewards/chosen": 1.6973270177841187, + "rewards/margins": 10.857688307762146, + "rewards/rejected": -9.160361289978027, + "step": 2585 + }, + { + "epoch": 0.9546398412625168, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 5.316365724601813e-08, + "logits/chosen": 1938788059.4285715, + "logits/rejected": 2022957966.2222223, + "logps/chosen": -270.40419224330356, + "logps/rejected": -703.94677734375, + "loss": 0.1062, + "rewards/chosen": 2.056065559387207, + "rewards/margins": 10.749602953592936, + "rewards/rejected": -8.693537394205729, + "step": 2586 + }, + { + "epoch": 0.9550089982003599, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 5.231112878785971e-08, + "logits/chosen": 1978594725.6470587, + "logits/rejected": 2907173410.133333, + "logps/chosen": -300.86833639705884, + "logps/rejected": -617.555859375, + "loss": 0.1658, + "rewards/chosen": 1.2594906302059399, + "rewards/margins": 11.003405425127815, + "rewards/rejected": -9.743914794921874, + "step": 2587 + }, + { + "epoch": 0.9553781551382031, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 5.146545528977309e-08, + "logits/chosen": 1820717056.0, + "logits/rejected": 1914250069.3333333, + "logps/chosen": -324.4548095703125, + "logps/rejected": -463.0132649739583, + "loss": 0.2014, + "rewards/chosen": 1.0782430648803711, + "rewards/margins": 8.831969388326009, + "rewards/rejected": -7.753726323445638, + "step": 2588 + }, + { + "epoch": 0.9557473120760464, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 5.06266379234277e-08, + "logits/chosen": 1284500952.6153846, + "logits/rejected": 1449050327.5789473, + "logps/chosen": -334.4509089543269, + "logps/rejected": -418.44667454769734, + "loss": 0.1291, + "rewards/chosen": 1.2984820145827074, + "rewards/margins": 8.559037081143153, + "rewards/rejected": -7.260555066560444, + "step": 2589 + }, + { + "epoch": 0.9561164690138896, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 4.97946778509939e-08, + "logits/chosen": 1876324352.0, + "logits/rejected": 1540395349.3333333, + "logps/chosen": -346.4471958705357, + "logps/rejected": -495.4529079861111, + "loss": 0.1553, + "rewards/chosen": 1.019313062940325, + "rewards/margins": 9.15577156581576, + "rewards/rejected": -8.136458502875435, + "step": 2590 + }, + { + "epoch": 0.9564856259517327, + "grad_norm": 10.375, + "kl": 0.056313514709472656, + "learning_rate": 4.896957622514298e-08, + "logits/chosen": 2239911574.5882354, + "logits/rejected": 1793419264.0, + "logps/chosen": -192.75287224264707, + "logps/rejected": -609.1451822916666, + "loss": 0.1407, + "rewards/chosen": 1.866818820728975, + "rewards/margins": 11.065378391041476, + "rewards/rejected": -9.1985595703125, + "step": 2591 + }, + { + "epoch": 0.9568547828895759, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 4.815133418904106e-08, + "logits/chosen": 1958239914.6666667, + "logits/rejected": 1657248358.4, + "logps/chosen": -314.28369140625, + "logps/rejected": -545.77060546875, + "loss": 0.1091, + "rewards/chosen": 1.3566190401713054, + "rewards/margins": 11.5414075533549, + "rewards/rejected": -10.184788513183594, + "step": 2592 + }, + { + "epoch": 0.9572239398274192, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 4.733995287635351e-08, + "logits/chosen": 2221984714.105263, + "logits/rejected": 1729175867.0769231, + "logps/chosen": -268.23167660361844, + "logps/rejected": -409.8818359375, + "loss": 0.1958, + "rewards/chosen": 1.4883244163111637, + "rewards/margins": 7.784557373417534, + "rewards/rejected": -6.29623295710637, + "step": 2593 + }, + { + "epoch": 0.9575930967652624, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 4.653543341123834e-08, + "logits/chosen": 2364610373.818182, + "logits/rejected": 2493603840.0, + "logps/chosen": -234.4482421875, + "logps/rejected": -457.6305338541667, + "loss": 0.1369, + "rewards/chosen": 1.2747208855368874, + "rewards/margins": 8.305347455012333, + "rewards/rejected": -7.030626569475446, + "step": 2594 + }, + { + "epoch": 0.9579622537031055, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 4.573777690834669e-08, + "logits/chosen": 1302723242.6666667, + "logits/rejected": 1511790110.1176472, + "logps/chosen": -261.54938151041665, + "logps/rejected": -397.1268669577206, + "loss": 0.1406, + "rewards/chosen": 1.3242529551188151, + "rewards/margins": 8.170537260466931, + "rewards/rejected": -6.8462843053481155, + "step": 2595 + }, + { + "epoch": 0.9583314106409487, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 4.494698447282231e-08, + "logits/chosen": 1886937770.6666667, + "logits/rejected": 1643133907.4782608, + "logps/chosen": -252.64203559027777, + "logps/rejected": -457.51596467391306, + "loss": 0.0812, + "rewards/chosen": 1.3005761040581598, + "rewards/margins": 9.520416775763323, + "rewards/rejected": -8.219840671705162, + "step": 2596 + }, + { + "epoch": 0.958700567578792, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 4.4163057200297674e-08, + "logits/chosen": 1308979200.0, + "logits/rejected": 1264703247.0588236, + "logps/chosen": -250.27477213541667, + "logps/rejected": -386.17888327205884, + "loss": 0.1753, + "rewards/chosen": 1.322675069173177, + "rewards/margins": 6.609098696241191, + "rewards/rejected": -5.2864236270680145, + "step": 2597 + }, + { + "epoch": 0.9590697245166351, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 4.338599617689343e-08, + "logits/chosen": 1981166478.2222223, + "logits/rejected": 1733108589.7142856, + "logps/chosen": -272.36382378472223, + "logps/rejected": -398.13204520089283, + "loss": 0.1624, + "rewards/chosen": 1.1220000584920247, + "rewards/margins": 8.132591837928409, + "rewards/rejected": -7.010591779436384, + "step": 2598 + }, + { + "epoch": 0.9594388814544783, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 4.261580247921893e-08, + "logits/chosen": 2164381223.3846154, + "logits/rejected": 1260814443.7894738, + "logps/chosen": -247.85345928485577, + "logps/rejected": -467.578125, + "loss": 0.1237, + "rewards/chosen": 1.28230593754695, + "rewards/margins": 9.780947102226225, + "rewards/rejected": -8.498641164679276, + "step": 2599 + }, + { + "epoch": 0.9598080383923215, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 4.1852477174367244e-08, + "logits/chosen": 3258916317.866667, + "logits/rejected": 2417200911.0588236, + "logps/chosen": -319.1265625, + "logps/rejected": -423.6343347886029, + "loss": 0.1539, + "rewards/chosen": 1.0525835673014323, + "rewards/margins": 9.187650777779375, + "rewards/rejected": -8.135067210477942, + "step": 2600 + }, + { + "epoch": 0.9601771953301648, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 4.109602131991519e-08, + "logits/chosen": 1821630464.0, + "logits/rejected": 2233323520.0, + "logps/chosen": -307.34498355263156, + "logps/rejected": -507.43960336538464, + "loss": 0.1835, + "rewards/chosen": 1.2244832892166941, + "rewards/margins": 9.524171775169219, + "rewards/rejected": -8.299688485952524, + "step": 2601 + }, + { + "epoch": 0.9605463522680079, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 4.0346435963923844e-08, + "logits/chosen": 1923448645.8181818, + "logits/rejected": 1946979532.8, + "logps/chosen": -310.74454012784093, + "logps/rejected": -391.535986328125, + "loss": 0.1914, + "rewards/chosen": 1.3787077990445225, + "rewards/margins": 8.494410618868741, + "rewards/rejected": -7.115702819824219, + "step": 2602 + }, + { + "epoch": 0.9609155092058511, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 3.960372214493358e-08, + "logits/chosen": 2562951955.6923075, + "logits/rejected": 1948021167.1578948, + "logps/chosen": -287.1371882512019, + "logps/rejected": -469.7184416118421, + "loss": 0.1129, + "rewards/chosen": 1.6267418494591346, + "rewards/margins": 9.073593077871964, + "rewards/rejected": -7.446851228412829, + "step": 2603 + }, + { + "epoch": 0.9612846661436943, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 3.8867880891965136e-08, + "logits/chosen": 1535475029.3333333, + "logits/rejected": 1582386176.0, + "logps/chosen": -242.54315863715277, + "logps/rejected": -480.90108816964283, + "loss": 0.216, + "rewards/chosen": 0.8573832511901855, + "rewards/margins": 8.362820829663958, + "rewards/rejected": -7.505437578473773, + "step": 2604 + }, + { + "epoch": 0.9616538230815376, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 3.8138913224516906e-08, + "logits/chosen": 1675268437.3333333, + "logits/rejected": 1573161837.7142856, + "logps/chosen": -309.10923936631946, + "logps/rejected": -491.19949776785717, + "loss": 0.1713, + "rewards/chosen": 0.9838287565443251, + "rewards/margins": 9.206944405086457, + "rewards/rejected": -8.223115648542132, + "step": 2605 + }, + { + "epoch": 0.9620229800193807, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 3.741682015256487e-08, + "logits/chosen": 2436583716.571429, + "logits/rejected": 1994525809.7777777, + "logps/chosen": -216.73922293526786, + "logps/rejected": -693.9239366319445, + "loss": 0.114, + "rewards/chosen": 1.665088108607701, + "rewards/margins": 11.177080669100324, + "rewards/rejected": -9.511992560492622, + "step": 2606 + }, + { + "epoch": 0.9623921369572239, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 3.6701602676559314e-08, + "logits/chosen": 1413752234.6666667, + "logits/rejected": 1463424614.4, + "logps/chosen": -286.9320882161458, + "logps/rejected": -445.619189453125, + "loss": 0.0907, + "rewards/chosen": 2.0658241907755532, + "rewards/margins": 10.26053320566813, + "rewards/rejected": -8.194709014892577, + "step": 2607 + }, + { + "epoch": 0.9627612938950671, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 3.599326178742535e-08, + "logits/chosen": 1395727837.8666666, + "logits/rejected": 1342961784.4705882, + "logps/chosen": -304.5376953125, + "logps/rejected": -353.1338752297794, + "loss": 0.1371, + "rewards/chosen": 1.5590035756429037, + "rewards/margins": 9.277758961097867, + "rewards/rejected": -7.718755385454963, + "step": 2608 + }, + { + "epoch": 0.9631304508329104, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 3.5291798466560165e-08, + "logits/chosen": 1472152017.4545455, + "logits/rejected": 1646706492.952381, + "logps/chosen": -317.85855379971593, + "logps/rejected": -403.8603980654762, + "loss": 0.1025, + "rewards/chosen": 1.421070012179288, + "rewards/margins": 8.802701045940449, + "rewards/rejected": -7.381631033761161, + "step": 2609 + }, + { + "epoch": 0.9634996077707535, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 3.459721368583191e-08, + "logits/chosen": 1819350220.8, + "logits/rejected": 1550549504.0, + "logps/chosen": -226.1052001953125, + "logps/rejected": -362.9988606770833, + "loss": 0.1828, + "rewards/chosen": 1.4939940452575684, + "rewards/margins": 8.190814622243245, + "rewards/rejected": -6.696820576985677, + "step": 2610 + }, + { + "epoch": 0.9638687647085967, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 3.3909508407579674e-08, + "logits/chosen": 1390258416.9411764, + "logits/rejected": 1670854519.4666667, + "logps/chosen": -257.0101102941176, + "logps/rejected": -437.2649739583333, + "loss": 0.1598, + "rewards/chosen": 1.2128321703742533, + "rewards/margins": 8.797477759566961, + "rewards/rejected": -7.584645589192708, + "step": 2611 + }, + { + "epoch": 0.96423792164644, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 3.322868358460962e-08, + "logits/chosen": 1535952164.5714285, + "logits/rejected": 1337678279.1111112, + "logps/chosen": -286.85567801339283, + "logps/rejected": -475.0125325520833, + "loss": 0.1179, + "rewards/chosen": 1.5023144313267298, + "rewards/margins": 9.987517644488621, + "rewards/rejected": -8.485203213161892, + "step": 2612 + }, + { + "epoch": 0.9646070785842832, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 3.255474016019666e-08, + "logits/chosen": 1368710348.8, + "logits/rejected": 2472140559.0588236, + "logps/chosen": -275.36201171875, + "logps/rejected": -384.10788143382354, + "loss": 0.1229, + "rewards/chosen": 2.2545748392740887, + "rewards/margins": 8.438255654129328, + "rewards/rejected": -6.183680814855239, + "step": 2613 + }, + { + "epoch": 0.9649762355221263, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 3.188767906807999e-08, + "logits/chosen": 1508425113.6, + "logits/rejected": 1791599616.0, + "logps/chosen": -300.82607421875, + "logps/rejected": -504.9453938802083, + "loss": 0.2012, + "rewards/chosen": 0.9330973625183105, + "rewards/margins": 9.01188866297404, + "rewards/rejected": -8.078791300455729, + "step": 2614 + }, + { + "epoch": 0.9653453924599695, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 3.122750123246532e-08, + "logits/chosen": 1842151936.0, + "logits/rejected": 1365436672.0, + "logps/chosen": -306.0777282714844, + "logps/rejected": -482.5491943359375, + "loss": 0.1159, + "rewards/chosen": 1.801967978477478, + "rewards/margins": 9.728594183921814, + "rewards/rejected": -7.926626205444336, + "step": 2615 + }, + { + "epoch": 0.9657145493978128, + "grad_norm": 11.375, + "kl": 0.039913177490234375, + "learning_rate": 3.0574207568019874e-08, + "logits/chosen": 1825193053.090909, + "logits/rejected": 2528812470.857143, + "logps/chosen": -390.6629083806818, + "logps/rejected": -547.9645647321429, + "loss": 0.0832, + "rewards/chosen": 2.1370889490300957, + "rewards/margins": 9.757097871788652, + "rewards/rejected": -7.620008922758556, + "step": 2616 + }, + { + "epoch": 0.966083706335656, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 2.992779897987408e-08, + "logits/chosen": 1734413372.235294, + "logits/rejected": 2016901256.5333333, + "logps/chosen": -360.52424172794116, + "logps/rejected": -500.03082682291665, + "loss": 0.1572, + "rewards/chosen": 1.2897745020249312, + "rewards/margins": 9.893991014069202, + "rewards/rejected": -8.604216512044271, + "step": 2617 + }, + { + "epoch": 0.9664528632734991, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 2.9288276363618194e-08, + "logits/chosen": 2458727424.0, + "logits/rejected": 2377813760.0, + "logps/chosen": -224.68431091308594, + "logps/rejected": -550.4782104492188, + "loss": 0.1219, + "rewards/chosen": 1.8457499742507935, + "rewards/margins": 10.597391247749329, + "rewards/rejected": -8.751641273498535, + "step": 2618 + }, + { + "epoch": 0.9668220202113423, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 2.865564060530346e-08, + "logits/chosen": 1939987516.235294, + "logits/rejected": 2430704571.733333, + "logps/chosen": -347.18230124080884, + "logps/rejected": -517.886328125, + "loss": 0.1548, + "rewards/chosen": 1.53491771922392, + "rewards/margins": 8.545411697088504, + "rewards/rejected": -7.010493977864583, + "step": 2619 + }, + { + "epoch": 0.9671911771491856, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 2.802989258143818e-08, + "logits/chosen": 1644992000.0, + "logits/rejected": 1253656576.0, + "logps/chosen": -244.96232096354166, + "logps/rejected": -394.7832763671875, + "loss": 0.1048, + "rewards/chosen": 1.8006820678710938, + "rewards/margins": 8.554643249511718, + "rewards/rejected": -6.753961181640625, + "step": 2620 + }, + { + "epoch": 0.9675603340870288, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 2.74110331589883e-08, + "logits/chosen": 1867367046.7368422, + "logits/rejected": 1995189641.8461537, + "logps/chosen": -279.59097450657896, + "logps/rejected": -500.33875450721155, + "loss": 0.1509, + "rewards/chosen": 1.5906761570980674, + "rewards/margins": 9.385554016360388, + "rewards/rejected": -7.79487785926232, + "step": 2621 + }, + { + "epoch": 0.9679294910248719, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 2.6799063195376286e-08, + "logits/chosen": 1831612175.0588236, + "logits/rejected": 1423054984.5333333, + "logps/chosen": -346.3326056985294, + "logps/rejected": -474.46848958333334, + "loss": 0.1209, + "rewards/chosen": 1.8067186018999886, + "rewards/margins": 9.832811131196863, + "rewards/rejected": -8.026092529296875, + "step": 2622 + }, + { + "epoch": 0.9682986479627151, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 2.619398353847835e-08, + "logits/chosen": 2137323008.0, + "logits/rejected": 1784953446.4, + "logps/chosen": -280.8288167317708, + "logps/rejected": -528.04443359375, + "loss": 0.1376, + "rewards/chosen": 0.8169922828674316, + "rewards/margins": 9.577966403961181, + "rewards/rejected": -8.76097412109375, + "step": 2623 + }, + { + "epoch": 0.9686678049005584, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 2.559579502662446e-08, + "logits/chosen": 1827450880.0, + "logits/rejected": 2320744675.5555553, + "logps/chosen": -284.96974400111606, + "logps/rejected": -504.5865885416667, + "loss": 0.1335, + "rewards/chosen": 1.3970819200788225, + "rewards/margins": 11.117726553054082, + "rewards/rejected": -9.72064463297526, + "step": 2624 + }, + { + "epoch": 0.9690369618384016, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 2.500449848859776e-08, + "logits/chosen": 1472199443.6923077, + "logits/rejected": 1851681522.5263157, + "logps/chosen": -278.1296198918269, + "logps/rejected": -452.86024876644734, + "loss": 0.124, + "rewards/chosen": 1.1758123544546275, + "rewards/margins": 8.97431683250767, + "rewards/rejected": -7.798504478053043, + "step": 2625 + }, + { + "epoch": 0.9694061187762447, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 2.4420094743631274e-08, + "logits/chosen": 2382054912.0, + "logits/rejected": 1958439936.0, + "logps/chosen": -364.486572265625, + "logps/rejected": -537.1291015625, + "loss": 0.1343, + "rewards/chosen": 1.3255845705668132, + "rewards/margins": 8.934990851084391, + "rewards/rejected": -7.609406280517578, + "step": 2626 + }, + { + "epoch": 0.969775275714088, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 2.3842584601409536e-08, + "logits/chosen": 2152045909.3333335, + "logits/rejected": 1958252544.0, + "logps/chosen": -301.67109375, + "logps/rejected": -533.3137063419117, + "loss": 0.1514, + "rewards/chosen": 1.1435000101725261, + "rewards/margins": 9.0719362894694, + "rewards/rejected": -7.928436279296875, + "step": 2627 + }, + { + "epoch": 0.9701444326519312, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 2.3271968862065285e-08, + "logits/chosen": 1763163022.2222223, + "logits/rejected": 2015581330.2857144, + "logps/chosen": -248.73763020833334, + "logps/rejected": -452.88316127232144, + "loss": 0.159, + "rewards/chosen": 1.3272556728786893, + "rewards/margins": 8.874496338859437, + "rewards/rejected": -7.547240665980747, + "step": 2628 + }, + { + "epoch": 0.9705135895897744, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 2.270824831617946e-08, + "logits/chosen": 2726475776.0, + "logits/rejected": 1620089856.0, + "logps/chosen": -321.0384928385417, + "logps/rejected": -414.1280517578125, + "loss": 0.2111, + "rewards/chosen": 1.558776060740153, + "rewards/margins": 8.27879540125529, + "rewards/rejected": -6.720019340515137, + "step": 2629 + }, + { + "epoch": 0.9708827465276175, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 2.2151423744780076e-08, + "logits/chosen": 1747046400.0, + "logits/rejected": 1980631355.0769231, + "logps/chosen": -237.33138877467104, + "logps/rejected": -670.0558894230769, + "loss": 0.1805, + "rewards/chosen": 1.1920832583778782, + "rewards/margins": 23.310608836803357, + "rewards/rejected": -22.11852557842548, + "step": 2630 + }, + { + "epoch": 0.9712519034654608, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 2.1601495919340022e-08, + "logits/chosen": 2243775218.5263157, + "logits/rejected": 1314905796.9230769, + "logps/chosen": -307.40506784539474, + "logps/rejected": -412.3331956129808, + "loss": 0.1896, + "rewards/chosen": 0.9832369151868319, + "rewards/margins": 8.368274264007445, + "rewards/rejected": -7.385037348820613, + "step": 2631 + }, + { + "epoch": 0.971621060403304, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 2.10584656017776e-08, + "logits/chosen": 1650911104.0, + "logits/rejected": 1963877504.0, + "logps/chosen": -281.9264831542969, + "logps/rejected": -525.0164794921875, + "loss": 0.1722, + "rewards/chosen": 1.0156636238098145, + "rewards/margins": 9.560764789581299, + "rewards/rejected": -8.545101165771484, + "step": 2632 + }, + { + "epoch": 0.9719902173411471, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 2.0522333544453764e-08, + "logits/chosen": 1495891797.3333333, + "logits/rejected": 1484200755.2, + "logps/chosen": -280.2703043619792, + "logps/rejected": -483.12705078125, + "loss": 0.1113, + "rewards/chosen": 1.2312882741292317, + "rewards/margins": 8.851352055867514, + "rewards/rejected": -7.620063781738281, + "step": 2633 + }, + { + "epoch": 0.9723593742789903, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 1.999310049017378e-08, + "logits/chosen": 1654931683.5555556, + "logits/rejected": 2114699410.2857144, + "logps/chosen": -269.5661892361111, + "logps/rejected": -484.56424386160717, + "loss": 0.1468, + "rewards/chosen": 1.672633171081543, + "rewards/margins": 8.15966102055141, + "rewards/rejected": -6.487027849469866, + "step": 2634 + }, + { + "epoch": 0.9727285312168336, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 1.9470767172182215e-08, + "logits/chosen": 1560071296.0, + "logits/rejected": 1367682304.0, + "logps/chosen": -274.9316101074219, + "logps/rejected": -563.2467651367188, + "loss": 0.1638, + "rewards/chosen": 1.4244953393936157, + "rewards/margins": 10.428736329078674, + "rewards/rejected": -9.004240989685059, + "step": 2635 + }, + { + "epoch": 0.9730976881546768, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 1.8955334314166298e-08, + "logits/chosen": 2058994145.8823528, + "logits/rejected": 1578515933.8666666, + "logps/chosen": -261.0875459558824, + "logps/rejected": -485.86279296875, + "loss": 0.1996, + "rewards/chosen": 0.7400466133566463, + "rewards/margins": 8.207517926833209, + "rewards/rejected": -7.467471313476563, + "step": 2636 + }, + { + "epoch": 0.9734668450925199, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 1.844680263025089e-08, + "logits/chosen": 1705008640.0, + "logits/rejected": 1734523136.0, + "logps/chosen": -226.51441955566406, + "logps/rejected": -449.78448486328125, + "loss": 0.1513, + "rewards/chosen": 1.1361011266708374, + "rewards/margins": 8.847718119621277, + "rewards/rejected": -7.7116169929504395, + "step": 2637 + }, + { + "epoch": 0.9738360020303631, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 1.794517282500019e-08, + "logits/chosen": 2088509732.5714285, + "logits/rejected": 2336252814.2222223, + "logps/chosen": -252.77054268973214, + "logps/rejected": -323.96375868055554, + "loss": 0.1346, + "rewards/chosen": 2.029412405831473, + "rewards/margins": 7.912230415949746, + "rewards/rejected": -5.882818010118273, + "step": 2638 + }, + { + "epoch": 0.9742051589682064, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 1.7450445593416576e-08, + "logits/chosen": 2185944320.0, + "logits/rejected": 2206056704.0, + "logps/chosen": -225.60382080078125, + "logps/rejected": -478.12579345703125, + "loss": 0.1554, + "rewards/chosen": 1.248208999633789, + "rewards/margins": 9.546585083007812, + "rewards/rejected": -8.298376083374023, + "step": 2639 + }, + { + "epoch": 0.9745743159060496, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 1.6962621620937314e-08, + "logits/chosen": 2201201900.3076925, + "logits/rejected": 1784664926.3157895, + "logps/chosen": -253.3258338341346, + "logps/rejected": -444.76778371710526, + "loss": 0.1253, + "rewards/chosen": 1.351659628061148, + "rewards/margins": 9.936325026909833, + "rewards/rejected": -8.584665398848685, + "step": 2640 + }, + { + "epoch": 0.9749434728438927, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 1.648170158343787e-08, + "logits/chosen": 2022113962.6666667, + "logits/rejected": 1523445278.1176472, + "logps/chosen": -299.5767578125, + "logps/rejected": -447.27062270220586, + "loss": 0.1232, + "rewards/chosen": 1.6004016876220704, + "rewards/margins": 8.530509701897117, + "rewards/rejected": -6.930108014275046, + "step": 2641 + }, + { + "epoch": 0.9753126297817359, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 1.6007686147225254e-08, + "logits/chosen": 2309815374.769231, + "logits/rejected": 1845917157.0526316, + "logps/chosen": -277.0148737980769, + "logps/rejected": -551.6919716282895, + "loss": 0.11, + "rewards/chosen": 1.5351239717923677, + "rewards/margins": 10.233563720456019, + "rewards/rejected": -8.69843974866365, + "step": 2642 + }, + { + "epoch": 0.9756817867195792, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 1.554057596904246e-08, + "logits/chosen": 2327484160.0, + "logits/rejected": 1965841152.0, + "logps/chosen": -307.0625, + "logps/rejected": -518.2608032226562, + "loss": 0.1231, + "rewards/chosen": 1.7959740161895752, + "rewards/margins": 10.358821153640747, + "rewards/rejected": -8.562847137451172, + "step": 2643 + }, + { + "epoch": 0.9760509436574224, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 1.5080371696065133e-08, + "logits/chosen": 2540897536.0, + "logits/rejected": 1372389888.0, + "logps/chosen": -323.7621765136719, + "logps/rejected": -560.822021484375, + "loss": 0.1519, + "rewards/chosen": 1.062551498413086, + "rewards/margins": 11.977798461914062, + "rewards/rejected": -10.915246963500977, + "step": 2644 + }, + { + "epoch": 0.9764201005952655, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 1.4627073965899907e-08, + "logits/chosen": 2110496540.4444444, + "logits/rejected": 2137640228.5714285, + "logps/chosen": -227.42333984375, + "logps/rejected": -517.8638741629464, + "loss": 0.1119, + "rewards/chosen": 2.3989196353488498, + "rewards/margins": 11.400808455452086, + "rewards/rejected": -9.001888820103236, + "step": 2645 + }, + { + "epoch": 0.9767892575331087, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 1.4180683406584961e-08, + "logits/chosen": 1587648219.4285715, + "logits/rejected": 2111734877.090909, + "logps/chosen": -266.5053245907738, + "logps/rejected": -505.23495205965907, + "loss": 0.1985, + "rewards/chosen": 1.5835584004720051, + "rewards/margins": 9.29180619210908, + "rewards/rejected": -7.708247791637074, + "step": 2646 + }, + { + "epoch": 0.977158414470952, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 1.3741200636589457e-08, + "logits/chosen": 1646489344.0, + "logits/rejected": 1341440640.0, + "logps/chosen": -261.611328125, + "logps/rejected": -442.82879638671875, + "loss": 0.1194, + "rewards/chosen": 1.6819113492965698, + "rewards/margins": 8.258711695671082, + "rewards/rejected": -6.576800346374512, + "step": 2647 + }, + { + "epoch": 0.9775275714087952, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 1.330862626481022e-08, + "logits/chosen": 1189842176.0, + "logits/rejected": 1490862848.0, + "logps/chosen": -261.2923278808594, + "logps/rejected": -459.8263854980469, + "loss": 0.1153, + "rewards/chosen": 1.9765797853469849, + "rewards/margins": 9.448445916175842, + "rewards/rejected": -7.471866130828857, + "step": 2648 + }, + { + "epoch": 0.9778967283466383, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 1.2882960890573947e-08, + "logits/chosen": 2074327160.4705882, + "logits/rejected": 2861602406.4, + "logps/chosen": -262.07028377757354, + "logps/rejected": -612.8923828125, + "loss": 0.1647, + "rewards/chosen": 1.1686758153578813, + "rewards/margins": 9.753413974537569, + "rewards/rejected": -8.584738159179688, + "step": 2649 + }, + { + "epoch": 0.9782658852844816, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 1.2464205103634996e-08, + "logits/chosen": 1314951261.090909, + "logits/rejected": 1209194593.5238094, + "logps/chosen": -303.07958984375, + "logps/rejected": -376.06031436011904, + "loss": 0.1123, + "rewards/chosen": 1.3956180052323774, + "rewards/margins": 8.326346735933642, + "rewards/rejected": -6.930728730701265, + "step": 2650 + }, + { + "epoch": 0.9786350422223248, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 1.2052359484173715e-08, + "logits/chosen": 1645075275.2941177, + "logits/rejected": 1680284194.1333334, + "logps/chosen": -319.3803136488971, + "logps/rejected": -453.1979166666667, + "loss": 0.1715, + "rewards/chosen": 1.2471453722785502, + "rewards/margins": 9.005369859583238, + "rewards/rejected": -7.7582244873046875, + "step": 2651 + }, + { + "epoch": 0.979004199160168, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 1.1647424602797553e-08, + "logits/chosen": 1937154304.0, + "logits/rejected": 1653330176.0, + "logps/chosen": -318.55755615234375, + "logps/rejected": -523.475341796875, + "loss": 0.1586, + "rewards/chosen": 0.9719578623771667, + "rewards/margins": 9.159582793712616, + "rewards/rejected": -8.18762493133545, + "step": 2652 + }, + { + "epoch": 0.9793733560980111, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 1.124940102053773e-08, + "logits/chosen": 1641037596.4444444, + "logits/rejected": 1900801462.857143, + "logps/chosen": -307.957763671875, + "logps/rejected": -364.6156529017857, + "loss": 0.1944, + "rewards/chosen": 0.9061891767713759, + "rewards/margins": 7.5683165353441995, + "rewards/rejected": -6.662127358572824, + "step": 2653 + }, + { + "epoch": 0.9797425130358544, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 1.0858289288851465e-08, + "logits/chosen": 1573117952.0, + "logits/rejected": 1573593292.8, + "logps/chosen": -259.98506303267044, + "logps/rejected": -541.394775390625, + "loss": 0.2076, + "rewards/chosen": 1.10092492537065, + "rewards/margins": 12.050303892655807, + "rewards/rejected": -10.949378967285156, + "step": 2654 + }, + { + "epoch": 0.9801116699736976, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 1.0474089949619182e-08, + "logits/chosen": 3336256625.7777777, + "logits/rejected": 1798828032.0, + "logps/chosen": -239.07161458333334, + "logps/rejected": -563.9407269021739, + "loss": 0.0787, + "rewards/chosen": 2.3256007300482855, + "rewards/margins": 10.107136003061193, + "rewards/rejected": -7.781535273012907, + "step": 2655 + }, + { + "epoch": 0.9804808269115408, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 1.0096803535143972e-08, + "logits/chosen": 1674323968.0, + "logits/rejected": 1558389321.142857, + "logps/chosen": -297.48130967881946, + "logps/rejected": -557.5966099330357, + "loss": 0.1815, + "rewards/chosen": 1.0775518417358398, + "rewards/margins": 10.538809776306152, + "rewards/rejected": -9.461257934570312, + "step": 2656 + }, + { + "epoch": 0.9808499838493839, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 9.726430568151036e-09, + "logits/chosen": 2335055616.0, + "logits/rejected": 1673556736.0, + "logps/chosen": -285.3436279296875, + "logps/rejected": -393.3205261230469, + "loss": 0.1497, + "rewards/chosen": 1.4604202508926392, + "rewards/margins": 7.453709244728088, + "rewards/rejected": -5.993288993835449, + "step": 2657 + }, + { + "epoch": 0.9812191407872272, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 9.36297156178767e-09, + "logits/chosen": 2158674566.736842, + "logits/rejected": 1284646596.9230769, + "logps/chosen": -282.7418277138158, + "logps/rejected": -364.0471379206731, + "loss": 0.1664, + "rewards/chosen": 1.438017393413343, + "rewards/margins": 8.008147961697597, + "rewards/rejected": -6.570130568284255, + "step": 2658 + }, + { + "epoch": 0.9815882977250704, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 9.006427019622177e-09, + "logits/chosen": 1410502053.6470587, + "logits/rejected": 1703066146.1333334, + "logps/chosen": -264.65969669117646, + "logps/rejected": -502.78404947916664, + "loss": 0.1406, + "rewards/chosen": 1.6921447305118336, + "rewards/margins": 10.31669001859777, + "rewards/rejected": -8.624545288085937, + "step": 2659 + }, + { + "epoch": 0.9819574546629136, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 8.656797435642183e-09, + "logits/chosen": 1442824564.3636363, + "logits/rejected": 1468508160.0, + "logps/chosen": -250.8143643465909, + "logps/rejected": -518.733837890625, + "loss": 0.2099, + "rewards/chosen": 1.214498433199796, + "rewards/margins": 11.321165761080657, + "rewards/rejected": -10.10666732788086, + "step": 2660 + }, + { + "epoch": 0.9823266116007567, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 8.31408329425465e-09, + "logits/chosen": 2246115328.0, + "logits/rejected": 2174129664.0, + "logps/chosen": -427.81298828125, + "logps/rejected": -519.18603515625, + "loss": 0.1482, + "rewards/chosen": 1.388380527496338, + "rewards/margins": 10.373939990997314, + "rewards/rejected": -8.985559463500977, + "step": 2661 + }, + { + "epoch": 0.9826957685386, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 7.978285070286419e-09, + "logits/chosen": 2477167872.0, + "logits/rejected": 2014711936.0, + "logps/chosen": -346.04803466796875, + "logps/rejected": -485.93994140625, + "loss": 0.1234, + "rewards/chosen": 1.4438722133636475, + "rewards/margins": 9.263353109359741, + "rewards/rejected": -7.819480895996094, + "step": 2662 + }, + { + "epoch": 0.9830649254764432, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 7.649403228980889e-09, + "logits/chosen": 1547490076.4444444, + "logits/rejected": 1779159186.2857144, + "logps/chosen": -274.3186306423611, + "logps/rejected": -424.88023158482144, + "loss": 0.1617, + "rewards/chosen": 1.2504789564344618, + "rewards/margins": 9.310117388528491, + "rewards/rejected": -8.05963843209403, + "step": 2663 + }, + { + "epoch": 0.9834340824142864, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 7.327438226000239e-09, + "logits/chosen": 2052104021.3333333, + "logits/rejected": 2110752768.0, + "logps/chosen": -351.1277669270833, + "logps/rejected": -468.2748046875, + "loss": 0.1148, + "rewards/chosen": 1.5170919100443523, + "rewards/margins": 9.06398827234904, + "rewards/rejected": -7.546896362304688, + "step": 2664 + }, + { + "epoch": 0.9838032393521295, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 7.01239050742264e-09, + "logits/chosen": 1769215268.5714285, + "logits/rejected": 1960470528.0, + "logps/chosen": -208.28043038504464, + "logps/rejected": -487.96875, + "loss": 0.1207, + "rewards/chosen": 1.6948896135602678, + "rewards/margins": 9.633695087735616, + "rewards/rejected": -7.938805474175347, + "step": 2665 + }, + { + "epoch": 0.9841723962899728, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 6.704260509742266e-09, + "logits/chosen": 2306128554.6666665, + "logits/rejected": 1825969356.8, + "logps/chosen": -264.05324300130206, + "logps/rejected": -447.4041015625, + "loss": 0.1659, + "rewards/chosen": 0.5831642945607504, + "rewards/margins": 7.699357684453328, + "rewards/rejected": -7.116193389892578, + "step": 2666 + }, + { + "epoch": 0.984541553227816, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 6.403048659870403e-09, + "logits/chosen": 1906909457.0666666, + "logits/rejected": 1853934893.1764705, + "logps/chosen": -257.57412109375, + "logps/rejected": -362.3627068014706, + "loss": 0.136, + "rewards/chosen": 1.4207655588785808, + "rewards/margins": 8.372214784809188, + "rewards/rejected": -6.951449225930607, + "step": 2667 + }, + { + "epoch": 0.9849107101656592, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 6.1087553751310036e-09, + "logits/chosen": 2020786858.6666667, + "logits/rejected": 2117012781.1764705, + "logps/chosen": -339.438671875, + "logps/rejected": -467.76232192095586, + "loss": 0.1575, + "rewards/chosen": 0.824389902750651, + "rewards/margins": 10.809441674924365, + "rewards/rejected": -9.985051772173714, + "step": 2668 + }, + { + "epoch": 0.9852798671035023, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 5.821381063264575e-09, + "logits/chosen": 1496014574.9333334, + "logits/rejected": 2214832489.4117646, + "logps/chosen": -234.85865885416666, + "logps/rejected": -416.26878446691177, + "loss": 0.1046, + "rewards/chosen": 2.2083114624023437, + "rewards/margins": 8.260516716452205, + "rewards/rejected": -6.052205254049862, + "step": 2669 + }, + { + "epoch": 0.9856490240413456, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 5.540926122424295e-09, + "logits/chosen": 1429531794.2857144, + "logits/rejected": 2232819712.0, + "logps/chosen": -265.31566220238096, + "logps/rejected": -558.3601740056819, + "loss": 0.1613, + "rewards/chosen": 1.6129063197544642, + "rewards/margins": 10.425168421361352, + "rewards/rejected": -8.812262101606889, + "step": 2670 + }, + { + "epoch": 0.9860181809791888, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 5.267390941177119e-09, + "logits/chosen": 1903825618.8235295, + "logits/rejected": 2525285580.8, + "logps/chosen": -210.2591050091912, + "logps/rejected": -448.081640625, + "loss": 0.1612, + "rewards/chosen": 1.4743303411147173, + "rewards/margins": 8.060634560678519, + "rewards/rejected": -6.586304219563802, + "step": 2671 + }, + { + "epoch": 0.9863873379170319, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 5.000775898502119e-09, + "logits/chosen": 2278307960.470588, + "logits/rejected": 2111305045.3333333, + "logps/chosen": -273.5592256433824, + "logps/rejected": -634.9578125, + "loss": 0.154, + "rewards/chosen": 1.3823686487534468, + "rewards/margins": 12.184576086904489, + "rewards/rejected": -10.802207438151042, + "step": 2672 + }, + { + "epoch": 0.9867564948548752, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 4.7410813637915885e-09, + "logits/chosen": 1472363081.142857, + "logits/rejected": 1809076451.5555556, + "logps/chosen": -236.62901088169642, + "logps/rejected": -491.54150390625, + "loss": 0.1005, + "rewards/chosen": 1.896531649998256, + "rewards/margins": 9.121386861044263, + "rewards/rejected": -7.224855211046007, + "step": 2673 + }, + { + "epoch": 0.9871256517927184, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 4.4883076968482705e-09, + "logits/chosen": 1409560733.5384614, + "logits/rejected": 2387798554.9473686, + "logps/chosen": -311.677490234375, + "logps/rejected": -459.84847861842104, + "loss": 0.1283, + "rewards/chosen": 1.2104788560133715, + "rewards/margins": 10.783031185628914, + "rewards/rejected": -9.572552329615542, + "step": 2674 + }, + { + "epoch": 0.9874948087305616, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 4.242455247887578e-09, + "logits/chosen": 2303702630.4, + "logits/rejected": 2204633449.4117646, + "logps/chosen": -269.08893229166665, + "logps/rejected": -494.1953125, + "loss": 0.1435, + "rewards/chosen": 1.2793304443359375, + "rewards/margins": 9.860595164579504, + "rewards/rejected": -8.581264720243567, + "step": 2675 + }, + { + "epoch": 0.9878639656684047, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 4.003524357534261e-09, + "logits/chosen": 1863556096.0, + "logits/rejected": 2119664640.0, + "logps/chosen": -276.0074462890625, + "logps/rejected": -405.3759765625, + "loss": 0.1319, + "rewards/chosen": 1.4753732681274414, + "rewards/margins": 9.106996536254883, + "rewards/rejected": -7.631623268127441, + "step": 2676 + }, + { + "epoch": 0.988233122606248, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 3.771515356825184e-09, + "logits/chosen": 1801154432.0, + "logits/rejected": 1619599488.0, + "logps/chosen": -302.5442199707031, + "logps/rejected": -442.544677734375, + "loss": 0.1416, + "rewards/chosen": 1.2289360761642456, + "rewards/margins": 9.085882067680359, + "rewards/rejected": -7.856945991516113, + "step": 2677 + }, + { + "epoch": 0.9886022795440912, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 3.54642856720433e-09, + "logits/chosen": 1617110747.4285715, + "logits/rejected": 2199558144.0, + "logps/chosen": -335.75327845982144, + "logps/rejected": -550.8023003472222, + "loss": 0.1182, + "rewards/chosen": 1.6529129573277064, + "rewards/margins": 8.609712782360258, + "rewards/rejected": -6.956799825032552, + "step": 2678 + }, + { + "epoch": 0.9889714364819344, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 3.328264300527795e-09, + "logits/chosen": 1856321706.6666667, + "logits/rejected": 2096995532.8, + "logps/chosen": -300.51804606119794, + "logps/rejected": -455.17646484375, + "loss": 0.0913, + "rewards/chosen": 2.0016349156697593, + "rewards/margins": 8.809883817036948, + "rewards/rejected": -6.808248901367188, + "step": 2679 + }, + { + "epoch": 0.9893405934197775, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 3.117022859059349e-09, + "logits/chosen": 1768772992.0, + "logits/rejected": 1533526528.0, + "logps/chosen": -279.76739501953125, + "logps/rejected": -530.0838012695312, + "loss": 0.1633, + "rewards/chosen": 1.0856561660766602, + "rewards/margins": 10.3218412399292, + "rewards/rejected": -9.236185073852539, + "step": 2680 + }, + { + "epoch": 0.9897097503576208, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 2.9127045354704343e-09, + "logits/chosen": 1353331507.2, + "logits/rejected": 1638688286.1176472, + "logps/chosen": -257.67609049479165, + "logps/rejected": -546.7764246323529, + "loss": 0.1486, + "rewards/chosen": 1.167668914794922, + "rewards/margins": 8.838933563232422, + "rewards/rejected": -7.6712646484375, + "step": 2681 + }, + { + "epoch": 0.990078907295464, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 2.7153096128423873e-09, + "logits/chosen": 2356781056.0, + "logits/rejected": 2386990742.5882354, + "logps/chosen": -314.0078125, + "logps/rejected": -580.0646254595588, + "loss": 0.159, + "rewards/chosen": 0.9780844370524089, + "rewards/margins": 10.064570355882832, + "rewards/rejected": -9.086485918830423, + "step": 2682 + }, + { + "epoch": 0.9904480642333072, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 2.524838364662552e-09, + "logits/chosen": 1125438208.0, + "logits/rejected": 1529609984.0, + "logps/chosen": -224.22727966308594, + "logps/rejected": -392.63824462890625, + "loss": 0.1154, + "rewards/chosen": 2.2879929542541504, + "rewards/margins": 9.221418857574463, + "rewards/rejected": -6.9334259033203125, + "step": 2683 + }, + { + "epoch": 0.9908172211711503, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 2.3412910548270553e-09, + "logits/chosen": 1885953911.4666667, + "logits/rejected": 1387051369.4117646, + "logps/chosen": -310.96748046875, + "logps/rejected": -414.14013671875, + "loss": 0.1231, + "rewards/chosen": 1.5069929758707683, + "rewards/margins": 9.196313259648342, + "rewards/rejected": -7.689320283777573, + "step": 2684 + }, + { + "epoch": 0.9911863781089936, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 2.164667937638587e-09, + "logits/chosen": 1661965458.2857144, + "logits/rejected": 1651673770.6666667, + "logps/chosen": -264.72024972098217, + "logps/rejected": -376.29212782118054, + "loss": 0.1614, + "rewards/chosen": 1.385552133832659, + "rewards/margins": 7.721670620025151, + "rewards/rejected": -6.3361184861924915, + "step": 2685 + }, + { + "epoch": 0.9915555350468368, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 1.9949692578058453e-09, + "logits/chosen": 1280220711.3846154, + "logits/rejected": 1190837409.6842105, + "logps/chosen": -284.3387920673077, + "logps/rejected": -422.31566097861844, + "loss": 0.1091, + "rewards/chosen": 1.5886978736290565, + "rewards/margins": 9.202871361242131, + "rewards/rejected": -7.614173487613075, + "step": 2686 + }, + { + "epoch": 0.99192469198468, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 1.8321952504435358e-09, + "logits/chosen": 1919817454.9333334, + "logits/rejected": 2792397884.2352943, + "logps/chosen": -292.30247395833334, + "logps/rejected": -628.0380859375, + "loss": 0.1558, + "rewards/chosen": 1.1058822631835938, + "rewards/margins": 12.294076089298024, + "rewards/rejected": -11.18819382611443, + "step": 2687 + }, + { + "epoch": 0.9922938489225231, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 1.6763461410740366e-09, + "logits/chosen": 2189019955.2, + "logits/rejected": 2052514996.7058823, + "logps/chosen": -232.75947265625, + "logps/rejected": -466.83777573529414, + "loss": 0.1287, + "rewards/chosen": 1.3806357065836588, + "rewards/margins": 8.761045844882142, + "rewards/rejected": -7.380410138298483, + "step": 2688 + }, + { + "epoch": 0.9926630058603664, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 1.527422145624069e-09, + "logits/chosen": 1962611049.4117646, + "logits/rejected": 1798243123.2, + "logps/chosen": -174.5126522288603, + "logps/rejected": -1177.6569010416667, + "loss": 0.1159, + "rewards/chosen": 2.176556755514706, + "rewards/margins": 69.38008865655638, + "rewards/rejected": -67.20353190104167, + "step": 2689 + }, + { + "epoch": 0.9930321627982096, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 1.3854234704252512e-09, + "logits/chosen": 1635945472.0, + "logits/rejected": 1560666258.2857144, + "logps/chosen": -288.66259765625, + "logps/rejected": -366.18697684151783, + "loss": 0.1851, + "rewards/chosen": 1.1693728764851887, + "rewards/margins": 8.363293556939988, + "rewards/rejected": -7.1939206804547995, + "step": 2690 + }, + { + "epoch": 0.9934013197360528, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 1.2503503122157644e-09, + "logits/chosen": 2278560290.133333, + "logits/rejected": 2117555380.7058823, + "logps/chosen": -312.8912109375, + "logps/rejected": -505.9911534926471, + "loss": 0.0957, + "rewards/chosen": 2.154127756754557, + "rewards/margins": 9.746973897896561, + "rewards/rejected": -7.592846141142004, + "step": 2691 + }, + { + "epoch": 0.993770476673896, + "grad_norm": 12.6875, + "kl": 0.27292728424072266, + "learning_rate": 1.1222028581375777e-09, + "logits/chosen": 1833704727.2727273, + "logits/rejected": 1995080294.4, + "logps/chosen": -250.29871715198863, + "logps/rejected": -488.13388671875, + "loss": 0.1677, + "rewards/chosen": 1.7184743014248935, + "rewards/margins": 9.825446805087005, + "rewards/rejected": -8.10697250366211, + "step": 2692 + }, + { + "epoch": 0.9941396336117392, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 1.0009812857370016e-09, + "logits/chosen": 1384891187.2, + "logits/rejected": 1300102927.0588236, + "logps/chosen": -257.19176432291664, + "logps/rejected": -534.3570772058823, + "loss": 0.1015, + "rewards/chosen": 2.0398653666178386, + "rewards/margins": 10.822980529186772, + "rewards/rejected": -8.783115162568933, + "step": 2693 + }, + { + "epoch": 0.9945087905495824, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 8.866857629652448e-10, + "logits/chosen": 1904536090.9473684, + "logits/rejected": 1609958006.1538463, + "logps/chosen": -222.94662314967104, + "logps/rejected": -469.6526066706731, + "loss": 0.1488, + "rewards/chosen": 1.6630820224159641, + "rewards/margins": 7.7216481892203515, + "rewards/rejected": -6.058566166804387, + "step": 2694 + }, + { + "epoch": 0.9948779474874256, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 7.79316448177303e-10, + "logits/chosen": 2026763317.8947368, + "logits/rejected": 1851649260.3076923, + "logps/chosen": -306.83213404605266, + "logps/rejected": -439.92784705528845, + "loss": 0.1815, + "rewards/chosen": 1.1572223462556537, + "rewards/margins": 9.586389286798022, + "rewards/rejected": -8.429166940542368, + "step": 2695 + }, + { + "epoch": 0.9952471044252688, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 6.788734901319594e-10, + "logits/chosen": 1437191246.7692308, + "logits/rejected": 1848799447.5789473, + "logps/chosen": -278.6091871995192, + "logps/rejected": -350.79384251644734, + "loss": 0.1104, + "rewards/chosen": 1.5873936873215895, + "rewards/margins": 8.480913841772658, + "rewards/rejected": -6.893520154451069, + "step": 2696 + }, + { + "epoch": 0.995616261363112, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 5.853570279917842e-10, + "logits/chosen": 1301826074.9473684, + "logits/rejected": 1555752172.3076923, + "logps/chosen": -240.24953741776315, + "logps/rejected": -434.84945913461536, + "loss": 0.1943, + "rewards/chosen": 1.018631282605623, + "rewards/margins": 7.981774851378159, + "rewards/rejected": -6.963143568772536, + "step": 2697 + }, + { + "epoch": 0.9959854183009552, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 4.9876719132258e-10, + "logits/chosen": 2016551774.3157895, + "logits/rejected": 1550716297.8461537, + "logps/chosen": -385.80299136513156, + "logps/rejected": -442.0441706730769, + "loss": 0.1724, + "rewards/chosen": 1.1733211718107526, + "rewards/margins": 8.85781753787145, + "rewards/rejected": -7.6844963660606975, + "step": 2698 + }, + { + "epoch": 0.9963545752387984, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 4.1910410009338155e-10, + "logits/chosen": 2015855360.0, + "logits/rejected": 1756510720.0, + "logps/chosen": -325.09527587890625, + "logps/rejected": -505.0140380859375, + "loss": 0.1191, + "rewards/chosen": 1.6802868843078613, + "rewards/margins": 10.370177745819092, + "rewards/rejected": -8.68989086151123, + "step": 2699 + }, + { + "epoch": 0.9967237321766416, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 3.4636786467590057e-10, + "logits/chosen": 1488480015.0588236, + "logits/rejected": 1345921979.7333333, + "logps/chosen": -270.6277860753676, + "logps/rejected": -390.25084635416664, + "loss": 0.1478, + "rewards/chosen": 1.6022502674775965, + "rewards/margins": 9.08922841689166, + "rewards/rejected": -7.486978149414062, + "step": 2700 + }, + { + "epoch": 0.9970928891144848, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 2.805585858461912e-10, + "logits/chosen": 1467404014.9333334, + "logits/rejected": 1482274816.0, + "logps/chosen": -229.68011067708332, + "logps/rejected": -442.8195369944853, + "loss": 0.1603, + "rewards/chosen": 0.9035256067911784, + "rewards/margins": 8.595459631377576, + "rewards/rejected": -7.691934024586397, + "step": 2701 + }, + { + "epoch": 0.997462046052328, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 2.2167635478187454e-10, + "logits/chosen": 1924537856.0, + "logits/rejected": 1670826752.0, + "logps/chosen": -294.6661376953125, + "logps/rejected": -515.1318359375, + "loss": 0.1166, + "rewards/chosen": 1.922693133354187, + "rewards/margins": 8.8905690908432, + "rewards/rejected": -6.967875957489014, + "step": 2702 + }, + { + "epoch": 0.9978312029901713, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 1.697212530632486e-10, + "logits/chosen": 1987359232.0, + "logits/rejected": 2110367360.0, + "logps/chosen": -296.7464294433594, + "logps/rejected": -434.89569091796875, + "loss": 0.1493, + "rewards/chosen": 1.2833212614059448, + "rewards/margins": 9.188984751701355, + "rewards/rejected": -7.90566349029541, + "step": 2703 + }, + { + "epoch": 0.9982003599280144, + "grad_norm": 15.125, + "kl": 0.0, + "learning_rate": 1.2469335267384364e-10, + "logits/chosen": 2496871725.1764708, + "logits/rejected": 2207329484.8, + "logps/chosen": -401.1233340992647, + "logps/rejected": -493.45514322916665, + "loss": 0.1682, + "rewards/chosen": 1.0576911253087662, + "rewards/margins": 9.304263492659027, + "rewards/rejected": -8.246572367350261, + "step": 2704 + }, + { + "epoch": 0.9985695168658576, + "grad_norm": 10.6875, + "kl": 2.807370185852051, + "learning_rate": 8.659271599875673e-11, + "logits/chosen": 1821663095.4666667, + "logits/rejected": 1756929325.1764705, + "logps/chosen": -264.81969401041664, + "logps/rejected": -585.9879940257352, + "loss": 0.1425, + "rewards/chosen": 1.8745740254720051, + "rewards/margins": 10.760795518463734, + "rewards/rejected": -8.886221492991728, + "step": 2705 + }, + { + "epoch": 0.9989386738037008, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5.5419395826317166e-11, + "logits/chosen": 1793633201.2307692, + "logits/rejected": 1688122421.8947368, + "logps/chosen": -447.7878605769231, + "logps/rejected": -390.38833778782896, + "loss": 0.105, + "rewards/chosen": 1.6783084869384766, + "rewards/margins": 8.667085246035928, + "rewards/rejected": -6.98877675909745, + "step": 2706 + }, + { + "epoch": 0.999307830741544, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 3.1173435346976146e-11, + "logits/chosen": 2300794112.0, + "logits/rejected": 2281580032.0, + "logps/chosen": -291.7148132324219, + "logps/rejected": -411.2530517578125, + "loss": 0.1715, + "rewards/chosen": 1.1019021272659302, + "rewards/margins": 7.986650109291077, + "rewards/rejected": -6.8847479820251465, + "step": 2707 + }, + { + "epoch": 0.9996769876793872, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 1.385486815219661e-11, + "logits/chosen": 1962445824.0, + "logits/rejected": 1995209984.0, + "logps/chosen": -324.60113525390625, + "logps/rejected": -472.29705810546875, + "loss": 0.1697, + "rewards/chosen": 1.1132055521011353, + "rewards/margins": 8.41993534564972, + "rewards/rejected": -7.306729793548584, + "step": 2708 + }, + { + "epoch": 1.0003691569378432, + "grad_norm": 13.25, + "kl": 1.497446060180664, + "learning_rate": 3.4637182377839086e-12, + "logits/chosen": 2276843520.0, + "logits/rejected": 1520478890.6666667, + "logps/chosen": -241.78392650462962, + "logps/rejected": -495.4312220982143, + "loss": 0.2533, + "rewards/chosen": 1.4716101752387152, + "rewards/margins": 10.138540600973464, + "rewards/rejected": -8.666930425734748, + "step": 2709 + } + ], + "logging_steps": 1, + "max_steps": 2709, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1355, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}