diff --git "a/checkpoint-1355/trainer_state.json" "b/checkpoint-1355/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1355/trainer_state.json" @@ -0,0 +1,20407 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.5002076507775368, + "eval_steps": 339, + "global_step": 1355, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003691569378432006, + "grad_norm": 22.75, + "kl": 0.0, + "learning_rate": 0.0, + "logits/chosen": 2555141026.909091, + "logits/rejected": 1722975436.8, + "logps/chosen": -251.8519620028409, + "logps/rejected": -332.2370361328125, + "loss": 0.5, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0007383138756864012, + "grad_norm": 27.875, + "kl": 0.0, + "learning_rate": 2.5000000000000004e-07, + "logits/chosen": 2082756500.2105262, + "logits/rejected": 2078594441.8461537, + "logps/chosen": -306.5312243009868, + "logps/rejected": -322.86951622596155, + "loss": 0.5, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.0011074708135296017, + "grad_norm": 37.0, + "kl": 0.283663272857666, + "learning_rate": 5.000000000000001e-07, + "logits/chosen": 1958897078.857143, + "logits/rejected": 1782147299.5555556, + "logps/chosen": -274.17801339285717, + "logps/rejected": -417.18321397569446, + "loss": 0.4966, + "rewards/chosen": 0.011177281183855874, + "rewards/margins": 0.018043566554311722, + "rewards/rejected": -0.006866285370455848, + "step": 3 + }, + { + "epoch": 0.0014766277513728024, + "grad_norm": 35.25, + "kl": 0.2572214603424072, + "learning_rate": 7.5e-07, + "logits/chosen": 1230686354.2857144, + "logits/rejected": 1339229525.3333333, + "logps/chosen": -289.994384765625, + "logps/rejected": -427.45372178819446, + "loss": 0.4953, + "rewards/chosen": -0.0008347396339688982, + "rewards/margins": 0.004339981646764846, + "rewards/rejected": -0.005174721280733745, + "step": 4 + }, + { + "epoch": 0.001845784689216003, + "grad_norm": 32.0, + "kl": 0.27948784828186035, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": 2612449437.5384617, + "logits/rejected": 1402594142.3157895, + "logps/chosen": -226.52452674278845, + "logps/rejected": -396.50986842105266, + "loss": 0.4904, + "rewards/chosen": -0.013464003801345825, + "rewards/margins": 0.04047517556893198, + "rewards/rejected": -0.053939179370277805, + "step": 5 + }, + { + "epoch": 0.0022149416270592034, + "grad_norm": 31.5, + "kl": 0.05804014205932617, + "learning_rate": 1.25e-06, + "logits/chosen": 2320796435.6923075, + "logits/rejected": 1748089802.1052632, + "logps/chosen": -256.34786283052887, + "logps/rejected": -338.63111636513156, + "loss": 0.4988, + "rewards/chosen": 0.011130671088512126, + "rewards/margins": 0.015404057074413608, + "rewards/rejected": -0.004273385985901481, + "step": 6 + }, + { + "epoch": 0.0025840985649024043, + "grad_norm": 27.875, + "kl": 0.17689180374145508, + "learning_rate": 1.5e-06, + "logits/chosen": 2760544451.047619, + "logits/rejected": 2041941457.4545455, + "logps/chosen": -296.3612583705357, + "logps/rejected": -362.95363547585225, + "loss": 0.5007, + "rewards/chosen": -0.021251624538784937, + "rewards/margins": 0.02772714849158283, + "rewards/rejected": -0.04897877303036777, + "step": 7 + }, + { + "epoch": 0.002953255502745605, + "grad_norm": 28.125, + "kl": 0.29871606826782227, + "learning_rate": 1.75e-06, + "logits/chosen": 1756430131.2, + "logits/rejected": 1819027626.6666667, + "logps/chosen": -257.7052734375, + "logps/rejected": -455.510498046875, + "loss": 0.4882, + "rewards/chosen": 0.020974960923194886, + "rewards/margins": 0.12168754835923512, + "rewards/rejected": -0.10071258743604024, + "step": 8 + }, + { + "epoch": 0.0033224124405888053, + "grad_norm": 32.75, + "kl": 0.10448455810546875, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": 1686372522.6666667, + "logits/rejected": 1566218342.4, + "logps/chosen": -356.6879069010417, + "logps/rejected": -333.321240234375, + "loss": 0.4925, + "rewards/chosen": -0.01961027830839157, + "rewards/margins": 0.04176507145166397, + "rewards/rejected": -0.06137534976005554, + "step": 9 + }, + { + "epoch": 0.003691569378432006, + "grad_norm": 28.375, + "kl": 0.027356624603271484, + "learning_rate": 2.25e-06, + "logits/chosen": 1681367142.4, + "logits/rejected": 1744424618.6666667, + "logps/chosen": -232.4369384765625, + "logps/rejected": -382.1427408854167, + "loss": 0.4852, + "rewards/chosen": 0.022665101289749145, + "rewards/margins": 0.14383291999499004, + "rewards/rejected": -0.12116781870524089, + "step": 10 + }, + { + "epoch": 0.004060726316275206, + "grad_norm": 27.875, + "kl": 0.0, + "learning_rate": 2.5e-06, + "logits/chosen": 1780963930.3529413, + "logits/rejected": 1877864311.4666667, + "logps/chosen": -329.39720243566177, + "logps/rejected": -287.20159505208335, + "loss": 0.4781, + "rewards/chosen": 0.013715656364665312, + "rewards/margins": 0.18694175855786194, + "rewards/rejected": -0.17322610219319662, + "step": 11 + }, + { + "epoch": 0.004429883254118407, + "grad_norm": 24.625, + "kl": 0.0, + "learning_rate": 2.7500000000000004e-06, + "logits/chosen": 1958743868.952381, + "logits/rejected": 1672948642.909091, + "logps/chosen": -238.36486235119048, + "logps/rejected": -335.138427734375, + "loss": 0.477, + "rewards/chosen": 0.003446015573683239, + "rewards/margins": 0.26674821172957813, + "rewards/rejected": -0.2633021961558949, + "step": 12 + }, + { + "epoch": 0.004799040191961607, + "grad_norm": 27.5, + "kl": 0.0, + "learning_rate": 3e-06, + "logits/chosen": 1642847501.4736843, + "logits/rejected": 1847050712.6153846, + "logps/chosen": -254.55658922697367, + "logps/rejected": -344.00338040865387, + "loss": 0.469, + "rewards/chosen": 0.006821201820122569, + "rewards/margins": 0.30787281848882375, + "rewards/rejected": -0.30105161666870117, + "step": 13 + }, + { + "epoch": 0.005168197129804809, + "grad_norm": 27.75, + "kl": 0.0, + "learning_rate": 3.2500000000000002e-06, + "logits/chosen": 1689543378.8235295, + "logits/rejected": 2345390353.0666666, + "logps/chosen": -319.5962775735294, + "logps/rejected": -290.24296875, + "loss": 0.4407, + "rewards/chosen": 0.0822614080765668, + "rewards/margins": 0.5117390361486697, + "rewards/rejected": -0.4294776280721029, + "step": 14 + }, + { + "epoch": 0.005537354067648009, + "grad_norm": 30.125, + "kl": 0.0, + "learning_rate": 3.5e-06, + "logits/chosen": 1949523727.0588236, + "logits/rejected": 1690669875.2, + "logps/chosen": -269.87184053308823, + "logps/rejected": -450.46067708333334, + "loss": 0.4273, + "rewards/chosen": 0.012773087796042948, + "rewards/margins": 0.6631298891469544, + "rewards/rejected": -0.6503568013509115, + "step": 15 + }, + { + "epoch": 0.00590651100549121, + "grad_norm": 25.25, + "kl": 0.0, + "learning_rate": 3.7500000000000005e-06, + "logits/chosen": 1412717977.6, + "logits/rejected": 1897293653.3333333, + "logps/chosen": -212.9364013671875, + "logps/rejected": -375.4115804036458, + "loss": 0.4311, + "rewards/chosen": 0.04288797378540039, + "rewards/margins": 0.7666228294372559, + "rewards/rejected": -0.7237348556518555, + "step": 16 + }, + { + "epoch": 0.00627566794333441, + "grad_norm": 27.375, + "kl": 0.0, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": 2409784320.0, + "logits/rejected": 2257375232.0, + "logps/chosen": -280.689453125, + "logps/rejected": -499.2878689236111, + "loss": 0.3772, + "rewards/chosen": 0.07687444346291679, + "rewards/margins": 1.128713424243624, + "rewards/rejected": -1.0518389807807074, + "step": 17 + }, + { + "epoch": 0.006644824881177611, + "grad_norm": 25.125, + "kl": 0.0, + "learning_rate": 4.25e-06, + "logits/chosen": 1563148288.0, + "logits/rejected": 2064093952.0, + "logps/chosen": -276.1801452636719, + "logps/rejected": -396.57135009765625, + "loss": 0.3865, + "rewards/chosen": 0.053327564150094986, + "rewards/margins": 1.0444594658911228, + "rewards/rejected": -0.9911319017410278, + "step": 18 + }, + { + "epoch": 0.007013981819020811, + "grad_norm": 22.75, + "kl": 0.0, + "learning_rate": 4.5e-06, + "logits/chosen": 1444451388.235294, + "logits/rejected": 2488889617.0666666, + "logps/chosen": -223.35120346966912, + "logps/rejected": -440.9142252604167, + "loss": 0.3669, + "rewards/chosen": 0.012831165510065416, + "rewards/margins": 1.4073760212636461, + "rewards/rejected": -1.3945448557535807, + "step": 19 + }, + { + "epoch": 0.007383138756864012, + "grad_norm": 21.5, + "kl": 0.0, + "learning_rate": 4.75e-06, + "logits/chosen": 1629098586.3529413, + "logits/rejected": 1632897706.6666667, + "logps/chosen": -239.83998736213235, + "logps/rejected": -396.71429036458335, + "loss": 0.3576, + "rewards/chosen": 0.07079794126398423, + "rewards/margins": 1.4907841574911977, + "rewards/rejected": -1.4199862162272134, + "step": 20 + }, + { + "epoch": 0.007752295694707212, + "grad_norm": 20.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2269135667.2, + "logits/rejected": 2437404310.5882354, + "logps/chosen": -308.4866536458333, + "logps/rejected": -420.2818244485294, + "loss": 0.3174, + "rewards/chosen": 0.01794281005859375, + "rewards/margins": 1.9573586856617646, + "rewards/rejected": -1.939415875603171, + "step": 21 + }, + { + "epoch": 0.008121452632550413, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 5.2500000000000006e-06, + "logits/chosen": 1493249417.8461537, + "logits/rejected": 1447019250.5263157, + "logps/chosen": -288.2017352764423, + "logps/rejected": -401.0751182154605, + "loss": 0.2948, + "rewards/chosen": 0.049938201904296875, + "rewards/margins": 2.054342169510691, + "rewards/rejected": -2.004403967606394, + "step": 22 + }, + { + "epoch": 0.008490609570393614, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 5.500000000000001e-06, + "logits/chosen": 1338189824.0, + "logits/rejected": 1443653745.7777777, + "logps/chosen": -225.77328055245536, + "logps/rejected": -396.80059136284723, + "loss": 0.2875, + "rewards/chosen": 0.1072447555405753, + "rewards/margins": 2.363371159349169, + "rewards/rejected": -2.2561264038085938, + "step": 23 + }, + { + "epoch": 0.008859766508236814, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 5.75e-06, + "logits/chosen": 2125301174.857143, + "logits/rejected": 1851773610.6666667, + "logps/chosen": -331.76150948660717, + "logps/rejected": -425.1775716145833, + "loss": 0.2708, + "rewards/chosen": 0.101840112890516, + "rewards/margins": 2.6721992369682073, + "rewards/rejected": -2.570359124077691, + "step": 24 + }, + { + "epoch": 0.009228923446080015, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 6e-06, + "logits/chosen": 1630805196.8, + "logits/rejected": 1687169024.0, + "logps/chosen": -299.4375244140625, + "logps/rejected": -309.2835693359375, + "loss": 0.3601, + "rewards/chosen": 0.09011529088020324, + "rewards/margins": 2.1960324267546336, + "rewards/rejected": -2.10591713587443, + "step": 25 + }, + { + "epoch": 0.009598080383923215, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 6.25e-06, + "logits/chosen": 2366178157.714286, + "logits/rejected": 1714451456.0, + "logps/chosen": -359.56455775669644, + "logps/rejected": -432.9134928385417, + "loss": 0.262, + "rewards/chosen": 0.025659620761871338, + "rewards/margins": 3.4077053666114807, + "rewards/rejected": -3.3820457458496094, + "step": 26 + }, + { + "epoch": 0.009967237321766416, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 6.5000000000000004e-06, + "logits/chosen": 2193663122.285714, + "logits/rejected": 1679756856.8888888, + "logps/chosen": -235.818115234375, + "logps/rejected": -456.0146484375, + "loss": 0.2258, + "rewards/chosen": 0.21129742690495082, + "rewards/margins": 4.282008774696834, + "rewards/rejected": -4.0707113477918835, + "step": 27 + }, + { + "epoch": 0.010336394259609617, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 6.750000000000001e-06, + "logits/chosen": 1532497042.2857144, + "logits/rejected": 1563654371.5555556, + "logps/chosen": -345.4580078125, + "logps/rejected": -528.5141059027778, + "loss": 0.2312, + "rewards/chosen": 0.08127937146595546, + "rewards/margins": 4.695724527987222, + "rewards/rejected": -4.614445156521267, + "step": 28 + }, + { + "epoch": 0.010705551197452817, + "grad_norm": 15.375, + "kl": 0.0, + "learning_rate": 7e-06, + "logits/chosen": 1362604513.8823528, + "logits/rejected": 1938225561.6, + "logps/chosen": -291.52001953125, + "logps/rejected": -444.8460286458333, + "loss": 0.2666, + "rewards/chosen": 0.24524837381699505, + "rewards/margins": 4.950824443966735, + "rewards/rejected": -4.70557607014974, + "step": 29 + }, + { + "epoch": 0.011074708135296018, + "grad_norm": 23.5, + "kl": 0.0, + "learning_rate": 7.25e-06, + "logits/chosen": 1780472320.0, + "logits/rejected": 2500088832.0, + "logps/chosen": -293.9621175130208, + "logps/rejected": -485.8729248046875, + "loss": 0.3438, + "rewards/chosen": 0.3926080067952474, + "rewards/margins": 4.688485463460286, + "rewards/rejected": -4.295877456665039, + "step": 30 + }, + { + "epoch": 0.011443865073139218, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 7.500000000000001e-06, + "logits/chosen": 1438574464.0, + "logits/rejected": 1654970112.0, + "logps/chosen": -245.86178588867188, + "logps/rejected": -390.7449035644531, + "loss": 0.2853, + "rewards/chosen": 0.16007846593856812, + "rewards/margins": 4.362633645534515, + "rewards/rejected": -4.202555179595947, + "step": 31 + }, + { + "epoch": 0.01181302201098242, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 7.75e-06, + "logits/chosen": 2016926720.0, + "logits/rejected": 1886617827.5555556, + "logps/chosen": -271.82486397879467, + "logps/rejected": -485.7682834201389, + "loss": 0.2217, + "rewards/chosen": 0.2042757272720337, + "rewards/margins": 6.365384380022685, + "rewards/rejected": -6.161108652750651, + "step": 32 + }, + { + "epoch": 0.012182178948825619, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 8.000000000000001e-06, + "logits/chosen": 1638726784.0, + "logits/rejected": 2490309632.0, + "logps/chosen": -334.6903076171875, + "logps/rejected": -540.9254150390625, + "loss": 0.2856, + "rewards/chosen": 0.1194494366645813, + "rewards/margins": 5.167553722858429, + "rewards/rejected": -5.048104286193848, + "step": 33 + }, + { + "epoch": 0.01255133588666882, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 8.25e-06, + "logits/chosen": 2277872876.3076925, + "logits/rejected": 1634631895.5789473, + "logps/chosen": -161.47468449519232, + "logps/rejected": -396.28759765625, + "loss": 0.186, + "rewards/chosen": 1.003878666804387, + "rewards/margins": 5.884395923691723, + "rewards/rejected": -4.880517256887336, + "step": 34 + }, + { + "epoch": 0.01292049282451202, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 8.5e-06, + "logits/chosen": 1731094407.5294118, + "logits/rejected": 1873154184.5333333, + "logps/chosen": -224.08095415900735, + "logps/rejected": -430.52314453125, + "loss": 0.226, + "rewards/chosen": 0.5978934344123391, + "rewards/margins": 6.125739707198798, + "rewards/rejected": -5.527846272786459, + "step": 35 + }, + { + "epoch": 0.013289649762355221, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 8.750000000000001e-06, + "logits/chosen": 1384088868.5714285, + "logits/rejected": 2011694876.4444444, + "logps/chosen": -226.98458426339286, + "logps/rejected": -484.6765407986111, + "loss": 0.2063, + "rewards/chosen": 0.6680049896240234, + "rewards/margins": 6.333443323771159, + "rewards/rejected": -5.665438334147136, + "step": 36 + }, + { + "epoch": 0.013658806700198423, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 9e-06, + "logits/chosen": 1342492416.0, + "logits/rejected": 1389898496.0, + "logps/chosen": -318.62762451171875, + "logps/rejected": -474.1055908203125, + "loss": 0.2231, + "rewards/chosen": 0.7867870330810547, + "rewards/margins": 6.662944316864014, + "rewards/rejected": -5.876157283782959, + "step": 37 + }, + { + "epoch": 0.014027963638041622, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.250000000000001e-06, + "logits/chosen": 1467727510.5882354, + "logits/rejected": 1590144477.8666666, + "logps/chosen": -284.17790670955884, + "logps/rejected": -361.7457682291667, + "loss": 0.287, + "rewards/chosen": 0.3623030325945686, + "rewards/margins": 4.944236626344568, + "rewards/rejected": -4.58193359375, + "step": 38 + }, + { + "epoch": 0.014397120575884824, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 9.5e-06, + "logits/chosen": 2600348785.7777777, + "logits/rejected": 1680918089.142857, + "logps/chosen": -322.96826171875, + "logps/rejected": -401.3741978236607, + "loss": 0.3034, + "rewards/chosen": 0.0875967608557807, + "rewards/margins": 5.292944094491383, + "rewards/rejected": -5.205347333635602, + "step": 39 + }, + { + "epoch": 0.014766277513728023, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 9.75e-06, + "logits/chosen": 1276814592.0, + "logits/rejected": 1708352896.0, + "logps/chosen": -316.9996643066406, + "logps/rejected": -516.47412109375, + "loss": 0.2507, + "rewards/chosen": 0.17344313859939575, + "rewards/margins": 6.922157108783722, + "rewards/rejected": -6.748713970184326, + "step": 40 + }, + { + "epoch": 0.015135434451571225, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 1e-05, + "logits/chosen": 1520733605.6470587, + "logits/rejected": 1560093081.6, + "logps/chosen": -274.39734604779414, + "logps/rejected": -492.22047526041666, + "loss": 0.2438, + "rewards/chosen": 0.5887775421142578, + "rewards/margins": 6.674698257446289, + "rewards/rejected": -6.085920715332032, + "step": 41 + }, + { + "epoch": 0.015504591389414424, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.999996536281763e-06, + "logits/chosen": 1878332022.1538463, + "logits/rejected": 1688040933.0526316, + "logps/chosen": -324.23475060096155, + "logps/rejected": -385.46425267269734, + "loss": 0.2469, + "rewards/chosen": -0.3026095170241136, + "rewards/margins": 5.725915182939907, + "rewards/rejected": -6.028524699964021, + "step": 42 + }, + { + "epoch": 0.015873748327257624, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.999986145131847e-06, + "logits/chosen": 2449532791.4666667, + "logits/rejected": 2073857325.1764705, + "logps/chosen": -319.5615559895833, + "logps/rejected": -430.82238051470586, + "loss": 0.2306, + "rewards/chosen": 0.5627494176228841, + "rewards/margins": 5.6860094668818455, + "rewards/rejected": -5.1232600492589615, + "step": 43 + }, + { + "epoch": 0.016242905265100825, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 9.999968826564655e-06, + "logits/chosen": 1568372736.0, + "logits/rejected": 1516755353.6, + "logps/chosen": -255.79056803385416, + "logps/rejected": -429.844091796875, + "loss": 0.1592, + "rewards/chosen": 0.7009109656016032, + "rewards/margins": 6.3849418799082445, + "rewards/rejected": -5.684030914306641, + "step": 44 + }, + { + "epoch": 0.016612062202944027, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 9.999944580604174e-06, + "logits/chosen": 1150999990.857143, + "logits/rejected": 1218021376.0, + "logps/chosen": -256.4312453497024, + "logps/rejected": -359.18845436789775, + "loss": 0.2696, + "rewards/chosen": 0.9469997769310361, + "rewards/margins": 5.699052430850602, + "rewards/rejected": -4.752052653919566, + "step": 45 + }, + { + "epoch": 0.016981219140787228, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.999913407284001e-06, + "logits/chosen": 2178425072.9411764, + "logits/rejected": 1514750225.0666666, + "logps/chosen": -290.7824276194853, + "logps/rejected": -377.14329427083334, + "loss": 0.2103, + "rewards/chosen": 1.0636055890251608, + "rewards/margins": 5.781053505691828, + "rewards/rejected": -4.717447916666667, + "step": 46 + }, + { + "epoch": 0.01735037607863043, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 9.999875306647327e-06, + "logits/chosen": 2119525034.6666667, + "logits/rejected": 1734975926.857143, + "logps/chosen": -304.3030598958333, + "logps/rejected": -406.95706612723217, + "loss": 0.267, + "rewards/chosen": 0.4767913818359375, + "rewards/margins": 6.05388913835798, + "rewards/rejected": -5.5770977565220425, + "step": 47 + }, + { + "epoch": 0.017719533016473627, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 9.999830278746938e-06, + "logits/chosen": 1849448448.0, + "logits/rejected": 1809036288.0, + "logps/chosen": -308.7493896484375, + "logps/rejected": -417.7481282552083, + "loss": 0.2787, + "rewards/chosen": 0.49034552574157714, + "rewards/margins": 6.644803253809611, + "rewards/rejected": -6.154457728068034, + "step": 48 + }, + { + "epoch": 0.01808868995431683, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 9.99977832364522e-06, + "logits/chosen": 1877806622.1176472, + "logits/rejected": 1842179959.4666667, + "logps/chosen": -271.62867647058823, + "logps/rejected": -498.37220052083336, + "loss": 0.2441, + "rewards/chosen": 0.5726045159732595, + "rewards/margins": 7.073316592796177, + "rewards/rejected": -6.500712076822917, + "step": 49 + }, + { + "epoch": 0.01845784689216003, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 9.999719441414155e-06, + "logits/chosen": 1857280186.1818182, + "logits/rejected": 1451943014.4, + "logps/chosen": -306.71364524147725, + "logps/rejected": -391.633154296875, + "loss": 0.3183, + "rewards/chosen": 0.33392316644841974, + "rewards/margins": 5.827245157415216, + "rewards/rejected": -5.4933219909667965, + "step": 50 + }, + { + "epoch": 0.01882700383000323, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 9.999653632135325e-06, + "logits/chosen": 1494967847.3846154, + "logits/rejected": 1591785579.7894738, + "logps/chosen": -250.60661433293268, + "logps/rejected": -404.4912623355263, + "loss": 0.2167, + "rewards/chosen": 0.15932913926931527, + "rewards/margins": 6.253341349512942, + "rewards/rejected": -6.094012210243626, + "step": 51 + }, + { + "epoch": 0.01919616076784643, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 9.999580895899908e-06, + "logits/chosen": 1338781286.4, + "logits/rejected": 1373899682.909091, + "logps/chosen": -306.9797607421875, + "logps/rejected": -356.29341264204544, + "loss": 0.1584, + "rewards/chosen": 0.7750319480895996, + "rewards/margins": 6.473371236974543, + "rewards/rejected": -5.698339288884943, + "step": 52 + }, + { + "epoch": 0.01956531770568963, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 9.999501232808678e-06, + "logits/chosen": 1816235101.090909, + "logits/rejected": 1874532556.8, + "logps/chosen": -229.54740767045453, + "logps/rejected": -438.01259765625, + "loss": 0.2747, + "rewards/chosen": 0.9943005821921609, + "rewards/margins": 6.325253798744895, + "rewards/rejected": -5.330953216552734, + "step": 53 + }, + { + "epoch": 0.019934474643532832, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 9.99941464297201e-06, + "logits/chosen": 1768479690.1052632, + "logits/rejected": 2750227062.1538463, + "logps/chosen": -260.1775544819079, + "logps/rejected": -534.6139948918269, + "loss": 0.2363, + "rewards/chosen": 0.7120265960693359, + "rewards/margins": 7.150468532855694, + "rewards/rejected": -6.438441936786358, + "step": 54 + }, + { + "epoch": 0.020303631581376033, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.99932112650987e-06, + "logits/chosen": 1212389717.3333333, + "logits/rejected": 1421195264.0, + "logps/chosen": -186.72550455729166, + "logps/rejected": -445.15792410714283, + "loss": 0.1812, + "rewards/chosen": 1.3645939297146268, + "rewards/margins": 7.265176379491413, + "rewards/rejected": -5.900582449776786, + "step": 55 + }, + { + "epoch": 0.020672788519219235, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 9.999220683551823e-06, + "logits/chosen": 1872288587.2941177, + "logits/rejected": 1455958698.6666667, + "logps/chosen": -325.3602941176471, + "logps/rejected": -391.61220703125, + "loss": 0.2451, + "rewards/chosen": 0.7901762233060949, + "rewards/margins": 5.693230015623803, + "rewards/rejected": -4.903053792317708, + "step": 56 + }, + { + "epoch": 0.021041945457062432, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 9.999113314237036e-06, + "logits/chosen": 1607395012.9230769, + "logits/rejected": 1758834472.4210527, + "logps/chosen": -224.738037109375, + "logps/rejected": -384.92041015625, + "loss": 0.2056, + "rewards/chosen": 0.5672861979557917, + "rewards/margins": 5.63627519877816, + "rewards/rejected": -5.068989000822368, + "step": 57 + }, + { + "epoch": 0.021411102394905634, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 9.998999018714264e-06, + "logits/chosen": 2056694442.6666667, + "logits/rejected": 2724740827.428571, + "logps/chosen": -221.12300618489584, + "logps/rejected": -595.9167131696429, + "loss": 0.2111, + "rewards/chosen": 1.227534082200792, + "rewards/margins": 8.258503974430145, + "rewards/rejected": -7.030969892229352, + "step": 58 + }, + { + "epoch": 0.021780259332748835, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 9.998877797141864e-06, + "logits/chosen": 2459097560.6153846, + "logits/rejected": 2502096572.631579, + "logps/chosen": -306.51461087740387, + "logps/rejected": -569.040861430921, + "loss": 0.1967, + "rewards/chosen": 0.23113008645864633, + "rewards/margins": 7.162394863391213, + "rewards/rejected": -6.931264776932566, + "step": 59 + }, + { + "epoch": 0.022149416270592037, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 9.998749649687784e-06, + "logits/chosen": 1715101559.4666667, + "logits/rejected": 1654963862.5882354, + "logps/chosen": -281.77239583333335, + "logps/rejected": -463.271484375, + "loss": 0.2371, + "rewards/chosen": 0.246796719233195, + "rewards/margins": 6.170544062408746, + "rewards/rejected": -5.923747343175552, + "step": 60 + }, + { + "epoch": 0.022518573208435234, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 9.998614576529575e-06, + "logits/chosen": 1463421269.3333333, + "logits/rejected": 1394006630.4, + "logps/chosen": -364.2314453125, + "logps/rejected": -443.4783203125, + "loss": 0.192, + "rewards/chosen": 0.3698062101999919, + "rewards/margins": 5.932623211542766, + "rewards/rejected": -5.562817001342774, + "step": 61 + }, + { + "epoch": 0.022887730146278436, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 9.998472577854377e-06, + "logits/chosen": 2008661196.8, + "logits/rejected": 1713685082.3529413, + "logps/chosen": -243.009765625, + "logps/rejected": -311.3816348805147, + "loss": 0.2156, + "rewards/chosen": 0.4195224444071452, + "rewards/margins": 5.323240831786511, + "rewards/rejected": -4.9037183873793655, + "step": 62 + }, + { + "epoch": 0.023256887084121637, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.998323653858927e-06, + "logits/chosen": 1677440000.0, + "logits/rejected": 1899185720.8888888, + "logps/chosen": -295.54966517857144, + "logps/rejected": -423.97398546006946, + "loss": 0.223, + "rewards/chosen": 0.3550265516553606, + "rewards/margins": 5.674383681917948, + "rewards/rejected": -5.319357130262587, + "step": 63 + }, + { + "epoch": 0.02362604402196484, + "grad_norm": 13.3125, + "kl": 0.0580594539642334, + "learning_rate": 9.998167804749557e-06, + "logits/chosen": 1694137472.0, + "logits/rejected": 2233405184.0, + "logps/chosen": -286.14202880859375, + "logps/rejected": -451.09979248046875, + "loss": 0.1781, + "rewards/chosen": 1.5691519975662231, + "rewards/margins": 7.318243622779846, + "rewards/rejected": -5.749091625213623, + "step": 64 + }, + { + "epoch": 0.02399520095980804, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.998005030742195e-06, + "logits/chosen": 2032204619.2941177, + "logits/rejected": 2416886033.0666666, + "logps/chosen": -190.55962775735293, + "logps/rejected": -468.30091145833336, + "loss": 0.2329, + "rewards/chosen": 0.6395805583280676, + "rewards/margins": 7.008909375059838, + "rewards/rejected": -6.3693288167317705, + "step": 65 + }, + { + "epoch": 0.024364357897651238, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 9.997835332062362e-06, + "logits/chosen": 1805828232.5333333, + "logits/rejected": 1742596698.3529413, + "logps/chosen": -236.96793619791666, + "logps/rejected": -455.2342888327206, + "loss": 0.2032, + "rewards/chosen": 0.6867312113444011, + "rewards/margins": 5.979020496443207, + "rewards/rejected": -5.292289285098805, + "step": 66 + }, + { + "epoch": 0.02473351483549444, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 9.997658708945173e-06, + "logits/chosen": 1767146973.8666666, + "logits/rejected": 1543355331.764706, + "logps/chosen": -280.15100911458336, + "logps/rejected": -375.7935431985294, + "loss": 0.1851, + "rewards/chosen": 1.3260906219482422, + "rewards/margins": 5.505223868874943, + "rewards/rejected": -4.179133246926701, + "step": 67 + }, + { + "epoch": 0.02510267177333764, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 9.997475161635339e-06, + "logits/chosen": 2129411449.2631578, + "logits/rejected": 1924609575.3846154, + "logps/chosen": -314.2047697368421, + "logps/rejected": -495.8671123798077, + "loss": 0.2577, + "rewards/chosen": 0.7498713041606703, + "rewards/margins": 6.013877698767041, + "rewards/rejected": -5.26400639460637, + "step": 68 + }, + { + "epoch": 0.025471828711180842, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.99728469038716e-06, + "logits/chosen": 1612928068.2666667, + "logits/rejected": 1459199638.5882354, + "logps/chosen": -279.89801432291665, + "logps/rejected": -508.3785615808824, + "loss": 0.1614, + "rewards/chosen": 1.44455935160319, + "rewards/margins": 6.4668663548488245, + "rewards/rejected": -5.0223070032456345, + "step": 69 + }, + { + "epoch": 0.02584098564902404, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 9.99708729546453e-06, + "logits/chosen": 1801447544.4705882, + "logits/rejected": 1763752209.0666666, + "logps/chosen": -230.23594037224265, + "logps/rejected": -364.57679036458336, + "loss": 0.182, + "rewards/chosen": 1.579604653751149, + "rewards/margins": 5.4770597420486755, + "rewards/rejected": -3.897455088297526, + "step": 70 + }, + { + "epoch": 0.02621014258686724, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.996882977140942e-06, + "logits/chosen": 2398131200.0, + "logits/rejected": 1937977600.0, + "logps/chosen": -227.99656677246094, + "logps/rejected": -473.57281494140625, + "loss": 0.1918, + "rewards/chosen": 1.0035066604614258, + "rewards/margins": 6.597690582275391, + "rewards/rejected": -5.594183921813965, + "step": 71 + }, + { + "epoch": 0.026579299524710442, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 9.996671735699473e-06, + "logits/chosen": 2165318314.6666665, + "logits/rejected": 1767100708.5714285, + "logps/chosen": -236.366455078125, + "logps/rejected": -349.53194754464283, + "loss": 0.2196, + "rewards/chosen": 1.3846684561835394, + "rewards/margins": 5.1746383091760055, + "rewards/rejected": -3.7899698529924666, + "step": 72 + }, + { + "epoch": 0.026948456462553644, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 9.996453571432797e-06, + "logits/chosen": 1467223186.2857144, + "logits/rejected": 1655437425.7777777, + "logps/chosen": -301.57247488839283, + "logps/rejected": -497.4978841145833, + "loss": 0.196, + "rewards/chosen": 0.5845005171639579, + "rewards/margins": 6.067470013149201, + "rewards/rejected": -5.482969495985243, + "step": 73 + }, + { + "epoch": 0.027317613400396845, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 9.996228484643176e-06, + "logits/chosen": 2630569984.0, + "logits/rejected": 1748253459.6923077, + "logps/chosen": -333.8983604029605, + "logps/rejected": -617.9084660456731, + "loss": 0.265, + "rewards/chosen": 0.42280814522191096, + "rewards/margins": 6.57642835161464, + "rewards/rejected": -6.153620206392729, + "step": 74 + }, + { + "epoch": 0.027686770338240043, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 9.995996475642466e-06, + "logits/chosen": 1252732928.0, + "logits/rejected": 1389756416.0, + "logps/chosen": -229.048583984375, + "logps/rejected": -433.85942925347223, + "loss": 0.1986, + "rewards/chosen": 0.9273087637765067, + "rewards/margins": 5.176066444033668, + "rewards/rejected": -4.248757680257161, + "step": 75 + }, + { + "epoch": 0.028055927276083244, + "grad_norm": 10.6875, + "kl": 2.4262642860412598, + "learning_rate": 9.995757544752114e-06, + "logits/chosen": 1759469158.4, + "logits/rejected": 2334256911.0588236, + "logps/chosen": -276.42845052083334, + "logps/rejected": -583.1529181985294, + "loss": 0.1751, + "rewards/chosen": 1.7980982462565105, + "rewards/margins": 8.691719354367724, + "rewards/rejected": -6.893621108111213, + "step": 76 + }, + { + "epoch": 0.028425084213926446, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 9.995511692303153e-06, + "logits/chosen": 1861607303.5294118, + "logits/rejected": 1953125853.8666666, + "logps/chosen": -343.29859834558823, + "logps/rejected": -479.34270833333335, + "loss": 0.2532, + "rewards/chosen": 0.3977852709153119, + "rewards/margins": 5.109022860433542, + "rewards/rejected": -4.7112375895182295, + "step": 77 + }, + { + "epoch": 0.028794241151769647, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 9.995258918636209e-06, + "logits/chosen": 1292165963.2941177, + "logits/rejected": 1305069704.5333333, + "logps/chosen": -263.4663947610294, + "logps/rejected": -461.77392578125, + "loss": 0.1798, + "rewards/chosen": 1.2859969419591568, + "rewards/margins": 7.121590610578949, + "rewards/rejected": -5.835593668619792, + "step": 78 + }, + { + "epoch": 0.029163398089612845, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 9.994999224101498e-06, + "logits/chosen": 2465983536.7619047, + "logits/rejected": 1553448587.6363637, + "logps/chosen": -321.2909691220238, + "logps/rejected": -356.27543501420456, + "loss": 0.2672, + "rewards/chosen": 0.5891254515874953, + "rewards/margins": 5.790280680635791, + "rewards/rejected": -5.201155229048296, + "step": 79 + }, + { + "epoch": 0.029532555027456046, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 9.994732609058824e-06, + "logits/chosen": 1922855680.0, + "logits/rejected": 1678234112.0, + "logps/chosen": -312.1918029785156, + "logps/rejected": -335.2796630859375, + "loss": 0.2394, + "rewards/chosen": 0.39574065804481506, + "rewards/margins": 4.852455765008926, + "rewards/rejected": -4.456715106964111, + "step": 80 + }, + { + "epoch": 0.029901711965299248, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 9.994459073877577e-06, + "logits/chosen": 1455374043.4285715, + "logits/rejected": 1726403470.2222223, + "logps/chosen": -254.13539341517858, + "logps/rejected": -398.309814453125, + "loss": 0.2246, + "rewards/chosen": 0.7208952222551618, + "rewards/margins": 5.393262318202427, + "rewards/rejected": -4.672367095947266, + "step": 81 + }, + { + "epoch": 0.03027086890314245, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 9.994178618936736e-06, + "logits/chosen": 1985200990.3157895, + "logits/rejected": 2359929462.1538463, + "logps/chosen": -319.4258583470395, + "logps/rejected": -365.87804236778845, + "loss": 0.2709, + "rewards/chosen": 0.6256721396195261, + "rewards/margins": 5.31967443782791, + "rewards/rejected": -4.6940022982083836, + "step": 82 + }, + { + "epoch": 0.03064002584098565, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.99389124462487e-06, + "logits/chosen": 2027066274.909091, + "logits/rejected": 2257553700.571429, + "logps/chosen": -230.66288618607953, + "logps/rejected": -506.45814732142856, + "loss": 0.2274, + "rewards/chosen": -0.09618000550703569, + "rewards/margins": 5.4014923025519295, + "rewards/rejected": -5.497672308058966, + "step": 83 + }, + { + "epoch": 0.03100918277882885, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.99359695134013e-06, + "logits/chosen": 1777332087.4666667, + "logits/rejected": 2275405824.0, + "logps/chosen": -311.42467447916664, + "logps/rejected": -558.0045955882352, + "loss": 0.2086, + "rewards/chosen": 0.5744876861572266, + "rewards/margins": 6.198888105504653, + "rewards/rejected": -5.624400419347427, + "step": 84 + }, + { + "epoch": 0.03137833971667205, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 9.993295739490259e-06, + "logits/chosen": 1831358610.2857144, + "logits/rejected": 2195003164.4444447, + "logps/chosen": -238.24769810267858, + "logps/rejected": -477.46875, + "loss": 0.1829, + "rewards/chosen": 0.7761190959385463, + "rewards/margins": 6.587837082999093, + "rewards/rejected": -5.811717987060547, + "step": 85 + }, + { + "epoch": 0.03174749665451525, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 9.992987609492578e-06, + "logits/chosen": 2042234752.0, + "logits/rejected": 2022515072.0, + "logps/chosen": -362.3477478027344, + "logps/rejected": -459.9891357421875, + "loss": 0.241, + "rewards/chosen": 0.5852454900741577, + "rewards/margins": 5.293747305870056, + "rewards/rejected": -4.708501815795898, + "step": 86 + }, + { + "epoch": 0.03211665359235845, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.992672561774001e-06, + "logits/chosen": 1858031616.0, + "logits/rejected": 2070205147.4285715, + "logps/chosen": -252.31770833333334, + "logps/rejected": -453.39222935267856, + "loss": 0.2913, + "rewards/chosen": 0.04599475860595703, + "rewards/margins": 5.78010926927839, + "rewards/rejected": -5.734114510672433, + "step": 87 + }, + { + "epoch": 0.03248581053020165, + "grad_norm": 13.0625, + "kl": 0.045295000076293945, + "learning_rate": 9.99235059677102e-06, + "logits/chosen": 2844576861.090909, + "logits/rejected": 2324120722.285714, + "logps/chosen": -318.09841086647725, + "logps/rejected": -529.4165736607143, + "loss": 0.1681, + "rewards/chosen": 0.7012535442005504, + "rewards/margins": 6.182985140647723, + "rewards/rejected": -5.4817315964471724, + "step": 88 + }, + { + "epoch": 0.03285496746804485, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.992021714929714e-06, + "logits/chosen": 1851180800.0, + "logits/rejected": 1355009536.0, + "logps/chosen": -294.095947265625, + "logps/rejected": -489.3984069824219, + "loss": 0.2363, + "rewards/chosen": 0.6235851049423218, + "rewards/margins": 5.485925078392029, + "rewards/rejected": -4.862339973449707, + "step": 89 + }, + { + "epoch": 0.03322412440588805, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 9.991685916705748e-06, + "logits/chosen": 2079479020.3076923, + "logits/rejected": 1698660783.1578948, + "logps/chosen": -352.05911959134613, + "logps/rejected": -490.8021175986842, + "loss": 0.185, + "rewards/chosen": 0.6049078061030462, + "rewards/margins": 7.19097018724511, + "rewards/rejected": -6.586062381142064, + "step": 90 + }, + { + "epoch": 0.033593281343731254, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 9.991343202564358e-06, + "logits/chosen": 1390758518.1538463, + "logits/rejected": 2231418233.263158, + "logps/chosen": -308.30014272836536, + "logps/rejected": -411.1084755345395, + "loss": 0.1712, + "rewards/chosen": 0.9770946502685547, + "rewards/margins": 5.507719441464073, + "rewards/rejected": -4.530624791195518, + "step": 91 + }, + { + "epoch": 0.033962438281574456, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 9.99099357298038e-06, + "logits/chosen": 2862299721.142857, + "logits/rejected": 1536312888.8888888, + "logps/chosen": -252.69503348214286, + "logps/rejected": -442.3046061197917, + "loss": 0.1883, + "rewards/chosen": 0.718670300074986, + "rewards/margins": 6.826822477673727, + "rewards/rejected": -6.1081521775987415, + "step": 92 + }, + { + "epoch": 0.03433159521941766, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 9.990637028438213e-06, + "logits/chosen": 2640026112.0, + "logits/rejected": 2393911808.0, + "logps/chosen": -349.05731201171875, + "logps/rejected": -311.5893249511719, + "loss": 0.2436, + "rewards/chosen": 0.2633194029331207, + "rewards/margins": 5.2781175673007965, + "rewards/rejected": -5.014798164367676, + "step": 93 + }, + { + "epoch": 0.03470075215726086, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 9.99027356943185e-06, + "logits/chosen": 2197273856.0, + "logits/rejected": 2080851072.0, + "logps/chosen": -329.997802734375, + "logps/rejected": -505.42755126953125, + "loss": 0.2367, + "rewards/chosen": 0.7300698757171631, + "rewards/margins": 4.740393400192261, + "rewards/rejected": -4.010323524475098, + "step": 94 + }, + { + "epoch": 0.03506990909510405, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 9.989903196464858e-06, + "logits/chosen": 1441355776.0, + "logits/rejected": 1445555260.235294, + "logps/chosen": -252.445556640625, + "logps/rejected": -368.59607651654414, + "loss": 0.2342, + "rewards/chosen": 0.3057329813639323, + "rewards/margins": 6.0151827494303385, + "rewards/rejected": -5.709449768066406, + "step": 95 + }, + { + "epoch": 0.035439066032947254, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 9.989525910050382e-06, + "logits/chosen": 2195819264.0, + "logits/rejected": 2371022336.0, + "logps/chosen": -253.74082946777344, + "logps/rejected": -425.7666931152344, + "loss": 0.2066, + "rewards/chosen": 0.9258164763450623, + "rewards/margins": 5.9472731947898865, + "rewards/rejected": -5.021456718444824, + "step": 96 + }, + { + "epoch": 0.035808222970790456, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 9.989141710711149e-06, + "logits/chosen": 2114243697.7777777, + "logits/rejected": 2152229449.142857, + "logps/chosen": -242.63430447048611, + "logps/rejected": -409.2127162388393, + "loss": 0.2407, + "rewards/chosen": 0.9091711044311523, + "rewards/margins": 6.099010603768485, + "rewards/rejected": -5.1898394993373325, + "step": 97 + }, + { + "epoch": 0.03617737990863366, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.988750598979464e-06, + "logits/chosen": 1479391232.0, + "logits/rejected": 1785386188.8, + "logps/chosen": -227.9235636393229, + "logps/rejected": -475.828369140625, + "loss": 0.2022, + "rewards/chosen": 0.6630279223124186, + "rewards/margins": 5.525640074412029, + "rewards/rejected": -4.86261215209961, + "step": 98 + }, + { + "epoch": 0.03654653684647686, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 9.988352575397204e-06, + "logits/chosen": 1519752192.0, + "logits/rejected": 1145407658.6666667, + "logps/chosen": -227.961572265625, + "logps/rejected": -361.073974609375, + "loss": 0.2889, + "rewards/chosen": 0.8448988914489746, + "rewards/margins": 5.649965635935466, + "rewards/rejected": -4.805066744486491, + "step": 99 + }, + { + "epoch": 0.03691569378432006, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 9.987947640515827e-06, + "logits/chosen": 1711863053.4736843, + "logits/rejected": 1618657280.0, + "logps/chosen": -296.8956877055921, + "logps/rejected": -406.9450871394231, + "loss": 0.2558, + "rewards/chosen": 0.6898695293225741, + "rewards/margins": 5.727167291679845, + "rewards/rejected": -5.037297762357271, + "step": 100 + }, + { + "epoch": 0.03728485072216326, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 9.987535794896366e-06, + "logits/chosen": 1989321614.2222223, + "logits/rejected": 2097976173.7142856, + "logps/chosen": -317.0855305989583, + "logps/rejected": -452.925537109375, + "loss": 0.2217, + "rewards/chosen": 0.755950927734375, + "rewards/margins": 5.925982339041574, + "rewards/rejected": -5.170031411307199, + "step": 101 + }, + { + "epoch": 0.03765400766000646, + "grad_norm": 17.25, + "kl": 0.0, + "learning_rate": 9.987117039109427e-06, + "logits/chosen": 2183288832.0, + "logits/rejected": 2276737024.0, + "logps/chosen": -403.1256103515625, + "logps/rejected": -362.5157775878906, + "loss": 0.2504, + "rewards/chosen": 0.5024189352989197, + "rewards/margins": 5.7441834807395935, + "rewards/rejected": -5.241764545440674, + "step": 102 + }, + { + "epoch": 0.038023164597849664, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 9.986691373735191e-06, + "logits/chosen": 1648751796.7058823, + "logits/rejected": 1363656430.9333334, + "logps/chosen": -224.86160098805146, + "logps/rejected": -446.74371744791665, + "loss": 0.2626, + "rewards/chosen": 0.37699444153729605, + "rewards/margins": 5.768255224414901, + "rewards/rejected": -5.3912607828776045, + "step": 103 + }, + { + "epoch": 0.03839232153569286, + "grad_norm": 18.0, + "kl": 0.12965011596679688, + "learning_rate": 9.986258799363412e-06, + "logits/chosen": 1755156870.0952382, + "logits/rejected": 2008543976.7272727, + "logps/chosen": -313.30245535714283, + "logps/rejected": -560.2453835227273, + "loss": 0.3148, + "rewards/chosen": 0.4358068193708147, + "rewards/margins": 7.7045175626680455, + "rewards/rejected": -7.26871074329723, + "step": 104 + }, + { + "epoch": 0.03876147847353606, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 9.985819316593416e-06, + "logits/chosen": 2025366186.6666667, + "logits/rejected": 1718867236.5714285, + "logps/chosen": -294.03716362847223, + "logps/rejected": -334.9393833705357, + "loss": 0.1871, + "rewards/chosen": 1.2875531514485676, + "rewards/margins": 6.514278048560732, + "rewards/rejected": -5.226724897112165, + "step": 105 + }, + { + "epoch": 0.03913063541137926, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.9853729260341e-06, + "logits/chosen": 1558354261.3333333, + "logits/rejected": 1251716505.6, + "logps/chosen": -367.9998372395833, + "logps/rejected": -378.8102294921875, + "loss": 0.183, + "rewards/chosen": 0.6617204745610555, + "rewards/margins": 5.560971268018086, + "rewards/rejected": -4.899250793457031, + "step": 106 + }, + { + "epoch": 0.03949979234922246, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 9.984919628303934e-06, + "logits/chosen": 1908911786.6666667, + "logits/rejected": 2275983155.2, + "logps/chosen": -304.4996337890625, + "logps/rejected": -526.97568359375, + "loss": 0.1993, + "rewards/chosen": 0.5054636001586914, + "rewards/margins": 7.350333595275879, + "rewards/rejected": -6.844869995117188, + "step": 107 + }, + { + "epoch": 0.039868949287065664, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 9.984459424030958e-06, + "logits/chosen": 1900433817.6, + "logits/rejected": 1773175125.3333333, + "logps/chosen": -347.9761962890625, + "logps/rejected": -506.8036295572917, + "loss": 0.2913, + "rewards/chosen": 0.3913978815078735, + "rewards/margins": 6.795141625404358, + "rewards/rejected": -6.403743743896484, + "step": 108 + }, + { + "epoch": 0.040238106224908865, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 9.983992313852776e-06, + "logits/chosen": 1554206935.5789473, + "logits/rejected": 2160550675.6923075, + "logps/chosen": -282.8433388157895, + "logps/rejected": -630.1656400240385, + "loss": 0.2778, + "rewards/chosen": 0.6889306118613795, + "rewards/margins": 8.18255595929227, + "rewards/rejected": -7.493625347430889, + "step": 109 + }, + { + "epoch": 0.040607263162752066, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 9.983518298416564e-06, + "logits/chosen": 1846246570.6666667, + "logits/rejected": 2043037081.6, + "logps/chosen": -342.9548746744792, + "logps/rejected": -384.856884765625, + "loss": 0.1835, + "rewards/chosen": 0.5253825982411703, + "rewards/margins": 5.616027816136678, + "rewards/rejected": -5.090645217895508, + "step": 110 + }, + { + "epoch": 0.04097642010059527, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 9.983037378379064e-06, + "logits/chosen": 1713726805.3333333, + "logits/rejected": 2360315494.4, + "logps/chosen": -226.1729939778646, + "logps/rejected": -502.5017578125, + "loss": 0.1747, + "rewards/chosen": 1.201509157816569, + "rewards/margins": 6.545764605204265, + "rewards/rejected": -5.344255447387695, + "step": 111 + }, + { + "epoch": 0.04134557703843847, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 9.982549554406585e-06, + "logits/chosen": 2131049050.3529413, + "logits/rejected": 1641782476.8, + "logps/chosen": -328.17038143382354, + "logps/rejected": -680.3260416666667, + "loss": 0.2105, + "rewards/chosen": 1.273947323069853, + "rewards/margins": 8.76672228644876, + "rewards/rejected": -7.492774963378906, + "step": 112 + }, + { + "epoch": 0.041714733976281664, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 9.982054827175e-06, + "logits/chosen": 1623167522.1333334, + "logits/rejected": 2323720312.470588, + "logps/chosen": -246.65091145833333, + "logps/rejected": -316.68014705882354, + "loss": 0.216, + "rewards/chosen": 0.693026606241862, + "rewards/margins": 5.106558638927983, + "rewards/rejected": -4.413532032686121, + "step": 113 + }, + { + "epoch": 0.042083890914124865, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 9.981553197369752e-06, + "logits/chosen": 1396239667.2, + "logits/rejected": 2098433194.6666667, + "logps/chosen": -263.24794921875, + "logps/rejected": -456.585693359375, + "loss": 0.2394, + "rewards/chosen": 1.0839725494384767, + "rewards/margins": 8.15456288655599, + "rewards/rejected": -7.070590337117513, + "step": 114 + }, + { + "epoch": 0.042453047851968066, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.981044665685834e-06, + "logits/chosen": 2574838945.6842103, + "logits/rejected": 2421310070.1538463, + "logps/chosen": -282.8310032894737, + "logps/rejected": -489.0692608173077, + "loss": 0.2382, + "rewards/chosen": 0.7279243469238281, + "rewards/margins": 7.7287577115572414, + "rewards/rejected": -7.000833364633413, + "step": 115 + }, + { + "epoch": 0.04282220478981127, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 9.980529232827819e-06, + "logits/chosen": 2209159529.4117646, + "logits/rejected": 2453668386.133333, + "logps/chosen": -337.8021024816176, + "logps/rejected": -496.6529947916667, + "loss": 0.2489, + "rewards/chosen": 0.5998899796429802, + "rewards/margins": 7.659581345202876, + "rewards/rejected": -7.059691365559896, + "step": 116 + }, + { + "epoch": 0.04319136172765447, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 9.980006899509827e-06, + "logits/chosen": 1529360896.0, + "logits/rejected": 1943247104.0, + "logps/chosen": -228.32723999023438, + "logps/rejected": -532.2776489257812, + "loss": 0.2304, + "rewards/chosen": 0.42655372619628906, + "rewards/margins": 8.87204360961914, + "rewards/rejected": -8.445489883422852, + "step": 117 + }, + { + "epoch": 0.04356051866549767, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 9.979477666455547e-06, + "logits/chosen": 2176278528.0, + "logits/rejected": 1787879796.3636363, + "logps/chosen": -305.897998046875, + "logps/rejected": -459.18825461647725, + "loss": 0.1549, + "rewards/chosen": 1.167719554901123, + "rewards/margins": 6.292212893746116, + "rewards/rejected": -5.124493338844993, + "step": 118 + }, + { + "epoch": 0.04392967560334087, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 9.978941534398224e-06, + "logits/chosen": 1797277559.4666667, + "logits/rejected": 1434951198.1176472, + "logps/chosen": -277.3956705729167, + "logps/rejected": -396.1455939797794, + "loss": 0.2444, + "rewards/chosen": 0.7151554743448894, + "rewards/margins": 4.542776934305827, + "rewards/rejected": -3.8276214599609375, + "step": 119 + }, + { + "epoch": 0.04429883254118407, + "grad_norm": 15.125, + "kl": 0.0, + "learning_rate": 9.978398504080661e-06, + "logits/chosen": 2059608064.0, + "logits/rejected": 1970344072.5333333, + "logps/chosen": -335.3508731617647, + "logps/rejected": -447.71790364583336, + "loss": 0.2586, + "rewards/chosen": 0.3449854289784151, + "rewards/margins": 6.147816951602112, + "rewards/rejected": -5.8028315226236975, + "step": 120 + }, + { + "epoch": 0.044667989479027274, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.97784857625522e-06, + "logits/chosen": 1657897062.4, + "logits/rejected": 1288226389.3333333, + "logps/chosen": -281.0583984375, + "logps/rejected": -521.1830240885416, + "loss": 0.2349, + "rewards/chosen": 0.9630319595336914, + "rewards/margins": 6.114233207702637, + "rewards/rejected": -5.151201248168945, + "step": 121 + }, + { + "epoch": 0.04503714641687047, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.977291751683821e-06, + "logits/chosen": 1902853059.764706, + "logits/rejected": 1981701188.2666667, + "logps/chosen": -260.3522518382353, + "logps/rejected": -531.2846354166667, + "loss": 0.1971, + "rewards/chosen": 1.098404603845933, + "rewards/margins": 8.396848207361558, + "rewards/rejected": -7.298443603515625, + "step": 122 + }, + { + "epoch": 0.04540630335471367, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.976728031137936e-06, + "logits/chosen": 1758258107.7333333, + "logits/rejected": 1697665385.4117646, + "logps/chosen": -299.39443359375, + "logps/rejected": -451.6884765625, + "loss": 0.2204, + "rewards/chosen": 0.6505784352620443, + "rewards/margins": 6.2329601886225685, + "rewards/rejected": -5.582381753360524, + "step": 123 + }, + { + "epoch": 0.04577546029255687, + "grad_norm": 15.125, + "kl": 0.4802436828613281, + "learning_rate": 9.976157415398591e-06, + "logits/chosen": 1585688791.5789473, + "logits/rejected": 1639487645.5384614, + "logps/chosen": -292.54111842105266, + "logps/rejected": -412.7202899639423, + "loss": 0.2459, + "rewards/chosen": 0.9496644672594572, + "rewards/margins": 7.109580097893471, + "rewards/rejected": -6.159915630634014, + "step": 124 + }, + { + "epoch": 0.04614461723040007, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.97557990525637e-06, + "logits/chosen": 2208791066.9473686, + "logits/rejected": 1935981016.6153846, + "logps/chosen": -258.21718236019734, + "logps/rejected": -463.4070012019231, + "loss": 0.2218, + "rewards/chosen": 0.8023242448505602, + "rewards/margins": 7.352701723816906, + "rewards/rejected": -6.550377478966346, + "step": 125 + }, + { + "epoch": 0.046513774168243274, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 9.974995501511404e-06, + "logits/chosen": 1565059218.2857144, + "logits/rejected": 1764496042.6666667, + "logps/chosen": -235.54947335379464, + "logps/rejected": -480.8943142361111, + "loss": 0.1247, + "rewards/chosen": 1.7268641335623605, + "rewards/margins": 8.040589075239877, + "rewards/rejected": -6.313724941677517, + "step": 126 + }, + { + "epoch": 0.046882931106086476, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 9.974404204973376e-06, + "logits/chosen": 1453867349.3333333, + "logits/rejected": 1547031405.7142856, + "logps/chosen": -247.05194769965277, + "logps/rejected": -496.84354073660717, + "loss": 0.2602, + "rewards/chosen": 0.30126484235127765, + "rewards/margins": 7.139560915174938, + "rewards/rejected": -6.838296072823661, + "step": 127 + }, + { + "epoch": 0.04725208804392968, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 9.973806016461522e-06, + "logits/chosen": 1617054019.368421, + "logits/rejected": 1849859465.8461537, + "logps/chosen": -307.8926552220395, + "logps/rejected": -358.07504507211536, + "loss": 0.2861, + "rewards/chosen": 0.4247779846191406, + "rewards/margins": 4.838160588191106, + "rewards/rejected": -4.413382603571965, + "step": 128 + }, + { + "epoch": 0.04762124498177288, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 9.973200936804624e-06, + "logits/chosen": 1714075648.0, + "logits/rejected": 1633756598.857143, + "logps/chosen": -297.0863986545139, + "logps/rejected": -408.29282924107144, + "loss": 0.2623, + "rewards/chosen": 0.28452467918395996, + "rewards/margins": 6.9117099557604105, + "rewards/rejected": -6.6271852765764505, + "step": 129 + }, + { + "epoch": 0.04799040191961608, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 9.972588966841013e-06, + "logits/chosen": 2087618921.4117646, + "logits/rejected": 2081517431.4666667, + "logps/chosen": -365.6372644761029, + "logps/rejected": -327.20940755208335, + "loss": 0.2334, + "rewards/chosen": 0.5694898717543658, + "rewards/margins": 5.698076704436657, + "rewards/rejected": -5.128586832682291, + "step": 130 + }, + { + "epoch": 0.048359558857459274, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.971970107418562e-06, + "logits/chosen": 1364215808.0, + "logits/rejected": 1746171264.0, + "logps/chosen": -304.3335876464844, + "logps/rejected": -462.2740783691406, + "loss": 0.1771, + "rewards/chosen": 1.2956247329711914, + "rewards/margins": 7.219841957092285, + "rewards/rejected": -5.924217224121094, + "step": 131 + }, + { + "epoch": 0.048728715795302475, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 9.971344359394696e-06, + "logits/chosen": 1409807488.0, + "logits/rejected": 1437029376.0, + "logps/chosen": -252.76441955566406, + "logps/rejected": -393.7583821614583, + "loss": 0.1415, + "rewards/chosen": 0.2677656412124634, + "rewards/margins": 5.021211584409078, + "rewards/rejected": -4.753445943196614, + "step": 132 + }, + { + "epoch": 0.04909787273314568, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 9.970711723636382e-06, + "logits/chosen": 1882551680.0, + "logits/rejected": 2164357632.0, + "logps/chosen": -285.0903015136719, + "logps/rejected": -545.5709228515625, + "loss": 0.1789, + "rewards/chosen": 0.8549647331237793, + "rewards/margins": 7.554940223693848, + "rewards/rejected": -6.699975490570068, + "step": 133 + }, + { + "epoch": 0.04946702967098888, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 9.970072201020127e-06, + "logits/chosen": 2046003200.0, + "logits/rejected": 2017410951.5294118, + "logps/chosen": -340.6033203125, + "logps/rejected": -371.1426355698529, + "loss": 0.2222, + "rewards/chosen": 0.304661496480306, + "rewards/margins": 5.604309108210545, + "rewards/rejected": -5.299647611730239, + "step": 134 + }, + { + "epoch": 0.04983618660883208, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 9.969425792431982e-06, + "logits/chosen": 1472845004.8, + "logits/rejected": 1534854609.4545455, + "logps/chosen": -242.8343994140625, + "logps/rejected": -487.8655894886364, + "loss": 0.1384, + "rewards/chosen": 0.5666323184967041, + "rewards/margins": 7.705569141561335, + "rewards/rejected": -7.138936823064631, + "step": 135 + }, + { + "epoch": 0.05020534354667528, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 9.968772498767537e-06, + "logits/chosen": 2157815398.4, + "logits/rejected": 2144619178.6666667, + "logps/chosen": -264.1074462890625, + "logps/rejected": -421.3930257161458, + "loss": 0.2877, + "rewards/chosen": 0.45233545303344724, + "rewards/margins": 6.125611082712809, + "rewards/rejected": -5.673275629679362, + "step": 136 + }, + { + "epoch": 0.05057450048451848, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.96811232093192e-06, + "logits/chosen": 1840451993.6, + "logits/rejected": 2367653767.529412, + "logps/chosen": -271.489404296875, + "logps/rejected": -547.3714958639706, + "loss": 0.2329, + "rewards/chosen": 0.22568483352661134, + "rewards/margins": 7.247097402460435, + "rewards/rejected": -7.021412568933823, + "step": 137 + }, + { + "epoch": 0.050943657422361684, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.967445259839805e-06, + "logits/chosen": 1211621760.0, + "logits/rejected": 1779529728.0, + "logps/chosen": -287.6078796386719, + "logps/rejected": -522.9850463867188, + "loss": 0.1964, + "rewards/chosen": 1.0404701232910156, + "rewards/margins": 6.724546432495117, + "rewards/rejected": -5.684076309204102, + "step": 138 + }, + { + "epoch": 0.051312814360204885, + "grad_norm": 15.375, + "kl": 0.0, + "learning_rate": 9.966771316415391e-06, + "logits/chosen": 1176933429.8947368, + "logits/rejected": 1345533479.3846154, + "logps/chosen": -273.7075709292763, + "logps/rejected": -454.73399939903845, + "loss": 0.2286, + "rewards/chosen": 0.9516277313232422, + "rewards/margins": 7.460347542395959, + "rewards/rejected": -6.508719811072717, + "step": 139 + }, + { + "epoch": 0.05168197129804808, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 9.966090491592422e-06, + "logits/chosen": 1860881588.7058823, + "logits/rejected": 1663990988.8, + "logps/chosen": -195.26230755974265, + "logps/rejected": -377.9548828125, + "loss": 0.1918, + "rewards/chosen": 1.3310065549962662, + "rewards/margins": 6.553645511701995, + "rewards/rejected": -5.222638956705729, + "step": 140 + }, + { + "epoch": 0.05205112823589128, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.96540278631417e-06, + "logits/chosen": 1410664174.9333334, + "logits/rejected": 2254645850.352941, + "logps/chosen": -325.3301106770833, + "logps/rejected": -404.6810087316176, + "loss": 0.1979, + "rewards/chosen": 1.0128177642822265, + "rewards/margins": 6.171277147180893, + "rewards/rejected": -5.158459382898667, + "step": 141 + }, + { + "epoch": 0.05242028517373448, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 9.964708201533441e-06, + "logits/chosen": 1835615800.8888888, + "logits/rejected": 1733942272.0, + "logps/chosen": -299.5881618923611, + "logps/rejected": -423.69688197544644, + "loss": 0.228, + "rewards/chosen": 0.9985835817125108, + "rewards/margins": 6.859615250239297, + "rewards/rejected": -5.861031668526786, + "step": 142 + }, + { + "epoch": 0.052789442111577684, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.964006738212574e-06, + "logits/chosen": 1723572224.0, + "logits/rejected": 1716936817.7777777, + "logps/chosen": -316.1324986049107, + "logps/rejected": -445.02039930555554, + "loss": 0.1667, + "rewards/chosen": 0.8416002137320382, + "rewards/margins": 6.770903277018713, + "rewards/rejected": -5.9293030632866754, + "step": 143 + }, + { + "epoch": 0.053158599049420885, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.963298397323443e-06, + "logits/chosen": 1438611828.3636363, + "logits/rejected": 1188689432.3809524, + "logps/chosen": -272.90640536221593, + "logps/rejected": -352.6194661458333, + "loss": 0.1765, + "rewards/chosen": 0.30639527060768823, + "rewards/margins": 5.695366508516915, + "rewards/rejected": -5.388971237909226, + "step": 144 + }, + { + "epoch": 0.053527755987264086, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 9.962583179847436e-06, + "logits/chosen": 1521943473.2307692, + "logits/rejected": 1792270443.7894738, + "logps/chosen": -295.5730731670673, + "logps/rejected": -548.1491570723684, + "loss": 0.2084, + "rewards/chosen": 0.18294525146484375, + "rewards/margins": 8.166454515959087, + "rewards/rejected": -7.983509264494243, + "step": 145 + }, + { + "epoch": 0.05389691292510729, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 9.961861086775483e-06, + "logits/chosen": 2049235787.2941177, + "logits/rejected": 2662717849.6, + "logps/chosen": -279.9469784007353, + "logps/rejected": -405.4477864583333, + "loss": 0.2471, + "rewards/chosen": 0.6134995853199678, + "rewards/margins": 7.04523837519627, + "rewards/rejected": -6.431738789876302, + "step": 146 + }, + { + "epoch": 0.05426606986295049, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 9.961132119108036e-06, + "logits/chosen": 1196804949.3333333, + "logits/rejected": 1285277081.6, + "logps/chosen": -300.0386555989583, + "logps/rejected": -423.7908203125, + "loss": 0.1142, + "rewards/chosen": 1.710240364074707, + "rewards/margins": 7.602990531921387, + "rewards/rejected": -5.89275016784668, + "step": 147 + }, + { + "epoch": 0.05463522680079369, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 9.960396277855067e-06, + "logits/chosen": 2119628920.4705882, + "logits/rejected": 2497171182.9333334, + "logps/chosen": -256.9636661305147, + "logps/rejected": -425.97486979166666, + "loss": 0.2647, + "rewards/chosen": 0.25106113097246957, + "rewards/margins": 6.095615269156063, + "rewards/rejected": -5.844554138183594, + "step": 148 + }, + { + "epoch": 0.055004383738636885, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.959653564036077e-06, + "logits/chosen": 2687640098.133333, + "logits/rejected": 1640455830.5882354, + "logps/chosen": -258.4703125, + "logps/rejected": -438.32292624080884, + "loss": 0.218, + "rewards/chosen": 0.7572470347086588, + "rewards/margins": 5.740309778849284, + "rewards/rejected": -4.983062744140625, + "step": 149 + }, + { + "epoch": 0.055373540676480086, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 9.958903978680086e-06, + "logits/chosen": 2401418003.6923075, + "logits/rejected": 1632841943.5789473, + "logps/chosen": -260.27152193509613, + "logps/rejected": -426.1464072779605, + "loss": 0.1984, + "rewards/chosen": 0.3203093455387996, + "rewards/margins": 5.774446244181892, + "rewards/rejected": -5.4541368986430925, + "step": 150 + }, + { + "epoch": 0.05574269761432329, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 9.958147522825634e-06, + "logits/chosen": 1990153637.6470587, + "logits/rejected": 1811087086.9333334, + "logps/chosen": -309.74445657169116, + "logps/rejected": -384.76666666666665, + "loss": 0.2156, + "rewards/chosen": 0.7645788753733915, + "rewards/margins": 5.694301470588235, + "rewards/rejected": -4.929722595214844, + "step": 151 + }, + { + "epoch": 0.05611185455216649, + "grad_norm": 15.375, + "kl": 0.0, + "learning_rate": 9.957384197520782e-06, + "logits/chosen": 1929592832.0, + "logits/rejected": 1732439722.6666667, + "logps/chosen": -286.061572265625, + "logps/rejected": -467.577392578125, + "loss": 0.263, + "rewards/chosen": 0.690336275100708, + "rewards/margins": 7.056879091262817, + "rewards/rejected": -6.366542816162109, + "step": 152 + }, + { + "epoch": 0.05648101149000969, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 9.956614003823107e-06, + "logits/chosen": 2206007296.0, + "logits/rejected": 1737327104.0, + "logps/chosen": -335.0188293457031, + "logps/rejected": -432.6441345214844, + "loss": 0.2165, + "rewards/chosen": 0.5905541777610779, + "rewards/margins": 7.540182054042816, + "rewards/rejected": -6.949627876281738, + "step": 153 + }, + { + "epoch": 0.05685016842785289, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 9.955836942799704e-06, + "logits/chosen": 1556232055.4666667, + "logits/rejected": 1265479318.5882354, + "logps/chosen": -239.9814453125, + "logps/rejected": -373.8742244944853, + "loss": 0.1936, + "rewards/chosen": 1.0524824778238933, + "rewards/margins": 5.694553472481521, + "rewards/rejected": -4.642070994657629, + "step": 154 + }, + { + "epoch": 0.05721932536569609, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 9.955053015527178e-06, + "logits/chosen": 1438291558.4, + "logits/rejected": 1913495913.4117646, + "logps/chosen": -264.23564453125, + "logps/rejected": -581.5668658088235, + "loss": 0.1318, + "rewards/chosen": 1.6814014434814453, + "rewards/margins": 8.076269284416647, + "rewards/rejected": -6.394867840935202, + "step": 155 + }, + { + "epoch": 0.057588482303539294, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 9.954262223091654e-06, + "logits/chosen": 1976908458.6666667, + "logits/rejected": 1801096285.090909, + "logps/chosen": -268.6873837425595, + "logps/rejected": -384.91725852272725, + "loss": 0.2974, + "rewards/chosen": 0.6272712889171782, + "rewards/margins": 5.663436352948606, + "rewards/rejected": -5.036165064031428, + "step": 156 + }, + { + "epoch": 0.057957639241382496, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 9.953464566588762e-06, + "logits/chosen": 2200508602.181818, + "logits/rejected": 1757060339.8095238, + "logps/chosen": -276.67189719460225, + "logps/rejected": -454.5063709077381, + "loss": 0.1173, + "rewards/chosen": 1.2091949636285955, + "rewards/margins": 8.021958722696676, + "rewards/rejected": -6.812763759068081, + "step": 157 + }, + { + "epoch": 0.05832679617922569, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 9.952660047123647e-06, + "logits/chosen": 1398910520.8888888, + "logits/rejected": 1652572013.7142856, + "logps/chosen": -277.05620659722223, + "logps/rejected": -458.380859375, + "loss": 0.187, + "rewards/chosen": 1.2973305384318035, + "rewards/margins": 6.729965164547875, + "rewards/rejected": -5.432634626116071, + "step": 158 + }, + { + "epoch": 0.05869595311706889, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 9.95184866581096e-06, + "logits/chosen": 1755749106.5263157, + "logits/rejected": 964426988.3076923, + "logps/chosen": -292.1627775493421, + "logps/rejected": -317.9696514423077, + "loss": 0.2828, + "rewards/chosen": 0.6497310839201275, + "rewards/margins": 4.934784923970458, + "rewards/rejected": -4.28505384005033, + "step": 159 + }, + { + "epoch": 0.05906511005491209, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.951030423774858e-06, + "logits/chosen": 2644732011.7894735, + "logits/rejected": 1906819859.6923077, + "logps/chosen": -289.92007606907896, + "logps/rejected": -536.0785381610577, + "loss": 0.2074, + "rewards/chosen": 0.7927007173237047, + "rewards/margins": 7.4333603430373465, + "rewards/rejected": -6.640659625713642, + "step": 160 + }, + { + "epoch": 0.059434266992755294, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 9.950205322149007e-06, + "logits/chosen": 1669616071.1111112, + "logits/rejected": 1819049837.7142856, + "logps/chosen": -280.81385633680554, + "logps/rejected": -491.8168247767857, + "loss": 0.248, + "rewards/chosen": 0.6197273466322157, + "rewards/margins": 7.244415631369939, + "rewards/rejected": -6.624688284737723, + "step": 161 + }, + { + "epoch": 0.059803423930598495, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 9.949373362076572e-06, + "logits/chosen": 1531173774.2222223, + "logits/rejected": 1973284864.0, + "logps/chosen": -306.318603515625, + "logps/rejected": -511.62479073660717, + "loss": 0.2675, + "rewards/chosen": 0.12365718682607015, + "rewards/margins": 6.651160064197722, + "rewards/rejected": -6.527502877371652, + "step": 162 + }, + { + "epoch": 0.0601725808684417, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 9.948534544710228e-06, + "logits/chosen": 2057528413.090909, + "logits/rejected": 1475775244.1904762, + "logps/chosen": -353.39450905539775, + "logps/rejected": -348.2158668154762, + "loss": 0.1606, + "rewards/chosen": 1.2516573125665837, + "rewards/margins": 6.118905121113831, + "rewards/rejected": -4.867247808547247, + "step": 163 + }, + { + "epoch": 0.0605417378062849, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 9.947688871212142e-06, + "logits/chosen": 1549554145.8823528, + "logits/rejected": 1380027869.8666666, + "logps/chosen": -241.73764935661765, + "logps/rejected": -516.6527994791667, + "loss": 0.1799, + "rewards/chosen": 1.4479791977826286, + "rewards/margins": 9.347383087756587, + "rewards/rejected": -7.899403889973958, + "step": 164 + }, + { + "epoch": 0.0609108947441281, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 9.946836342753982e-06, + "logits/chosen": 1892361485.4736843, + "logits/rejected": 1943659756.3076923, + "logps/chosen": -302.43007298519734, + "logps/rejected": -517.0065730168269, + "loss": 0.2254, + "rewards/chosen": 1.1371040344238281, + "rewards/margins": 6.716646928053636, + "rewards/rejected": -5.5795428936298075, + "step": 165 + }, + { + "epoch": 0.0612800516819713, + "grad_norm": 46.5, + "kl": 0.0, + "learning_rate": 9.945976960516921e-06, + "logits/chosen": 1739663974.4, + "logits/rejected": 1829369002.6666667, + "logps/chosen": -335.09169921875, + "logps/rejected": -617.3716227213541, + "loss": 0.2644, + "rewards/chosen": 0.44480624198913576, + "rewards/margins": 7.539785846074422, + "rewards/rejected": -7.094979604085286, + "step": 166 + }, + { + "epoch": 0.061649208619814495, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 9.945110725691618e-06, + "logits/chosen": 1619395764.7058823, + "logits/rejected": 1431729902.9333334, + "logps/chosen": -267.6011316636029, + "logps/rejected": -402.2577799479167, + "loss": 0.1864, + "rewards/chosen": 1.2293080722584444, + "rewards/margins": 7.933381151685528, + "rewards/rejected": -6.704073079427084, + "step": 167 + }, + { + "epoch": 0.0620183655576577, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 9.944237639478232e-06, + "logits/chosen": 1981292916.3636363, + "logits/rejected": 2284940531.8095236, + "logps/chosen": -351.1681463068182, + "logps/rejected": -438.3822079613095, + "loss": 0.2216, + "rewards/chosen": 0.22398909655484286, + "rewards/margins": 5.096196374851904, + "rewards/rejected": -4.872207278297061, + "step": 168 + }, + { + "epoch": 0.0623875224955009, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 9.943357703086411e-06, + "logits/chosen": 2168239718.4, + "logits/rejected": 2831547934.117647, + "logps/chosen": -262.52356770833336, + "logps/rejected": -499.00080422794116, + "loss": 0.1938, + "rewards/chosen": 1.2805671691894531, + "rewards/margins": 7.524218166575713, + "rewards/rejected": -6.2436509973862595, + "step": 169 + }, + { + "epoch": 0.0627566794333441, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 9.942470917735299e-06, + "logits/chosen": 1497043041.5238094, + "logits/rejected": 1222771805.090909, + "logps/chosen": -196.6768043154762, + "logps/rejected": -539.0693359375, + "loss": 0.2716, + "rewards/chosen": 0.9215043385823568, + "rewards/margins": 3.808020909627279, + "rewards/rejected": -2.886516571044922, + "step": 170 + }, + { + "epoch": 0.06312583637118731, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 9.941577284653523e-06, + "logits/chosen": 2424778496.0, + "logits/rejected": 1769192704.0, + "logps/chosen": -352.7112121582031, + "logps/rejected": -445.326171875, + "loss": 0.2518, + "rewards/chosen": 0.4472127854824066, + "rewards/margins": 6.623797506093979, + "rewards/rejected": -6.176584720611572, + "step": 171 + }, + { + "epoch": 0.0634949933090305, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 9.940676805079201e-06, + "logits/chosen": 1570746496.0, + "logits/rejected": 1724764288.0, + "logps/chosen": -357.9239501953125, + "logps/rejected": -452.7489013671875, + "loss": 0.2256, + "rewards/chosen": 0.6584669351577759, + "rewards/margins": 5.512856602668762, + "rewards/rejected": -4.854389667510986, + "step": 172 + }, + { + "epoch": 0.0638641502468737, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.939769480259937e-06, + "logits/chosen": 1901196580.5714285, + "logits/rejected": 1888412330.6666667, + "logps/chosen": -228.95063127790178, + "logps/rejected": -352.1045735677083, + "loss": 0.1867, + "rewards/chosen": 1.2099661145891463, + "rewards/margins": 6.505925587245397, + "rewards/rejected": -5.29595947265625, + "step": 173 + }, + { + "epoch": 0.0642333071847169, + "grad_norm": 13.9375, + "kl": 0.20199871063232422, + "learning_rate": 9.938855311452818e-06, + "logits/chosen": 1128744374.857143, + "logits/rejected": 1232993652.3636363, + "logps/chosen": -234.67420014880952, + "logps/rejected": -494.70028409090907, + "loss": 0.2564, + "rewards/chosen": 0.9493690672374907, + "rewards/margins": 6.306448313064906, + "rewards/rejected": -5.357079245827415, + "step": 174 + }, + { + "epoch": 0.0646024641225601, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.93793429992441e-06, + "logits/chosen": 2461948586.6666665, + "logits/rejected": 1785004754.8235295, + "logps/chosen": -267.17294921875, + "logps/rejected": -465.64283662683823, + "loss": 0.185, + "rewards/chosen": 0.9911238352457682, + "rewards/margins": 8.411316329357671, + "rewards/rejected": -7.420192494111903, + "step": 175 + }, + { + "epoch": 0.0649716210604033, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.937006446950768e-06, + "logits/chosen": 1653422694.4, + "logits/rejected": 1690656426.6666667, + "logps/chosen": -266.283935546875, + "logps/rejected": -385.8729654947917, + "loss": 0.2755, + "rewards/chosen": 0.7713281154632569, + "rewards/margins": 6.205270910263062, + "rewards/rejected": -5.433942794799805, + "step": 176 + }, + { + "epoch": 0.0653407779982465, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 9.936071753817416e-06, + "logits/chosen": 2647660182.5882354, + "logits/rejected": 2037809561.6, + "logps/chosen": -324.83498965992646, + "logps/rejected": -528.6254231770833, + "loss": 0.2235, + "rewards/chosen": 0.6302930046530331, + "rewards/margins": 8.15356621835746, + "rewards/rejected": -7.523273213704427, + "step": 177 + }, + { + "epoch": 0.0657099349360897, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 9.935130221819361e-06, + "logits/chosen": 1682948253.5384614, + "logits/rejected": 2623847585.6842103, + "logps/chosen": -306.7768366887019, + "logps/rejected": -494.61986019736844, + "loss": 0.1497, + "rewards/chosen": 1.2759845440204327, + "rewards/margins": 7.237956027752957, + "rewards/rejected": -5.961971483732524, + "step": 178 + }, + { + "epoch": 0.0660790918739329, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 9.934181852261084e-06, + "logits/chosen": 1601886617.6, + "logits/rejected": 1737047319.2727273, + "logps/chosen": -289.715087890625, + "logps/rejected": -537.2959428267045, + "loss": 0.0954, + "rewards/chosen": 1.6865406036376953, + "rewards/margins": 9.190039721402254, + "rewards/rejected": -7.50349911776456, + "step": 179 + }, + { + "epoch": 0.0664482488117761, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 9.93322664645654e-06, + "logits/chosen": 1798584477.5384614, + "logits/rejected": 1927257249.6842105, + "logps/chosen": -245.40478515625, + "logps/rejected": -522.7278988486842, + "loss": 0.1407, + "rewards/chosen": 1.4086092435396635, + "rewards/margins": 9.413722544063923, + "rewards/rejected": -8.00511330052426, + "step": 180 + }, + { + "epoch": 0.06681740574961931, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.932264605729152e-06, + "logits/chosen": 2033656832.0, + "logits/rejected": 2891849113.6, + "logps/chosen": -300.49049886067706, + "logps/rejected": -457.447119140625, + "loss": 0.1939, + "rewards/chosen": 0.5647591749827067, + "rewards/margins": 6.921987263361613, + "rewards/rejected": -6.357228088378906, + "step": 181 + }, + { + "epoch": 0.06718656268746251, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 9.931295731411819e-06, + "logits/chosen": 1556286403.764706, + "logits/rejected": 1556958412.8, + "logps/chosen": -237.14914119944854, + "logps/rejected": -374.16106770833335, + "loss": 0.2328, + "rewards/chosen": 0.7590243395637063, + "rewards/margins": 6.520092556523341, + "rewards/rejected": -5.761068216959635, + "step": 182 + }, + { + "epoch": 0.06755571962530571, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 9.930320024846899e-06, + "logits/chosen": 1982816802.1333334, + "logits/rejected": 1769219734.5882354, + "logps/chosen": -258.62159830729166, + "logps/rejected": -375.94450827205884, + "loss": 0.2451, + "rewards/chosen": 0.42381070454915365, + "rewards/margins": 5.4041959799972235, + "rewards/rejected": -4.98038527544807, + "step": 183 + }, + { + "epoch": 0.06792487656314891, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 9.929337487386225e-06, + "logits/chosen": 1831180151.4666667, + "logits/rejected": 1761786819.764706, + "logps/chosen": -304.87604166666665, + "logps/rejected": -517.9893152573529, + "loss": 0.2191, + "rewards/chosen": 0.47626304626464844, + "rewards/margins": 6.387571334838867, + "rewards/rejected": -5.911308288574219, + "step": 184 + }, + { + "epoch": 0.06829403350099211, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 9.928348120391087e-06, + "logits/chosen": 2572690500.266667, + "logits/rejected": 2167752101.647059, + "logps/chosen": -261.04521484375, + "logps/rejected": -637.8296760110294, + "loss": 0.1565, + "rewards/chosen": 1.2647260030110676, + "rewards/margins": 9.390314812753715, + "rewards/rejected": -8.125588809742647, + "step": 185 + }, + { + "epoch": 0.06866319043883531, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.927351925232245e-06, + "logits/chosen": 1925735082.6666667, + "logits/rejected": 1461771926.5882354, + "logps/chosen": -311.31324869791666, + "logps/rejected": -433.6963752297794, + "loss": 0.2135, + "rewards/chosen": 0.5511041641235351, + "rewards/margins": 6.438059874141917, + "rewards/rejected": -5.886955710018382, + "step": 186 + }, + { + "epoch": 0.06903234737667852, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 9.92634890328991e-06, + "logits/chosen": 2131489792.0, + "logits/rejected": 2103618432.0, + "logps/chosen": -301.91326904296875, + "logps/rejected": -339.43524169921875, + "loss": 0.2214, + "rewards/chosen": 0.452073335647583, + "rewards/margins": 5.639359712600708, + "rewards/rejected": -5.187286376953125, + "step": 187 + }, + { + "epoch": 0.06940150431452172, + "grad_norm": 13.1875, + "kl": 0.12229537963867188, + "learning_rate": 9.92533905595376e-06, + "logits/chosen": 1855865675.2941177, + "logits/rejected": 1644342476.8, + "logps/chosen": -312.7223115808824, + "logps/rejected": -438.82958984375, + "loss": 0.2195, + "rewards/chosen": 1.2117050395292395, + "rewards/margins": 6.721602907367782, + "rewards/rejected": -5.509897867838542, + "step": 188 + }, + { + "epoch": 0.06977066125236492, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 9.924322384622922e-06, + "logits/chosen": 1751344274.2857144, + "logits/rejected": 1878398293.3333333, + "logps/chosen": -321.732666015625, + "logps/rejected": -450.19715711805554, + "loss": 0.1964, + "rewards/chosen": 0.5876798629760742, + "rewards/margins": 5.983964602152507, + "rewards/rejected": -5.396284739176433, + "step": 189 + }, + { + "epoch": 0.0701398181902081, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.923298890705983e-06, + "logits/chosen": 1472814226.2857144, + "logits/rejected": 1493376227.5555556, + "logps/chosen": -320.85654994419644, + "logps/rejected": -483.9733072916667, + "loss": 0.1421, + "rewards/chosen": 1.493065425327846, + "rewards/margins": 8.704915061829581, + "rewards/rejected": -7.211849636501736, + "step": 190 + }, + { + "epoch": 0.07050897512805131, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.922268575620981e-06, + "logits/chosen": 1411139824.9411764, + "logits/rejected": 2175930094.9333334, + "logps/chosen": -259.9036075367647, + "logps/rejected": -476.1249674479167, + "loss": 0.2209, + "rewards/chosen": 0.5080053666058708, + "rewards/margins": 6.565353490792068, + "rewards/rejected": -6.0573481241861975, + "step": 191 + }, + { + "epoch": 0.07087813206589451, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 9.921231440795404e-06, + "logits/chosen": 1411518621.5384614, + "logits/rejected": 1857303605.8947368, + "logps/chosen": -264.09021935096155, + "logps/rejected": -491.94443873355266, + "loss": 0.1887, + "rewards/chosen": 1.1723593931931715, + "rewards/margins": 8.337286022510606, + "rewards/rejected": -7.164926629317434, + "step": 192 + }, + { + "epoch": 0.07124728900373771, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.92018748766619e-06, + "logits/chosen": 1469235200.0, + "logits/rejected": 1730677418.6666667, + "logps/chosen": -292.8130615234375, + "logps/rejected": -502.1977945963542, + "loss": 0.1947, + "rewards/chosen": 1.605323600769043, + "rewards/margins": 7.637721188863118, + "rewards/rejected": -6.032397588094075, + "step": 193 + }, + { + "epoch": 0.07161644594158091, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 9.919136717679723e-06, + "logits/chosen": 1489188096.0, + "logits/rejected": 1393334528.0, + "logps/chosen": -254.79714965820312, + "logps/rejected": -382.0877990722656, + "loss": 0.1984, + "rewards/chosen": 0.8846181035041809, + "rewards/margins": 6.8402188420295715, + "rewards/rejected": -5.955600738525391, + "step": 194 + }, + { + "epoch": 0.07198560287942411, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.918079132291828e-06, + "logits/chosen": 1967694787.764706, + "logits/rejected": 1463822199.4666667, + "logps/chosen": -250.33412798713235, + "logps/rejected": -340.72864583333336, + "loss": 0.2111, + "rewards/chosen": 0.7158857233384076, + "rewards/margins": 6.207194317088408, + "rewards/rejected": -5.49130859375, + "step": 195 + }, + { + "epoch": 0.07235475981726731, + "grad_norm": 13.0625, + "kl": 0.6617727279663086, + "learning_rate": 9.917014732967782e-06, + "logits/chosen": 1367304192.0, + "logits/rejected": 1434894677.3333333, + "logps/chosen": -328.46407645089283, + "logps/rejected": -423.6180013020833, + "loss": 0.1493, + "rewards/chosen": 1.8524954659598214, + "rewards/margins": 8.105687519860647, + "rewards/rejected": -6.2531920539008246, + "step": 196 + }, + { + "epoch": 0.07272391675511052, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 9.915943521182292e-06, + "logits/chosen": 2595679280.7619047, + "logits/rejected": 2466666309.818182, + "logps/chosen": -294.67013113839283, + "logps/rejected": -320.38427734375, + "loss": 0.2736, + "rewards/chosen": 1.0027737390427363, + "rewards/margins": 5.51562059906138, + "rewards/rejected": -4.5128468600186435, + "step": 197 + }, + { + "epoch": 0.07309307369295372, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 9.91486549841951e-06, + "logits/chosen": 1599045847.5789473, + "logits/rejected": 1839370870.1538463, + "logps/chosen": -195.8356291118421, + "logps/rejected": -439.2619816706731, + "loss": 0.1594, + "rewards/chosen": 1.6159053601716693, + "rewards/margins": 8.587747025586333, + "rewards/rejected": -6.971841665414663, + "step": 198 + }, + { + "epoch": 0.07346223063079692, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 9.913780666173022e-06, + "logits/chosen": 1744811008.0, + "logits/rejected": 1738365696.0, + "logps/chosen": -280.6553039550781, + "logps/rejected": -496.47833251953125, + "loss": 0.2129, + "rewards/chosen": 0.6269842386245728, + "rewards/margins": 6.745821595191956, + "rewards/rejected": -6.118837356567383, + "step": 199 + }, + { + "epoch": 0.07383138756864012, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.912689025945851e-06, + "logits/chosen": 1731220772.5714285, + "logits/rejected": 1485328270.2222223, + "logps/chosen": -295.63528878348217, + "logps/rejected": -421.65370008680554, + "loss": 0.2287, + "rewards/chosen": 0.35897983823503765, + "rewards/margins": 5.639596598488944, + "rewards/rejected": -5.280616760253906, + "step": 200 + }, + { + "epoch": 0.07420054450648332, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 9.91159057925045e-06, + "logits/chosen": 1223482481.7777777, + "logits/rejected": 1530489709.7142856, + "logps/chosen": -262.52924262152777, + "logps/rejected": -473.86781529017856, + "loss": 0.2419, + "rewards/chosen": 0.4861944516499837, + "rewards/margins": 8.065860589345297, + "rewards/rejected": -7.5796661376953125, + "step": 201 + }, + { + "epoch": 0.07456970144432652, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 9.910485327608702e-06, + "logits/chosen": 2591656251.076923, + "logits/rejected": 2065203846.7368422, + "logps/chosen": -229.14823091947116, + "logps/rejected": -489.9150390625, + "loss": 0.157, + "rewards/chosen": 0.9946117401123047, + "rewards/margins": 6.7041474392539575, + "rewards/rejected": -5.709535699141653, + "step": 202 + }, + { + "epoch": 0.07493885838216972, + "grad_norm": 15.125, + "kl": 0.0, + "learning_rate": 9.909373272551919e-06, + "logits/chosen": 1617803904.0, + "logits/rejected": 2005531008.0, + "logps/chosen": -297.0911865234375, + "logps/rejected": -520.767822265625, + "loss": 0.2081, + "rewards/chosen": 0.7357272505760193, + "rewards/margins": 6.25931590795517, + "rewards/rejected": -5.52358865737915, + "step": 203 + }, + { + "epoch": 0.07530801532001292, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 9.90825441562084e-06, + "logits/chosen": 1821062197.8947368, + "logits/rejected": 1995083460.9230769, + "logps/chosen": -340.32421875, + "logps/rejected": -447.14400540865387, + "loss": 0.2485, + "rewards/chosen": 0.5422391891479492, + "rewards/margins": 6.957692586458647, + "rewards/rejected": -6.4154533973106975, + "step": 204 + }, + { + "epoch": 0.07567717225785613, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 9.907128758365627e-06, + "logits/chosen": 1669836492.8, + "logits/rejected": 1947845973.3333333, + "logps/chosen": -264.425830078125, + "logps/rejected": -527.2880452473959, + "loss": 0.2248, + "rewards/chosen": 1.1608144760131835, + "rewards/margins": 6.972369702657064, + "rewards/rejected": -5.81155522664388, + "step": 205 + }, + { + "epoch": 0.07604632919569933, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 9.905996302345863e-06, + "logits/chosen": 1922174020.2666667, + "logits/rejected": 2222210831.0588236, + "logps/chosen": -267.587939453125, + "logps/rejected": -508.81858915441177, + "loss": 0.155, + "rewards/chosen": 1.1608455657958985, + "rewards/margins": 9.207422570621267, + "rewards/rejected": -8.046577004825368, + "step": 206 + }, + { + "epoch": 0.07641548613354253, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 9.904857049130553e-06, + "logits/chosen": 1536168263.68, + "logits/rejected": 1545219218.2857144, + "logps/chosen": -278.0503515625, + "logps/rejected": -670.1974051339286, + "loss": 0.2941, + "rewards/chosen": 0.8895104217529297, + "rewards/margins": 11.655464575631278, + "rewards/rejected": -10.765954153878349, + "step": 207 + }, + { + "epoch": 0.07678464307138572, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 9.903711000298118e-06, + "logits/chosen": 1557965952.0, + "logits/rejected": 1031605952.0, + "logps/chosen": -342.5038146972656, + "logps/rejected": -415.4825439453125, + "loss": 0.2119, + "rewards/chosen": 0.7806885838508606, + "rewards/margins": 7.948217689990997, + "rewards/rejected": -7.167529106140137, + "step": 208 + }, + { + "epoch": 0.07715380000922892, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 9.902558157436392e-06, + "logits/chosen": 2105853952.0, + "logits/rejected": 1438852778.6666667, + "logps/chosen": -318.1811767578125, + "logps/rejected": -373.058349609375, + "loss": 0.2836, + "rewards/chosen": 0.5601299285888672, + "rewards/margins": 5.551716613769531, + "rewards/rejected": -4.991586685180664, + "step": 209 + }, + { + "epoch": 0.07752295694707212, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 9.901398522142624e-06, + "logits/chosen": 1771273420.8, + "logits/rejected": 2357349677.1764708, + "logps/chosen": -367.66998697916665, + "logps/rejected": -511.3906824448529, + "loss": 0.2003, + "rewards/chosen": 0.5893281936645508, + "rewards/margins": 7.0993271883796245, + "rewards/rejected": -6.509998994715073, + "step": 210 + }, + { + "epoch": 0.07789211388491532, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.900232096023478e-06, + "logits/chosen": 1714014754.1333334, + "logits/rejected": 1482416007.5294118, + "logps/chosen": -197.12638346354166, + "logps/rejected": -477.98928653492646, + "loss": 0.1687, + "rewards/chosen": 1.0620689392089844, + "rewards/margins": 8.536500594195196, + "rewards/rejected": -7.474431654986213, + "step": 211 + }, + { + "epoch": 0.07826127082275852, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.899058880695019e-06, + "logits/chosen": 1908046506.6666667, + "logits/rejected": 3238210379.2941175, + "logps/chosen": -291.6744140625, + "logps/rejected": -380.1985294117647, + "loss": 0.1727, + "rewards/chosen": 1.3485973358154297, + "rewards/margins": 6.66027784908519, + "rewards/rejected": -5.311680513269761, + "step": 212 + }, + { + "epoch": 0.07863042776060172, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 9.89787887778272e-06, + "logits/chosen": 1433902421.3333333, + "logits/rejected": 1510644073.4117646, + "logps/chosen": -282.03053385416666, + "logps/rejected": -396.88220932904414, + "loss": 0.1759, + "rewards/chosen": 1.2443155924479166, + "rewards/margins": 6.28171937231924, + "rewards/rejected": -5.037403779871323, + "step": 213 + }, + { + "epoch": 0.07899958469844492, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 9.896692088921466e-06, + "logits/chosen": 2571602147.5555553, + "logits/rejected": 2169690843.428571, + "logps/chosen": -283.45494249131946, + "logps/rejected": -565.0706263950893, + "loss": 0.187, + "rewards/chosen": 1.0868806838989258, + "rewards/margins": 8.722324235098704, + "rewards/rejected": -7.635443551199777, + "step": 214 + }, + { + "epoch": 0.07936874163628813, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 9.895498515755532e-06, + "logits/chosen": 2180546087.3846154, + "logits/rejected": 1674937290.1052632, + "logps/chosen": -240.181640625, + "logps/rejected": -494.97291324013156, + "loss": 0.1915, + "rewards/chosen": 0.6344446769127479, + "rewards/margins": 7.282570576378209, + "rewards/rejected": -6.648125899465461, + "step": 215 + }, + { + "epoch": 0.07973789857413133, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 9.894298159938605e-06, + "logits/chosen": 2025556992.0, + "logits/rejected": 1648880469.3333333, + "logps/chosen": -306.396533203125, + "logps/rejected": -387.446044921875, + "loss": 0.2603, + "rewards/chosen": 0.7764100551605224, + "rewards/margins": 6.467584880193074, + "rewards/rejected": -5.691174825032552, + "step": 216 + }, + { + "epoch": 0.08010705551197453, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.893091023133756e-06, + "logits/chosen": 2599144886.857143, + "logits/rejected": 1580353536.0, + "logps/chosen": -284.59225027901783, + "logps/rejected": -435.24305555555554, + "loss": 0.2221, + "rewards/chosen": 0.38711816923958914, + "rewards/margins": 6.742737387853956, + "rewards/rejected": -6.3556192186143665, + "step": 217 + }, + { + "epoch": 0.08047621244981773, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.891877107013461e-06, + "logits/chosen": 1952478412.8, + "logits/rejected": 2008730925.1764705, + "logps/chosen": -313.202734375, + "logps/rejected": -541.3700597426471, + "loss": 0.1945, + "rewards/chosen": 0.6368071873982747, + "rewards/margins": 6.487811395233753, + "rewards/rejected": -5.851004207835478, + "step": 218 + }, + { + "epoch": 0.08084536938766093, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.890656413259585e-06, + "logits/chosen": 1868918647.4666667, + "logits/rejected": 2070145385.4117646, + "logps/chosen": -339.6949869791667, + "logps/rejected": -409.09719669117646, + "loss": 0.219, + "rewards/chosen": 0.47541306813557943, + "rewards/margins": 6.362307294209798, + "rewards/rejected": -5.886894226074219, + "step": 219 + }, + { + "epoch": 0.08121452632550413, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.889428943563382e-06, + "logits/chosen": 2613116050.285714, + "logits/rejected": 1694930375.1111112, + "logps/chosen": -291.2520228794643, + "logps/rejected": -371.1877170138889, + "loss": 0.1896, + "rewards/chosen": 1.125910827091762, + "rewards/margins": 6.569520276690286, + "rewards/rejected": -5.443609449598524, + "step": 220 + }, + { + "epoch": 0.08158368326334733, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.888194699625499e-06, + "logits/chosen": 2747854392.888889, + "logits/rejected": 1755361426.2857144, + "logps/chosen": -320.61073133680554, + "logps/rejected": -453.8185337611607, + "loss": 0.2227, + "rewards/chosen": 0.7362472746107314, + "rewards/margins": 7.750842851305765, + "rewards/rejected": -7.014595576695034, + "step": 221 + }, + { + "epoch": 0.08195284020119054, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.886953683155964e-06, + "logits/chosen": 1992880128.0, + "logits/rejected": 1669637120.0, + "logps/chosen": -332.87245008680554, + "logps/rejected": -551.91552734375, + "loss": 0.2056, + "rewards/chosen": 1.164188067118327, + "rewards/margins": 7.371108690897624, + "rewards/rejected": -6.206920623779297, + "step": 222 + }, + { + "epoch": 0.08232199713903374, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 9.885705895874188e-06, + "logits/chosen": 2185397816.888889, + "logits/rejected": 2089003008.0, + "logps/chosen": -322.3407931857639, + "logps/rejected": -458.33119419642856, + "loss": 0.2248, + "rewards/chosen": 0.7197759946187338, + "rewards/margins": 7.195091633569627, + "rewards/rejected": -6.475315638950893, + "step": 223 + }, + { + "epoch": 0.08269115407687694, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 9.884451339508967e-06, + "logits/chosen": 2107596526.9333334, + "logits/rejected": 2767462159.0588236, + "logps/chosen": -279.2965494791667, + "logps/rejected": -449.5297277113971, + "loss": 0.1588, + "rewards/chosen": 1.315366236368815, + "rewards/margins": 7.897823834886738, + "rewards/rejected": -6.582457598517923, + "step": 224 + }, + { + "epoch": 0.08306031101472014, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 9.883190015798469e-06, + "logits/chosen": 1882893390.7692308, + "logits/rejected": 2124771112.4210527, + "logps/chosen": -216.56268780048077, + "logps/rejected": -520.6907894736842, + "loss": 0.154, + "rewards/chosen": 1.7223923022930439, + "rewards/margins": 7.329244783532764, + "rewards/rejected": -5.60685248123972, + "step": 225 + }, + { + "epoch": 0.08342946795256333, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 9.881921926490245e-06, + "logits/chosen": 1618491392.0, + "logits/rejected": 1681094016.0, + "logps/chosen": -321.0919189453125, + "logps/rejected": -514.27099609375, + "loss": 0.2249, + "rewards/chosen": 0.5107834935188293, + "rewards/margins": 6.200155079364777, + "rewards/rejected": -5.689371585845947, + "step": 226 + }, + { + "epoch": 0.08379862489040653, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 9.880647073341219e-06, + "logits/chosen": 1637272576.0, + "logits/rejected": 1694106331.4285715, + "logps/chosen": -310.8017306857639, + "logps/rejected": -432.19932338169644, + "loss": 0.2295, + "rewards/chosen": 1.2232177522447374, + "rewards/margins": 7.278936310419961, + "rewards/rejected": -6.055718558175223, + "step": 227 + }, + { + "epoch": 0.08416778182824973, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 9.879365458117678e-06, + "logits/chosen": 2029691611.4285715, + "logits/rejected": 1335902776.8888888, + "logps/chosen": -322.3362513950893, + "logps/rejected": -414.431884765625, + "loss": 0.216, + "rewards/chosen": 0.31933518818446566, + "rewards/margins": 4.937679813021705, + "rewards/rejected": -4.618344624837239, + "step": 228 + }, + { + "epoch": 0.08453693876609293, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 9.878077082595287e-06, + "logits/chosen": 1970508706.909091, + "logits/rejected": 2426740736.0, + "logps/chosen": -280.591796875, + "logps/rejected": -514.5986328125, + "loss": 0.123, + "rewards/chosen": 1.1186904907226562, + "rewards/margins": 8.958988371349516, + "rewards/rejected": -7.84029788062686, + "step": 229 + }, + { + "epoch": 0.08490609570393613, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.876781948559073e-06, + "logits/chosen": 2207755806.117647, + "logits/rejected": 1596277555.2, + "logps/chosen": -279.7059972426471, + "logps/rejected": -469.0556640625, + "loss": 0.1721, + "rewards/chosen": 1.3132118898279526, + "rewards/margins": 7.817259537939932, + "rewards/rejected": -6.504047648111979, + "step": 230 + }, + { + "epoch": 0.08527525264177933, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 9.87548005780343e-06, + "logits/chosen": 1703355245.7142856, + "logits/rejected": 2056368583.1111112, + "logps/chosen": -299.69859095982144, + "logps/rejected": -474.4347330729167, + "loss": 0.163, + "rewards/chosen": 1.37884521484375, + "rewards/margins": 8.054289923773872, + "rewards/rejected": -6.675444708930121, + "step": 231 + }, + { + "epoch": 0.08564440957962254, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 9.874171412132107e-06, + "logits/chosen": 1956650097.7777777, + "logits/rejected": 1761728219.4285715, + "logps/chosen": -324.7473415798611, + "logps/rejected": -478.4915248325893, + "loss": 0.2467, + "rewards/chosen": 0.3728671073913574, + "rewards/margins": 7.447789941515241, + "rewards/rejected": -7.074922834123884, + "step": 232 + }, + { + "epoch": 0.08601356651746574, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 9.872856013358219e-06, + "logits/chosen": 2656566923.6363635, + "logits/rejected": 1751575795.8095238, + "logps/chosen": -257.83780184659093, + "logps/rejected": -421.71405319940476, + "loss": 0.1178, + "rewards/chosen": 1.272094813260165, + "rewards/margins": 7.441989077118052, + "rewards/rejected": -6.169894263857887, + "step": 233 + }, + { + "epoch": 0.08638272345530894, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.871533863304234e-06, + "logits/chosen": 1777459768.8888888, + "logits/rejected": 1737914806.857143, + "logps/chosen": -281.3489583333333, + "logps/rejected": -475.93739536830356, + "loss": 0.1926, + "rewards/chosen": 1.9016390906439886, + "rewards/margins": 9.053498919048007, + "rewards/rejected": -7.151859828404018, + "step": 234 + }, + { + "epoch": 0.08675188039315214, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 9.870204963801974e-06, + "logits/chosen": 1908824502.857143, + "logits/rejected": 2459854848.0, + "logps/chosen": -274.90096609933033, + "logps/rejected": -539.6013454861111, + "loss": 0.2095, + "rewards/chosen": 0.3777961390359061, + "rewards/margins": 6.885999433578007, + "rewards/rejected": -6.508203294542101, + "step": 235 + }, + { + "epoch": 0.08712103733099534, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 9.86886931669261e-06, + "logits/chosen": 1987539968.0, + "logits/rejected": 1767886592.0, + "logps/chosen": -285.40362548828125, + "logps/rejected": -402.8817138671875, + "loss": 0.1882, + "rewards/chosen": 0.8615530729293823, + "rewards/margins": 7.9712032079696655, + "rewards/rejected": -7.109650135040283, + "step": 236 + }, + { + "epoch": 0.08749019426883854, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.867526923826668e-06, + "logits/chosen": 1903570397.8666666, + "logits/rejected": 1530725556.7058823, + "logps/chosen": -330.64361979166665, + "logps/rejected": -511.91796875, + "loss": 0.165, + "rewards/chosen": 1.117876942952474, + "rewards/margins": 8.44961378808115, + "rewards/rejected": -7.331736845128677, + "step": 237 + }, + { + "epoch": 0.08785935120668174, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 9.866177787064013e-06, + "logits/chosen": 2198547275.2941175, + "logits/rejected": 1873264093.8666666, + "logps/chosen": -288.7004825367647, + "logps/rejected": -511.0263671875, + "loss": 0.1863, + "rewards/chosen": 1.0603486229391659, + "rewards/margins": 7.821169138889687, + "rewards/rejected": -6.760820515950521, + "step": 238 + }, + { + "epoch": 0.08822850814452494, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.864821908273861e-06, + "logits/chosen": 1772254939.4285715, + "logits/rejected": 1570908615.1111112, + "logps/chosen": -258.10878208705356, + "logps/rejected": -511.54291449652777, + "loss": 0.186, + "rewards/chosen": 0.627357006072998, + "rewards/margins": 9.116165532006157, + "rewards/rejected": -8.488808525933159, + "step": 239 + }, + { + "epoch": 0.08859766508236815, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.863459289334758e-06, + "logits/chosen": 2675938099.2, + "logits/rejected": 1956669560.4705882, + "logps/chosen": -298.59560546875, + "logps/rejected": -542.7853285845588, + "loss": 0.1963, + "rewards/chosen": 0.7494485219319661, + "rewards/margins": 7.094848715090285, + "rewards/rejected": -6.345400193158318, + "step": 240 + }, + { + "epoch": 0.08896682202021135, + "grad_norm": 11.8125, + "kl": 0.9455204010009766, + "learning_rate": 9.862089932134601e-06, + "logits/chosen": 1215964774.4, + "logits/rejected": 1623221187.764706, + "logps/chosen": -215.22179361979167, + "logps/rejected": -376.16417738970586, + "loss": 0.1947, + "rewards/chosen": 1.438476816813151, + "rewards/margins": 6.256367657231349, + "rewards/rejected": -4.817890840418198, + "step": 241 + }, + { + "epoch": 0.08933597895805455, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 9.860713838570616e-06, + "logits/chosen": 2768228592.9411764, + "logits/rejected": 2219111901.866667, + "logps/chosen": -219.66696346507354, + "logps/rejected": -476.97369791666665, + "loss": 0.1999, + "rewards/chosen": 0.9496830210966223, + "rewards/margins": 7.9543725556018305, + "rewards/rejected": -7.004689534505208, + "step": 242 + }, + { + "epoch": 0.08970513589589775, + "grad_norm": 12.5625, + "kl": 0.3970003128051758, + "learning_rate": 9.859331010549362e-06, + "logits/chosen": 2242105051.428571, + "logits/rejected": 1974337536.0, + "logps/chosen": -245.67314801897322, + "logps/rejected": -436.80360243055554, + "loss": 0.1929, + "rewards/chosen": 0.5280186789376395, + "rewards/margins": 6.862357820783343, + "rewards/rejected": -6.334339141845703, + "step": 243 + }, + { + "epoch": 0.09007429283374094, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 9.85794144998673e-06, + "logits/chosen": 1540820081.7777777, + "logits/rejected": 1823759506.2857144, + "logps/chosen": -221.62223307291666, + "logps/rejected": -467.1923130580357, + "loss": 0.192, + "rewards/chosen": 1.3148587544759114, + "rewards/margins": 7.734915778750465, + "rewards/rejected": -6.420057024274554, + "step": 244 + }, + { + "epoch": 0.09044344977158414, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.856545158807938e-06, + "logits/chosen": 1524417194.6666667, + "logits/rejected": 1303430445.1764705, + "logps/chosen": -246.04095052083332, + "logps/rejected": -368.9867302389706, + "loss": 0.2207, + "rewards/chosen": 0.3987483342488607, + "rewards/margins": 5.639583636265176, + "rewards/rejected": -5.240835302016315, + "step": 245 + }, + { + "epoch": 0.09081260670942734, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 9.855142138947532e-06, + "logits/chosen": 1980737945.6, + "logits/rejected": 1801239552.0, + "logps/chosen": -243.0710205078125, + "logps/rejected": -360.29296875, + "loss": 0.1918, + "rewards/chosen": 1.7831703186035157, + "rewards/margins": 7.3502250671386715, + "rewards/rejected": -5.567054748535156, + "step": 246 + }, + { + "epoch": 0.09118176364727054, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 9.853732392349376e-06, + "logits/chosen": 2048307200.0, + "logits/rejected": 1606512399.0588236, + "logps/chosen": -254.74156901041667, + "logps/rejected": -449.9626034007353, + "loss": 0.1953, + "rewards/chosen": 1.150223159790039, + "rewards/margins": 7.08902733746697, + "rewards/rejected": -5.93880417767693, + "step": 247 + }, + { + "epoch": 0.09155092058511374, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 9.852315920966653e-06, + "logits/chosen": 2144834323.6923077, + "logits/rejected": 1339052786.5263157, + "logps/chosen": -312.3495342548077, + "logps/rejected": -418.9804944490132, + "loss": 0.2206, + "rewards/chosen": 0.19657709048344538, + "rewards/margins": 5.523775319821438, + "rewards/rejected": -5.327198229337993, + "step": 248 + }, + { + "epoch": 0.09192007752295694, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 9.850892726761874e-06, + "logits/chosen": 1366033347.764706, + "logits/rejected": 1395864507.7333333, + "logps/chosen": -241.67718864889707, + "logps/rejected": -455.53831380208334, + "loss": 0.1852, + "rewards/chosen": 1.348412457634421, + "rewards/margins": 7.927757652133119, + "rewards/rejected": -6.579345194498698, + "step": 249 + }, + { + "epoch": 0.09228923446080015, + "grad_norm": 12.125, + "kl": 0.993544340133667, + "learning_rate": 9.84946281170685e-06, + "logits/chosen": 2226978084.571429, + "logits/rejected": 2548566698.6666665, + "logps/chosen": -301.3502720424107, + "logps/rejected": -424.406982421875, + "loss": 0.1512, + "rewards/chosen": 1.3940033231462752, + "rewards/margins": 6.916240041218106, + "rewards/rejected": -5.522236718071832, + "step": 250 + }, + { + "epoch": 0.09265839139864335, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 9.848026177782713e-06, + "logits/chosen": 2367221113.263158, + "logits/rejected": 2140729974.1538463, + "logps/chosen": -260.51870888157896, + "logps/rejected": -555.0961162860577, + "loss": 0.1969, + "rewards/chosen": 1.2923580972771895, + "rewards/margins": 9.751017342694857, + "rewards/rejected": -8.458659245417667, + "step": 251 + }, + { + "epoch": 0.09302754833648655, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 9.846582826979899e-06, + "logits/chosen": 1431866368.0, + "logits/rejected": 1509573973.3333333, + "logps/chosen": -305.49595424107144, + "logps/rejected": -499.3811848958333, + "loss": 0.1722, + "rewards/chosen": 0.7817914145333427, + "rewards/margins": 7.3967040833972755, + "rewards/rejected": -6.614912668863933, + "step": 252 + }, + { + "epoch": 0.09339670527432975, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 9.845132761298154e-06, + "logits/chosen": 1868903693.4736843, + "logits/rejected": 1952465053.5384614, + "logps/chosen": -303.1103515625, + "logps/rejected": -490.50060096153845, + "loss": 0.1811, + "rewards/chosen": 1.359496467991879, + "rewards/margins": 8.57860798391736, + "rewards/rejected": -7.219111515925481, + "step": 253 + }, + { + "epoch": 0.09376586221217295, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 9.843675982746526e-06, + "logits/chosen": 2268049723.076923, + "logits/rejected": 2117635449.2631578, + "logps/chosen": -303.2747145432692, + "logps/rejected": -470.2289782072368, + "loss": 0.1674, + "rewards/chosen": 1.4120873671311598, + "rewards/margins": 8.027750594413233, + "rewards/rejected": -6.615663227282073, + "step": 254 + }, + { + "epoch": 0.09413501915001615, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.84221249334336e-06, + "logits/chosen": 1693802496.0, + "logits/rejected": 2199852732.631579, + "logps/chosen": -232.17328350360577, + "logps/rejected": -396.2252775493421, + "loss": 0.1011, + "rewards/chosen": 2.0058023012601414, + "rewards/margins": 8.063178158964705, + "rewards/rejected": -6.057375857704564, + "step": 255 + }, + { + "epoch": 0.09450417608785935, + "grad_norm": 14.1875, + "kl": 0.4275493621826172, + "learning_rate": 9.840742295116306e-06, + "logits/chosen": 1124594748.235294, + "logits/rejected": 1288799300.2666667, + "logps/chosen": -273.73161764705884, + "logps/rejected": -364.12021484375, + "loss": 0.2001, + "rewards/chosen": 1.1605049582088696, + "rewards/margins": 5.961187026079964, + "rewards/rejected": -4.800682067871094, + "step": 256 + }, + { + "epoch": 0.09487333302570256, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 9.839265390102301e-06, + "logits/chosen": 1764619264.0, + "logits/rejected": 1703910297.6, + "logps/chosen": -251.419921875, + "logps/rejected": -416.073974609375, + "loss": 0.1462, + "rewards/chosen": 1.4828344980875652, + "rewards/margins": 8.041036097208659, + "rewards/rejected": -6.558201599121094, + "step": 257 + }, + { + "epoch": 0.09524248996354576, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 9.837781780347584e-06, + "logits/chosen": 1664917661.5384614, + "logits/rejected": 1781539570.5263157, + "logps/chosen": -295.6209247295673, + "logps/rejected": -516.7000925164474, + "loss": 0.173, + "rewards/chosen": 0.6074054791377141, + "rewards/margins": 8.232161338512714, + "rewards/rejected": -7.624755859375, + "step": 258 + }, + { + "epoch": 0.09561164690138896, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.836291467907671e-06, + "logits/chosen": 1723986688.0, + "logits/rejected": 2695691264.0, + "logps/chosen": -255.94284057617188, + "logps/rejected": -430.0029602050781, + "loss": 0.2381, + "rewards/chosen": 0.3577594459056854, + "rewards/margins": 5.885409325361252, + "rewards/rejected": -5.527649879455566, + "step": 259 + }, + { + "epoch": 0.09598080383923216, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 9.834794454847373e-06, + "logits/chosen": 2283068484.266667, + "logits/rejected": 2248847841.882353, + "logps/chosen": -241.690234375, + "logps/rejected": -524.4182559742648, + "loss": 0.1234, + "rewards/chosen": 1.905566151936849, + "rewards/margins": 10.329769463632621, + "rewards/rejected": -8.424203311695772, + "step": 260 + }, + { + "epoch": 0.09634996077707536, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 9.833290743240785e-06, + "logits/chosen": 1639894667.6363637, + "logits/rejected": 1420454960.7619047, + "logps/chosen": -329.5035511363636, + "logps/rejected": -456.81715029761904, + "loss": 0.1125, + "rewards/chosen": 1.0936532454057173, + "rewards/margins": 8.699429351013976, + "rewards/rejected": -7.605776105608259, + "step": 261 + }, + { + "epoch": 0.09671911771491855, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 9.83178033517128e-06, + "logits/chosen": 1432696978.2857144, + "logits/rejected": 1841446684.4444444, + "logps/chosen": -273.81663295200894, + "logps/rejected": -421.1220974392361, + "loss": 0.1934, + "rewards/chosen": 0.8194854600088937, + "rewards/margins": 7.565872517843095, + "rewards/rejected": -6.746387057834202, + "step": 262 + }, + { + "epoch": 0.09708827465276175, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.830263232731505e-06, + "logits/chosen": 2048985673.142857, + "logits/rejected": 1535940266.6666667, + "logps/chosen": -267.709228515625, + "logps/rejected": -486.6457248263889, + "loss": 0.145, + "rewards/chosen": 1.7929661614554269, + "rewards/margins": 9.245319744897268, + "rewards/rejected": -7.45235358344184, + "step": 263 + }, + { + "epoch": 0.09745743159060495, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.82873943802339e-06, + "logits/chosen": 1753461940.7058823, + "logits/rejected": 1516693367.4666667, + "logps/chosen": -344.5571863511029, + "logps/rejected": -535.6671875, + "loss": 0.2285, + "rewards/chosen": 0.352659590104047, + "rewards/margins": 8.420944705663942, + "rewards/rejected": -8.068285115559895, + "step": 264 + }, + { + "epoch": 0.09782658852844815, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 9.827208953158132e-06, + "logits/chosen": 1843572495.0588236, + "logits/rejected": 2058218700.8, + "logps/chosen": -294.9776826746324, + "logps/rejected": -391.48251953125, + "loss": 0.2896, + "rewards/chosen": 0.24041147793040557, + "rewards/margins": 6.456691079981186, + "rewards/rejected": -6.216279602050781, + "step": 265 + }, + { + "epoch": 0.09819574546629135, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 9.825671780256197e-06, + "logits/chosen": 1811588189.090909, + "logits/rejected": 1754528621.7142856, + "logps/chosen": -280.61849698153407, + "logps/rejected": -429.42987351190476, + "loss": 0.1567, + "rewards/chosen": 0.6178337443958629, + "rewards/margins": 7.383796617582247, + "rewards/rejected": -6.765962873186384, + "step": 266 + }, + { + "epoch": 0.09856490240413456, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 9.824127921447321e-06, + "logits/chosen": 2296939776.0, + "logits/rejected": 2201672448.0, + "logps/chosen": -262.82366943359375, + "logps/rejected": -359.08526611328125, + "loss": 0.2166, + "rewards/chosen": 0.47066012024879456, + "rewards/margins": 6.092517286539078, + "rewards/rejected": -5.621857166290283, + "step": 267 + }, + { + "epoch": 0.09893405934197776, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 9.822577378870502e-06, + "logits/chosen": 2328476262.4, + "logits/rejected": 1703655765.3333333, + "logps/chosen": -218.2671630859375, + "logps/rejected": -515.38330078125, + "loss": 0.2266, + "rewards/chosen": 0.9317961692810058, + "rewards/margins": 8.50515661239624, + "rewards/rejected": -7.573360443115234, + "step": 268 + }, + { + "epoch": 0.09930321627982096, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 9.82102015467399e-06, + "logits/chosen": 1978378624.0, + "logits/rejected": 1710202880.0, + "logps/chosen": -322.7784118652344, + "logps/rejected": -468.9908752441406, + "loss": 0.2167, + "rewards/chosen": 0.6909960508346558, + "rewards/margins": 6.601669192314148, + "rewards/rejected": -5.910673141479492, + "step": 269 + }, + { + "epoch": 0.09967237321766416, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 9.81945625101531e-06, + "logits/chosen": 3050243998.4761906, + "logits/rejected": 2361141992.7272725, + "logps/chosen": -345.8775111607143, + "logps/rejected": -397.1536310369318, + "loss": 0.2298, + "rewards/chosen": 0.94533447992234, + "rewards/margins": 6.897675733029584, + "rewards/rejected": -5.952341253107244, + "step": 270 + }, + { + "epoch": 0.10004153015550736, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 9.81788567006122e-06, + "logits/chosen": 1528746120.5333333, + "logits/rejected": 1531104677.6470587, + "logps/chosen": -241.090234375, + "logps/rejected": -482.3291590073529, + "loss": 0.2172, + "rewards/chosen": 0.39494520823160806, + "rewards/margins": 6.7609236735923615, + "rewards/rejected": -6.365978465360754, + "step": 271 + }, + { + "epoch": 0.10041068709335056, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.816308413987747e-06, + "logits/chosen": 2232662339.368421, + "logits/rejected": 2012502173.5384614, + "logps/chosen": -285.95972964638156, + "logps/rejected": -404.50672325721155, + "loss": 0.2036, + "rewards/chosen": 1.2899169921875, + "rewards/margins": 7.265409616323618, + "rewards/rejected": -5.975492624136118, + "step": 272 + }, + { + "epoch": 0.10077984403119376, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 9.814724484980156e-06, + "logits/chosen": 1728581085.8666666, + "logits/rejected": 2453942512.9411764, + "logps/chosen": -244.68253580729166, + "logps/rejected": -412.92945772058823, + "loss": 0.1745, + "rewards/chosen": 1.0816758473714192, + "rewards/margins": 7.44792179032868, + "rewards/rejected": -6.366245942957261, + "step": 273 + }, + { + "epoch": 0.10114900096903696, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 9.813133885232962e-06, + "logits/chosen": 1383893219.5555556, + "logits/rejected": 1365664182.857143, + "logps/chosen": -265.19639756944446, + "logps/rejected": -401.4805385044643, + "loss": 0.2205, + "rewards/chosen": 0.8240726788838705, + "rewards/margins": 6.715718927837553, + "rewards/rejected": -5.891646248953683, + "step": 274 + }, + { + "epoch": 0.10151815790688017, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.811536616949921e-06, + "logits/chosen": 2127971564.3076923, + "logits/rejected": 2102075392.0, + "logps/chosen": -309.9605243389423, + "logps/rejected": -411.66270045230266, + "loss": 0.2082, + "rewards/chosen": 0.3268109835111178, + "rewards/margins": 5.845621054954375, + "rewards/rejected": -5.518810071443257, + "step": 275 + }, + { + "epoch": 0.10188731484472337, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 9.809932682344026e-06, + "logits/chosen": 1758097954.1333334, + "logits/rejected": 1285664045.1764705, + "logps/chosen": -214.67169596354168, + "logps/rejected": -421.70751953125, + "loss": 0.1434, + "rewards/chosen": 1.2558099110921224, + "rewards/margins": 8.197803175683115, + "rewards/rejected": -6.941993264590993, + "step": 276 + }, + { + "epoch": 0.10225647178256657, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 9.80832208363751e-06, + "logits/chosen": 2048963206.7368422, + "logits/rejected": 1745305284.9230769, + "logps/chosen": -330.1112510279605, + "logps/rejected": -380.83901742788464, + "loss": 0.2671, + "rewards/chosen": 0.39155254865947525, + "rewards/margins": 6.559855818265845, + "rewards/rejected": -6.16830326960637, + "step": 277 + }, + { + "epoch": 0.10262562872040977, + "grad_norm": 14.75, + "kl": 0.11701345443725586, + "learning_rate": 9.806704823061837e-06, + "logits/chosen": 1958317933.7142856, + "logits/rejected": 2712888661.3333335, + "logps/chosen": -371.164794921875, + "logps/rejected": -465.4070638020833, + "loss": 0.1909, + "rewards/chosen": 0.5397615432739258, + "rewards/margins": 7.27199141184489, + "rewards/rejected": -6.732229868570964, + "step": 278 + }, + { + "epoch": 0.10299478565825297, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 9.8050809028577e-06, + "logits/chosen": 1918053034.6666667, + "logits/rejected": 1417137814.5882354, + "logps/chosen": -386.6728515625, + "logps/rejected": -474.89401424632354, + "loss": 0.1921, + "rewards/chosen": 0.8444217681884766, + "rewards/margins": 5.563980484008789, + "rewards/rejected": -4.7195587158203125, + "step": 279 + }, + { + "epoch": 0.10336394259609616, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 9.803450325275018e-06, + "logits/chosen": 1564940151.4666667, + "logits/rejected": 1606698164.7058823, + "logps/chosen": -332.02177734375, + "logps/rejected": -423.7408088235294, + "loss": 0.1932, + "rewards/chosen": 0.6410570780436198, + "rewards/margins": 6.578573234408509, + "rewards/rejected": -5.9375161563648895, + "step": 280 + }, + { + "epoch": 0.10373309953393936, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 9.801813092572938e-06, + "logits/chosen": 2022453521.0666666, + "logits/rejected": 1927797699.764706, + "logps/chosen": -316.55735677083334, + "logps/rejected": -602.2984834558823, + "loss": 0.2096, + "rewards/chosen": 0.5625272115071615, + "rewards/margins": 6.531296060599533, + "rewards/rejected": -5.968768849092371, + "step": 281 + }, + { + "epoch": 0.10410225647178256, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 9.800169207019826e-06, + "logits/chosen": 1590730638.2222223, + "logits/rejected": 2300578084.571429, + "logps/chosen": -261.65985785590277, + "logps/rejected": -388.1918247767857, + "loss": 0.2176, + "rewards/chosen": 1.3194899029201932, + "rewards/margins": 6.649861759609646, + "rewards/rejected": -5.330371856689453, + "step": 282 + }, + { + "epoch": 0.10447141340962576, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 9.798518670893263e-06, + "logits/chosen": 1997319633.4545455, + "logits/rejected": 1809895628.8, + "logps/chosen": -300.9884588068182, + "logps/rejected": -305.700537109375, + "loss": 0.257, + "rewards/chosen": 1.1478494297374378, + "rewards/margins": 6.705653537403453, + "rewards/rejected": -5.557804107666016, + "step": 283 + }, + { + "epoch": 0.10484057034746896, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 9.796861486480045e-06, + "logits/chosen": 2167737074.5263157, + "logits/rejected": 1999945097.8461537, + "logps/chosen": -355.8480674342105, + "logps/rejected": -462.99789663461536, + "loss": 0.2363, + "rewards/chosen": 0.7633656451576635, + "rewards/margins": 7.962476409881221, + "rewards/rejected": -7.1991107647235575, + "step": 284 + }, + { + "epoch": 0.10520972728531217, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 9.795197656076182e-06, + "logits/chosen": 1632783360.0, + "logits/rejected": 1622156083.2, + "logps/chosen": -365.6959635416667, + "logps/rejected": -389.415771484375, + "loss": 0.2035, + "rewards/chosen": 0.1614638070265452, + "rewards/margins": 5.783546326557795, + "rewards/rejected": -5.62208251953125, + "step": 285 + }, + { + "epoch": 0.10557888422315537, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 9.793527181986888e-06, + "logits/chosen": 1597025621.3333333, + "logits/rejected": 1995834368.0, + "logps/chosen": -288.8605550130208, + "logps/rejected": -583.08427734375, + "loss": 0.1599, + "rewards/chosen": 0.588905135790507, + "rewards/margins": 7.20903529326121, + "rewards/rejected": -6.620130157470703, + "step": 286 + }, + { + "epoch": 0.10594804116099857, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 9.791850066526584e-06, + "logits/chosen": 3136440661.3333335, + "logits/rejected": 1721279078.4, + "logps/chosen": -299.9694010416667, + "logps/rejected": -455.729541015625, + "loss": 0.1563, + "rewards/chosen": 0.8755815029144287, + "rewards/margins": 6.879615545272827, + "rewards/rejected": -6.004034042358398, + "step": 287 + }, + { + "epoch": 0.10631719809884177, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.790166312018894e-06, + "logits/chosen": 1912333516.8, + "logits/rejected": 1403121749.3333333, + "logps/chosen": -269.272802734375, + "logps/rejected": -585.6920572916666, + "loss": 0.1895, + "rewards/chosen": 1.3292658805847168, + "rewards/margins": 8.927832571665446, + "rewards/rejected": -7.5985666910807295, + "step": 288 + }, + { + "epoch": 0.10668635503668497, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 9.788475920796638e-06, + "logits/chosen": 1915768149.3333333, + "logits/rejected": 2182330368.0, + "logps/chosen": -219.4649658203125, + "logps/rejected": -606.148486328125, + "loss": 0.1433, + "rewards/chosen": 1.0323289235432942, + "rewards/margins": 8.743295415242514, + "rewards/rejected": -7.710966491699219, + "step": 289 + }, + { + "epoch": 0.10705551197452817, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 9.78677889520183e-06, + "logits/chosen": 1379898254.2222223, + "logits/rejected": 1465265737.142857, + "logps/chosen": -278.90703667534723, + "logps/rejected": -607.9679129464286, + "loss": 0.1724, + "rewards/chosen": 1.3309804068671331, + "rewards/margins": 10.745853802514453, + "rewards/rejected": -9.414873395647321, + "step": 290 + }, + { + "epoch": 0.10742466891237137, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 9.785075237585678e-06, + "logits/chosen": 1743149738.6666667, + "logits/rejected": 2367486771.2, + "logps/chosen": -246.84893798828125, + "logps/rejected": -442.88212890625, + "loss": 0.1815, + "rewards/chosen": 0.6989297866821289, + "rewards/margins": 6.813461112976074, + "rewards/rejected": -6.114531326293945, + "step": 291 + }, + { + "epoch": 0.10779382585021458, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 9.783364950308578e-06, + "logits/chosen": 2242759626.105263, + "logits/rejected": 1857791054.7692308, + "logps/chosen": -222.3205694901316, + "logps/rejected": -491.5148362379808, + "loss": 0.2273, + "rewards/chosen": 1.0162039305034436, + "rewards/margins": 7.69055378871408, + "rewards/rejected": -6.674349858210637, + "step": 292 + }, + { + "epoch": 0.10816298278805778, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 9.78164803574011e-06, + "logits/chosen": 1661178831.2380953, + "logits/rejected": 1743329280.0, + "logps/chosen": -267.8080357142857, + "logps/rejected": -461.7286931818182, + "loss": 0.237, + "rewards/chosen": 1.0526900518508184, + "rewards/margins": 7.210590775394852, + "rewards/rejected": -6.157900723544034, + "step": 293 + }, + { + "epoch": 0.10853213972590098, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 9.77992449625904e-06, + "logits/chosen": 2574269553.7777777, + "logits/rejected": 1951157737.7391305, + "logps/chosen": -338.5415310329861, + "logps/rejected": -520.6604110054348, + "loss": 0.1202, + "rewards/chosen": 0.9983696407741971, + "rewards/margins": 7.912341914891044, + "rewards/rejected": -6.913972274116848, + "step": 294 + }, + { + "epoch": 0.10890129666374418, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 9.778194334253308e-06, + "logits/chosen": 1710206976.0, + "logits/rejected": 1996958515.2, + "logps/chosen": -320.86170151654414, + "logps/rejected": -440.87568359375, + "loss": 0.185, + "rewards/chosen": 1.0007241192985983, + "rewards/margins": 9.306014341466566, + "rewards/rejected": -8.305290222167969, + "step": 295 + }, + { + "epoch": 0.10927045360158738, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 9.776457552120034e-06, + "logits/chosen": 1621165738.6666667, + "logits/rejected": 2276909465.6, + "logps/chosen": -332.0484212239583, + "logps/rejected": -559.043603515625, + "loss": 0.1573, + "rewards/chosen": 0.8084496657053629, + "rewards/margins": 7.511607472101848, + "rewards/rejected": -6.703157806396485, + "step": 296 + }, + { + "epoch": 0.10963961053943058, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 9.774714152265504e-06, + "logits/chosen": 1846064878.9333334, + "logits/rejected": 1589098977.8823528, + "logps/chosen": -370.32848307291664, + "logps/rejected": -522.0892693014706, + "loss": 0.1944, + "rewards/chosen": 0.820379638671875, + "rewards/margins": 7.711372195973116, + "rewards/rejected": -6.8909925573012405, + "step": 297 + }, + { + "epoch": 0.11000876747727377, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 9.772964137105179e-06, + "logits/chosen": 2103658752.0, + "logits/rejected": 1707513600.0, + "logps/chosen": -293.0218505859375, + "logps/rejected": -517.441162109375, + "loss": 0.2262, + "rewards/chosen": 0.7265486121177673, + "rewards/margins": 7.379050195217133, + "rewards/rejected": -6.652501583099365, + "step": 298 + }, + { + "epoch": 0.11037792441511697, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 9.771207509063682e-06, + "logits/chosen": 2663499190.857143, + "logits/rejected": 2131321287.1111112, + "logps/chosen": -314.42539760044644, + "logps/rejected": -698.0623914930555, + "loss": 0.1401, + "rewards/chosen": 1.2839648383004325, + "rewards/margins": 9.750318360707118, + "rewards/rejected": -8.466353522406685, + "step": 299 + }, + { + "epoch": 0.11074708135296017, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 9.769444270574799e-06, + "logits/chosen": 1830025489.0666666, + "logits/rejected": 2108509485.1764705, + "logps/chosen": -279.42522786458335, + "logps/rejected": -379.85486557904414, + "loss": 0.1707, + "rewards/chosen": 1.3648675282796223, + "rewards/margins": 7.256301244099935, + "rewards/rejected": -5.8914337158203125, + "step": 300 + }, + { + "epoch": 0.11111623829080337, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 9.767674424081472e-06, + "logits/chosen": 2045057647.3043478, + "logits/rejected": 1836874638.2222223, + "logps/chosen": -272.3753821331522, + "logps/rejected": -629.0846896701389, + "loss": 0.2531, + "rewards/chosen": 1.2450005904487942, + "rewards/margins": 6.804306417271711, + "rewards/rejected": -5.559305826822917, + "step": 301 + }, + { + "epoch": 0.11148539522864657, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 9.765897972035806e-06, + "logits/chosen": 1641214520.8888888, + "logits/rejected": 2251253760.0, + "logps/chosen": -216.07187228732639, + "logps/rejected": -446.01681082589283, + "loss": 0.2016, + "rewards/chosen": 1.267933315700955, + "rewards/margins": 6.7012731158544145, + "rewards/rejected": -5.43333980015346, + "step": 302 + }, + { + "epoch": 0.11185455216648978, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 9.764114916899049e-06, + "logits/chosen": 1728858670.5454545, + "logits/rejected": 1600480012.1904762, + "logps/chosen": -306.89450905539775, + "logps/rejected": -547.4903738839286, + "loss": 0.122, + "rewards/chosen": 1.2787579623135654, + "rewards/margins": 8.23571219175925, + "rewards/rejected": -6.956954229445684, + "step": 303 + }, + { + "epoch": 0.11222370910433298, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 9.762325261141602e-06, + "logits/chosen": 1973804032.0, + "logits/rejected": 1802498420.3636363, + "logps/chosen": -259.93291015625, + "logps/rejected": -454.11501242897725, + "loss": 0.1344, + "rewards/chosen": 0.8137189865112304, + "rewards/margins": 7.040212821960449, + "rewards/rejected": -6.226493835449219, + "step": 304 + }, + { + "epoch": 0.11259286604217618, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 9.760529007243011e-06, + "logits/chosen": 1233308672.0, + "logits/rejected": 1955375427.368421, + "logps/chosen": -256.69649564302887, + "logps/rejected": -445.79183799342104, + "loss": 0.1076, + "rewards/chosen": 1.6934173290546124, + "rewards/margins": 8.444660518816125, + "rewards/rejected": -6.751243189761513, + "step": 305 + }, + { + "epoch": 0.11296202298001938, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 9.758726157691961e-06, + "logits/chosen": 1571382681.6, + "logits/rejected": 1838468778.6666667, + "logps/chosen": -286.5646240234375, + "logps/rejected": -386.9343668619792, + "loss": 0.2698, + "rewards/chosen": 0.43589019775390625, + "rewards/margins": 7.736979802449544, + "rewards/rejected": -7.301089604695638, + "step": 306 + }, + { + "epoch": 0.11333117991786258, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.75691671498628e-06, + "logits/chosen": 1985576277.3333333, + "logits/rejected": 1881284900.5714285, + "logps/chosen": -274.95830620659723, + "logps/rejected": -349.3916713169643, + "loss": 0.225, + "rewards/chosen": 1.0141366322835286, + "rewards/margins": 6.746578125726609, + "rewards/rejected": -5.732441493443081, + "step": 307 + }, + { + "epoch": 0.11370033685570578, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.755100681632924e-06, + "logits/chosen": 2392320602.352941, + "logits/rejected": 1872387959.4666667, + "logps/chosen": -367.8361385569853, + "logps/rejected": -436.31256510416665, + "loss": 0.1868, + "rewards/chosen": 1.0853500366210938, + "rewards/margins": 7.814613850911458, + "rewards/rejected": -6.7292638142903645, + "step": 308 + }, + { + "epoch": 0.11406949379354898, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.75327806014799e-06, + "logits/chosen": 1578351160.8888888, + "logits/rejected": 1900455204.5714285, + "logps/chosen": -245.11911349826389, + "logps/rejected": -531.3168247767857, + "loss": 0.2329, + "rewards/chosen": 0.5866188473171658, + "rewards/margins": 8.406115002102322, + "rewards/rejected": -7.819496154785156, + "step": 309 + }, + { + "epoch": 0.11443865073139219, + "grad_norm": 13.6875, + "kl": 0.30956125259399414, + "learning_rate": 9.75144885305669e-06, + "logits/chosen": 2038926677.3333333, + "logits/rejected": 2394494537.142857, + "logps/chosen": -314.65980360243054, + "logps/rejected": -412.06295340401783, + "loss": 0.2237, + "rewards/chosen": 1.2123469246758356, + "rewards/margins": 5.655106302291628, + "rewards/rejected": -4.4427593776157925, + "step": 310 + }, + { + "epoch": 0.11480780766923539, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 9.749613062893373e-06, + "logits/chosen": 1840382739.6923077, + "logits/rejected": 2134430127.1578948, + "logps/chosen": -234.12056790865384, + "logps/rejected": -493.5501644736842, + "loss": 0.1754, + "rewards/chosen": 0.6219647480891302, + "rewards/margins": 7.399088450288966, + "rewards/rejected": -6.777123702199836, + "step": 311 + }, + { + "epoch": 0.11517696460707859, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 9.7477706922015e-06, + "logits/chosen": 2764598125.714286, + "logits/rejected": 3046856704.0, + "logps/chosen": -257.80653599330356, + "logps/rejected": -560.7020399305555, + "loss": 0.1703, + "rewards/chosen": 1.3177879878452845, + "rewards/margins": 9.57884891449459, + "rewards/rejected": -8.261060926649305, + "step": 312 + }, + { + "epoch": 0.11554612154492179, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.745921743533653e-06, + "logits/chosen": 1473042311.5294118, + "logits/rejected": 2416495820.8, + "logps/chosen": -293.54865579044116, + "logps/rejected": -524.4080078125, + "loss": 0.1692, + "rewards/chosen": 1.6227103962617762, + "rewards/margins": 8.117225878846412, + "rewards/rejected": -6.494515482584635, + "step": 313 + }, + { + "epoch": 0.11591527848276499, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.744066219451526e-06, + "logits/chosen": 1809048629.8947368, + "logits/rejected": 1631027515.0769231, + "logps/chosen": -259.85580283717104, + "logps/rejected": -495.7014723557692, + "loss": 0.2551, + "rewards/chosen": 0.9207908228824013, + "rewards/margins": 6.081833843277534, + "rewards/rejected": -5.161043020395132, + "step": 314 + }, + { + "epoch": 0.11628443542060819, + "grad_norm": 12.625, + "kl": 0.8117055892944336, + "learning_rate": 9.742204122525925e-06, + "logits/chosen": 1466590208.0, + "logits/rejected": 2062477482.6666667, + "logps/chosen": -240.2080078125, + "logps/rejected": -498.1715901692708, + "loss": 0.1909, + "rewards/chosen": 1.6859901428222657, + "rewards/margins": 9.303480911254884, + "rewards/rejected": -7.617490768432617, + "step": 315 + }, + { + "epoch": 0.11665359235845138, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 9.740335455336762e-06, + "logits/chosen": 1330146872.8888888, + "logits/rejected": 1089062326.857143, + "logps/chosen": -257.4879557291667, + "logps/rejected": -413.96895926339283, + "loss": 0.1654, + "rewards/chosen": 1.4695846769544814, + "rewards/margins": 8.430021906655933, + "rewards/rejected": -6.9604372297014505, + "step": 316 + }, + { + "epoch": 0.11702274929629458, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 9.73846022047305e-06, + "logits/chosen": 1498745446.4, + "logits/rejected": 2244384954.181818, + "logps/chosen": -263.574169921875, + "logps/rejected": -554.7884410511364, + "loss": 0.1055, + "rewards/chosen": 1.3974539756774902, + "rewards/margins": 8.149565930800005, + "rewards/rejected": -6.752111955122515, + "step": 317 + }, + { + "epoch": 0.11739190623413778, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 9.736578420532904e-06, + "logits/chosen": 2426956746.105263, + "logits/rejected": 1393608388.9230769, + "logps/chosen": -329.1658357319079, + "logps/rejected": -409.3966721754808, + "loss": 0.2252, + "rewards/chosen": 1.0377998352050781, + "rewards/margins": 7.1182242173414965, + "rewards/rejected": -6.080424382136418, + "step": 318 + }, + { + "epoch": 0.11776106317198098, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 9.734690058123534e-06, + "logits/chosen": 1712908928.0, + "logits/rejected": 1759267328.0, + "logps/chosen": -218.2560577392578, + "logps/rejected": -392.0929870605469, + "loss": 0.1897, + "rewards/chosen": 0.8327862024307251, + "rewards/margins": 6.859002709388733, + "rewards/rejected": -6.026216506958008, + "step": 319 + }, + { + "epoch": 0.11813022010982419, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 9.732795135861245e-06, + "logits/chosen": 2732491629.714286, + "logits/rejected": 2010875221.3333333, + "logps/chosen": -347.94185965401783, + "logps/rejected": -581.3672417534722, + "loss": 0.1928, + "rewards/chosen": 0.5334914071219308, + "rewards/margins": 9.250272175622365, + "rewards/rejected": -8.716780768500435, + "step": 320 + }, + { + "epoch": 0.11849937704766739, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 9.73089365637142e-06, + "logits/chosen": 1505722488.4705882, + "logits/rejected": 1816177186.1333334, + "logps/chosen": -304.00896139705884, + "logps/rejected": -391.8509765625, + "loss": 0.194, + "rewards/chosen": 1.3245123694924748, + "rewards/margins": 7.886983377793255, + "rewards/rejected": -6.562471008300781, + "step": 321 + }, + { + "epoch": 0.11886853398551059, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 9.728985622288542e-06, + "logits/chosen": 2038449058.909091, + "logits/rejected": 1983236973.7142856, + "logps/chosen": -262.05118075284093, + "logps/rejected": -489.99232700892856, + "loss": 0.1054, + "rewards/chosen": 1.5703854994340376, + "rewards/margins": 8.481939745155763, + "rewards/rejected": -6.911554245721726, + "step": 322 + }, + { + "epoch": 0.11923769092335379, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.727071036256166e-06, + "logits/chosen": 1860948608.0, + "logits/rejected": 1410873216.0, + "logps/chosen": -269.7547912597656, + "logps/rejected": -541.3761596679688, + "loss": 0.211, + "rewards/chosen": 0.6384012699127197, + "rewards/margins": 7.099425554275513, + "rewards/rejected": -6.461024284362793, + "step": 323 + }, + { + "epoch": 0.11960684786119699, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 9.725149900926925e-06, + "logits/chosen": 1793793638.4, + "logits/rejected": 1772903484.235294, + "logps/chosen": -240.184130859375, + "logps/rejected": -397.7808191636029, + "loss": 0.1603, + "rewards/chosen": 1.5827176411946615, + "rewards/margins": 7.00188824522729, + "rewards/rejected": -5.419170604032629, + "step": 324 + }, + { + "epoch": 0.11997600479904019, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.723222218962529e-06, + "logits/chosen": 1886661911.2727273, + "logits/rejected": 1809809408.0, + "logps/chosen": -277.7151988636364, + "logps/rejected": -460.1244419642857, + "loss": 0.1644, + "rewards/chosen": 0.5133852091702548, + "rewards/margins": 7.712075714425091, + "rewards/rejected": -7.198690505254836, + "step": 325 + }, + { + "epoch": 0.1203451617368834, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 9.721287993033757e-06, + "logits/chosen": 1701160116.7058823, + "logits/rejected": 2235247547.733333, + "logps/chosen": -241.08389820772058, + "logps/rejected": -506.75305989583336, + "loss": 0.1638, + "rewards/chosen": 1.340056812061983, + "rewards/margins": 8.68946273654115, + "rewards/rejected": -7.349405924479167, + "step": 326 + }, + { + "epoch": 0.1207143186747266, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 9.719347225820453e-06, + "logits/chosen": 1275668224.0, + "logits/rejected": 1838634240.0, + "logps/chosen": -300.2945251464844, + "logps/rejected": -420.5171203613281, + "loss": 0.2081, + "rewards/chosen": 0.9389196634292603, + "rewards/margins": 7.048129200935364, + "rewards/rejected": -6.1092095375061035, + "step": 327 + }, + { + "epoch": 0.1210834756125698, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 9.717399920011527e-06, + "logits/chosen": 2098533888.0, + "logits/rejected": 2152257536.0, + "logps/chosen": -291.8856608072917, + "logps/rejected": -451.26796875, + "loss": 0.1319, + "rewards/chosen": 1.4894905090332031, + "rewards/margins": 7.712226486206054, + "rewards/rejected": -6.222735977172851, + "step": 328 + }, + { + "epoch": 0.121452632550413, + "grad_norm": 13.5, + "kl": 0.36493349075317383, + "learning_rate": 9.715446078304946e-06, + "logits/chosen": 1826282961.4545455, + "logits/rejected": 2429105688.3809524, + "logps/chosen": -294.46493252840907, + "logps/rejected": -470.8598865327381, + "loss": 0.1512, + "rewards/chosen": 0.8230190277099609, + "rewards/margins": 7.52409063066755, + "rewards/rejected": -6.701071602957589, + "step": 329 + }, + { + "epoch": 0.1218217894882562, + "grad_norm": 16.75, + "kl": 0.20928645133972168, + "learning_rate": 9.713485703407732e-06, + "logits/chosen": 2193311623.529412, + "logits/rejected": 2448197222.4, + "logps/chosen": -385.33898207720586, + "logps/rejected": -484.82776692708336, + "loss": 0.219, + "rewards/chosen": 0.7785900901345646, + "rewards/margins": 7.287018027960086, + "rewards/rejected": -6.5084279378255205, + "step": 330 + }, + { + "epoch": 0.1221909464260994, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 9.71151879803596e-06, + "logits/chosen": 2241329259.7894735, + "logits/rejected": 1130339485.5384614, + "logps/chosen": -260.50791529605266, + "logps/rejected": -332.5345928485577, + "loss": 0.248, + "rewards/chosen": 0.8129700610512182, + "rewards/margins": 5.653278910679373, + "rewards/rejected": -4.840308849628155, + "step": 331 + }, + { + "epoch": 0.1225601033639426, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 9.709545364914754e-06, + "logits/chosen": 2046657536.0, + "logits/rejected": 1764187704.8888888, + "logps/chosen": -333.5389927455357, + "logps/rejected": -372.90869140625, + "loss": 0.1923, + "rewards/chosen": 0.5461257525852748, + "rewards/margins": 7.192624962519086, + "rewards/rejected": -6.646499209933811, + "step": 332 + }, + { + "epoch": 0.1229292603017858, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 9.707565406778277e-06, + "logits/chosen": 1631706112.0, + "logits/rejected": 1564445440.0, + "logps/chosen": -312.7308349609375, + "logps/rejected": -476.3707275390625, + "loss": 0.186, + "rewards/chosen": 0.8617057204246521, + "rewards/margins": 7.816310822963715, + "rewards/rejected": -6.9546051025390625, + "step": 333 + }, + { + "epoch": 0.12329841723962899, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 9.70557892636974e-06, + "logits/chosen": 1563645405.8666666, + "logits/rejected": 2005603388.235294, + "logps/chosen": -325.72259114583335, + "logps/rejected": -540.3509880514706, + "loss": 0.1756, + "rewards/chosen": 1.1565523783365885, + "rewards/margins": 8.76632231170056, + "rewards/rejected": -7.609769933363971, + "step": 334 + }, + { + "epoch": 0.12366757417747219, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 9.703585926441383e-06, + "logits/chosen": 1484886893.7142856, + "logits/rejected": 2080737280.0, + "logps/chosen": -170.3687744140625, + "logps/rejected": -483.0564236111111, + "loss": 0.1411, + "rewards/chosen": 2.1464057649884904, + "rewards/margins": 8.668394103882804, + "rewards/rejected": -6.521988338894314, + "step": 335 + }, + { + "epoch": 0.1240367311153154, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 9.701586409754484e-06, + "logits/chosen": 1545110797.4736843, + "logits/rejected": 1759782596.9230769, + "logps/chosen": -304.8999280427632, + "logps/rejected": -524.0590444711538, + "loss": 0.2321, + "rewards/chosen": 1.1281626851935136, + "rewards/margins": 8.60429207033474, + "rewards/rejected": -7.476129385141226, + "step": 336 + }, + { + "epoch": 0.1244058880531586, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 9.699580379079348e-06, + "logits/chosen": 1697938688.0, + "logits/rejected": 1860971520.0, + "logps/chosen": -261.2745361328125, + "logps/rejected": -562.5830078125, + "loss": 0.2311, + "rewards/chosen": 0.3474116325378418, + "rewards/margins": 7.845529556274414, + "rewards/rejected": -7.498117923736572, + "step": 337 + }, + { + "epoch": 0.1247750449910018, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 9.697567837195303e-06, + "logits/chosen": 2704874556.2352943, + "logits/rejected": 1485390097.0666666, + "logps/chosen": -337.89728860294116, + "logps/rejected": -408.2734375, + "loss": 0.186, + "rewards/chosen": 1.374041950001436, + "rewards/margins": 7.340348000619925, + "rewards/rejected": -5.966306050618489, + "step": 338 + }, + { + "epoch": 0.125144201928845, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 9.695548786890701e-06, + "logits/chosen": 1532585642.6666667, + "logits/rejected": 1877556955.4285715, + "logps/chosen": -239.21115451388889, + "logps/rejected": -441.5912388392857, + "loss": 0.1551, + "rewards/chosen": 2.1020791795518665, + "rewards/margins": 9.117091103205606, + "rewards/rejected": -7.015011923653739, + "step": 339 + }, + { + "epoch": 0.125144201928845, + "eval_kl": 0.0, + "eval_logits/chosen": 3507292306.985646, + "eval_logits/rejected": 3533149028.848485, + "eval_logps/chosen": -295.8917090311005, + "eval_logps/rejected": -468.6000405844156, + "eval_loss": 0.16918018460273743, + "eval_rewards/chosen": 1.1123073158081638, + "eval_rewards/margins": 8.199268409727265, + "eval_rewards/rejected": -7.0869610939191015, + "eval_runtime": 109.5846, + "eval_samples_per_second": 7.994, + "eval_steps_per_second": 0.502, + "step": 339 + }, + { + "epoch": 0.1255133588666882, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 9.693523230962914e-06, + "logits/chosen": 1651297109.3333333, + "logits/rejected": 1667680563.2, + "logps/chosen": -226.25838216145834, + "logps/rejected": -351.815673828125, + "loss": 0.1159, + "rewards/chosen": 1.7283385594685872, + "rewards/margins": 6.712252744038899, + "rewards/rejected": -4.983914184570312, + "step": 340 + }, + { + "epoch": 0.12588251580453141, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 9.691491172218318e-06, + "logits/chosen": 2188598218.105263, + "logits/rejected": 2458437474.4615383, + "logps/chosen": -262.4993832236842, + "logps/rejected": -522.5113431490385, + "loss": 0.2481, + "rewards/chosen": 0.8335970828407689, + "rewards/margins": 11.080357524547498, + "rewards/rejected": -10.24676044170673, + "step": 341 + }, + { + "epoch": 0.12625167274237462, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 9.689452613472309e-06, + "logits/chosen": 1638323712.0, + "logits/rejected": 2167182540.8, + "logps/chosen": -275.1349283854167, + "logps/rejected": -418.9759765625, + "loss": 0.1506, + "rewards/chosen": 1.4113213221232097, + "rewards/margins": 7.963069979349773, + "rewards/rejected": -6.551748657226563, + "step": 342 + }, + { + "epoch": 0.1266208296802178, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 9.687407557549282e-06, + "logits/chosen": 1754517288.4210527, + "logits/rejected": 1831039448.6153846, + "logps/chosen": -292.2557308799342, + "logps/rejected": -441.79871544471155, + "loss": 0.1941, + "rewards/chosen": 1.4750100185996609, + "rewards/margins": 8.412591007556992, + "rewards/rejected": -6.937580988957332, + "step": 343 + }, + { + "epoch": 0.126989986618061, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 9.685356007282639e-06, + "logits/chosen": 1718597339.4285715, + "logits/rejected": 1479137373.090909, + "logps/chosen": -250.99583798363096, + "logps/rejected": -394.78280362215907, + "loss": 0.227, + "rewards/chosen": 1.3655876886276972, + "rewards/margins": 7.31252316891889, + "rewards/rejected": -5.946935480291193, + "step": 344 + }, + { + "epoch": 0.1273591435559042, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 9.683297965514774e-06, + "logits/chosen": 1946383938.7826087, + "logits/rejected": 1930582243.5555556, + "logps/chosen": -288.39451002038044, + "logps/rejected": -341.96275499131946, + "loss": 0.3316, + "rewards/chosen": 0.5754285480665124, + "rewards/margins": 5.109883741480141, + "rewards/rejected": -4.534455193413629, + "step": 345 + }, + { + "epoch": 0.1277283004937474, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 9.681233435097078e-06, + "logits/chosen": 1143471360.0, + "logits/rejected": 1413154560.0, + "logps/chosen": -261.47674560546875, + "logps/rejected": -494.34368896484375, + "loss": 0.1635, + "rewards/chosen": 1.2011501789093018, + "rewards/margins": 9.124915361404419, + "rewards/rejected": -7.923765182495117, + "step": 346 + }, + { + "epoch": 0.1280974574315906, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.679162418889932e-06, + "logits/chosen": 1108339029.3333333, + "logits/rejected": 1607356416.0, + "logps/chosen": -256.07411024305554, + "logps/rejected": -452.017333984375, + "loss": 0.193, + "rewards/chosen": 1.5214390224880643, + "rewards/margins": 7.945553794739739, + "rewards/rejected": -6.4241147722516745, + "step": 347 + }, + { + "epoch": 0.1284666143694338, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 9.677084919762703e-06, + "logits/chosen": 1343635937.8823528, + "logits/rejected": 2056398711.4666667, + "logps/chosen": -260.0425379136029, + "logps/rejected": -497.38255208333334, + "loss": 0.1698, + "rewards/chosen": 1.5038629419663374, + "rewards/margins": 8.75862927904316, + "rewards/rejected": -7.2547663370768225, + "step": 348 + }, + { + "epoch": 0.128835771307277, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 9.675000940593738e-06, + "logits/chosen": 1967014249.4117646, + "logits/rejected": 1834801834.6666667, + "logps/chosen": -295.69651884191177, + "logps/rejected": -461.6944986979167, + "loss": 0.1516, + "rewards/chosen": 1.4664307762594784, + "rewards/margins": 8.267338165582395, + "rewards/rejected": -6.800907389322917, + "step": 349 + }, + { + "epoch": 0.1292049282451202, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.672910484270367e-06, + "logits/chosen": 2704886784.0, + "logits/rejected": 2213251328.0, + "logps/chosen": -344.67547607421875, + "logps/rejected": -511.8169250488281, + "loss": 0.1856, + "rewards/chosen": 1.0536870956420898, + "rewards/margins": 9.18661880493164, + "rewards/rejected": -8.13293170928955, + "step": 350 + }, + { + "epoch": 0.1295740851829634, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 9.670813553688888e-06, + "logits/chosen": 2081259008.0, + "logits/rejected": 2255987968.0, + "logps/chosen": -303.13360595703125, + "logps/rejected": -460.738037109375, + "loss": 0.1976, + "rewards/chosen": 0.7987386584281921, + "rewards/margins": 6.779437243938446, + "rewards/rejected": -5.980698585510254, + "step": 351 + }, + { + "epoch": 0.1299432421208066, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.668710151754572e-06, + "logits/chosen": 2571791521.6842103, + "logits/rejected": 1672823256.6153846, + "logps/chosen": -229.90560752467104, + "logps/rejected": -448.85535606971155, + "loss": 0.2012, + "rewards/chosen": 1.4017832906622636, + "rewards/margins": 7.323555000397841, + "rewards/rejected": -5.921771709735577, + "step": 352 + }, + { + "epoch": 0.1303123990586498, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 9.666600281381657e-06, + "logits/chosen": 1628455227.0769231, + "logits/rejected": 2085885628.631579, + "logps/chosen": -323.06847205528845, + "logps/rejected": -530.048828125, + "loss": 0.1701, + "rewards/chosen": 0.5004661266620343, + "rewards/margins": 9.518498802957264, + "rewards/rejected": -9.01803267629523, + "step": 353 + }, + { + "epoch": 0.130681555996493, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 9.664483945493342e-06, + "logits/chosen": 1895652420.2666667, + "logits/rejected": 1853399040.0, + "logps/chosen": -326.6705729166667, + "logps/rejected": -428.90866268382354, + "loss": 0.1936, + "rewards/chosen": 0.7678691864013671, + "rewards/margins": 7.024685377233169, + "rewards/rejected": -6.256816190831802, + "step": 354 + }, + { + "epoch": 0.1310507129343362, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.66236114702178e-06, + "logits/chosen": 2007081915.7333333, + "logits/rejected": 1816121103.0588236, + "logps/chosen": -269.1964518229167, + "logps/rejected": -401.4315544577206, + "loss": 0.1985, + "rewards/chosen": 0.5368651072184245, + "rewards/margins": 6.8147743299895644, + "rewards/rejected": -6.2779092227711395, + "step": 355 + }, + { + "epoch": 0.1314198698721794, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 9.660231888908085e-06, + "logits/chosen": 1526057984.0, + "logits/rejected": 1883630592.0, + "logps/chosen": -237.85587565104166, + "logps/rejected": -416.067919921875, + "loss": 0.1383, + "rewards/chosen": 1.2615716457366943, + "rewards/margins": 6.7582234859466555, + "rewards/rejected": -5.496651840209961, + "step": 356 + }, + { + "epoch": 0.1317890268100226, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 9.658096174102314e-06, + "logits/chosen": 2144994872.8888888, + "logits/rejected": 1681049014.857143, + "logps/chosen": -291.23741319444446, + "logps/rejected": -578.5938197544643, + "loss": 0.2063, + "rewards/chosen": 1.0190391540527344, + "rewards/margins": 9.165148598807198, + "rewards/rejected": -8.146109444754464, + "step": 357 + }, + { + "epoch": 0.1321581837478658, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 9.655954005563475e-06, + "logits/chosen": 2134261613.7142856, + "logits/rejected": 2313300878.2222223, + "logps/chosen": -261.28731863839283, + "logps/rejected": -567.4992947048611, + "loss": 0.1841, + "rewards/chosen": 0.5613866533551898, + "rewards/margins": 8.280863141256665, + "rewards/rejected": -7.719476487901476, + "step": 358 + }, + { + "epoch": 0.132527340685709, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 9.653805386259514e-06, + "logits/chosen": 1798780807.5294118, + "logits/rejected": 2186673493.3333335, + "logps/chosen": -204.4872328814338, + "logps/rejected": -393.27692057291665, + "loss": 0.2058, + "rewards/chosen": 1.0325897441190832, + "rewards/margins": 7.006276471007104, + "rewards/rejected": -5.973686726888021, + "step": 359 + }, + { + "epoch": 0.1328964976235522, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 9.651650319167318e-06, + "logits/chosen": 2094073309.8666666, + "logits/rejected": 2171559213.1764708, + "logps/chosen": -330.50556640625, + "logps/rejected": -503.8912568933824, + "loss": 0.2035, + "rewards/chosen": 0.5402606328328451, + "rewards/margins": 7.653179710986567, + "rewards/rejected": -7.112919078153722, + "step": 360 + }, + { + "epoch": 0.1332656545613954, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 9.649488807272703e-06, + "logits/chosen": 2180775480.888889, + "logits/rejected": 1574576420.5714285, + "logps/chosen": -318.3544108072917, + "logps/rejected": -387.89100864955356, + "loss": 0.2227, + "rewards/chosen": 0.6724677615695529, + "rewards/margins": 7.089004910181439, + "rewards/rejected": -6.416537148611886, + "step": 361 + }, + { + "epoch": 0.13363481149923861, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 9.647320853570415e-06, + "logits/chosen": 1386188390.4, + "logits/rejected": 1224655811.764706, + "logps/chosen": -259.0849609375, + "logps/rejected": -412.5089326746324, + "loss": 0.1662, + "rewards/chosen": 1.3073441823323568, + "rewards/margins": 6.889398769303864, + "rewards/rejected": -5.582054586971507, + "step": 362 + }, + { + "epoch": 0.13400396843708182, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 9.645146461064129e-06, + "logits/chosen": 1579306368.0, + "logits/rejected": 2029641344.0, + "logps/chosen": -258.07177734375, + "logps/rejected": -465.22698974609375, + "loss": 0.1438, + "rewards/chosen": 1.7284153699874878, + "rewards/margins": 8.150326609611511, + "rewards/rejected": -6.421911239624023, + "step": 363 + }, + { + "epoch": 0.13437312537492502, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.642965632766437e-06, + "logits/chosen": 1939270542.2222223, + "logits/rejected": 2043072804.5714285, + "logps/chosen": -297.59784613715277, + "logps/rejected": -480.90774972098217, + "loss": 0.2402, + "rewards/chosen": 0.47220929463704425, + "rewards/margins": 6.6547501881917315, + "rewards/rejected": -6.1825408935546875, + "step": 364 + }, + { + "epoch": 0.13474228231276822, + "grad_norm": 24.25, + "kl": 0.0, + "learning_rate": 9.64077837169885e-06, + "logits/chosen": 2184852102.736842, + "logits/rejected": 1433711064.6153846, + "logps/chosen": -340.2958727384868, + "logps/rejected": -438.2702073317308, + "loss": 0.2597, + "rewards/chosen": 0.4157489224484092, + "rewards/margins": 6.1706458199844665, + "rewards/rejected": -5.7548968975360575, + "step": 365 + }, + { + "epoch": 0.13511143925061142, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.638584680891787e-06, + "logits/chosen": 1377009408.0, + "logits/rejected": 1552623744.0, + "logps/chosen": -264.9659423828125, + "logps/rejected": -515.0516967773438, + "loss": 0.1546, + "rewards/chosen": 1.8077906370162964, + "rewards/margins": 8.66408383846283, + "rewards/rejected": -6.856293201446533, + "step": 366 + }, + { + "epoch": 0.13548059618845462, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 9.636384563384584e-06, + "logits/chosen": 2714350250.6666665, + "logits/rejected": 1742696857.6, + "logps/chosen": -232.76542154947916, + "logps/rejected": -429.0328125, + "loss": 0.1596, + "rewards/chosen": 0.8740178743998209, + "rewards/margins": 7.745548025767008, + "rewards/rejected": -6.871530151367187, + "step": 367 + }, + { + "epoch": 0.13584975312629782, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 9.634178022225474e-06, + "logits/chosen": 1541684792.8888888, + "logits/rejected": 1660612900.5714285, + "logps/chosen": -310.5464138454861, + "logps/rejected": -466.06333705357144, + "loss": 0.1992, + "rewards/chosen": 1.0209729936387804, + "rewards/margins": 8.210181433057027, + "rewards/rejected": -7.189208439418247, + "step": 368 + }, + { + "epoch": 0.13621891006414102, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 9.631965060471587e-06, + "logits/chosen": 1774833117.8666666, + "logits/rejected": 1743265430.5882354, + "logps/chosen": -350.38310546875, + "logps/rejected": -518.9946001838235, + "loss": 0.1355, + "rewards/chosen": 1.6168473561604817, + "rewards/margins": 9.287025937847062, + "rewards/rejected": -7.670178581686581, + "step": 369 + }, + { + "epoch": 0.13658806700198423, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 9.62974568118896e-06, + "logits/chosen": 1333180054.5882354, + "logits/rejected": 1949014562.1333334, + "logps/chosen": -313.18537454044116, + "logps/rejected": -438.5080078125, + "loss": 0.2563, + "rewards/chosen": 0.395992166855756, + "rewards/margins": 8.279177998561485, + "rewards/rejected": -7.883185831705729, + "step": 370 + }, + { + "epoch": 0.13695722393982743, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.627519887452512e-06, + "logits/chosen": 1461645312.0, + "logits/rejected": 1523881252.5714285, + "logps/chosen": -232.86726888020834, + "logps/rejected": -404.53438895089283, + "loss": 0.2131, + "rewards/chosen": 0.7098377015855577, + "rewards/margins": 7.758998795161172, + "rewards/rejected": -7.049161093575614, + "step": 371 + }, + { + "epoch": 0.13732638087767063, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 9.625287682346051e-06, + "logits/chosen": 1684139446.857143, + "logits/rejected": 1731419249.7777777, + "logps/chosen": -312.4532993861607, + "logps/rejected": -485.69325086805554, + "loss": 0.1471, + "rewards/chosen": 1.1038735253470284, + "rewards/margins": 8.035731588091169, + "rewards/rejected": -6.931858062744141, + "step": 372 + }, + { + "epoch": 0.13769553781551383, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 9.62304906896227e-06, + "logits/chosen": 2146312021.3333333, + "logits/rejected": 1677266329.6, + "logps/chosen": -295.2914225260417, + "logps/rejected": -571.55419921875, + "loss": 0.1506, + "rewards/chosen": 0.7845592498779297, + "rewards/margins": 8.31790657043457, + "rewards/rejected": -7.533347320556641, + "step": 373 + }, + { + "epoch": 0.13806469475335703, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 9.620804050402738e-06, + "logits/chosen": 2354609083.733333, + "logits/rejected": 1766122797.1764705, + "logps/chosen": -231.64274088541666, + "logps/rejected": -583.2599379595588, + "loss": 0.1549, + "rewards/chosen": 1.2507829030354818, + "rewards/margins": 9.06287899391324, + "rewards/rejected": -7.812096090877757, + "step": 374 + }, + { + "epoch": 0.13843385169120023, + "grad_norm": 11.9375, + "kl": 0.1382436752319336, + "learning_rate": 9.618552629777904e-06, + "logits/chosen": 1584362732.3076923, + "logits/rejected": 1476279026.5263157, + "logps/chosen": -240.361572265625, + "logps/rejected": -382.98758737664474, + "loss": 0.1666, + "rewards/chosen": 1.0091890188363881, + "rewards/margins": 6.631763674469612, + "rewards/rejected": -5.6225746556332235, + "step": 375 + }, + { + "epoch": 0.13880300862904343, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 9.616294810207077e-06, + "logits/chosen": 1225417334.1538463, + "logits/rejected": 1242104993.6842105, + "logps/chosen": -283.8134577824519, + "logps/rejected": -516.4565172697369, + "loss": 0.1208, + "rewards/chosen": 1.4325991410475512, + "rewards/margins": 9.382924554801663, + "rewards/rejected": -7.950325413754112, + "step": 376 + }, + { + "epoch": 0.13917216556688664, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 9.61403059481844e-06, + "logits/chosen": 2342250359.4666667, + "logits/rejected": 1338388239.0588236, + "logps/chosen": -264.5478515625, + "logps/rejected": -502.7103630514706, + "loss": 0.1313, + "rewards/chosen": 1.7053250630696615, + "rewards/margins": 8.98542526843501, + "rewards/rejected": -7.28010020536535, + "step": 377 + }, + { + "epoch": 0.13954132250472984, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 9.611759986749036e-06, + "logits/chosen": 1735746560.0, + "logits/rejected": 2172561035.6363635, + "logps/chosen": -279.0372802734375, + "logps/rejected": -553.6524325284091, + "loss": 0.106, + "rewards/chosen": 1.6629119873046876, + "rewards/margins": 9.801081015846945, + "rewards/rejected": -8.138169028542258, + "step": 378 + }, + { + "epoch": 0.139910479442573, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 9.60948298914476e-06, + "logits/chosen": 1631841099.2941177, + "logits/rejected": 2150236979.2, + "logps/chosen": -319.9461454503676, + "logps/rejected": -540.7320963541666, + "loss": 0.2001, + "rewards/chosen": 0.868106393253102, + "rewards/margins": 8.632727364932789, + "rewards/rejected": -7.764620971679688, + "step": 379 + }, + { + "epoch": 0.1402796363804162, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.607199605160367e-06, + "logits/chosen": 2015413376.0, + "logits/rejected": 1986445824.0, + "logps/chosen": -297.494140625, + "logps/rejected": -541.105712890625, + "loss": 0.2012, + "rewards/chosen": 0.6245735883712769, + "rewards/margins": 8.115182280540466, + "rewards/rejected": -7.4906086921691895, + "step": 380 + }, + { + "epoch": 0.1406487933182594, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.604909837959456e-06, + "logits/chosen": 1235722778.9473684, + "logits/rejected": 1658123972.9230769, + "logps/chosen": -245.80111533717104, + "logps/rejected": -544.52783203125, + "loss": 0.1952, + "rewards/chosen": 1.3158021224172491, + "rewards/margins": 9.832599701669052, + "rewards/rejected": -8.516797579251802, + "step": 381 + }, + { + "epoch": 0.14101795025610261, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 9.602613690714468e-06, + "logits/chosen": 1798145228.8, + "logits/rejected": 1696284431.0588236, + "logps/chosen": -246.39615885416666, + "logps/rejected": -535.2096737132352, + "loss": 0.1557, + "rewards/chosen": 1.6111700693766275, + "rewards/margins": 7.949309771668677, + "rewards/rejected": -6.338139702292049, + "step": 382 + }, + { + "epoch": 0.14138710719394582, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.600311166606687e-06, + "logits/chosen": 1723616000.0, + "logits/rejected": 1721759872.0, + "logps/chosen": -247.85740661621094, + "logps/rejected": -430.78985595703125, + "loss": 0.1906, + "rewards/chosen": 1.1673980951309204, + "rewards/margins": 7.764888405799866, + "rewards/rejected": -6.597490310668945, + "step": 383 + }, + { + "epoch": 0.14175626413178902, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.59800226882623e-06, + "logits/chosen": 1383921800.5333333, + "logits/rejected": 1733275045.6470587, + "logps/chosen": -268.0473307291667, + "logps/rejected": -462.4274471507353, + "loss": 0.1725, + "rewards/chosen": 1.16984011332194, + "rewards/margins": 8.810279315125708, + "rewards/rejected": -7.640439201803768, + "step": 384 + }, + { + "epoch": 0.14212542106963222, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.595687000572049e-06, + "logits/chosen": 1998519374.7692308, + "logits/rejected": 2749480528.8421054, + "logps/chosen": -321.11733774038464, + "logps/rejected": -401.9541015625, + "loss": 0.1853, + "rewards/chosen": 0.8211579689612756, + "rewards/margins": 6.721532532078053, + "rewards/rejected": -5.9003745631167765, + "step": 385 + }, + { + "epoch": 0.14249457800747542, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 9.593365365051915e-06, + "logits/chosen": 1360691830.1538463, + "logits/rejected": 1720151417.2631578, + "logps/chosen": -177.74866661658655, + "logps/rejected": -441.29702919407896, + "loss": 0.1702, + "rewards/chosen": 0.8808885721059946, + "rewards/margins": 7.712759203273757, + "rewards/rejected": -6.831870631167763, + "step": 386 + }, + { + "epoch": 0.14286373494531862, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 9.591037365482424e-06, + "logits/chosen": 2256725196.8, + "logits/rejected": 2085053952.0, + "logps/chosen": -305.390380859375, + "logps/rejected": -459.892333984375, + "loss": 0.2566, + "rewards/chosen": 0.7829440116882325, + "rewards/margins": 9.776160208384196, + "rewards/rejected": -8.993216196695963, + "step": 387 + }, + { + "epoch": 0.14323289188316182, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.588703005088994e-06, + "logits/chosen": 1741253563.7333333, + "logits/rejected": 1799208960.0, + "logps/chosen": -257.41346028645836, + "logps/rejected": -508.30003446691177, + "loss": 0.1939, + "rewards/chosen": 0.6550120671590169, + "rewards/margins": 8.80546282973944, + "rewards/rejected": -8.150450762580423, + "step": 388 + }, + { + "epoch": 0.14360204882100502, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 9.58636228710585e-06, + "logits/chosen": 1820005797.6470587, + "logits/rejected": 1911634329.6, + "logps/chosen": -342.0751378676471, + "logps/rejected": -481.21692708333336, + "loss": 0.2505, + "rewards/chosen": 0.30485038196339326, + "rewards/margins": 9.440752278122247, + "rewards/rejected": -9.135901896158854, + "step": 389 + }, + { + "epoch": 0.14397120575884823, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 9.584015214776025e-06, + "logits/chosen": 1938715554.909091, + "logits/rejected": 1540675788.8, + "logps/chosen": -287.2528631036932, + "logps/rejected": -324.00595703125, + "loss": 0.2331, + "rewards/chosen": 1.344713644547896, + "rewards/margins": 6.631439642472701, + "rewards/rejected": -5.286725997924805, + "step": 390 + }, + { + "epoch": 0.14434036269669143, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 9.58166179135136e-06, + "logits/chosen": 1857606948.5714285, + "logits/rejected": 1266557383.1111112, + "logps/chosen": -299.09517996651783, + "logps/rejected": -426.6337619357639, + "loss": 0.1607, + "rewards/chosen": 1.1080269813537598, + "rewards/margins": 7.754366821712917, + "rewards/rejected": -6.646339840359158, + "step": 391 + }, + { + "epoch": 0.14470951963453463, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 9.579302020092491e-06, + "logits/chosen": 1742936883.2, + "logits/rejected": 1950550186.6666667, + "logps/chosen": -289.5045654296875, + "logps/rejected": -446.995361328125, + "loss": 0.2415, + "rewards/chosen": 0.8267616271972656, + "rewards/margins": 8.21956164042155, + "rewards/rejected": -7.392800013224284, + "step": 392 + }, + { + "epoch": 0.14507867657237783, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 9.576935904268853e-06, + "logits/chosen": 1455717668.5714285, + "logits/rejected": 1913486904.8888888, + "logps/chosen": -308.18258231026783, + "logps/rejected": -437.35047743055554, + "loss": 0.1589, + "rewards/chosen": 1.2528223310198103, + "rewards/margins": 7.9809246971493675, + "rewards/rejected": -6.728102366129558, + "step": 393 + }, + { + "epoch": 0.14544783351022103, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 9.574563447158671e-06, + "logits/chosen": 2003695495.5294118, + "logits/rejected": 2082482312.5333333, + "logps/chosen": -256.9423253676471, + "logps/rejected": -444.6559244791667, + "loss": 0.1812, + "rewards/chosen": 1.133278678445255, + "rewards/margins": 8.316498588113223, + "rewards/rejected": -7.183219909667969, + "step": 394 + }, + { + "epoch": 0.14581699044806423, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 9.57218465204895e-06, + "logits/chosen": 1726928759.4666667, + "logits/rejected": 1980301071.0588236, + "logps/chosen": -273.89326171875, + "logps/rejected": -412.39636948529414, + "loss": 0.1712, + "rewards/chosen": 0.9172369639078776, + "rewards/margins": 7.958772734099743, + "rewards/rejected": -7.0415357701918655, + "step": 395 + }, + { + "epoch": 0.14618614738590743, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 9.569799522235484e-06, + "logits/chosen": 1709594624.0, + "logits/rejected": 1723621376.0, + "logps/chosen": -258.77618815104165, + "logps/rejected": -381.2661994485294, + "loss": 0.1734, + "rewards/chosen": 0.8910394032796224, + "rewards/margins": 7.070839100258023, + "rewards/rejected": -6.1797996969784, + "step": 396 + }, + { + "epoch": 0.14655530432375063, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 9.567408061022838e-06, + "logits/chosen": 3571920896.0, + "logits/rejected": 2737334954.6666665, + "logps/chosen": -239.96843610491072, + "logps/rejected": -519.90673828125, + "loss": 0.1324, + "rewards/chosen": 1.3232063565935408, + "rewards/margins": 7.757024326021709, + "rewards/rejected": -6.433817969428168, + "step": 397 + }, + { + "epoch": 0.14692446126159384, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 9.565010271724353e-06, + "logits/chosen": 1958242759.1111112, + "logits/rejected": 2901280182.857143, + "logps/chosen": -344.83238389756946, + "logps/rejected": -403.4195033482143, + "loss": 0.2265, + "rewards/chosen": 0.8392838372124566, + "rewards/margins": 6.468618423219711, + "rewards/rejected": -5.629334586007254, + "step": 398 + }, + { + "epoch": 0.14729361819943704, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 9.562606157662132e-06, + "logits/chosen": 1543794560.0, + "logits/rejected": 1796631808.0, + "logps/chosen": -332.0307922363281, + "logps/rejected": -409.88067626953125, + "loss": 0.1721, + "rewards/chosen": 1.094642996788025, + "rewards/margins": 7.752733588218689, + "rewards/rejected": -6.658090591430664, + "step": 399 + }, + { + "epoch": 0.14766277513728024, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.56019572216705e-06, + "logits/chosen": 1905797347.5555556, + "logits/rejected": 1789911478.857143, + "logps/chosen": -329.1207682291667, + "logps/rejected": -488.9536830357143, + "loss": 0.172, + "rewards/chosen": 1.2870570288764105, + "rewards/margins": 8.21430039784265, + "rewards/rejected": -6.927243368966239, + "step": 400 + }, + { + "epoch": 0.14803193207512344, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 9.557778968578728e-06, + "logits/chosen": 1614072627.2, + "logits/rejected": 1632268288.0, + "logps/chosen": -265.12169596354164, + "logps/rejected": -416.1413143382353, + "loss": 0.1676, + "rewards/chosen": 1.638983662923177, + "rewards/margins": 8.142224809235218, + "rewards/rejected": -6.503241146312041, + "step": 401 + }, + { + "epoch": 0.14840108901296664, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 9.555355900245553e-06, + "logits/chosen": 2180031022.5454545, + "logits/rejected": 1780363878.4, + "logps/chosen": -311.76285067471593, + "logps/rejected": -382.5916015625, + "loss": 0.2326, + "rewards/chosen": 1.2490649656815962, + "rewards/margins": 7.760044427351518, + "rewards/rejected": -6.510979461669922, + "step": 402 + }, + { + "epoch": 0.14877024595080984, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.552926520524654e-06, + "logits/chosen": 1896104538.3529413, + "logits/rejected": 1815742327.4666667, + "logps/chosen": -274.66345932904414, + "logps/rejected": -575.7646484375, + "loss": 0.2028, + "rewards/chosen": 0.9009261411779067, + "rewards/margins": 7.843930495019053, + "rewards/rejected": -6.943004353841146, + "step": 403 + }, + { + "epoch": 0.14913940288865304, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.550490832781905e-06, + "logits/chosen": 1683728263.5294118, + "logits/rejected": 2133273122.1333334, + "logps/chosen": -283.2207892922794, + "logps/rejected": -461.8162434895833, + "loss": 0.2061, + "rewards/chosen": 1.3944896249210132, + "rewards/margins": 8.208423951092888, + "rewards/rejected": -6.813934326171875, + "step": 404 + }, + { + "epoch": 0.14950855982649625, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 9.54804884039192e-06, + "logits/chosen": 1576172150.1538463, + "logits/rejected": 1565305478.7368422, + "logps/chosen": -256.34557166466345, + "logps/rejected": -413.0727025082237, + "loss": 0.1811, + "rewards/chosen": 0.5149996097271259, + "rewards/margins": 6.631817686412981, + "rewards/rejected": -6.116818076685855, + "step": 405 + }, + { + "epoch": 0.14987771676433945, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.545600546738047e-06, + "logits/chosen": 2661444039.111111, + "logits/rejected": 2792508269.714286, + "logps/chosen": -267.06049262152777, + "logps/rejected": -504.3657924107143, + "loss": 0.2014, + "rewards/chosen": 0.9517600801255968, + "rewards/margins": 8.4590728547838, + "rewards/rejected": -7.507312774658203, + "step": 406 + }, + { + "epoch": 0.15024687370218265, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 9.54314595521237e-06, + "logits/chosen": 1397186816.0, + "logits/rejected": 1663922944.0, + "logps/chosen": -299.0145263671875, + "logps/rejected": -339.6547546386719, + "loss": 0.1692, + "rewards/chosen": 1.0805680751800537, + "rewards/margins": 6.122377157211304, + "rewards/rejected": -5.04180908203125, + "step": 407 + }, + { + "epoch": 0.15061603064002585, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.540685069215693e-06, + "logits/chosen": 1831780036.9230769, + "logits/rejected": 1716071262.3157895, + "logps/chosen": -246.77422626201923, + "logps/rejected": -422.6981136924342, + "loss": 0.187, + "rewards/chosen": 0.9568292177640475, + "rewards/margins": 6.36310772760677, + "rewards/rejected": -5.406278509842722, + "step": 408 + }, + { + "epoch": 0.15098518757786905, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.53821789215754e-06, + "logits/chosen": 1877976192.0, + "logits/rejected": 1774899072.0, + "logps/chosen": -287.7886962890625, + "logps/rejected": -454.25482177734375, + "loss": 0.2496, + "rewards/chosen": 0.13521817326545715, + "rewards/margins": 6.945976287126541, + "rewards/rejected": -6.810758113861084, + "step": 409 + }, + { + "epoch": 0.15135434451571225, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 9.535744427456156e-06, + "logits/chosen": 1482375395.5555556, + "logits/rejected": 1468752310.857143, + "logps/chosen": -300.3171115451389, + "logps/rejected": -499.8369140625, + "loss": 0.2392, + "rewards/chosen": 0.488131841023763, + "rewards/margins": 7.519225983392625, + "rewards/rejected": -7.031094142368862, + "step": 410 + }, + { + "epoch": 0.15172350145355545, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 9.533264678538493e-06, + "logits/chosen": 2152330581.3333335, + "logits/rejected": 1885027147.2941177, + "logps/chosen": -209.37931315104166, + "logps/rejected": -471.7879423253676, + "loss": 0.1853, + "rewards/chosen": 1.0260955810546875, + "rewards/margins": 7.536616964901195, + "rewards/rejected": -6.510521383846507, + "step": 411 + }, + { + "epoch": 0.15209265839139866, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 9.530778648840213e-06, + "logits/chosen": 2040008476.4444444, + "logits/rejected": 1421106761.142857, + "logps/chosen": -298.0353732638889, + "logps/rejected": -468.2005092075893, + "loss": 0.1862, + "rewards/chosen": 1.4321862326727972, + "rewards/margins": 8.714710886516269, + "rewards/rejected": -7.282524653843471, + "step": 412 + }, + { + "epoch": 0.15246181532924186, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.528286341805675e-06, + "logits/chosen": 1386916864.0, + "logits/rejected": 1678343168.0, + "logps/chosen": -312.471728515625, + "logps/rejected": -413.3424479166667, + "loss": 0.2002, + "rewards/chosen": 1.1034334182739258, + "rewards/margins": 8.104323514302571, + "rewards/rejected": -7.0008900960286455, + "step": 413 + }, + { + "epoch": 0.15283097226708506, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.525787760887945e-06, + "logits/chosen": 2307841566.117647, + "logits/rejected": 2275797674.6666665, + "logps/chosen": -364.07571231617646, + "logps/rejected": -416.757421875, + "loss": 0.1865, + "rewards/chosen": 1.2652756186092602, + "rewards/margins": 7.000926253374885, + "rewards/rejected": -5.735650634765625, + "step": 414 + }, + { + "epoch": 0.15320012920492823, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.523282909548773e-06, + "logits/chosen": 1289296099.5555556, + "logits/rejected": 1995946276.5714285, + "logps/chosen": -299.05986870659723, + "logps/rejected": -408.19810267857144, + "loss": 0.2144, + "rewards/chosen": 1.0565063688490126, + "rewards/margins": 8.21663755083841, + "rewards/rejected": -7.160131181989398, + "step": 415 + }, + { + "epoch": 0.15356928614277143, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.520771791258593e-06, + "logits/chosen": 1849796198.4, + "logits/rejected": 1834064715.2941177, + "logps/chosen": -340.3735026041667, + "logps/rejected": -407.8080480238971, + "loss": 0.1908, + "rewards/chosen": 0.8948452631632487, + "rewards/margins": 6.197336843901989, + "rewards/rejected": -5.3024915807387405, + "step": 416 + }, + { + "epoch": 0.15393844308061463, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 9.518254409496536e-06, + "logits/chosen": 2121342012.235294, + "logits/rejected": 1217952563.2, + "logps/chosen": -306.6924689797794, + "logps/rejected": -430.39033203125, + "loss": 0.1601, + "rewards/chosen": 1.6364097595214844, + "rewards/margins": 8.554572296142577, + "rewards/rejected": -6.918162536621094, + "step": 417 + }, + { + "epoch": 0.15430760001845784, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 9.515730767750397e-06, + "logits/chosen": 1601389616.7619047, + "logits/rejected": 1655772439.2727273, + "logps/chosen": -249.9218982514881, + "logps/rejected": -344.3494762073864, + "loss": 0.2229, + "rewards/chosen": 1.1368172963460286, + "rewards/margins": 7.215015815966057, + "rewards/rejected": -6.078198519620028, + "step": 418 + }, + { + "epoch": 0.15467675695630104, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 9.513200869516651e-06, + "logits/chosen": 1381346397.090909, + "logits/rejected": 2113726951.6190476, + "logps/chosen": -339.06289950284093, + "logps/rejected": -388.5776134672619, + "loss": 0.1918, + "rewards/chosen": 0.5060956261374734, + "rewards/margins": 6.6043382496028755, + "rewards/rejected": -6.098242623465402, + "step": 419 + }, + { + "epoch": 0.15504591389414424, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 9.51066471830044e-06, + "logits/chosen": 1852270336.0, + "logits/rejected": 1473811968.0, + "logps/chosen": -266.4080505371094, + "logps/rejected": -403.548095703125, + "loss": 0.193, + "rewards/chosen": 0.7962771058082581, + "rewards/margins": 7.894688665866852, + "rewards/rejected": -7.098411560058594, + "step": 420 + }, + { + "epoch": 0.15541507083198744, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.50812231761557e-06, + "logits/chosen": 1747788406.1538463, + "logits/rejected": 2023733571.368421, + "logps/chosen": -290.64847506009613, + "logps/rejected": -439.35567434210526, + "loss": 0.1465, + "rewards/chosen": 1.2459685985858624, + "rewards/margins": 7.018829075431051, + "rewards/rejected": -5.772860476845189, + "step": 421 + }, + { + "epoch": 0.15578422776983064, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 9.505573670984502e-06, + "logits/chosen": 2004292230.7368422, + "logits/rejected": 2002882402.4615386, + "logps/chosen": -184.99994860197367, + "logps/rejected": -483.9354717548077, + "loss": 0.1853, + "rewards/chosen": 1.5995942165977077, + "rewards/margins": 7.2835841082368304, + "rewards/rejected": -5.683989891639123, + "step": 422 + }, + { + "epoch": 0.15615338470767384, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.503018781938358e-06, + "logits/chosen": 1236752128.0, + "logits/rejected": 2007274752.0, + "logps/chosen": -271.8293762207031, + "logps/rejected": -445.6889343261719, + "loss": 0.1992, + "rewards/chosen": 0.8957881927490234, + "rewards/margins": 7.318389892578125, + "rewards/rejected": -6.422601699829102, + "step": 423 + }, + { + "epoch": 0.15652254164551704, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 9.5004576540169e-06, + "logits/chosen": 2672929645.714286, + "logits/rejected": 2408996408.888889, + "logps/chosen": -259.61373465401783, + "logps/rejected": -549.6809895833334, + "loss": 0.1808, + "rewards/chosen": 0.7344390324183873, + "rewards/margins": 7.449676120091998, + "rewards/rejected": -6.715237087673611, + "step": 424 + }, + { + "epoch": 0.15689169858336025, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.49789029076854e-06, + "logits/chosen": 1468558563.5555556, + "logits/rejected": 1477639753.142857, + "logps/chosen": -224.86748589409723, + "logps/rejected": -318.23440987723217, + "loss": 0.1835, + "rewards/chosen": 1.3313792546590169, + "rewards/margins": 7.426517804463704, + "rewards/rejected": -6.0951385498046875, + "step": 425 + }, + { + "epoch": 0.15726085552120345, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 9.49531669575033e-06, + "logits/chosen": 2148819080.5333333, + "logits/rejected": 2368397312.0, + "logps/chosen": -297.6759440104167, + "logps/rejected": -509.75212545955884, + "loss": 0.1917, + "rewards/chosen": 0.8936670303344727, + "rewards/margins": 8.30569517472211, + "rewards/rejected": -7.412028144387638, + "step": 426 + }, + { + "epoch": 0.15763001245904665, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 9.492736872527948e-06, + "logits/chosen": 1491219200.0, + "logits/rejected": 1513951488.0, + "logps/chosen": -287.7457580566406, + "logps/rejected": -357.4420471191406, + "loss": 0.2049, + "rewards/chosen": 1.028751015663147, + "rewards/margins": 7.207629323005676, + "rewards/rejected": -6.178878307342529, + "step": 427 + }, + { + "epoch": 0.15799916939688985, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.49015082467571e-06, + "logits/chosen": 1451139192.4705882, + "logits/rejected": 1425122781.8666666, + "logps/chosen": -360.09426700367646, + "logps/rejected": -481.3603515625, + "loss": 0.1819, + "rewards/chosen": 1.2073700848747702, + "rewards/margins": 8.74751901065602, + "rewards/rejected": -7.54014892578125, + "step": 428 + }, + { + "epoch": 0.15836832633473305, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 9.48755855577655e-06, + "logits/chosen": 1648751130.9473684, + "logits/rejected": 1315853705.8461537, + "logps/chosen": -325.03238075657896, + "logps/rejected": -571.3363131009615, + "loss": 0.2126, + "rewards/chosen": 1.0955026526200144, + "rewards/margins": 8.588325739872118, + "rewards/rejected": -7.492823087252104, + "step": 429 + }, + { + "epoch": 0.15873748327257625, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 9.484960069422026e-06, + "logits/chosen": 1799661568.0, + "logits/rejected": 2723344156.4444447, + "logps/chosen": -341.4420689174107, + "logps/rejected": -609.2888454861111, + "loss": 0.1067, + "rewards/chosen": 1.6041766575404577, + "rewards/margins": 10.343394627646795, + "rewards/rejected": -8.739217970106337, + "step": 430 + }, + { + "epoch": 0.15910664021041945, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.482355369212307e-06, + "logits/chosen": 2755885641.142857, + "logits/rejected": 1985955157.3333333, + "logps/chosen": -259.13295200892856, + "logps/rejected": -564.4095052083334, + "loss": 0.1446, + "rewards/chosen": 1.4627829960414342, + "rewards/margins": 9.625913014487615, + "rewards/rejected": -8.16313001844618, + "step": 431 + }, + { + "epoch": 0.15947579714826265, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 9.47974445875617e-06, + "logits/chosen": 1956324040.347826, + "logits/rejected": 1521241201.7777777, + "logps/chosen": -342.52373471467394, + "logps/rejected": -434.6310763888889, + "loss": 0.2382, + "rewards/chosen": 1.4340296206266985, + "rewards/margins": 8.555200512282514, + "rewards/rejected": -7.121170891655816, + "step": 432 + }, + { + "epoch": 0.15984495408610586, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 9.477127341671e-06, + "logits/chosen": 1398344557.7142856, + "logits/rejected": 1425347697.7777777, + "logps/chosen": -306.09486607142856, + "logps/rejected": -429.64643012152777, + "loss": 0.1417, + "rewards/chosen": 1.1269269670758928, + "rewards/margins": 8.88234383719308, + "rewards/rejected": -7.7554168701171875, + "step": 433 + }, + { + "epoch": 0.16021411102394906, + "grad_norm": 13.8125, + "kl": 0.9919271469116211, + "learning_rate": 9.47450402158278e-06, + "logits/chosen": 1627436373.3333333, + "logits/rejected": 1671521426.2857144, + "logps/chosen": -268.61469184027777, + "logps/rejected": -419.15488978794644, + "loss": 0.2059, + "rewards/chosen": 1.1425614886813693, + "rewards/margins": 9.435201629759774, + "rewards/rejected": -8.292640141078405, + "step": 434 + }, + { + "epoch": 0.16058326796179226, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 9.471874502126087e-06, + "logits/chosen": 1883517366.857143, + "logits/rejected": 2270909326.2222223, + "logps/chosen": -298.3540736607143, + "logps/rejected": -640.4641927083334, + "loss": 0.1484, + "rewards/chosen": 1.019315242767334, + "rewards/margins": 11.426795270707872, + "rewards/rejected": -10.407480027940538, + "step": 435 + }, + { + "epoch": 0.16095242489963546, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 9.469238786944086e-06, + "logits/chosen": 1889522073.6, + "logits/rejected": 1955934208.0, + "logps/chosen": -265.3699951171875, + "logps/rejected": -436.8512369791667, + "loss": 0.2776, + "rewards/chosen": 0.5707391738891602, + "rewards/margins": 7.696090126037598, + "rewards/rejected": -7.1253509521484375, + "step": 436 + }, + { + "epoch": 0.16132158183747866, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 9.466596879688525e-06, + "logits/chosen": 1524982306.1333334, + "logits/rejected": 2101468220.235294, + "logps/chosen": -243.32716471354166, + "logps/rejected": -347.1457088694853, + "loss": 0.2071, + "rewards/chosen": 0.7067002614339193, + "rewards/margins": 6.42231637543323, + "rewards/rejected": -5.71561611399931, + "step": 437 + }, + { + "epoch": 0.16169073877532186, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 9.463948784019736e-06, + "logits/chosen": 2382068736.0, + "logits/rejected": 1903634432.0, + "logps/chosen": -302.1851318359375, + "logps/rejected": -429.0208629261364, + "loss": 0.1255, + "rewards/chosen": 1.1733879089355468, + "rewards/margins": 7.524258908358487, + "rewards/rejected": -6.35087099942294, + "step": 438 + }, + { + "epoch": 0.16205989571316506, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 9.461294503606621e-06, + "logits/chosen": 1423123456.0, + "logits/rejected": 1668021361.7777777, + "logps/chosen": -274.70179966517856, + "logps/rejected": -358.2701009114583, + "loss": 0.189, + "rewards/chosen": 0.8811074665614537, + "rewards/margins": 6.934006721254379, + "rewards/rejected": -6.0528992546929254, + "step": 439 + }, + { + "epoch": 0.16242905265100827, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 9.458634042126651e-06, + "logits/chosen": 1840356773.6470587, + "logits/rejected": 1948377088.0, + "logps/chosen": -283.4951746323529, + "logps/rejected": -492.25970052083335, + "loss": 0.1799, + "rewards/chosen": 0.9979782104492188, + "rewards/margins": 7.6272425333658855, + "rewards/rejected": -6.629264322916667, + "step": 440 + }, + { + "epoch": 0.16279820958885147, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 9.455967403265861e-06, + "logits/chosen": 1764379776.0, + "logits/rejected": 2547751424.0, + "logps/chosen": -279.8078308105469, + "logps/rejected": -405.1233825683594, + "loss": 0.1761, + "rewards/chosen": 1.0658788681030273, + "rewards/margins": 7.703524589538574, + "rewards/rejected": -6.637645721435547, + "step": 441 + }, + { + "epoch": 0.16316736652669467, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 9.453294590718846e-06, + "logits/chosen": 2153193267.2, + "logits/rejected": 2300990464.0, + "logps/chosen": -302.87109375, + "logps/rejected": -680.265625, + "loss": 0.262, + "rewards/chosen": 0.6101669788360595, + "rewards/margins": 9.794128211339315, + "rewards/rejected": -9.183961232503256, + "step": 442 + }, + { + "epoch": 0.16353652346453787, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 9.450615608188755e-06, + "logits/chosen": 2295278170.352941, + "logits/rejected": 1724589397.3333333, + "logps/chosen": -293.3492647058824, + "logps/rejected": -446.7376953125, + "loss": 0.2028, + "rewards/chosen": 1.1460014792049633, + "rewards/margins": 6.843126723345588, + "rewards/rejected": -5.697125244140625, + "step": 443 + }, + { + "epoch": 0.16390568040238107, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 9.447930459387284e-06, + "logits/chosen": 1544387820.3076923, + "logits/rejected": 1523285800.4210527, + "logps/chosen": -176.35471754807693, + "logps/rejected": -418.89170435855266, + "loss": 0.1422, + "rewards/chosen": 1.3574663308950572, + "rewards/margins": 7.512983909020058, + "rewards/rejected": -6.155517578125, + "step": 444 + }, + { + "epoch": 0.16427483734022427, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 9.445239148034673e-06, + "logits/chosen": 2157634267.428571, + "logits/rejected": 1811817472.0, + "logps/chosen": -260.056396484375, + "logps/rejected": -435.5497233072917, + "loss": 0.1568, + "rewards/chosen": 1.190401213509696, + "rewards/margins": 7.259217080615816, + "rewards/rejected": -6.06881586710612, + "step": 445 + }, + { + "epoch": 0.16464399427806747, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 9.442541677859695e-06, + "logits/chosen": 1888418032.9411764, + "logits/rejected": 1410723976.5333333, + "logps/chosen": -279.02326516544116, + "logps/rejected": -406.93017578125, + "loss": 0.1867, + "rewards/chosen": 1.0029088188620174, + "rewards/margins": 7.282194504083371, + "rewards/rejected": -6.279285685221354, + "step": 446 + }, + { + "epoch": 0.16501315121591068, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 9.439838052599668e-06, + "logits/chosen": 2164433211.076923, + "logits/rejected": 1802583093.8947368, + "logps/chosen": -314.37691556490387, + "logps/rejected": -576.5401418585526, + "loss": 0.1103, + "rewards/chosen": 1.3447289100060096, + "rewards/margins": 10.43544160788841, + "rewards/rejected": -9.0907126978824, + "step": 447 + }, + { + "epoch": 0.16538230815375388, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 9.437128276000424e-06, + "logits/chosen": 1466425659.0769231, + "logits/rejected": 1750580601.2631578, + "logps/chosen": -290.70688100961536, + "logps/rejected": -557.6423725328947, + "loss": 0.1811, + "rewards/chosen": 0.3252264903141902, + "rewards/margins": 7.791055636849963, + "rewards/rejected": -7.465829146535773, + "step": 448 + }, + { + "epoch": 0.16575146509159708, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 9.434412351816329e-06, + "logits/chosen": 1857439428.9230769, + "logits/rejected": 2492173904.8421054, + "logps/chosen": -220.86519681490384, + "logps/rejected": -358.22049753289474, + "loss": 0.1425, + "rewards/chosen": 1.5148622072660005, + "rewards/margins": 8.31200222833919, + "rewards/rejected": -6.797140021073191, + "step": 449 + }, + { + "epoch": 0.16612062202944028, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 9.431690283810257e-06, + "logits/chosen": 1876025457.7777777, + "logits/rejected": 1827281042.2857144, + "logps/chosen": -311.16832139756946, + "logps/rejected": -584.4556361607143, + "loss": 0.188, + "rewards/chosen": 1.1974159876505535, + "rewards/margins": 7.976609865824382, + "rewards/rejected": -6.779193878173828, + "step": 450 + }, + { + "epoch": 0.16648977896728345, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 9.428962075753602e-06, + "logits/chosen": 1794954854.4, + "logits/rejected": 1651961976.4705882, + "logps/chosen": -219.48981119791668, + "logps/rejected": -511.2558019301471, + "loss": 0.1202, + "rewards/chosen": 1.911020533243815, + "rewards/margins": 8.24248182259354, + "rewards/rejected": -6.331461289349725, + "step": 451 + }, + { + "epoch": 0.16685893590512665, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 9.42622773142626e-06, + "logits/chosen": 2014075172.5714285, + "logits/rejected": 2313355036.4444447, + "logps/chosen": -296.16514369419644, + "logps/rejected": -478.00151909722223, + "loss": 0.1854, + "rewards/chosen": 0.7648314748491559, + "rewards/margins": 8.013518371279277, + "rewards/rejected": -7.248686896430121, + "step": 452 + }, + { + "epoch": 0.16722809284296986, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 9.423487254616632e-06, + "logits/chosen": 1507809664.0, + "logits/rejected": 1533274624.0, + "logps/chosen": -340.4511413574219, + "logps/rejected": -493.132568359375, + "loss": 0.1657, + "rewards/chosen": 1.3845552206039429, + "rewards/margins": 8.302598357200623, + "rewards/rejected": -6.91804313659668, + "step": 453 + }, + { + "epoch": 0.16759724978081306, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 9.420740649121611e-06, + "logits/chosen": 1880290417.7777777, + "logits/rejected": 1329632402.2857144, + "logps/chosen": -237.93340386284723, + "logps/rejected": -404.05698939732144, + "loss": 0.2316, + "rewards/chosen": 0.5798261430528429, + "rewards/margins": 7.000474445403569, + "rewards/rejected": -6.420648302350726, + "step": 454 + }, + { + "epoch": 0.16796640671865626, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.417987918746587e-06, + "logits/chosen": 1998635885.7142856, + "logits/rejected": 2313715712.0, + "logps/chosen": -388.2079380580357, + "logps/rejected": -539.5053168402778, + "loss": 0.1664, + "rewards/chosen": 0.744182995387486, + "rewards/margins": 9.420208386012487, + "rewards/rejected": -8.676025390625, + "step": 455 + }, + { + "epoch": 0.16833556365649946, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 9.41522906730543e-06, + "logits/chosen": 1481927017.4117646, + "logits/rejected": 1647460215.4666667, + "logps/chosen": -258.1116153492647, + "logps/rejected": -398.8003255208333, + "loss": 0.1879, + "rewards/chosen": 1.0994893242331112, + "rewards/margins": 7.681139104506549, + "rewards/rejected": -6.5816497802734375, + "step": 456 + }, + { + "epoch": 0.16870472059434266, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 9.412464098620495e-06, + "logits/chosen": 1388586780.4444444, + "logits/rejected": 1447887433.142857, + "logps/chosen": -380.5871310763889, + "logps/rejected": -404.9984654017857, + "loss": 0.219, + "rewards/chosen": 0.9128994411892362, + "rewards/margins": 6.943295493958489, + "rewards/rejected": -6.030396052769253, + "step": 457 + }, + { + "epoch": 0.16907387753218586, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 9.409693016522613e-06, + "logits/chosen": 1475822933.3333333, + "logits/rejected": 1880910994.2857144, + "logps/chosen": -357.24956597222223, + "logps/rejected": -345.016845703125, + "loss": 0.21, + "rewards/chosen": 1.5323240492078993, + "rewards/margins": 6.078353003850059, + "rewards/rejected": -4.54602895464216, + "step": 458 + }, + { + "epoch": 0.16944303447002906, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 9.40691582485108e-06, + "logits/chosen": 1580634697.142857, + "logits/rejected": 1367673628.4444444, + "logps/chosen": -349.52113560267856, + "logps/rejected": -476.46533203125, + "loss": 0.1783, + "rewards/chosen": 0.6020426068987165, + "rewards/margins": 7.91927028837658, + "rewards/rejected": -7.317227681477864, + "step": 459 + }, + { + "epoch": 0.16981219140787226, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.404132527453662e-06, + "logits/chosen": 2117280153.6, + "logits/rejected": 1796125515.2941177, + "logps/chosen": -302.4202473958333, + "logps/rejected": -554.7742417279412, + "loss": 0.1619, + "rewards/chosen": 0.9770669937133789, + "rewards/margins": 9.58953863031724, + "rewards/rejected": -8.61247163660386, + "step": 460 + }, + { + "epoch": 0.17018134834571547, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 9.40134312818658e-06, + "logits/chosen": 1579234560.0, + "logits/rejected": 2085654528.0, + "logps/chosen": -275.31854248046875, + "logps/rejected": -544.705810546875, + "loss": 0.1647, + "rewards/chosen": 1.4008495807647705, + "rewards/margins": 9.715579271316528, + "rewards/rejected": -8.314729690551758, + "step": 461 + }, + { + "epoch": 0.17055050528355867, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 9.398547630914512e-06, + "logits/chosen": 1836288097.5238094, + "logits/rejected": 1681981626.1818182, + "logps/chosen": -323.7904110863095, + "logps/rejected": -471.47860440340907, + "loss": 0.2442, + "rewards/chosen": 0.9647959754580543, + "rewards/margins": 7.631681351434617, + "rewards/rejected": -6.6668853759765625, + "step": 462 + }, + { + "epoch": 0.17091966222140187, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 9.395746039510585e-06, + "logits/chosen": 1478776393.142857, + "logits/rejected": 1432818119.1111112, + "logps/chosen": -231.71540178571428, + "logps/rejected": -430.23505316840277, + "loss": 0.1202, + "rewards/chosen": 1.7084026336669922, + "rewards/margins": 9.498465432061089, + "rewards/rejected": -7.790062798394097, + "step": 463 + }, + { + "epoch": 0.17128881915924507, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.392938357856367e-06, + "logits/chosen": 1837313877.3333333, + "logits/rejected": 1759940198.4, + "logps/chosen": -306.7340901692708, + "logps/rejected": -326.896484375, + "loss": 0.1661, + "rewards/chosen": 0.9637904167175293, + "rewards/margins": 6.768612957000732, + "rewards/rejected": -5.804822540283203, + "step": 464 + }, + { + "epoch": 0.17165797609708827, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 9.390124589841866e-06, + "logits/chosen": 1832702080.0, + "logits/rejected": 2140270592.0, + "logps/chosen": -260.9884338378906, + "logps/rejected": -644.94921875, + "loss": 0.1466, + "rewards/chosen": 1.4802207946777344, + "rewards/margins": 10.892483711242676, + "rewards/rejected": -9.412262916564941, + "step": 465 + }, + { + "epoch": 0.17202713303493147, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 9.387304739365524e-06, + "logits/chosen": 1832524980.7058823, + "logits/rejected": 1617883955.2, + "logps/chosen": -251.7400333180147, + "logps/rejected": -468.75738932291665, + "loss": 0.232, + "rewards/chosen": 0.5938537261065315, + "rewards/margins": 8.656721462922938, + "rewards/rejected": -8.062867736816406, + "step": 466 + }, + { + "epoch": 0.17239628997277467, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 9.384478810334202e-06, + "logits/chosen": 2719824715.2941175, + "logits/rejected": 2452078592.0, + "logps/chosen": -246.74488740808823, + "logps/rejected": -493.70875651041666, + "loss": 0.2345, + "rewards/chosen": 0.3798382702995749, + "rewards/margins": 8.062478345983168, + "rewards/rejected": -7.682640075683594, + "step": 467 + }, + { + "epoch": 0.17276544691061788, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9.381646806663195e-06, + "logits/chosen": 1188046301.8666666, + "logits/rejected": 1470587602.8235295, + "logps/chosen": -234.28460286458332, + "logps/rejected": -427.8580537683824, + "loss": 0.1875, + "rewards/chosen": 0.8390771230061849, + "rewards/margins": 7.5509620142918, + "rewards/rejected": -6.7118848912856155, + "step": 468 + }, + { + "epoch": 0.17313460384846108, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 9.378808732276206e-06, + "logits/chosen": 1693333276.4444444, + "logits/rejected": 1718176182.857143, + "logps/chosen": -297.3453776041667, + "logps/rejected": -375.6416713169643, + "loss": 0.219, + "rewards/chosen": 1.1055640114678278, + "rewards/margins": 6.6584836717635865, + "rewards/rejected": -5.552919660295759, + "step": 469 + }, + { + "epoch": 0.17350376078630428, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 9.37596459110535e-06, + "logits/chosen": 1928345600.0, + "logits/rejected": 2211051861.3333335, + "logps/chosen": -301.3369140625, + "logps/rejected": -394.3219807942708, + "loss": 0.1633, + "rewards/chosen": 1.9237049102783204, + "rewards/margins": 8.652132797241212, + "rewards/rejected": -6.728427886962891, + "step": 470 + }, + { + "epoch": 0.17387291772414748, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.373114387091148e-06, + "logits/chosen": 1599539053.7142856, + "logits/rejected": 1522536903.1111112, + "logps/chosen": -228.62154715401786, + "logps/rejected": -406.00599500868054, + "loss": 0.1381, + "rewards/chosen": 1.2966367176600866, + "rewards/margins": 7.772359136551145, + "rewards/rejected": -6.475722418891059, + "step": 471 + }, + { + "epoch": 0.17424207466199068, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 9.370258124182525e-06, + "logits/chosen": 1871466086.4, + "logits/rejected": 2209802240.0, + "logps/chosen": -280.2326171875, + "logps/rejected": -600.40966796875, + "loss": 0.2434, + "rewards/chosen": 0.7489444255828858, + "rewards/margins": 9.939755582809449, + "rewards/rejected": -9.190811157226562, + "step": 472 + }, + { + "epoch": 0.17461123159983388, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.367395806336793e-06, + "logits/chosen": 2080404626.2857144, + "logits/rejected": 1564791714.909091, + "logps/chosen": -270.8211960565476, + "logps/rejected": -423.20321377840907, + "loss": 0.2198, + "rewards/chosen": 1.1135547274634952, + "rewards/margins": 7.375220071701777, + "rewards/rejected": -6.261665344238281, + "step": 473 + }, + { + "epoch": 0.17498038853767708, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 9.364527437519658e-06, + "logits/chosen": 1680217216.0, + "logits/rejected": 2128451456.0, + "logps/chosen": -271.66949462890625, + "logps/rejected": -426.18597412109375, + "loss": 0.2014, + "rewards/chosen": 1.0132856369018555, + "rewards/margins": 7.316222667694092, + "rewards/rejected": -6.302937030792236, + "step": 474 + }, + { + "epoch": 0.17534954547552029, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 9.361653021705211e-06, + "logits/chosen": 2177999592.7272725, + "logits/rejected": 2288232652.8, + "logps/chosen": -320.27978515625, + "logps/rejected": -808.631201171875, + "loss": 0.2696, + "rewards/chosen": 0.6454575712030585, + "rewards/margins": 11.774324148351496, + "rewards/rejected": -11.128866577148438, + "step": 475 + }, + { + "epoch": 0.1757187024133635, + "grad_norm": 18.5, + "kl": 0.0, + "learning_rate": 9.358772562875914e-06, + "logits/chosen": 1779464647.1111112, + "logits/rejected": 2097166189.7142856, + "logps/chosen": -276.9006076388889, + "logps/rejected": -389.137939453125, + "loss": 0.1909, + "rewards/chosen": 1.0232006708780925, + "rewards/margins": 7.866908709208171, + "rewards/rejected": -6.843708038330078, + "step": 476 + }, + { + "epoch": 0.1760878593512067, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 9.355886065022611e-06, + "logits/chosen": 1595497708.3076923, + "logits/rejected": 2594478187.7894735, + "logps/chosen": -229.4954552283654, + "logps/rejected": -412.3012952302632, + "loss": 0.1806, + "rewards/chosen": 0.6330189338097205, + "rewards/margins": 7.243601787428142, + "rewards/rejected": -6.610582853618421, + "step": 477 + }, + { + "epoch": 0.1764570162890499, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 9.352993532144505e-06, + "logits/chosen": 1333980160.0, + "logits/rejected": 1994839210.6666667, + "logps/chosen": -181.42242431640625, + "logps/rejected": -518.6885579427084, + "loss": 0.0867, + "rewards/chosen": 1.152406096458435, + "rewards/margins": 9.0311998128891, + "rewards/rejected": -7.878793716430664, + "step": 478 + }, + { + "epoch": 0.1768261732268931, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 9.350094968249163e-06, + "logits/chosen": 2194639320.6153846, + "logits/rejected": 2490766174.3157897, + "logps/chosen": -266.01705228365387, + "logps/rejected": -380.25917454769734, + "loss": 0.1785, + "rewards/chosen": 0.7653886354886569, + "rewards/margins": 6.724045749617974, + "rewards/rejected": -5.958657114129317, + "step": 479 + }, + { + "epoch": 0.1771953301647363, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 9.347190377352512e-06, + "logits/chosen": 1560867354.9473684, + "logits/rejected": 1299225048.6153846, + "logps/chosen": -388.36366673519734, + "logps/rejected": -436.19869290865387, + "loss": 0.223, + "rewards/chosen": 0.8749334435713919, + "rewards/margins": 6.251118934106247, + "rewards/rejected": -5.376185490534856, + "step": 480 + }, + { + "epoch": 0.1775644871025795, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 9.344279763478823e-06, + "logits/chosen": 2076912981.3333333, + "logits/rejected": 1763164641.8823528, + "logps/chosen": -259.7587890625, + "logps/rejected": -507.12258731617646, + "loss": 0.1529, + "rewards/chosen": 1.4007269541422527, + "rewards/margins": 8.68908799863329, + "rewards/rejected": -7.2883610444910385, + "step": 481 + }, + { + "epoch": 0.1779336440404227, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 9.341363130660714e-06, + "logits/chosen": 2137540608.0, + "logits/rejected": 2237207347.2, + "logps/chosen": -259.72446695963544, + "logps/rejected": -461.426708984375, + "loss": 0.1589, + "rewards/chosen": 0.6215546925862631, + "rewards/margins": 7.199357732137044, + "rewards/rejected": -6.577803039550782, + "step": 482 + }, + { + "epoch": 0.1783028009782659, + "grad_norm": 15.4375, + "kl": 0.10217571258544922, + "learning_rate": 9.338440482939146e-06, + "logits/chosen": 1521586068.2105262, + "logits/rejected": 1922938092.3076923, + "logps/chosen": -300.8343955592105, + "logps/rejected": -322.84326171875, + "loss": 0.2111, + "rewards/chosen": 0.9895760385613692, + "rewards/margins": 6.97859499232489, + "rewards/rejected": -5.989018953763521, + "step": 483 + }, + { + "epoch": 0.1786719579161091, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.335511824363407e-06, + "logits/chosen": 1551830084.2666667, + "logits/rejected": 2047619312.9411764, + "logps/chosen": -225.37298177083332, + "logps/rejected": -491.97409237132354, + "loss": 0.1481, + "rewards/chosen": 1.2445401509602865, + "rewards/margins": 7.407126138724533, + "rewards/rejected": -6.162585987764246, + "step": 484 + }, + { + "epoch": 0.1790411148539523, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 9.332577158991118e-06, + "logits/chosen": 1995962368.0, + "logits/rejected": 2586446409.142857, + "logps/chosen": -234.26925998263889, + "logps/rejected": -497.11959402901783, + "loss": 0.2209, + "rewards/chosen": 0.7873786290486654, + "rewards/margins": 7.336417652311779, + "rewards/rejected": -6.549039023263114, + "step": 485 + }, + { + "epoch": 0.1794102717917955, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 9.32963649088822e-06, + "logits/chosen": 1795679027.2, + "logits/rejected": 1622218752.0, + "logps/chosen": -297.19228515625, + "logps/rejected": -484.64171645220586, + "loss": 0.16, + "rewards/chosen": 1.0754651387532552, + "rewards/margins": 8.482411657595167, + "rewards/rejected": -7.406946518841912, + "step": 486 + }, + { + "epoch": 0.17977942872963867, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 9.326689824128971e-06, + "logits/chosen": 1621383168.0, + "logits/rejected": 1497072103.6190476, + "logps/chosen": -339.66914506392044, + "logps/rejected": -566.118908110119, + "loss": 0.1232, + "rewards/chosen": 1.1408760764382102, + "rewards/margins": 10.624549172141336, + "rewards/rejected": -9.483673095703125, + "step": 487 + }, + { + "epoch": 0.18014858566748188, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.323737162795941e-06, + "logits/chosen": 2212555190.857143, + "logits/rejected": 2225090787.5555553, + "logps/chosen": -234.53543526785714, + "logps/rejected": -566.3039279513889, + "loss": 0.1672, + "rewards/chosen": 0.9922229221888951, + "rewards/margins": 7.503315214126829, + "rewards/rejected": -6.511092291937934, + "step": 488 + }, + { + "epoch": 0.18051774260532508, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 9.320778510980004e-06, + "logits/chosen": 2285489265.7777777, + "logits/rejected": 1902621562.4347825, + "logps/chosen": -336.9425998263889, + "logps/rejected": -515.7549252717391, + "loss": 0.1061, + "rewards/chosen": 0.8516276147630479, + "rewards/margins": 8.702518728044298, + "rewards/rejected": -7.85089111328125, + "step": 489 + }, + { + "epoch": 0.18088689954316828, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 9.317813872780336e-06, + "logits/chosen": 1913687478.857143, + "logits/rejected": 1657847239.1111112, + "logps/chosen": -269.0014125279018, + "logps/rejected": -447.3102756076389, + "loss": 0.1313, + "rewards/chosen": 1.319936888558524, + "rewards/margins": 8.26653494153704, + "rewards/rejected": -6.946598052978516, + "step": 490 + }, + { + "epoch": 0.18125605648101148, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 9.314843252304405e-06, + "logits/chosen": 1760165614.9333334, + "logits/rejected": 1962293127.5294118, + "logps/chosen": -291.0530598958333, + "logps/rejected": -376.41931870404414, + "loss": 0.1784, + "rewards/chosen": 1.0503790537516275, + "rewards/margins": 6.396765233956131, + "rewards/rejected": -5.346386180204504, + "step": 491 + }, + { + "epoch": 0.18162521341885468, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 9.311866653667967e-06, + "logits/chosen": 1244888795.4285715, + "logits/rejected": 1571566478.2222223, + "logps/chosen": -184.14887346540178, + "logps/rejected": -386.0273708767361, + "loss": 0.1341, + "rewards/chosen": 1.8228936876569475, + "rewards/margins": 7.820430240933858, + "rewards/rejected": -5.99753655327691, + "step": 492 + }, + { + "epoch": 0.18199437035669788, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 9.30888408099506e-06, + "logits/chosen": 1501434733.7142856, + "logits/rejected": 2069093262.2222223, + "logps/chosen": -246.63685825892858, + "logps/rejected": -476.80946180555554, + "loss": 0.1564, + "rewards/chosen": 0.8960362161908831, + "rewards/margins": 8.771150657108851, + "rewards/rejected": -7.875114440917969, + "step": 493 + }, + { + "epoch": 0.18236352729454108, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 9.305895538418004e-06, + "logits/chosen": 2132647073.6842105, + "logits/rejected": 2157983271.3846154, + "logps/chosen": -325.94017269736844, + "logps/rejected": -592.6210186298077, + "loss": 0.1793, + "rewards/chosen": 1.2058826245759662, + "rewards/margins": 8.917423433620437, + "rewards/rejected": -7.711540809044471, + "step": 494 + }, + { + "epoch": 0.18273268423238428, + "grad_norm": 10.625, + "kl": 0.7403898239135742, + "learning_rate": 9.302901030077384e-06, + "logits/chosen": 1953897773.1764705, + "logits/rejected": 1632884189.8666666, + "logps/chosen": -236.97508329503677, + "logps/rejected": -434.2770182291667, + "loss": 0.1789, + "rewards/chosen": 1.3950559952679802, + "rewards/margins": 9.621203964831782, + "rewards/rejected": -8.226147969563803, + "step": 495 + }, + { + "epoch": 0.1831018411702275, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 9.299900560122057e-06, + "logits/chosen": 1951862393.9047618, + "logits/rejected": 2139443200.0, + "logps/chosen": -287.9765159970238, + "logps/rejected": -598.4104225852273, + "loss": 0.2039, + "rewards/chosen": 1.463614781697591, + "rewards/margins": 8.55523548704205, + "rewards/rejected": -7.09162070534446, + "step": 496 + }, + { + "epoch": 0.1834709981080707, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 9.296894132709134e-06, + "logits/chosen": 2160421225.4117646, + "logits/rejected": 2569863714.133333, + "logps/chosen": -259.53369140625, + "logps/rejected": -390.4531575520833, + "loss": 0.1536, + "rewards/chosen": 1.621340358958525, + "rewards/margins": 6.94680524339863, + "rewards/rejected": -5.3254648844401045, + "step": 497 + }, + { + "epoch": 0.1838401550459139, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 9.29388175200398e-06, + "logits/chosen": 1766330514.2857144, + "logits/rejected": 1565979534.2222223, + "logps/chosen": -284.0482177734375, + "logps/rejected": -489.21685112847223, + "loss": 0.1402, + "rewards/chosen": 1.3489065170288086, + "rewards/margins": 8.749697261386448, + "rewards/rejected": -7.400790744357639, + "step": 498 + }, + { + "epoch": 0.1842093119837571, + "grad_norm": 17.25, + "kl": 0.0, + "learning_rate": 9.290863422180211e-06, + "logits/chosen": 1795120261.5652175, + "logits/rejected": 1485407118.2222223, + "logps/chosen": -314.0444972826087, + "logps/rejected": -440.51402452256946, + "loss": 0.2479, + "rewards/chosen": 0.9329056947127633, + "rewards/margins": 7.176038815779387, + "rewards/rejected": -6.243133121066624, + "step": 499 + }, + { + "epoch": 0.1845784689216003, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 9.287839147419685e-06, + "logits/chosen": 1734314880.0, + "logits/rejected": 1805313280.0, + "logps/chosen": -232.840576171875, + "logps/rejected": -454.1831970214844, + "loss": 0.1561, + "rewards/chosen": 1.5997205972671509, + "rewards/margins": 9.560705304145813, + "rewards/rejected": -7.960984706878662, + "step": 500 + }, + { + "epoch": 0.1849476258594435, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 9.284808931912501e-06, + "logits/chosen": 2343862272.0, + "logits/rejected": 1620667520.0, + "logps/chosen": -327.5703125, + "logps/rejected": -442.1397705078125, + "loss": 0.1956, + "rewards/chosen": 0.9307717680931091, + "rewards/margins": 8.823698937892914, + "rewards/rejected": -7.892927169799805, + "step": 501 + }, + { + "epoch": 0.1853167827972867, + "grad_norm": 16.0, + "kl": 0.5581798553466797, + "learning_rate": 9.281772779856977e-06, + "logits/chosen": 2163830198.857143, + "logits/rejected": 2000974196.3636363, + "logps/chosen": -332.61358351934524, + "logps/rejected": -476.36647727272725, + "loss": 0.2553, + "rewards/chosen": 0.7307724271501813, + "rewards/margins": 8.296268698457, + "rewards/rejected": -7.565496271306818, + "step": 502 + }, + { + "epoch": 0.1856859397351299, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 9.278730695459664e-06, + "logits/chosen": 2037516180.2105262, + "logits/rejected": 2681786998.1538463, + "logps/chosen": -268.5090974506579, + "logps/rejected": -392.14734825721155, + "loss": 0.1554, + "rewards/chosen": 1.622494245830335, + "rewards/margins": 7.87285928301483, + "rewards/rejected": -6.250365037184495, + "step": 503 + }, + { + "epoch": 0.1860550966729731, + "grad_norm": 18.5, + "kl": 0.9719223976135254, + "learning_rate": 9.275682682935336e-06, + "logits/chosen": 1748425386.6666667, + "logits/rejected": 1918790475.2941177, + "logps/chosen": -375.046484375, + "logps/rejected": -453.5421357996324, + "loss": 0.193, + "rewards/chosen": 1.345151138305664, + "rewards/margins": 6.440502862369313, + "rewards/rejected": -5.095351724063649, + "step": 504 + }, + { + "epoch": 0.1864242536108163, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 9.27262874650697e-06, + "logits/chosen": 1772750506.6666667, + "logits/rejected": 1787728486.4, + "logps/chosen": -242.2025349934896, + "logps/rejected": -495.9966796875, + "loss": 0.1052, + "rewards/chosen": 1.7768511772155762, + "rewards/margins": 8.226907634735108, + "rewards/rejected": -6.450056457519532, + "step": 505 + }, + { + "epoch": 0.1867934105486595, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 9.269568890405762e-06, + "logits/chosen": 1557479962.9473684, + "logits/rejected": 1926980214.1538463, + "logps/chosen": -275.08760793585526, + "logps/rejected": -402.48922025240387, + "loss": 0.176, + "rewards/chosen": 1.565025530363384, + "rewards/margins": 7.244478279762422, + "rewards/rejected": -5.679452749399038, + "step": 506 + }, + { + "epoch": 0.1871625674865027, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 9.2665031188711e-06, + "logits/chosen": 1559975296.0, + "logits/rejected": 2149469184.0, + "logps/chosen": -211.3551025390625, + "logps/rejected": -463.6027526855469, + "loss": 0.1895, + "rewards/chosen": 0.9574509263038635, + "rewards/margins": 8.133961260318756, + "rewards/rejected": -7.176510334014893, + "step": 507 + }, + { + "epoch": 0.1875317244243459, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 9.263431436150571e-06, + "logits/chosen": 1693622710.857143, + "logits/rejected": 1439700423.1111112, + "logps/chosen": -296.3124302455357, + "logps/rejected": -428.69371202256946, + "loss": 0.155, + "rewards/chosen": 1.498307773045131, + "rewards/margins": 7.987213361830939, + "rewards/rejected": -6.488905588785808, + "step": 508 + }, + { + "epoch": 0.1879008813621891, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 9.260353846499954e-06, + "logits/chosen": 1579855631.0588236, + "logits/rejected": 1536109772.8, + "logps/chosen": -287.30221737132354, + "logps/rejected": -432.1041666666667, + "loss": 0.1974, + "rewards/chosen": 0.8192945368149701, + "rewards/margins": 8.274596966014188, + "rewards/rejected": -7.455302429199219, + "step": 509 + }, + { + "epoch": 0.1882700383000323, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.257270354183212e-06, + "logits/chosen": 1602002189.4736843, + "logits/rejected": 1682813085.5384614, + "logps/chosen": -360.8655941611842, + "logps/rejected": -430.3850661057692, + "loss": 0.1526, + "rewards/chosen": 2.0253319991262337, + "rewards/margins": 10.72539937254871, + "rewards/rejected": -8.700067373422476, + "step": 510 + }, + { + "epoch": 0.1886391952378755, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 9.254180963472478e-06, + "logits/chosen": 1536171212.8, + "logits/rejected": 2071653436.235294, + "logps/chosen": -224.76726888020832, + "logps/rejected": -490.26941636029414, + "loss": 0.1261, + "rewards/chosen": 1.5379498799641926, + "rewards/margins": 8.906932531618605, + "rewards/rejected": -7.368982651654412, + "step": 511 + }, + { + "epoch": 0.1890083521757187, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 9.251085678648072e-06, + "logits/chosen": 1429697877.3333333, + "logits/rejected": 2144477476.5714285, + "logps/chosen": -338.524658203125, + "logps/rejected": -478.64208984375, + "loss": 0.1882, + "rewards/chosen": 1.0394471486409504, + "rewards/margins": 8.476882298787435, + "rewards/rejected": -7.437435150146484, + "step": 512 + }, + { + "epoch": 0.1893775091135619, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 9.247984503998466e-06, + "logits/chosen": 2323327096.470588, + "logits/rejected": 1814560358.4, + "logps/chosen": -203.08531996783088, + "logps/rejected": -510.0996419270833, + "loss": 0.1886, + "rewards/chosen": 1.1141486448400162, + "rewards/margins": 8.385692020491057, + "rewards/rejected": -7.271543375651041, + "step": 513 + }, + { + "epoch": 0.1897466660514051, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 9.2448774438203e-06, + "logits/chosen": 1847028480.0, + "logits/rejected": 2033866752.0, + "logps/chosen": -270.27276611328125, + "logps/rejected": -645.2341918945312, + "loss": 0.176, + "rewards/chosen": 1.5080769062042236, + "rewards/margins": 9.613614320755005, + "rewards/rejected": -8.105537414550781, + "step": 514 + }, + { + "epoch": 0.1901158229892483, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 9.241764502418365e-06, + "logits/chosen": 1933643776.0, + "logits/rejected": 2672773443.368421, + "logps/chosen": -319.96585787259613, + "logps/rejected": -610.021638569079, + "loss": 0.1894, + "rewards/chosen": 0.6091128496023325, + "rewards/margins": 6.803653174566353, + "rewards/rejected": -6.194540324964021, + "step": 515 + }, + { + "epoch": 0.1904849799270915, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 9.238645684105606e-06, + "logits/chosen": 1463426779.4285715, + "logits/rejected": 1637662947.5555556, + "logps/chosen": -187.28548758370536, + "logps/rejected": -489.7932400173611, + "loss": 0.1173, + "rewards/chosen": 2.0201211656842912, + "rewards/margins": 9.265878374614413, + "rewards/rejected": -7.245757208930121, + "step": 516 + }, + { + "epoch": 0.19085413686493471, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 9.2355209932031e-06, + "logits/chosen": 2076992170.6666667, + "logits/rejected": 1505105920.0, + "logps/chosen": -251.32462565104166, + "logps/rejected": -365.79561941964283, + "loss": 0.1951, + "rewards/chosen": 1.523885515001085, + "rewards/margins": 7.000006600031777, + "rewards/rejected": -5.476121085030692, + "step": 517 + }, + { + "epoch": 0.19122329380277792, + "grad_norm": 15.375, + "kl": 0.0, + "learning_rate": 9.232390434040071e-06, + "logits/chosen": 1723503411.2, + "logits/rejected": 1271525376.0, + "logps/chosen": -257.61630859375, + "logps/rejected": -454.79833984375, + "loss": 0.2204, + "rewards/chosen": 1.1533077239990235, + "rewards/margins": 9.002512105305989, + "rewards/rejected": -7.849204381306966, + "step": 518 + }, + { + "epoch": 0.19159245074062112, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 9.229254010953868e-06, + "logits/chosen": 2019313904.9411764, + "logits/rejected": 1638672247.4666667, + "logps/chosen": -271.79541015625, + "logps/rejected": -538.7889973958333, + "loss": 0.1815, + "rewards/chosen": 1.1583520103903377, + "rewards/margins": 9.723732869765339, + "rewards/rejected": -8.565380859375, + "step": 519 + }, + { + "epoch": 0.19196160767846432, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 9.226111728289963e-06, + "logits/chosen": 2070273365.3333333, + "logits/rejected": 2225796505.6, + "logps/chosen": -237.7210489908854, + "logps/rejected": -393.187060546875, + "loss": 0.1627, + "rewards/chosen": 0.5267066955566406, + "rewards/margins": 7.491919708251953, + "rewards/rejected": -6.965213012695313, + "step": 520 + }, + { + "epoch": 0.19233076461630752, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.222963590401953e-06, + "logits/chosen": 1506792999.3846154, + "logits/rejected": 1365210812.631579, + "logps/chosen": -337.86328125, + "logps/rejected": -517.5747327302631, + "loss": 0.1248, + "rewards/chosen": 1.7331085205078125, + "rewards/margins": 8.621051989103618, + "rewards/rejected": -6.887943468595806, + "step": 521 + }, + { + "epoch": 0.19269992155415072, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 9.21980960165154e-06, + "logits/chosen": 1434353408.0, + "logits/rejected": 1940198144.0, + "logps/chosen": -328.9886779785156, + "logps/rejected": -441.9497375488281, + "loss": 0.1721, + "rewards/chosen": 1.191384196281433, + "rewards/margins": 8.249430537223816, + "rewards/rejected": -7.058046340942383, + "step": 522 + }, + { + "epoch": 0.1930690784919939, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 9.216649766408536e-06, + "logits/chosen": 1948139929.6, + "logits/rejected": 2137723843.764706, + "logps/chosen": -283.740625, + "logps/rejected": -461.90935202205884, + "loss": 0.1831, + "rewards/chosen": 1.0980261484781901, + "rewards/margins": 8.45220553080241, + "rewards/rejected": -7.354179382324219, + "step": 523 + }, + { + "epoch": 0.1934382354298371, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 9.213484089050853e-06, + "logits/chosen": 1227787324.235294, + "logits/rejected": 1396557141.3333333, + "logps/chosen": -222.26766429227942, + "logps/rejected": -410.88671875, + "loss": 0.1888, + "rewards/chosen": 1.3342289644129135, + "rewards/margins": 10.131850627824372, + "rewards/rejected": -8.797621663411459, + "step": 524 + }, + { + "epoch": 0.1938073923676803, + "grad_norm": 15.375, + "kl": 0.0, + "learning_rate": 9.210312573964496e-06, + "logits/chosen": 1975815976.4210527, + "logits/rejected": 2138181947.0769231, + "logps/chosen": -286.6119449013158, + "logps/rejected": -434.5, + "loss": 0.2473, + "rewards/chosen": 0.774317741394043, + "rewards/margins": 7.7571216729971075, + "rewards/rejected": -6.982803931603065, + "step": 525 + }, + { + "epoch": 0.1941765493055235, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 9.207135225543557e-06, + "logits/chosen": 2058776094.1176472, + "logits/rejected": 1696930065.0666666, + "logps/chosen": -272.95211971507354, + "logps/rejected": -418.25413411458334, + "loss": 0.1907, + "rewards/chosen": 1.1297698301427506, + "rewards/margins": 7.756622247134938, + "rewards/rejected": -6.626852416992188, + "step": 526 + }, + { + "epoch": 0.1945457062433667, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 9.203952048190217e-06, + "logits/chosen": 1456261802.6666667, + "logits/rejected": 1508054308.5714285, + "logps/chosen": -273.5607638888889, + "logps/rejected": -459.2677525111607, + "loss": 0.1944, + "rewards/chosen": 0.9496177037556967, + "rewards/margins": 8.545520282927013, + "rewards/rejected": -7.595902579171317, + "step": 527 + }, + { + "epoch": 0.1949148631812099, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 9.200763046314725e-06, + "logits/chosen": 1726218480.9411764, + "logits/rejected": 2044894276.2666667, + "logps/chosen": -251.31692325367646, + "logps/rejected": -491.96669921875, + "loss": 0.1647, + "rewards/chosen": 1.305674721212948, + "rewards/margins": 8.819770281922583, + "rewards/rejected": -7.514095560709635, + "step": 528 + }, + { + "epoch": 0.1952840201190531, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.197568224335401e-06, + "logits/chosen": 2031852604.235294, + "logits/rejected": 2154188253.866667, + "logps/chosen": -327.15558938419116, + "logps/rejected": -386.21259765625, + "loss": 0.156, + "rewards/chosen": 1.687422696281882, + "rewards/margins": 8.830508938957664, + "rewards/rejected": -7.143086242675781, + "step": 529 + }, + { + "epoch": 0.1956531770568963, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 9.194367586678634e-06, + "logits/chosen": 1813981440.0, + "logits/rejected": 1483889152.0, + "logps/chosen": -263.4017333984375, + "logps/rejected": -545.5291748046875, + "loss": 0.1329, + "rewards/chosen": 1.5658973455429077, + "rewards/margins": 10.041785597801208, + "rewards/rejected": -8.4758882522583, + "step": 530 + }, + { + "epoch": 0.1960223339947395, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 9.191161137778861e-06, + "logits/chosen": 1460036461.7142856, + "logits/rejected": 2147167345.7777777, + "logps/chosen": -230.32803780691964, + "logps/rejected": -490.8362087673611, + "loss": 0.1394, + "rewards/chosen": 1.4165964126586914, + "rewards/margins": 9.720474349127876, + "rewards/rejected": -8.303877936469185, + "step": 531 + }, + { + "epoch": 0.1963914909325827, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 9.187948882078582e-06, + "logits/chosen": 2209746688.0, + "logits/rejected": 2317312256.0, + "logps/chosen": -290.333984375, + "logps/rejected": -435.89013671875, + "loss": 0.1986, + "rewards/chosen": 0.5478121638298035, + "rewards/margins": 8.063519179821014, + "rewards/rejected": -7.515707015991211, + "step": 532 + }, + { + "epoch": 0.1967606478704259, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 9.184730824028334e-06, + "logits/chosen": 1651792256.0, + "logits/rejected": 1720465536.0, + "logps/chosen": -273.79901123046875, + "logps/rejected": -401.3411865234375, + "loss": 0.1557, + "rewards/chosen": 1.341324806213379, + "rewards/margins": 9.602961540222168, + "rewards/rejected": -8.261636734008789, + "step": 533 + }, + { + "epoch": 0.1971298048082691, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.181506968086696e-06, + "logits/chosen": 2016805683.2, + "logits/rejected": 1754403498.6666667, + "logps/chosen": -191.06611328125, + "logps/rejected": -412.4502360026042, + "loss": 0.2163, + "rewards/chosen": 1.2251964569091798, + "rewards/margins": 7.654104487101238, + "rewards/rejected": -6.428908030192058, + "step": 534 + }, + { + "epoch": 0.1974989617461123, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 9.178277318720279e-06, + "logits/chosen": 1795798766.9333334, + "logits/rejected": 1837004559.0588236, + "logps/chosen": -263.09851888020836, + "logps/rejected": -435.7174287683824, + "loss": 0.1544, + "rewards/chosen": 1.2812094370524088, + "rewards/margins": 7.581780654308843, + "rewards/rejected": -6.300571217256434, + "step": 535 + }, + { + "epoch": 0.1978681186839555, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 9.175041880403721e-06, + "logits/chosen": 2105398452.7058823, + "logits/rejected": 1484027630.9333334, + "logps/chosen": -196.0568129595588, + "logps/rejected": -516.4480143229167, + "loss": 0.1913, + "rewards/chosen": 0.9719672483556411, + "rewards/margins": 9.985456017886891, + "rewards/rejected": -9.01348876953125, + "step": 536 + }, + { + "epoch": 0.19823727562179871, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 9.171800657619683e-06, + "logits/chosen": 2277996701.5384617, + "logits/rejected": 2253424316.631579, + "logps/chosen": -312.363037109375, + "logps/rejected": -451.5133120888158, + "loss": 0.1669, + "rewards/chosen": 0.6535064990703876, + "rewards/margins": 8.331756460521868, + "rewards/rejected": -7.67824996145148, + "step": 537 + }, + { + "epoch": 0.19860643255964192, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 9.168553654858834e-06, + "logits/chosen": 1963455360.0, + "logits/rejected": 1795472256.0, + "logps/chosen": -280.07733154296875, + "logps/rejected": -462.04681396484375, + "loss": 0.157, + "rewards/chosen": 1.5113461017608643, + "rewards/margins": 8.006688833236694, + "rewards/rejected": -6.49534273147583, + "step": 538 + }, + { + "epoch": 0.19897558949748512, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 9.165300876619857e-06, + "logits/chosen": 2262503619.047619, + "logits/rejected": 1666511778.909091, + "logps/chosen": -296.01922898065476, + "logps/rejected": -477.92751242897725, + "loss": 0.2336, + "rewards/chosen": 0.9710443587530226, + "rewards/margins": 7.999099029607071, + "rewards/rejected": -7.028054670854048, + "step": 539 + }, + { + "epoch": 0.19934474643532832, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 9.162042327409437e-06, + "logits/chosen": 2047733037.1764705, + "logits/rejected": 1576008635.7333333, + "logps/chosen": -258.44054457720586, + "logps/rejected": -452.567578125, + "loss": 0.1145, + "rewards/chosen": 1.7968803854549633, + "rewards/margins": 9.497731825884651, + "rewards/rejected": -7.7008514404296875, + "step": 540 + }, + { + "epoch": 0.19971390337317152, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.15877801174225e-06, + "logits/chosen": 2478298180.266667, + "logits/rejected": 2671404935.529412, + "logps/chosen": -315.15989583333334, + "logps/rejected": -444.2377355238971, + "loss": 0.1427, + "rewards/chosen": 1.4257303873697917, + "rewards/margins": 7.939315197514553, + "rewards/rejected": -6.513584810144761, + "step": 541 + }, + { + "epoch": 0.20008306031101472, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 9.155507934140962e-06, + "logits/chosen": 1719702905.2631578, + "logits/rejected": 1912967168.0, + "logps/chosen": -236.93313116776315, + "logps/rejected": -471.5680964543269, + "loss": 0.1785, + "rewards/chosen": 1.284210205078125, + "rewards/margins": 10.377217806302584, + "rewards/rejected": -9.093007601224459, + "step": 542 + }, + { + "epoch": 0.20045221724885792, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 9.152232099136227e-06, + "logits/chosen": 1952822613.3333333, + "logits/rejected": 1566212096.0, + "logps/chosen": -322.4392578125, + "logps/rejected": -393.16745174632354, + "loss": 0.1792, + "rewards/chosen": 1.3511577606201173, + "rewards/margins": 7.851636168536018, + "rewards/rejected": -6.5004784079159, + "step": 543 + }, + { + "epoch": 0.20082137418670112, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 9.148950511266674e-06, + "logits/chosen": 1664007509.3333333, + "logits/rejected": 1545862348.8, + "logps/chosen": -291.77972412109375, + "logps/rejected": -448.739453125, + "loss": 0.1312, + "rewards/chosen": 1.9782946904500325, + "rewards/margins": 7.589177640279134, + "rewards/rejected": -5.610882949829102, + "step": 544 + }, + { + "epoch": 0.20119053112454433, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 9.1456631750789e-06, + "logits/chosen": 2050768640.0, + "logits/rejected": 1730386688.0, + "logps/chosen": -316.289794921875, + "logps/rejected": -456.49554443359375, + "loss": 0.2026, + "rewards/chosen": 0.947542130947113, + "rewards/margins": 6.61425107717514, + "rewards/rejected": -5.666708946228027, + "step": 545 + }, + { + "epoch": 0.20155968806238753, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 9.142370095127465e-06, + "logits/chosen": 1590782361.6, + "logits/rejected": 2142522187.2941177, + "logps/chosen": -316.83128255208334, + "logps/rejected": -556.9126263786765, + "loss": 0.136, + "rewards/chosen": 1.432281239827474, + "rewards/margins": 10.470899439793008, + "rewards/rejected": -9.038618199965534, + "step": 546 + }, + { + "epoch": 0.20192884500023073, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 9.139071275974894e-06, + "logits/chosen": 1994018028.3076923, + "logits/rejected": 2116788008.4210527, + "logps/chosen": -310.19933143028845, + "logps/rejected": -401.5547645970395, + "loss": 0.118, + "rewards/chosen": 1.5993235661433294, + "rewards/margins": 8.001874514436915, + "rewards/rejected": -6.402550948293586, + "step": 547 + }, + { + "epoch": 0.20229800193807393, + "grad_norm": 13.5, + "kl": 0.5224275588989258, + "learning_rate": 9.135766722191655e-06, + "logits/chosen": 1761064960.0, + "logits/rejected": 1998165138.2857144, + "logps/chosen": -268.2634548611111, + "logps/rejected": -537.3767787388393, + "loss": 0.1693, + "rewards/chosen": 1.641847398546007, + "rewards/margins": 8.541684347485738, + "rewards/rejected": -6.899836948939732, + "step": 548 + }, + { + "epoch": 0.20266715887591713, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 9.132456438356165e-06, + "logits/chosen": 1566293040.7619047, + "logits/rejected": 1479971840.0, + "logps/chosen": -240.08909970238096, + "logps/rejected": -422.41122159090907, + "loss": 0.1641, + "rewards/chosen": 1.7491017296200706, + "rewards/margins": 10.319969094676889, + "rewards/rejected": -8.570867365056818, + "step": 549 + }, + { + "epoch": 0.20303631581376033, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 9.129140429054785e-06, + "logits/chosen": 1665969545.8461537, + "logits/rejected": 2158504474.9473686, + "logps/chosen": -350.14832481971155, + "logps/rejected": -565.6294202302631, + "loss": 0.1375, + "rewards/chosen": 1.0363346980168269, + "rewards/margins": 9.660001561709262, + "rewards/rejected": -8.623666863692435, + "step": 550 + }, + { + "epoch": 0.20340547275160353, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 9.125818698881798e-06, + "logits/chosen": 2100261091.5555556, + "logits/rejected": 1602453650.2857144, + "logps/chosen": -374.42222764756946, + "logps/rejected": -444.42703683035717, + "loss": 0.2465, + "rewards/chosen": 0.46811405817667645, + "rewards/margins": 7.718553838275728, + "rewards/rejected": -7.250439780099051, + "step": 551 + }, + { + "epoch": 0.20377462968944673, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.122491252439425e-06, + "logits/chosen": 1712183175.5294118, + "logits/rejected": 1576757248.0, + "logps/chosen": -247.01490693933823, + "logps/rejected": -373.6669596354167, + "loss": 0.1459, + "rewards/chosen": 1.5472046347225414, + "rewards/margins": 6.945182336545457, + "rewards/rejected": -5.397977701822916, + "step": 552 + }, + { + "epoch": 0.20414378662728994, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 9.119158094337794e-06, + "logits/chosen": 1938288867.5555556, + "logits/rejected": 1805902848.0, + "logps/chosen": -292.389404296875, + "logps/rejected": -692.8347516741071, + "loss": 0.2384, + "rewards/chosen": 0.642407152387831, + "rewards/margins": 10.807107251787942, + "rewards/rejected": -10.164700099400111, + "step": 553 + }, + { + "epoch": 0.20451294356513314, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 9.11581922919496e-06, + "logits/chosen": 2167668931.047619, + "logits/rejected": 2191192436.3636365, + "logps/chosen": -275.5000465029762, + "logps/rejected": -599.0743519176136, + "loss": 0.2574, + "rewards/chosen": 0.6368027641659691, + "rewards/margins": 9.732010672102758, + "rewards/rejected": -9.09520790793679, + "step": 554 + }, + { + "epoch": 0.20488210050297634, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.112474661636871e-06, + "logits/chosen": 1644161536.0, + "logits/rejected": 1943716249.6, + "logps/chosen": -185.45072428385416, + "logps/rejected": -418.820947265625, + "loss": 0.1206, + "rewards/chosen": 1.3512927691141765, + "rewards/margins": 6.789508406321208, + "rewards/rejected": -5.438215637207032, + "step": 555 + }, + { + "epoch": 0.20525125744081954, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 9.109124396297391e-06, + "logits/chosen": 1545126456.8888888, + "logits/rejected": 1875554304.0, + "logps/chosen": -204.2105712890625, + "logps/rejected": -493.33328683035717, + "loss": 0.177, + "rewards/chosen": 1.603835317823622, + "rewards/margins": 9.379460274227082, + "rewards/rejected": -7.77562495640346, + "step": 556 + }, + { + "epoch": 0.20562041437866274, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 9.105768437818263e-06, + "logits/chosen": 1924836944.8421052, + "logits/rejected": 1989770791.3846154, + "logps/chosen": -386.01112767269734, + "logps/rejected": -473.62349759615387, + "loss": 0.1918, + "rewards/chosen": 1.2474373265316612, + "rewards/margins": 7.677802668891937, + "rewards/rejected": -6.430365342360276, + "step": 557 + }, + { + "epoch": 0.20598957131650594, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 9.102406790849129e-06, + "logits/chosen": 1754000653.4736843, + "logits/rejected": 2242638611.6923075, + "logps/chosen": -299.9478053042763, + "logps/rejected": -591.8624924879807, + "loss": 0.2149, + "rewards/chosen": 1.003880450600072, + "rewards/margins": 9.462475726478978, + "rewards/rejected": -8.458595275878906, + "step": 558 + }, + { + "epoch": 0.20635872825434912, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 9.099039460047506e-06, + "logits/chosen": 1713453056.0, + "logits/rejected": 1666205559.4666667, + "logps/chosen": -239.04262408088235, + "logps/rejected": -407.6943033854167, + "loss": 0.186, + "rewards/chosen": 1.0664082695456112, + "rewards/margins": 7.2653849134258195, + "rewards/rejected": -6.198976643880209, + "step": 559 + }, + { + "epoch": 0.20672788519219232, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 9.09566645007879e-06, + "logits/chosen": 2019744654.2222223, + "logits/rejected": 2054881572.5714285, + "logps/chosen": -373.71446397569446, + "logps/rejected": -401.4518345424107, + "loss": 0.1705, + "rewards/chosen": 1.4559410942925348, + "rewards/margins": 9.07830077882797, + "rewards/rejected": -7.622359684535435, + "step": 560 + }, + { + "epoch": 0.20709704213003552, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 9.09228776561624e-06, + "logits/chosen": 1899064481.6842105, + "logits/rejected": 1670445528.6153846, + "logps/chosen": -327.5763003700658, + "logps/rejected": -514.2982271634615, + "loss": 0.203, + "rewards/chosen": 1.0184479763633327, + "rewards/margins": 8.24529335083749, + "rewards/rejected": -7.226845374474158, + "step": 561 + }, + { + "epoch": 0.20746619906787872, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 9.088903411340985e-06, + "logits/chosen": 1573458602.6666667, + "logits/rejected": 1942682038.857143, + "logps/chosen": -269.1371799045139, + "logps/rejected": -461.8994838169643, + "loss": 0.1754, + "rewards/chosen": 1.3602530161539714, + "rewards/margins": 8.284960156395321, + "rewards/rejected": -6.924707140241351, + "step": 562 + }, + { + "epoch": 0.20783535600572192, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 9.085513391942003e-06, + "logits/chosen": 1709549112.8888888, + "logits/rejected": 1265875090.2857144, + "logps/chosen": -268.36634657118054, + "logps/rejected": -443.36704799107144, + "loss": 0.1792, + "rewards/chosen": 1.6182136535644531, + "rewards/margins": 9.469799041748047, + "rewards/rejected": -7.851585388183594, + "step": 563 + }, + { + "epoch": 0.20820451294356512, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 9.08211771211612e-06, + "logits/chosen": 1362971306.6666667, + "logits/rejected": 1379651945.4117646, + "logps/chosen": -299.9892252604167, + "logps/rejected": -463.3130744485294, + "loss": 0.1783, + "rewards/chosen": 1.0788370768229167, + "rewards/margins": 9.094118305281096, + "rewards/rejected": -8.01528122845818, + "step": 564 + }, + { + "epoch": 0.20857366988140832, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 9.078716376568011e-06, + "logits/chosen": 1803551597.7142856, + "logits/rejected": 2502470314.6666665, + "logps/chosen": -199.92628696986608, + "logps/rejected": -503.97846137152777, + "loss": 0.1049, + "rewards/chosen": 1.9145941053118025, + "rewards/margins": 8.923256011236282, + "rewards/rejected": -7.0086619059244795, + "step": 565 + }, + { + "epoch": 0.20894282681925153, + "grad_norm": 22.75, + "kl": 3.969576358795166, + "learning_rate": 9.075309390010182e-06, + "logits/chosen": 2006638376.4210527, + "logits/rejected": 1806391768.6153846, + "logps/chosen": -269.13204152960526, + "logps/rejected": -466.76998197115387, + "loss": 0.2186, + "rewards/chosen": 1.7497229325143915, + "rewards/margins": 6.704926649085905, + "rewards/rejected": -4.955203716571514, + "step": 566 + }, + { + "epoch": 0.20931198375709473, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 9.07189675716297e-06, + "logits/chosen": 1584099514.1818182, + "logits/rejected": 1690752731.4285715, + "logps/chosen": -184.32218794389203, + "logps/rejected": -573.49609375, + "loss": 0.0659, + "rewards/chosen": 3.0985742048783735, + "rewards/margins": 11.60654728546803, + "rewards/rejected": -8.507973080589657, + "step": 567 + }, + { + "epoch": 0.20968114069493793, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 9.068478482754532e-06, + "logits/chosen": 2399958481.4545455, + "logits/rejected": 2145268687.2380953, + "logps/chosen": -313.9454900568182, + "logps/rejected": -536.1988467261905, + "loss": 0.1317, + "rewards/chosen": 0.9548492431640625, + "rewards/margins": 7.847540719168527, + "rewards/rejected": -6.892691476004464, + "step": 568 + }, + { + "epoch": 0.21005029763278113, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 9.065054571520846e-06, + "logits/chosen": 1584595416.6153846, + "logits/rejected": 1301092783.1578948, + "logps/chosen": -318.8743239182692, + "logps/rejected": -373.0796155427632, + "loss": 0.1537, + "rewards/chosen": 0.938760023850661, + "rewards/margins": 7.361528064557898, + "rewards/rejected": -6.422768040707237, + "step": 569 + }, + { + "epoch": 0.21041945457062433, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 9.061625028205699e-06, + "logits/chosen": 1455776699.7333333, + "logits/rejected": 1304073276.235294, + "logps/chosen": -280.13551432291666, + "logps/rejected": -432.4692957261029, + "loss": 0.1366, + "rewards/chosen": 1.6975513458251954, + "rewards/margins": 9.96963440390194, + "rewards/rejected": -8.272083058076745, + "step": 570 + }, + { + "epoch": 0.21078861150846753, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 9.058189857560675e-06, + "logits/chosen": 1510337929.8461537, + "logits/rejected": 2164830423.5789475, + "logps/chosen": -318.91011868990387, + "logps/rejected": -455.4263466282895, + "loss": 0.1296, + "rewards/chosen": 1.1510832859919622, + "rewards/margins": 9.861998299355449, + "rewards/rejected": -8.710915013363486, + "step": 571 + }, + { + "epoch": 0.21115776844631073, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 9.054749064345165e-06, + "logits/chosen": 1941602304.0, + "logits/rejected": 2153375744.0, + "logps/chosen": -290.41527035361844, + "logps/rejected": -569.1043419471154, + "loss": 0.2105, + "rewards/chosen": 1.2930553837826377, + "rewards/margins": 9.11125630212699, + "rewards/rejected": -7.818200918344351, + "step": 572 + }, + { + "epoch": 0.21152692538415394, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 9.05130265332634e-06, + "logits/chosen": 1237256738.1333334, + "logits/rejected": 1572677391.0588236, + "logps/chosen": -253.20830078125, + "logps/rejected": -439.21989889705884, + "loss": 0.1441, + "rewards/chosen": 1.5978641510009766, + "rewards/margins": 9.366781459135169, + "rewards/rejected": -7.768917308134191, + "step": 573 + }, + { + "epoch": 0.21189608232199714, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 9.04785062927916e-06, + "logits/chosen": 1750054473.142857, + "logits/rejected": 1767066624.0, + "logps/chosen": -289.7642299107143, + "logps/rejected": -438.2781575520833, + "loss": 0.1207, + "rewards/chosen": 1.5603437423706055, + "rewards/margins": 8.245320108201769, + "rewards/rejected": -6.684976365831163, + "step": 574 + }, + { + "epoch": 0.21226523925984034, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 9.04439299698636e-06, + "logits/chosen": 2106680320.0, + "logits/rejected": 2149581255.111111, + "logps/chosen": -217.13985770089286, + "logps/rejected": -511.48442925347223, + "loss": 0.1674, + "rewards/chosen": 0.6998860495431083, + "rewards/margins": 9.178852656530955, + "rewards/rejected": -8.478966606987846, + "step": 575 + }, + { + "epoch": 0.21263439619768354, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 9.040929761238448e-06, + "logits/chosen": 1783858907.4285715, + "logits/rejected": 1612699875.5555556, + "logps/chosen": -364.8817661830357, + "logps/rejected": -416.44829644097223, + "loss": 0.131, + "rewards/chosen": 1.5877128328595842, + "rewards/margins": 9.273552167983283, + "rewards/rejected": -7.685839335123698, + "step": 576 + }, + { + "epoch": 0.21300355313552674, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 9.03746092683369e-06, + "logits/chosen": 2298371120.7619047, + "logits/rejected": 1714261457.4545455, + "logps/chosen": -313.3645833333333, + "logps/rejected": -654.8312322443181, + "loss": 0.2546, + "rewards/chosen": 0.5874936694190616, + "rewards/margins": 7.647399675278437, + "rewards/rejected": -7.059906005859375, + "step": 577 + }, + { + "epoch": 0.21337271007336994, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 9.033986498578113e-06, + "logits/chosen": 1510029870.5454545, + "logits/rejected": 1497066496.0, + "logps/chosen": -245.35182883522728, + "logps/rejected": -401.3447998046875, + "loss": 0.2181, + "rewards/chosen": 1.1192637356844815, + "rewards/margins": 7.7077639493075285, + "rewards/rejected": -6.588500213623047, + "step": 578 + }, + { + "epoch": 0.21374186701121314, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 9.030506481285495e-06, + "logits/chosen": 1725962103.4666667, + "logits/rejected": 1504510192.9411764, + "logps/chosen": -308.36188151041665, + "logps/rejected": -474.1260340073529, + "loss": 0.1923, + "rewards/chosen": 0.859922981262207, + "rewards/margins": 8.62186612521901, + "rewards/rejected": -7.761943143956802, + "step": 579 + }, + { + "epoch": 0.21411102394905634, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 9.027020879777354e-06, + "logits/chosen": 1494290304.0, + "logits/rejected": 1917537280.0, + "logps/chosen": -259.0396423339844, + "logps/rejected": -476.64996337890625, + "loss": 0.1942, + "rewards/chosen": 0.7355548143386841, + "rewards/margins": 7.97956907749176, + "rewards/rejected": -7.244014263153076, + "step": 580 + }, + { + "epoch": 0.21448018088689955, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 9.023529698882946e-06, + "logits/chosen": 1687045504.0, + "logits/rejected": 2050980352.0, + "logps/chosen": -293.108642578125, + "logps/rejected": -401.1996154785156, + "loss": 0.1879, + "rewards/chosen": 1.0889683961868286, + "rewards/margins": 8.064456582069397, + "rewards/rejected": -6.975488185882568, + "step": 581 + }, + { + "epoch": 0.21484933782474275, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 9.020032943439258e-06, + "logits/chosen": 1862757139.6923077, + "logits/rejected": 2083863713.6842105, + "logps/chosen": -312.3418156550481, + "logps/rejected": -479.86703330592104, + "loss": 0.1783, + "rewards/chosen": 0.5547563479496882, + "rewards/margins": 8.033224590394179, + "rewards/rejected": -7.47846824244449, + "step": 582 + }, + { + "epoch": 0.21521849476258595, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 9.016530618291001e-06, + "logits/chosen": 1595973290.6666667, + "logits/rejected": 1898930790.4, + "logps/chosen": -337.49843343098956, + "logps/rejected": -532.58046875, + "loss": 0.1019, + "rewards/chosen": 1.6104737917582195, + "rewards/margins": 10.200187842051188, + "rewards/rejected": -8.589714050292969, + "step": 583 + }, + { + "epoch": 0.21558765170042915, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 9.013022728290604e-06, + "logits/chosen": 1673675605.3333333, + "logits/rejected": 1419395276.8, + "logps/chosen": -352.8475748697917, + "logps/rejected": -495.93935546875, + "loss": 0.1727, + "rewards/chosen": 0.8103640874226888, + "rewards/margins": 8.080707867940268, + "rewards/rejected": -7.270343780517578, + "step": 584 + }, + { + "epoch": 0.21595680863827235, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 9.009509278298201e-06, + "logits/chosen": 1310180420.2666667, + "logits/rejected": 1336335781.6470587, + "logps/chosen": -274.98138020833335, + "logps/rejected": -423.38786764705884, + "loss": 0.186, + "rewards/chosen": 0.7402849833170573, + "rewards/margins": 7.555808467491, + "rewards/rejected": -6.815523484173943, + "step": 585 + }, + { + "epoch": 0.21632596557611555, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 9.005990273181631e-06, + "logits/chosen": 1902469266.2857144, + "logits/rejected": 1657869994.6666667, + "logps/chosen": -287.6361781529018, + "logps/rejected": -411.61829969618054, + "loss": 0.1774, + "rewards/chosen": 0.8822000367300851, + "rewards/margins": 8.568849783095102, + "rewards/rejected": -7.686649746365017, + "step": 586 + }, + { + "epoch": 0.21669512251395875, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 9.002465717816436e-06, + "logits/chosen": 2128489418.1052632, + "logits/rejected": 1495049294.7692308, + "logps/chosen": -300.53140419407896, + "logps/rejected": -517.9981971153846, + "loss": 0.2234, + "rewards/chosen": 0.8977372018914473, + "rewards/margins": 9.218170598450943, + "rewards/rejected": -8.320433396559496, + "step": 587 + }, + { + "epoch": 0.21706427945180196, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 8.998935617085837e-06, + "logits/chosen": 2116407432.5333333, + "logits/rejected": 1388392448.0, + "logps/chosen": -209.25735677083333, + "logps/rejected": -495.6539522058824, + "loss": 0.203, + "rewards/chosen": 0.5550373713175456, + "rewards/margins": 7.457597437091902, + "rewards/rejected": -6.902560065774357, + "step": 588 + }, + { + "epoch": 0.21743343638964516, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 8.995399975880749e-06, + "logits/chosen": 1931104496.9411764, + "logits/rejected": 1907063739.7333333, + "logps/chosen": -330.6831916360294, + "logps/rejected": -584.0557942708333, + "loss": 0.1705, + "rewards/chosen": 1.3296071220846737, + "rewards/margins": 7.730576563816445, + "rewards/rejected": -6.400969441731771, + "step": 589 + }, + { + "epoch": 0.21780259332748836, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 8.991858799099755e-06, + "logits/chosen": 1355156299.2941177, + "logits/rejected": 1596645102.9333334, + "logps/chosen": -261.05534811580884, + "logps/rejected": -470.28310546875, + "loss": 0.187, + "rewards/chosen": 0.9710822385900161, + "rewards/margins": 8.149877608056162, + "rewards/rejected": -7.178795369466146, + "step": 590 + }, + { + "epoch": 0.21817175026533156, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 8.98831209164911e-06, + "logits/chosen": 2427628150.1538463, + "logits/rejected": 1891787614.3157895, + "logps/chosen": -272.9636418269231, + "logps/rejected": -424.48057154605266, + "loss": 0.1475, + "rewards/chosen": 1.3483134049635668, + "rewards/margins": 7.458520008967473, + "rewards/rejected": -6.110206604003906, + "step": 591 + }, + { + "epoch": 0.21854090720317476, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 8.984759858442734e-06, + "logits/chosen": 1370383587.5555556, + "logits/rejected": 1658954605.7142856, + "logps/chosen": -243.58902994791666, + "logps/rejected": -517.3915318080357, + "loss": 0.1532, + "rewards/chosen": 1.6982640160454645, + "rewards/margins": 9.038833648439438, + "rewards/rejected": -7.340569632393973, + "step": 592 + }, + { + "epoch": 0.21891006414101796, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 8.9812021044022e-06, + "logits/chosen": 1804773649.0666666, + "logits/rejected": 2078883358.1176472, + "logps/chosen": -246.70431315104167, + "logps/rejected": -248.9306640625, + "loss": 0.2012, + "rewards/chosen": 0.893220329284668, + "rewards/margins": 5.268978332070744, + "rewards/rejected": -4.375758002786076, + "step": 593 + }, + { + "epoch": 0.21927922107886116, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 8.97763883445673e-06, + "logits/chosen": 1296615168.0, + "logits/rejected": 1788024064.0, + "logps/chosen": -262.5074157714844, + "logps/rejected": -474.1465148925781, + "loss": 0.1508, + "rewards/chosen": 1.1971220970153809, + "rewards/margins": 8.397694110870361, + "rewards/rejected": -7.2005720138549805, + "step": 594 + }, + { + "epoch": 0.21964837801670434, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 8.97407005354319e-06, + "logits/chosen": 1575275081.142857, + "logits/rejected": 1664285582.2222223, + "logps/chosen": -254.67567661830358, + "logps/rejected": -533.3963758680555, + "loss": 0.0958, + "rewards/chosen": 2.0708652223859514, + "rewards/margins": 10.941850465441506, + "rewards/rejected": -8.870985243055555, + "step": 595 + }, + { + "epoch": 0.22001753495454754, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 8.970495766606083e-06, + "logits/chosen": 1953710295.5789473, + "logits/rejected": 1281999635.6923077, + "logps/chosen": -267.0317896792763, + "logps/rejected": -533.9694636418269, + "loss": 0.1741, + "rewards/chosen": 1.8573640522203947, + "rewards/margins": 9.819478826484218, + "rewards/rejected": -7.9621147742638225, + "step": 596 + }, + { + "epoch": 0.22038669189239074, + "grad_norm": 14.3125, + "kl": 0.02653789520263672, + "learning_rate": 8.966915978597532e-06, + "logits/chosen": 1988963669.3333333, + "logits/rejected": 2139787023.0588236, + "logps/chosen": -337.67734375, + "logps/rejected": -615.0726102941177, + "loss": 0.1601, + "rewards/chosen": 1.0363204956054688, + "rewards/margins": 11.261036682128907, + "rewards/rejected": -10.224716186523438, + "step": 597 + }, + { + "epoch": 0.22075584883023394, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 8.963330694477295e-06, + "logits/chosen": 1762913621.3333333, + "logits/rejected": 1614848722.8235295, + "logps/chosen": -208.40558268229168, + "logps/rejected": -323.7610868566176, + "loss": 0.1395, + "rewards/chosen": 1.715588633219401, + "rewards/margins": 7.319312944599226, + "rewards/rejected": -5.603724311379826, + "step": 598 + }, + { + "epoch": 0.22112500576807714, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 8.959739919212734e-06, + "logits/chosen": 2276996534.857143, + "logits/rejected": 1976095175.1111112, + "logps/chosen": -346.4030064174107, + "logps/rejected": -468.3058810763889, + "loss": 0.1851, + "rewards/chosen": 0.5459445204053607, + "rewards/margins": 7.559672768153842, + "rewards/rejected": -7.013728247748481, + "step": 599 + }, + { + "epoch": 0.22149416270592034, + "grad_norm": 12.5625, + "kl": 0.043280601501464844, + "learning_rate": 8.956143657778822e-06, + "logits/chosen": 2262869742.9333334, + "logits/rejected": 2173326034.8235292, + "logps/chosen": -280.6311848958333, + "logps/rejected": -345.14973000919116, + "loss": 0.1689, + "rewards/chosen": 1.3801097869873047, + "rewards/margins": 7.53739065282485, + "rewards/rejected": -6.157280865837546, + "step": 600 + }, + { + "epoch": 0.22186331964376355, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 8.952541915158137e-06, + "logits/chosen": 1803325124.9230769, + "logits/rejected": 2023171125.8947368, + "logps/chosen": -359.3541917067308, + "logps/rejected": -354.11937191611844, + "loss": 0.1633, + "rewards/chosen": 0.8156102987436148, + "rewards/margins": 6.361511145526099, + "rewards/rejected": -5.545900846782484, + "step": 601 + }, + { + "epoch": 0.22223247658160675, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 8.948934696340842e-06, + "logits/chosen": 1748525738.6666667, + "logits/rejected": 1933833758.1176472, + "logps/chosen": -286.5561848958333, + "logps/rejected": -437.5272575827206, + "loss": 0.1664, + "rewards/chosen": 1.1367459615071616, + "rewards/margins": 7.939574133181105, + "rewards/rejected": -6.802828171673943, + "step": 602 + }, + { + "epoch": 0.22260163351944995, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 8.945322006324698e-06, + "logits/chosen": 2229880459.6363635, + "logits/rejected": 1928157593.6, + "logps/chosen": -254.3050204190341, + "logps/rejected": -391.869189453125, + "loss": 0.2266, + "rewards/chosen": 1.2546343369917436, + "rewards/margins": 6.251790480180221, + "rewards/rejected": -4.997156143188477, + "step": 603 + }, + { + "epoch": 0.22297079045729315, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 8.941703850115037e-06, + "logits/chosen": 1916993672.5333333, + "logits/rejected": 1948162529.8823528, + "logps/chosen": -296.65638020833336, + "logps/rejected": -609.7596507352941, + "loss": 0.1389, + "rewards/chosen": 1.3807899475097656, + "rewards/margins": 9.322025658102596, + "rewards/rejected": -7.941235710592831, + "step": 604 + }, + { + "epoch": 0.22333994739513635, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 8.938080232724773e-06, + "logits/chosen": 1908749952.0, + "logits/rejected": 1511897088.0, + "logps/chosen": -224.04574584960938, + "logps/rejected": -533.3795166015625, + "loss": 0.1328, + "rewards/chosen": 1.847679853439331, + "rewards/margins": 13.875939130783081, + "rewards/rejected": -12.02825927734375, + "step": 605 + }, + { + "epoch": 0.22370910433297955, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 8.934451159174377e-06, + "logits/chosen": 1820361216.0, + "logits/rejected": 1917167872.0, + "logps/chosen": -344.0973205566406, + "logps/rejected": -413.4035949707031, + "loss": 0.1617, + "rewards/chosen": 0.9643862843513489, + "rewards/margins": 7.9102190136909485, + "rewards/rejected": -6.9458327293396, + "step": 606 + }, + { + "epoch": 0.22407826127082275, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 8.930816634491887e-06, + "logits/chosen": 2328598528.0, + "logits/rejected": 1740394837.3333333, + "logps/chosen": -286.82571847098217, + "logps/rejected": -474.1115451388889, + "loss": 0.1548, + "rewards/chosen": 0.9134588922773089, + "rewards/margins": 7.879216474200052, + "rewards/rejected": -6.965757581922743, + "step": 607 + }, + { + "epoch": 0.22444741820866596, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 8.927176663712892e-06, + "logits/chosen": 1838034670.9333334, + "logits/rejected": 1583910189.1764705, + "logps/chosen": -304.90810546875, + "logps/rejected": -457.36804917279414, + "loss": 0.1687, + "rewards/chosen": 1.0260272343953452, + "rewards/margins": 8.023338994792864, + "rewards/rejected": -6.997311760397518, + "step": 608 + }, + { + "epoch": 0.22481657514650916, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 8.923531251880524e-06, + "logits/chosen": 1449696870.4, + "logits/rejected": 1364100276.7058823, + "logps/chosen": -293.06865234375, + "logps/rejected": -417.5852481617647, + "loss": 0.1646, + "rewards/chosen": 0.9469762166341146, + "rewards/margins": 8.179127472522213, + "rewards/rejected": -7.232151255888097, + "step": 609 + }, + { + "epoch": 0.22518573208435236, + "grad_norm": 12.0625, + "kl": 0.6632614135742188, + "learning_rate": 8.919880404045452e-06, + "logits/chosen": 2320912128.0, + "logits/rejected": 2041535488.0, + "logps/chosen": -278.5690002441406, + "logps/rejected": -456.20611572265625, + "loss": 0.152, + "rewards/chosen": 1.4269788265228271, + "rewards/margins": 7.8543479442596436, + "rewards/rejected": -6.427369117736816, + "step": 610 + }, + { + "epoch": 0.22555488902219556, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 8.916224125265883e-06, + "logits/chosen": 1921330744.8888888, + "logits/rejected": 2043888786.2857144, + "logps/chosen": -284.8400065104167, + "logps/rejected": -496.33394949776783, + "loss": 0.2348, + "rewards/chosen": 0.5135703086853027, + "rewards/margins": 9.602342128753662, + "rewards/rejected": -9.08877182006836, + "step": 611 + }, + { + "epoch": 0.22592404596003876, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 8.912562420607545e-06, + "logits/chosen": 1664036608.0, + "logits/rejected": 1616830464.0, + "logps/chosen": -275.5440979003906, + "logps/rejected": -489.3852844238281, + "loss": 0.1934, + "rewards/chosen": 1.3444292545318604, + "rewards/margins": 8.888211488723755, + "rewards/rejected": -7.5437822341918945, + "step": 612 + }, + { + "epoch": 0.22629320289788196, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 8.90889529514368e-06, + "logits/chosen": 1697856512.0, + "logits/rejected": 1439162823.1111112, + "logps/chosen": -252.26491001674108, + "logps/rejected": -410.28325737847223, + "loss": 0.1397, + "rewards/chosen": 1.3609981536865234, + "rewards/margins": 8.333114412095812, + "rewards/rejected": -6.972116258409288, + "step": 613 + }, + { + "epoch": 0.22666235983572516, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 8.905222753955045e-06, + "logits/chosen": 1569975637.3333333, + "logits/rejected": 1470964589.7142856, + "logps/chosen": -230.00230577256946, + "logps/rejected": -439.30161830357144, + "loss": 0.2246, + "rewards/chosen": 1.1193210813734267, + "rewards/margins": 7.620468215336875, + "rewards/rejected": -6.501147133963449, + "step": 614 + }, + { + "epoch": 0.22703151677356836, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 8.901544802129903e-06, + "logits/chosen": 1395512500.7058823, + "logits/rejected": 1444220245.3333333, + "logps/chosen": -262.44646139705884, + "logps/rejected": -499.1049479166667, + "loss": 0.1793, + "rewards/chosen": 1.0909907397101908, + "rewards/margins": 8.905394019332586, + "rewards/rejected": -7.814403279622396, + "step": 615 + }, + { + "epoch": 0.22740067371141157, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 8.897861444764004e-06, + "logits/chosen": 1501180177.0666666, + "logits/rejected": 1689819617.8823528, + "logps/chosen": -319.46611328125, + "logps/rejected": -375.2960994944853, + "loss": 0.1336, + "rewards/chosen": 1.53377685546875, + "rewards/margins": 7.811287105784697, + "rewards/rejected": -6.277510250315947, + "step": 616 + }, + { + "epoch": 0.22776983064925477, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 8.894172686960594e-06, + "logits/chosen": 1554227561.4117646, + "logits/rejected": 1765889501.8666666, + "logps/chosen": -296.76809512867646, + "logps/rejected": -412.38128255208335, + "loss": 0.1672, + "rewards/chosen": 1.4057897679946, + "rewards/margins": 7.17333381503236, + "rewards/rejected": -5.76754404703776, + "step": 617 + }, + { + "epoch": 0.22813898758709797, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 8.890478533830403e-06, + "logits/chosen": 1935020152.4705882, + "logits/rejected": 1634824192.0, + "logps/chosen": -277.04187729779414, + "logps/rejected": -516.9689778645833, + "loss": 0.1826, + "rewards/chosen": 0.9207604352165671, + "rewards/margins": 8.044114520503026, + "rewards/rejected": -7.123354085286459, + "step": 618 + }, + { + "epoch": 0.22850814452494117, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 8.886778990491632e-06, + "logits/chosen": 1908239246.2222223, + "logits/rejected": 1755994258.2857144, + "logps/chosen": -305.37470160590277, + "logps/rejected": -416.4071568080357, + "loss": 0.1867, + "rewards/chosen": 1.1695077684190538, + "rewards/margins": 7.7874075268942216, + "rewards/rejected": -6.6178997584751675, + "step": 619 + }, + { + "epoch": 0.22887730146278437, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 8.883074062069948e-06, + "logits/chosen": 2015700582.4, + "logits/rejected": 1793307927.2727273, + "logps/chosen": -388.994775390625, + "logps/rejected": -484.0183771306818, + "loss": 0.1075, + "rewards/chosen": 1.7233427047729493, + "rewards/margins": 9.696314534274014, + "rewards/rejected": -7.972971829501065, + "step": 620 + }, + { + "epoch": 0.22924645840062757, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 8.879363753698487e-06, + "logits/chosen": 1574212096.0, + "logits/rejected": 1616455552.0, + "logps/chosen": -246.962890625, + "logps/rejected": -468.5999450683594, + "loss": 0.1638, + "rewards/chosen": 1.1764740943908691, + "rewards/margins": 8.70206594467163, + "rewards/rejected": -7.525591850280762, + "step": 621 + }, + { + "epoch": 0.22961561533847077, + "grad_norm": 12.625, + "kl": 0.10536384582519531, + "learning_rate": 8.875648070517832e-06, + "logits/chosen": 2642551552.0, + "logits/rejected": 1582042368.0, + "logps/chosen": -285.5059509277344, + "logps/rejected": -482.621826171875, + "loss": 0.1332, + "rewards/chosen": 1.4250067472457886, + "rewards/margins": 8.993879675865173, + "rewards/rejected": -7.568872928619385, + "step": 622 + }, + { + "epoch": 0.22998477227631398, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 8.871927017676013e-06, + "logits/chosen": 2179116919.4666667, + "logits/rejected": 1518672112.9411764, + "logps/chosen": -312.80263671875, + "logps/rejected": -440.6478056066176, + "loss": 0.1738, + "rewards/chosen": 1.0501731236775715, + "rewards/margins": 8.328777967714796, + "rewards/rejected": -7.278604844037225, + "step": 623 + }, + { + "epoch": 0.23035392921415718, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 8.868200600328505e-06, + "logits/chosen": 2086838994.8235295, + "logits/rejected": 1449286314.6666667, + "logps/chosen": -246.6076229319853, + "logps/rejected": -404.45647786458335, + "loss": 0.1723, + "rewards/chosen": 1.2777229757869946, + "rewards/margins": 7.690595798866422, + "rewards/rejected": -6.412872823079427, + "step": 624 + }, + { + "epoch": 0.23072308615200038, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 8.864468823638211e-06, + "logits/chosen": 1383989032.4210527, + "logits/rejected": 1829815689.8461537, + "logps/chosen": -226.97327302631578, + "logps/rejected": -455.8234675480769, + "loss": 0.2404, + "rewards/chosen": 0.7630898827000668, + "rewards/margins": 8.1317017319714, + "rewards/rejected": -7.368611849271334, + "step": 625 + }, + { + "epoch": 0.23109224308984358, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 8.860731692775459e-06, + "logits/chosen": 1965130988.3076923, + "logits/rejected": 2316218799.1578946, + "logps/chosen": -301.9336688701923, + "logps/rejected": -551.4968647203947, + "loss": 0.1547, + "rewards/chosen": 1.0046693361722505, + "rewards/margins": 9.682487232965014, + "rewards/rejected": -8.677817896792764, + "step": 626 + }, + { + "epoch": 0.23146140002768678, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 8.856989212917994e-06, + "logits/chosen": 2187031893.3333335, + "logits/rejected": 2074544686.5454545, + "logps/chosen": -259.6909644717262, + "logps/rejected": -592.6592240767045, + "loss": 0.2488, + "rewards/chosen": 0.6828241348266602, + "rewards/margins": 8.849679686806418, + "rewards/rejected": -8.166855551979758, + "step": 627 + }, + { + "epoch": 0.23183055696552998, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 8.853241389250981e-06, + "logits/chosen": 1737659847.1111112, + "logits/rejected": 1836216905.142857, + "logps/chosen": -252.59597439236111, + "logps/rejected": -586.3655133928571, + "loss": 0.2167, + "rewards/chosen": 0.8531558248731825, + "rewards/margins": 10.330153200361464, + "rewards/rejected": -9.476997375488281, + "step": 628 + }, + { + "epoch": 0.23219971390337318, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 8.849488226966975e-06, + "logits/chosen": 1546647893.3333333, + "logits/rejected": 1753084928.0, + "logps/chosen": -225.14888509114584, + "logps/rejected": -555.550537109375, + "loss": 0.1556, + "rewards/chosen": 0.7913695971171061, + "rewards/margins": 9.401956144968668, + "rewards/rejected": -8.610586547851563, + "step": 629 + }, + { + "epoch": 0.23256887084121639, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 8.84572973126594e-06, + "logits/chosen": 2628641353.142857, + "logits/rejected": 1986980522.6666667, + "logps/chosen": -239.04148646763392, + "logps/rejected": -448.5937771267361, + "loss": 0.1919, + "rewards/chosen": 0.5066382203783307, + "rewards/margins": 7.032596811415657, + "rewards/rejected": -6.525958591037327, + "step": 630 + }, + { + "epoch": 0.2329380277790596, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 8.84196590735522e-06, + "logits/chosen": 1930185344.0, + "logits/rejected": 1598667520.0, + "logps/chosen": -209.1876678466797, + "logps/rejected": -569.3306274414062, + "loss": 0.1554, + "rewards/chosen": 1.5321828126907349, + "rewards/margins": 9.476721405982971, + "rewards/rejected": -7.944538593292236, + "step": 631 + }, + { + "epoch": 0.23330718471690276, + "grad_norm": 12.375, + "kl": 0.7686805725097656, + "learning_rate": 8.83819676044955e-06, + "logits/chosen": 1234525485.1764705, + "logits/rejected": 1252758323.2, + "logps/chosen": -250.5320255055147, + "logps/rejected": -484.73046875, + "loss": 0.1991, + "rewards/chosen": 1.261500639073989, + "rewards/margins": 9.81878620222503, + "rewards/rejected": -8.557285563151042, + "step": 632 + }, + { + "epoch": 0.23367634165474596, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 8.83442229577103e-06, + "logits/chosen": 1887933056.0, + "logits/rejected": 1975230720.0, + "logps/chosen": -312.4565124511719, + "logps/rejected": -383.0641174316406, + "loss": 0.1796, + "rewards/chosen": 1.2086622714996338, + "rewards/margins": 6.871243715286255, + "rewards/rejected": -5.662581443786621, + "step": 633 + }, + { + "epoch": 0.23404549859258916, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 8.830642518549135e-06, + "logits/chosen": 2043774429.8666666, + "logits/rejected": 1816917895.5294118, + "logps/chosen": -249.179052734375, + "logps/rejected": -465.77404067095586, + "loss": 0.1461, + "rewards/chosen": 1.4289903004964193, + "rewards/margins": 9.38156239378686, + "rewards/rejected": -7.952572093290441, + "step": 634 + }, + { + "epoch": 0.23441465553043236, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 8.8268574340207e-06, + "logits/chosen": 1576409770.6666667, + "logits/rejected": 1504268288.0, + "logps/chosen": -264.4895833333333, + "logps/rejected": -492.52613740808823, + "loss": 0.1721, + "rewards/chosen": 0.9085037867228191, + "rewards/margins": 8.447358049130907, + "rewards/rejected": -7.538854262408088, + "step": 635 + }, + { + "epoch": 0.23478381246827557, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 8.823067047429908e-06, + "logits/chosen": 2060052747.1304348, + "logits/rejected": 2217257187.5555553, + "logps/chosen": -236.2571968410326, + "logps/rejected": -437.77940538194446, + "loss": 0.2949, + "rewards/chosen": 0.625471654145614, + "rewards/margins": 7.827743857379121, + "rewards/rejected": -7.202272203233507, + "step": 636 + }, + { + "epoch": 0.23515296940611877, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 8.819271364028294e-06, + "logits/chosen": 1693727232.0, + "logits/rejected": 1572822912.0, + "logps/chosen": -218.5090789794922, + "logps/rejected": -409.12890625, + "loss": 0.1736, + "rewards/chosen": 1.1553616523742676, + "rewards/margins": 7.355856418609619, + "rewards/rejected": -6.200494766235352, + "step": 637 + }, + { + "epoch": 0.23552212634396197, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 8.815470389074727e-06, + "logits/chosen": 1554153231.0588236, + "logits/rejected": 1720617233.0666666, + "logps/chosen": -238.52988568474265, + "logps/rejected": -495.54127604166666, + "loss": 0.1665, + "rewards/chosen": 1.2009148317224838, + "rewards/margins": 8.520997432633942, + "rewards/rejected": -7.320082600911459, + "step": 638 + }, + { + "epoch": 0.23589128328180517, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 8.811664127835412e-06, + "logits/chosen": 2188843008.0, + "logits/rejected": 1738923008.0, + "logps/chosen": -357.3623352050781, + "logps/rejected": -441.7335205078125, + "loss": 0.1967, + "rewards/chosen": 0.7807996869087219, + "rewards/margins": 7.317703068256378, + "rewards/rejected": -6.536903381347656, + "step": 639 + }, + { + "epoch": 0.23626044021964837, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 8.807852585583876e-06, + "logits/chosen": 1713496436.3636363, + "logits/rejected": 1442547712.0, + "logps/chosen": -291.25277432528407, + "logps/rejected": -471.3265904017857, + "loss": 0.1525, + "rewards/chosen": 0.6411446658047762, + "rewards/margins": 6.969198458122484, + "rewards/rejected": -6.328053792317708, + "step": 640 + }, + { + "epoch": 0.23662959715749157, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 8.80403576760096e-06, + "logits/chosen": 1652766993.0666666, + "logits/rejected": 2220258484.7058825, + "logps/chosen": -329.8064453125, + "logps/rejected": -502.8772403492647, + "loss": 0.1927, + "rewards/chosen": 0.5765282313028971, + "rewards/margins": 8.306226861243154, + "rewards/rejected": -7.729698629940257, + "step": 641 + }, + { + "epoch": 0.23699875409533477, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 8.800213679174818e-06, + "logits/chosen": 1691347968.0, + "logits/rejected": 1919845469.090909, + "logps/chosen": -277.1225341796875, + "logps/rejected": -481.07040127840907, + "loss": 0.1058, + "rewards/chosen": 1.1288966178894042, + "rewards/margins": 7.3966876723549575, + "rewards/rejected": -6.267791054465554, + "step": 642 + }, + { + "epoch": 0.23736791103317798, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 8.796386325600906e-06, + "logits/chosen": 1725575168.0, + "logits/rejected": 1640679531.7894738, + "logps/chosen": -303.6401179387019, + "logps/rejected": -466.49624794407896, + "loss": 0.1611, + "rewards/chosen": 0.9578496492826022, + "rewards/margins": 7.988468417271911, + "rewards/rejected": -7.030618767989309, + "step": 643 + }, + { + "epoch": 0.23773706797102118, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 8.79255371218197e-06, + "logits/chosen": 1507335561.8461537, + "logits/rejected": 2210447791.1578946, + "logps/chosen": -298.8276554987981, + "logps/rejected": -463.1732627467105, + "loss": 0.1151, + "rewards/chosen": 1.9234768794133112, + "rewards/margins": 9.787994091327374, + "rewards/rejected": -7.8645172119140625, + "step": 644 + }, + { + "epoch": 0.23810622490886438, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 8.78871584422805e-06, + "logits/chosen": 2378346203.428571, + "logits/rejected": 2268684288.0, + "logps/chosen": -253.41966029575892, + "logps/rejected": -402.89344618055554, + "loss": 0.1315, + "rewards/chosen": 1.354564939226423, + "rewards/margins": 7.773544584001813, + "rewards/rejected": -6.418979644775391, + "step": 645 + }, + { + "epoch": 0.23847538184670758, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 8.784872727056464e-06, + "logits/chosen": 2145537408.0, + "logits/rejected": 1621345920.0, + "logps/chosen": -317.9798583984375, + "logps/rejected": -487.2658386230469, + "loss": 0.1732, + "rewards/chosen": 1.084149718284607, + "rewards/margins": 8.736176371574402, + "rewards/rejected": -7.652026653289795, + "step": 646 + }, + { + "epoch": 0.23884453878455078, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 8.781024365991802e-06, + "logits/chosen": 2100527344.9411764, + "logits/rejected": 2055482299.7333333, + "logps/chosen": -355.20519301470586, + "logps/rejected": -447.6387044270833, + "loss": 0.1578, + "rewards/chosen": 1.2088603973388672, + "rewards/margins": 9.425849533081054, + "rewards/rejected": -8.216989135742187, + "step": 647 + }, + { + "epoch": 0.23921369572239398, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 8.777170766365916e-06, + "logits/chosen": 1895837549.7142856, + "logits/rejected": 2008193479.1111112, + "logps/chosen": -380.7836216517857, + "logps/rejected": -497.91596137152777, + "loss": 0.1857, + "rewards/chosen": 0.7978768348693848, + "rewards/margins": 9.084834469689262, + "rewards/rejected": -8.286957634819878, + "step": 648 + }, + { + "epoch": 0.23958285266023718, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 8.773311933517923e-06, + "logits/chosen": 2225705398.857143, + "logits/rejected": 2162490936.888889, + "logps/chosen": -241.17766462053572, + "logps/rejected": -530.4954427083334, + "loss": 0.1787, + "rewards/chosen": 1.0899677276611328, + "rewards/margins": 7.266934076944987, + "rewards/rejected": -6.1769663492838545, + "step": 649 + }, + { + "epoch": 0.23995200959808038, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 8.769447872794185e-06, + "logits/chosen": 1980485416.4210527, + "logits/rejected": 1419477307.0769231, + "logps/chosen": -327.4742495888158, + "logps/rejected": -478.44591346153845, + "loss": 0.2228, + "rewards/chosen": 0.754159224660773, + "rewards/margins": 7.879342330129523, + "rewards/rejected": -7.12518310546875, + "step": 650 + }, + { + "epoch": 0.24032116653592359, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 8.765578589548309e-06, + "logits/chosen": 1383262208.0, + "logits/rejected": 1900146688.0, + "logps/chosen": -229.96515764508928, + "logps/rejected": -408.4047580295139, + "loss": 0.1647, + "rewards/chosen": 1.2261815752301897, + "rewards/margins": 7.80615500798301, + "rewards/rejected": -6.579973432752821, + "step": 651 + }, + { + "epoch": 0.2406903234737668, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 8.76170408914114e-06, + "logits/chosen": 1413862478.7692308, + "logits/rejected": 1256695592.4210527, + "logps/chosen": -186.7420372596154, + "logps/rejected": -343.00375205592104, + "loss": 0.1453, + "rewards/chosen": 1.4604656512920673, + "rewards/margins": 7.380501565662955, + "rewards/rejected": -5.920035914370888, + "step": 652 + }, + { + "epoch": 0.24105948041161, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 8.757824376940748e-06, + "logits/chosen": 2001367040.0, + "logits/rejected": 1986912460.8, + "logps/chosen": -318.39524332682294, + "logps/rejected": -563.444140625, + "loss": 0.1138, + "rewards/chosen": 1.3615036010742188, + "rewards/margins": 9.812646484375, + "rewards/rejected": -8.451142883300781, + "step": 653 + }, + { + "epoch": 0.2414286373494532, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 8.75393945832242e-06, + "logits/chosen": 1493627426.1333334, + "logits/rejected": 1642464436.7058823, + "logps/chosen": -250.84716796875, + "logps/rejected": -474.68528837316177, + "loss": 0.1903, + "rewards/chosen": 0.8015295664469401, + "rewards/margins": 7.510263966579063, + "rewards/rejected": -6.708734400132123, + "step": 654 + }, + { + "epoch": 0.2417977942872964, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 8.75004933866867e-06, + "logits/chosen": 1490150784.0, + "logits/rejected": 1442845312.0, + "logps/chosen": -263.4288635253906, + "logps/rejected": -453.8269958496094, + "loss": 0.2033, + "rewards/chosen": 0.9110702872276306, + "rewards/margins": 7.131710469722748, + "rewards/rejected": -6.220640182495117, + "step": 655 + }, + { + "epoch": 0.2421669512251396, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 8.746154023369204e-06, + "logits/chosen": 1828971724.8, + "logits/rejected": 1851741967.0588236, + "logps/chosen": -291.86510416666664, + "logps/rejected": -472.20582490808823, + "loss": 0.1427, + "rewards/chosen": 1.4730523427327473, + "rewards/margins": 7.773016349942076, + "rewards/rejected": -6.299964007209329, + "step": 656 + }, + { + "epoch": 0.2425361081629828, + "grad_norm": 14.4375, + "kl": 0.06703472137451172, + "learning_rate": 8.742253517820933e-06, + "logits/chosen": 2721785012.7058825, + "logits/rejected": 2704604091.733333, + "logps/chosen": -299.48977481617646, + "logps/rejected": -401.200390625, + "loss": 0.2385, + "rewards/chosen": 0.39396050397087545, + "rewards/margins": 7.173337741926605, + "rewards/rejected": -6.779377237955729, + "step": 657 + }, + { + "epoch": 0.242905265100826, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 8.738347827427957e-06, + "logits/chosen": 1750829056.0, + "logits/rejected": 2413437747.2, + "logps/chosen": -276.5531364889706, + "logps/rejected": -638.8397135416667, + "loss": 0.2042, + "rewards/chosen": 0.9142261392929975, + "rewards/margins": 8.338099023407581, + "rewards/rejected": -7.423872884114584, + "step": 658 + }, + { + "epoch": 0.2432744220386692, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 8.734436957601564e-06, + "logits/chosen": 1813715259.0769231, + "logits/rejected": 2006152677.0526316, + "logps/chosen": -329.1476862980769, + "logps/rejected": -546.96875, + "loss": 0.1591, + "rewards/chosen": 1.241298822256235, + "rewards/margins": 8.937934018339705, + "rewards/rejected": -7.69663519608347, + "step": 659 + }, + { + "epoch": 0.2436435789765124, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 8.730520913760209e-06, + "logits/chosen": 1921491968.0, + "logits/rejected": 2296602168.888889, + "logps/chosen": -229.15150669642858, + "logps/rejected": -682.74169921875, + "loss": 0.1629, + "rewards/chosen": 0.8116139684404645, + "rewards/margins": 11.271374846261645, + "rewards/rejected": -10.45976087782118, + "step": 660 + }, + { + "epoch": 0.2440127359143556, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 8.726599701329526e-06, + "logits/chosen": 1634050529.8823528, + "logits/rejected": 1924187613.8666666, + "logps/chosen": -319.8782169117647, + "logps/rejected": -478.51100260416666, + "loss": 0.2059, + "rewards/chosen": 1.0886991164263558, + "rewards/margins": 8.558125080781824, + "rewards/rejected": -7.469425964355469, + "step": 661 + }, + { + "epoch": 0.2443818928521988, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 8.722673325742302e-06, + "logits/chosen": 1870096699.0769231, + "logits/rejected": 1707136377.2631578, + "logps/chosen": -353.32895132211536, + "logps/rejected": -560.3124486019736, + "loss": 0.1161, + "rewards/chosen": 1.4291331951434796, + "rewards/margins": 10.80392920922654, + "rewards/rejected": -9.37479601408306, + "step": 662 + }, + { + "epoch": 0.244751049790042, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 8.718741792438481e-06, + "logits/chosen": 1769429922.909091, + "logits/rejected": 2356020760.3809524, + "logps/chosen": -244.13831676136363, + "logps/rejected": -530.2338169642857, + "loss": 0.1267, + "rewards/chosen": 0.8344071128151633, + "rewards/margins": 8.404776288317395, + "rewards/rejected": -7.570369175502232, + "step": 663 + }, + { + "epoch": 0.2451202067278852, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 8.714805106865151e-06, + "logits/chosen": 2116740581.0526316, + "logits/rejected": 2088702109.5384614, + "logps/chosen": -357.04273745888156, + "logps/rejected": -397.4817457932692, + "loss": 0.2385, + "rewards/chosen": 0.6662938469334653, + "rewards/margins": 6.473946247023609, + "rewards/rejected": -5.807652400090144, + "step": 664 + }, + { + "epoch": 0.2454893636657284, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 8.710863274476544e-06, + "logits/chosen": 1738030501.6470587, + "logits/rejected": 1138794769.0666666, + "logps/chosen": -251.41595818014707, + "logps/rejected": -379.57652994791664, + "loss": 0.1653, + "rewards/chosen": 1.2679084329044117, + "rewards/margins": 7.372393499636182, + "rewards/rejected": -6.104485066731771, + "step": 665 + }, + { + "epoch": 0.2458585206035716, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 8.706916300734017e-06, + "logits/chosen": 1498765767.1111112, + "logits/rejected": 1142872795.4285715, + "logps/chosen": -239.326416015625, + "logps/rejected": -508.318603515625, + "loss": 0.1551, + "rewards/chosen": 1.605823940700955, + "rewards/margins": 9.487242441328744, + "rewards/rejected": -7.88141850062779, + "step": 666 + }, + { + "epoch": 0.2462276775414148, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 8.70296419110605e-06, + "logits/chosen": 2508578093.1764708, + "logits/rejected": 1667816925.8666666, + "logps/chosen": -272.29733455882354, + "logps/rejected": -500.83701171875, + "loss": 0.2335, + "rewards/chosen": 0.5442578371833352, + "rewards/margins": 9.282291894800524, + "rewards/rejected": -8.738034057617188, + "step": 667 + }, + { + "epoch": 0.24659683447925798, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 8.69900695106824e-06, + "logits/chosen": 1619647488.0, + "logits/rejected": 1496164352.0, + "logps/chosen": -242.96864536830358, + "logps/rejected": -359.64385308159723, + "loss": 0.1656, + "rewards/chosen": 1.025043146950858, + "rewards/margins": 7.142156896137056, + "rewards/rejected": -6.117113749186198, + "step": 668 + }, + { + "epoch": 0.24696599141710118, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 8.695044586103297e-06, + "logits/chosen": 1399412872.5333333, + "logits/rejected": 1689190159.0588236, + "logps/chosen": -256.296240234375, + "logps/rejected": -439.26907169117646, + "loss": 0.2092, + "rewards/chosen": 0.8171581268310547, + "rewards/margins": 8.446997855691349, + "rewards/rejected": -7.629839728860294, + "step": 669 + }, + { + "epoch": 0.24733514835494438, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 8.691077101701024e-06, + "logits/chosen": 1973901994.6666667, + "logits/rejected": 2083195465.142857, + "logps/chosen": -278.1073947482639, + "logps/rejected": -780.9238978794643, + "loss": 0.1737, + "rewards/chosen": 1.1396573384602864, + "rewards/margins": 31.758148375011626, + "rewards/rejected": -30.61849103655134, + "step": 670 + }, + { + "epoch": 0.24770430529278759, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 8.68710450335832e-06, + "logits/chosen": 1684105938.8235295, + "logits/rejected": 1960626585.6, + "logps/chosen": -240.85482249540442, + "logps/rejected": -530.4730794270833, + "loss": 0.1265, + "rewards/chosen": 1.709817549761604, + "rewards/margins": 11.87165226655848, + "rewards/rejected": -10.161834716796875, + "step": 671 + }, + { + "epoch": 0.2480734622306308, + "grad_norm": 11.625, + "kl": 0.2384204864501953, + "learning_rate": 8.683126796579173e-06, + "logits/chosen": 1599741659.4285715, + "logits/rejected": 1817046357.3333333, + "logps/chosen": -263.88612583705356, + "logps/rejected": -440.82823350694446, + "loss": 0.1585, + "rewards/chosen": 1.2255584171840124, + "rewards/margins": 8.73992032853384, + "rewards/rejected": -7.514361911349827, + "step": 672 + }, + { + "epoch": 0.248442619168474, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 8.679143986874643e-06, + "logits/chosen": 2451321304.6153846, + "logits/rejected": 1563392538.9473684, + "logps/chosen": -283.40542367788464, + "logps/rejected": -559.964689555921, + "loss": 0.1184, + "rewards/chosen": 1.419213661780724, + "rewards/margins": 11.654226318544703, + "rewards/rejected": -10.23501265676398, + "step": 673 + }, + { + "epoch": 0.2488117761063172, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 8.67515607976286e-06, + "logits/chosen": 1489568914.2857144, + "logits/rejected": 1679206855.1111112, + "logps/chosen": -302.54150390625, + "logps/rejected": -484.31260850694446, + "loss": 0.1522, + "rewards/chosen": 1.1830558776855469, + "rewards/margins": 9.90100818210178, + "rewards/rejected": -8.717952304416233, + "step": 674 + }, + { + "epoch": 0.2491809330441604, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 8.671163080769025e-06, + "logits/chosen": 1524868962.4615386, + "logits/rejected": 1854182885.0526316, + "logps/chosen": -251.41280423677884, + "logps/rejected": -561.5582853618421, + "loss": 0.1313, + "rewards/chosen": 1.2134993626521184, + "rewards/margins": 8.91554821647613, + "rewards/rejected": -7.702048853824013, + "step": 675 + }, + { + "epoch": 0.2495500899820036, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 8.66716499542538e-06, + "logits/chosen": 1884363922.2857144, + "logits/rejected": 2004013738.6666667, + "logps/chosen": -271.65733119419644, + "logps/rejected": -483.8679470486111, + "loss": 0.1577, + "rewards/chosen": 0.7499054500034877, + "rewards/margins": 7.706628133380224, + "rewards/rejected": -6.956722683376736, + "step": 676 + }, + { + "epoch": 0.2499192469198468, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 8.663161829271226e-06, + "logits/chosen": 1535478390.1538463, + "logits/rejected": 1708894315.7894738, + "logps/chosen": -280.3178898737981, + "logps/rejected": -537.8037109375, + "loss": 0.1748, + "rewards/chosen": 0.32710827313936675, + "rewards/margins": 9.31843726548106, + "rewards/rejected": -8.991328992341694, + "step": 677 + }, + { + "epoch": 0.25028840385769, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 8.659153587852895e-06, + "logits/chosen": 2179699200.0, + "logits/rejected": 1627045376.0, + "logps/chosen": -295.35888671875, + "logps/rejected": -551.3049926757812, + "loss": 0.173, + "rewards/chosen": 0.8541174530982971, + "rewards/margins": 9.764796316623688, + "rewards/rejected": -8.91067886352539, + "step": 678 + }, + { + "epoch": 0.25028840385769, + "eval_kl": 0.0, + "eval_logits/chosen": 3454141983.84689, + "eval_logits/rejected": 3476122779.151515, + "eval_logps/chosen": -293.949461722488, + "eval_logps/rejected": -474.12939664502164, + "eval_loss": 0.1509229987859726, + "eval_rewards/chosen": 1.3065299440228768, + "eval_rewards/margins": 8.946426221248256, + "eval_rewards/rejected": -7.639896277225379, + "eval_runtime": 109.3996, + "eval_samples_per_second": 8.007, + "eval_steps_per_second": 0.503, + "step": 678 + }, + { + "epoch": 0.2506575607955332, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 8.65514027672376e-06, + "logits/chosen": 1989280836.2666667, + "logits/rejected": 2138981797.6470587, + "logps/chosen": -367.14921875, + "logps/rejected": -477.27159926470586, + "loss": 0.1594, + "rewards/chosen": 1.8254585266113281, + "rewards/margins": 9.436805500703699, + "rewards/rejected": -7.611346974092371, + "step": 679 + }, + { + "epoch": 0.2510267177333764, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 8.651121901444208e-06, + "logits/chosen": 1722287581.8666666, + "logits/rejected": 1801929908.7058823, + "logps/chosen": -292.0982421875, + "logps/rejected": -464.30261948529414, + "loss": 0.1577, + "rewards/chosen": 1.1125287373860677, + "rewards/margins": 8.651902696198109, + "rewards/rejected": -7.539373958812041, + "step": 680 + }, + { + "epoch": 0.2513958746712196, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 8.64709846758165e-06, + "logits/chosen": 1762754852.5714285, + "logits/rejected": 2200898218.6666665, + "logps/chosen": -310.33182198660717, + "logps/rejected": -442.51963975694446, + "loss": 0.1514, + "rewards/chosen": 1.3102513722011022, + "rewards/margins": 8.338853124588255, + "rewards/rejected": -7.028601752387153, + "step": 681 + }, + { + "epoch": 0.25176503160906283, + "grad_norm": 12.875, + "kl": 0.11469078063964844, + "learning_rate": 8.643069980710502e-06, + "logits/chosen": 2413391872.0, + "logits/rejected": 1295672661.3333333, + "logps/chosen": -351.44215303308823, + "logps/rejected": -364.004296875, + "loss": 0.1634, + "rewards/chosen": 1.5754811904009651, + "rewards/margins": 9.508459502575445, + "rewards/rejected": -7.9329783121744795, + "step": 682 + }, + { + "epoch": 0.252134188546906, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 8.639036446412177e-06, + "logits/chosen": 2214952448.0, + "logits/rejected": 1822320000.0, + "logps/chosen": -340.1465148925781, + "logps/rejected": -526.520751953125, + "loss": 0.1663, + "rewards/chosen": 1.3681427240371704, + "rewards/margins": 10.997568726539612, + "rewards/rejected": -9.629426002502441, + "step": 683 + }, + { + "epoch": 0.25250334548474923, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 8.634997870275092e-06, + "logits/chosen": 1473706276.5714285, + "logits/rejected": 1904057344.0, + "logps/chosen": -327.3775111607143, + "logps/rejected": -511.8525390625, + "loss": 0.1452, + "rewards/chosen": 1.4785426003592355, + "rewards/margins": 9.379565617394826, + "rewards/rejected": -7.90102301703559, + "step": 684 + }, + { + "epoch": 0.2528725024225924, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 8.63095425789464e-06, + "logits/chosen": 2165988171.2941175, + "logits/rejected": 1829965277.8666666, + "logps/chosen": -330.62795840992646, + "logps/rejected": -520.0130208333334, + "loss": 0.1412, + "rewards/chosen": 1.8803940941305721, + "rewards/margins": 8.31009074940401, + "rewards/rejected": -6.429696655273437, + "step": 685 + }, + { + "epoch": 0.2532416593604356, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 8.62690561487319e-06, + "logits/chosen": 2282603520.0, + "logits/rejected": 2241282304.0, + "logps/chosen": -277.59735107421875, + "logps/rejected": -464.32916259765625, + "loss": 0.1615, + "rewards/chosen": 1.0696661472320557, + "rewards/margins": 7.556753396987915, + "rewards/rejected": -6.487087249755859, + "step": 686 + }, + { + "epoch": 0.2536108162982788, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 8.622851946820094e-06, + "logits/chosen": 1511956359.5294118, + "logits/rejected": 1509084501.3333333, + "logps/chosen": -268.2816521139706, + "logps/rejected": -497.06731770833335, + "loss": 0.1894, + "rewards/chosen": 1.186398898853975, + "rewards/margins": 8.537666395598768, + "rewards/rejected": -7.351267496744792, + "step": 687 + }, + { + "epoch": 0.253979973236122, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 8.618793259351655e-06, + "logits/chosen": 1786399232.0, + "logits/rejected": 1905487462.4, + "logps/chosen": -283.63295491536456, + "logps/rejected": -435.180615234375, + "loss": 0.1252, + "rewards/chosen": 1.3028504848480225, + "rewards/margins": 7.723425436019897, + "rewards/rejected": -6.420574951171875, + "step": 688 + }, + { + "epoch": 0.2543491301739652, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 8.614729558091129e-06, + "logits/chosen": 2401594187.2941175, + "logits/rejected": 1965033062.4, + "logps/chosen": -238.60291245404412, + "logps/rejected": -444.8243815104167, + "loss": 0.185, + "rewards/chosen": 1.054581361658433, + "rewards/margins": 10.65605027441885, + "rewards/rejected": -9.601468912760417, + "step": 689 + }, + { + "epoch": 0.2547182871118084, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 8.610660848668723e-06, + "logits/chosen": 1956186989.7142856, + "logits/rejected": 2602589980.4444447, + "logps/chosen": -284.50992257254467, + "logps/rejected": -441.7181803385417, + "loss": 0.1837, + "rewards/chosen": 0.5967070715767997, + "rewards/margins": 8.010884860205271, + "rewards/rejected": -7.414177788628472, + "step": 690 + }, + { + "epoch": 0.2550874440496516, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 8.606587136721586e-06, + "logits/chosen": 1773974118.4, + "logits/rejected": 1766915072.0, + "logps/chosen": -307.51025390625, + "logps/rejected": -474.328369140625, + "loss": 0.1978, + "rewards/chosen": 0.9682549476623535, + "rewards/margins": 7.710261249542237, + "rewards/rejected": -6.742006301879883, + "step": 691 + }, + { + "epoch": 0.2554566009874948, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 8.602508427893794e-06, + "logits/chosen": 1690416128.0, + "logits/rejected": 1653553766.4, + "logps/chosen": -270.0097249348958, + "logps/rejected": -494.12421875, + "loss": 0.1362, + "rewards/chosen": 1.0904799302419026, + "rewards/margins": 8.234240611394247, + "rewards/rejected": -7.143760681152344, + "step": 692 + }, + { + "epoch": 0.255825757925338, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 8.598424727836343e-06, + "logits/chosen": 1953130349.7142856, + "logits/rejected": 1840313139.2, + "logps/chosen": -264.21786063058033, + "logps/rejected": -587.3396484375, + "loss": 0.0843, + "rewards/chosen": 1.0863005093165807, + "rewards/margins": 10.267689669472832, + "rewards/rejected": -9.18138916015625, + "step": 693 + }, + { + "epoch": 0.2561949148631812, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 8.59433604220715e-06, + "logits/chosen": 1866516366.2222223, + "logits/rejected": 1585663853.7142856, + "logps/chosen": -211.74864366319446, + "logps/rejected": -438.4081333705357, + "loss": 0.1756, + "rewards/chosen": 1.3603085411919489, + "rewards/margins": 8.8062559158083, + "rewards/rejected": -7.445947374616351, + "step": 694 + }, + { + "epoch": 0.2565640718010244, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 8.590242376671035e-06, + "logits/chosen": 2369217929.8461537, + "logits/rejected": 2309797564.631579, + "logps/chosen": -334.5200946514423, + "logps/rejected": -531.6450452302631, + "loss": 0.1132, + "rewards/chosen": 1.693501692551833, + "rewards/margins": 8.747140351577325, + "rewards/rejected": -7.053638659025493, + "step": 695 + }, + { + "epoch": 0.2569332287388676, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 8.586143736899721e-06, + "logits/chosen": 1206721280.0, + "logits/rejected": 1069724928.0, + "logps/chosen": -232.7579803466797, + "logps/rejected": -439.3637390136719, + "loss": 0.1567, + "rewards/chosen": 1.3320797681808472, + "rewards/margins": 10.024652361869812, + "rewards/rejected": -8.692572593688965, + "step": 696 + }, + { + "epoch": 0.2573023856767108, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 8.582040128571822e-06, + "logits/chosen": 1749154669.7142856, + "logits/rejected": 1900851996.4444444, + "logps/chosen": -282.76475306919644, + "logps/rejected": -548.2631293402778, + "loss": 0.096, + "rewards/chosen": 2.03106267111642, + "rewards/margins": 9.221928081815205, + "rewards/rejected": -7.190865410698785, + "step": 697 + }, + { + "epoch": 0.257671542614554, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 8.577931557372832e-06, + "logits/chosen": 1864244284.235294, + "logits/rejected": 1581174237.8666666, + "logps/chosen": -300.5215418198529, + "logps/rejected": -367.42760416666664, + "loss": 0.1952, + "rewards/chosen": 0.9721404804902918, + "rewards/margins": 6.344993060242896, + "rewards/rejected": -5.372852579752604, + "step": 698 + }, + { + "epoch": 0.2580406995523972, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 8.573818028995129e-06, + "logits/chosen": 1502296960.0, + "logits/rejected": 1491697152.0, + "logps/chosen": -232.6980438232422, + "logps/rejected": -424.576904296875, + "loss": 0.197, + "rewards/chosen": 0.7530190944671631, + "rewards/margins": 7.363127946853638, + "rewards/rejected": -6.610108852386475, + "step": 699 + }, + { + "epoch": 0.2584098564902404, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 8.56969954913795e-06, + "logits/chosen": 1301741056.0, + "logits/rejected": 2037355904.0, + "logps/chosen": -178.26678466796875, + "logps/rejected": -490.1294860839844, + "loss": 0.1841, + "rewards/chosen": 0.9701017141342163, + "rewards/margins": 7.9521883726119995, + "rewards/rejected": -6.982086658477783, + "step": 700 + }, + { + "epoch": 0.2587790134280836, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 8.565576123507398e-06, + "logits/chosen": 1816361756.4444444, + "logits/rejected": 2888043475.478261, + "logps/chosen": -233.67787000868054, + "logps/rejected": -474.1322180706522, + "loss": 0.1049, + "rewards/chosen": 0.8033172289530436, + "rewards/margins": 8.390401791835176, + "rewards/rejected": -7.587084562882133, + "step": 701 + }, + { + "epoch": 0.2591481703659268, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 8.561447757816428e-06, + "logits/chosen": 3558549367.4666667, + "logits/rejected": 2512792636.2352943, + "logps/chosen": -348.08619791666666, + "logps/rejected": -412.71607881433823, + "loss": 0.1281, + "rewards/chosen": 1.9977976481119792, + "rewards/margins": 9.586786905924479, + "rewards/rejected": -7.5889892578125, + "step": 702 + }, + { + "epoch": 0.25951732730377003, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 8.557314457784838e-06, + "logits/chosen": 2499396039.111111, + "logits/rejected": 1837727597.7142856, + "logps/chosen": -230.98814561631946, + "logps/rejected": -402.01307896205356, + "loss": 0.1969, + "rewards/chosen": 1.1478605270385742, + "rewards/margins": 8.448952402387347, + "rewards/rejected": -7.301091875348773, + "step": 703 + }, + { + "epoch": 0.2598864842416132, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 8.553176229139262e-06, + "logits/chosen": 1905652224.0, + "logits/rejected": 2688879104.0, + "logps/chosen": -339.8748779296875, + "logps/rejected": -469.4305419921875, + "loss": 0.1593, + "rewards/chosen": 1.2111269235610962, + "rewards/margins": 7.6917964220047, + "rewards/rejected": -6.4806694984436035, + "step": 704 + }, + { + "epoch": 0.26025564117945643, + "grad_norm": 12.0, + "kl": 1.0053119659423828, + "learning_rate": 8.54903307761316e-06, + "logits/chosen": 1752171466.1052632, + "logits/rejected": 2005397504.0, + "logps/chosen": -301.2323961759868, + "logps/rejected": -369.26509915865387, + "loss": 0.1785, + "rewards/chosen": 1.498931884765625, + "rewards/margins": 7.979768606332632, + "rewards/rejected": -6.480836721567007, + "step": 705 + }, + { + "epoch": 0.2606247981172996, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 8.544885008946822e-06, + "logits/chosen": 1824079499.6363637, + "logits/rejected": 2094868382.4761906, + "logps/chosen": -216.76420454545453, + "logps/rejected": -661.0634300595239, + "loss": 0.1338, + "rewards/chosen": 1.1554926091974431, + "rewards/margins": 9.873177400399081, + "rewards/rejected": -8.717684791201638, + "step": 706 + }, + { + "epoch": 0.26099395505514283, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 8.54073202888734e-06, + "logits/chosen": 1374426453.3333333, + "logits/rejected": 1765861286.9565217, + "logps/chosen": -200.953125, + "logps/rejected": -346.5176842730978, + "loss": 0.1295, + "rewards/chosen": 0.5691349771287706, + "rewards/margins": 7.099452200719124, + "rewards/rejected": -6.5303172235903535, + "step": 707 + }, + { + "epoch": 0.261363111992986, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 8.536574143188619e-06, + "logits/chosen": 1763637248.0, + "logits/rejected": 1876370298.4347825, + "logps/chosen": -297.84312608506946, + "logps/rejected": -432.2619735054348, + "loss": 0.1234, + "rewards/chosen": 0.6439764764573839, + "rewards/margins": 8.457567811588158, + "rewards/rejected": -7.813591335130774, + "step": 708 + }, + { + "epoch": 0.26173226893082924, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 8.532411357611352e-06, + "logits/chosen": 1690759040.0, + "logits/rejected": 1620271616.0, + "logps/chosen": -281.0442199707031, + "logps/rejected": -496.8271484375, + "loss": 0.1358, + "rewards/chosen": 2.1863605976104736, + "rewards/margins": 8.861633539199829, + "rewards/rejected": -6.6752729415893555, + "step": 709 + }, + { + "epoch": 0.2621014258686724, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 8.528243677923028e-06, + "logits/chosen": 1590008685.7142856, + "logits/rejected": 1620823040.0, + "logps/chosen": -274.89578683035717, + "logps/rejected": -507.4638671875, + "loss": 0.1437, + "rewards/chosen": 1.1951545987810408, + "rewards/margins": 8.421536536443801, + "rewards/rejected": -7.226381937662761, + "step": 710 + }, + { + "epoch": 0.26247058280651564, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 8.524071109897915e-06, + "logits/chosen": 1575872512.0, + "logits/rejected": 1668048603.4285715, + "logps/chosen": -339.5906032986111, + "logps/rejected": -570.0492117745536, + "loss": 0.1921, + "rewards/chosen": 1.1478169759114583, + "rewards/margins": 11.254784356980098, + "rewards/rejected": -10.106967381068639, + "step": 711 + }, + { + "epoch": 0.2628397397443588, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 8.519893659317052e-06, + "logits/chosen": 1795548774.4, + "logits/rejected": 1825740458.6666667, + "logps/chosen": -289.83134765625, + "logps/rejected": -453.2017415364583, + "loss": 0.2297, + "rewards/chosen": 0.9489949226379395, + "rewards/margins": 7.313629754384358, + "rewards/rejected": -6.364634831746419, + "step": 712 + }, + { + "epoch": 0.26320889668220204, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 8.515711331968242e-06, + "logits/chosen": 2133171792.8421052, + "logits/rejected": 1882783113.8461537, + "logps/chosen": -277.95962685032896, + "logps/rejected": -415.3552433894231, + "loss": 0.1652, + "rewards/chosen": 1.6074349252801192, + "rewards/margins": 7.973598912659927, + "rewards/rejected": -6.3661639873798075, + "step": 713 + }, + { + "epoch": 0.2635780536200452, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 8.51152413364605e-06, + "logits/chosen": 1332163925.3333333, + "logits/rejected": 1867391337.4117646, + "logps/chosen": -222.390087890625, + "logps/rejected": -393.5350988051471, + "loss": 0.1325, + "rewards/chosen": 1.3882767995198568, + "rewards/margins": 7.548514728920132, + "rewards/rejected": -6.160237929400275, + "step": 714 + }, + { + "epoch": 0.26394721055788845, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 8.507332070151784e-06, + "logits/chosen": 2360593976.888889, + "logits/rejected": 1631233755.4285715, + "logps/chosen": -300.18126085069446, + "logps/rejected": -477.26991489955356, + "loss": 0.1997, + "rewards/chosen": 0.9707688225640191, + "rewards/margins": 8.314943343874008, + "rewards/rejected": -7.344174521309989, + "step": 715 + }, + { + "epoch": 0.2643163674957316, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 8.503135147293496e-06, + "logits/chosen": 1733139328.0, + "logits/rejected": 1562306304.0, + "logps/chosen": -262.3669128417969, + "logps/rejected": -496.5869140625, + "loss": 0.166, + "rewards/chosen": 1.1070632934570312, + "rewards/margins": 8.657933235168457, + "rewards/rejected": -7.550869941711426, + "step": 716 + }, + { + "epoch": 0.26468552443357485, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 8.498933370885967e-06, + "logits/chosen": 1360120346.9473684, + "logits/rejected": 1728848817.2307692, + "logps/chosen": -301.32090357730266, + "logps/rejected": -439.55431189903845, + "loss": 0.1702, + "rewards/chosen": 1.5223317397268195, + "rewards/margins": 8.47398954847081, + "rewards/rejected": -6.95165780874399, + "step": 717 + }, + { + "epoch": 0.265054681371418, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 8.494726746750705e-06, + "logits/chosen": 1330706295.4666667, + "logits/rejected": 2060726031.0588236, + "logps/chosen": -191.76277669270834, + "logps/rejected": -470.759765625, + "loss": 0.1443, + "rewards/chosen": 1.42223269144694, + "rewards/margins": 8.829470024856867, + "rewards/rejected": -7.407237333409927, + "step": 718 + }, + { + "epoch": 0.26542383830926125, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 8.490515280715937e-06, + "logits/chosen": 2364996186.352941, + "logits/rejected": 1840331980.8, + "logps/chosen": -285.1833065257353, + "logps/rejected": -549.5393229166667, + "loss": 0.1742, + "rewards/chosen": 1.0194485608269186, + "rewards/margins": 10.152565219355564, + "rewards/rejected": -9.133116658528646, + "step": 719 + }, + { + "epoch": 0.2657929952471044, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 8.486298978616593e-06, + "logits/chosen": 1410933097.4117646, + "logits/rejected": 1456810257.0666666, + "logps/chosen": -291.14318129595586, + "logps/rejected": -452.30983072916666, + "loss": 0.1742, + "rewards/chosen": 1.2349590974695541, + "rewards/margins": 9.11880614336799, + "rewards/rejected": -7.883847045898437, + "step": 720 + }, + { + "epoch": 0.26616215218494765, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 8.48207784629431e-06, + "logits/chosen": 1883839744.0, + "logits/rejected": 1376344064.0, + "logps/chosen": -354.7296142578125, + "logps/rejected": -386.4733581542969, + "loss": 0.1805, + "rewards/chosen": 0.8549596667289734, + "rewards/margins": 7.068763911724091, + "rewards/rejected": -6.213804244995117, + "step": 721 + }, + { + "epoch": 0.2665313091227908, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 8.477851889597408e-06, + "logits/chosen": 1465790825.4117646, + "logits/rejected": 1714474188.8, + "logps/chosen": -280.22009995404414, + "logps/rejected": -452.34583333333336, + "loss": 0.163, + "rewards/chosen": 1.5502988029928768, + "rewards/margins": 9.011663040460325, + "rewards/rejected": -7.461364237467448, + "step": 722 + }, + { + "epoch": 0.266900466060634, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 8.473621114380899e-06, + "logits/chosen": 2304839439.0588236, + "logits/rejected": 1537664614.4, + "logps/chosen": -368.36865234375, + "logps/rejected": -573.8295572916667, + "loss": 0.1895, + "rewards/chosen": 0.8097377103917739, + "rewards/margins": 9.270643166934743, + "rewards/rejected": -8.460905456542969, + "step": 723 + }, + { + "epoch": 0.26726962299847723, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 8.469385526506466e-06, + "logits/chosen": 2105896320.0, + "logits/rejected": 1929952256.0, + "logps/chosen": -342.3580322265625, + "logps/rejected": -517.1311645507812, + "loss": 0.1656, + "rewards/chosen": 1.0522119998931885, + "rewards/margins": 8.553890466690063, + "rewards/rejected": -7.501678466796875, + "step": 724 + }, + { + "epoch": 0.2676387799363204, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 8.465145131842467e-06, + "logits/chosen": 1209644347.0769231, + "logits/rejected": 2355036160.0, + "logps/chosen": -254.0584998497596, + "logps/rejected": -422.6581774259868, + "loss": 0.2067, + "rewards/chosen": 1.1839763934795673, + "rewards/margins": 6.944912929766574, + "rewards/rejected": -5.760936536287007, + "step": 725 + }, + { + "epoch": 0.26800793687416363, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 8.46089993626391e-06, + "logits/chosen": 1980739447.4666667, + "logits/rejected": 2029912304.9411764, + "logps/chosen": -375.4416015625, + "logps/rejected": -397.6094324448529, + "loss": 0.152, + "rewards/chosen": 1.1780044555664062, + "rewards/margins": 8.307536764705882, + "rewards/rejected": -7.129532309139476, + "step": 726 + }, + { + "epoch": 0.2683770938120068, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 8.456649945652463e-06, + "logits/chosen": 1993883921.0666666, + "logits/rejected": 1754258371.764706, + "logps/chosen": -191.17587890625, + "logps/rejected": -515.4721966911765, + "loss": 0.1267, + "rewards/chosen": 1.3963610331217449, + "rewards/margins": 9.150813009224686, + "rewards/rejected": -7.754451976102941, + "step": 727 + }, + { + "epoch": 0.26874625074985004, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 8.452395165896433e-06, + "logits/chosen": 1892826190.7692308, + "logits/rejected": 1999675823.1578948, + "logps/chosen": -325.47569861778845, + "logps/rejected": -589.1712582236842, + "loss": 0.1215, + "rewards/chosen": 1.2532466008112981, + "rewards/margins": 10.918916092227828, + "rewards/rejected": -9.66566949141653, + "step": 728 + }, + { + "epoch": 0.2691154076876932, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 8.448135602890763e-06, + "logits/chosen": 1941016389.8181818, + "logits/rejected": 1635075072.0, + "logps/chosen": -312.67844460227275, + "logps/rejected": -424.200146484375, + "loss": 0.2712, + "rewards/chosen": 0.6924879334189675, + "rewards/margins": 8.99737379767678, + "rewards/rejected": -8.304885864257812, + "step": 729 + }, + { + "epoch": 0.26948456462553644, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 8.443871262537023e-06, + "logits/chosen": 1789425810.2857144, + "logits/rejected": 1505312540.4444444, + "logps/chosen": -260.25106375558033, + "logps/rejected": -479.05723741319446, + "loss": 0.1432, + "rewards/chosen": 1.5229415893554688, + "rewards/margins": 10.195179409450954, + "rewards/rejected": -8.672237820095486, + "step": 730 + }, + { + "epoch": 0.2698537215633796, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 8.4396021507434e-06, + "logits/chosen": 2084863219.8095238, + "logits/rejected": 1822718510.5454545, + "logps/chosen": -295.35228329613096, + "logps/rejected": -397.00692471590907, + "loss": 0.1975, + "rewards/chosen": 1.3762493133544922, + "rewards/margins": 7.344661539251154, + "rewards/rejected": -5.968412225896662, + "step": 731 + }, + { + "epoch": 0.27022287850122284, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 8.4353282734247e-06, + "logits/chosen": 1485335009.8823528, + "logits/rejected": 1535449088.0, + "logps/chosen": -279.3338407628676, + "logps/rejected": -454.4908203125, + "loss": 0.178, + "rewards/chosen": 1.2829552818747127, + "rewards/margins": 8.180408589980182, + "rewards/rejected": -6.897453308105469, + "step": 732 + }, + { + "epoch": 0.270592035439066, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 8.431049636502322e-06, + "logits/chosen": 1759702747.4285715, + "logits/rejected": 1572019768.8888888, + "logps/chosen": -347.2534877232143, + "logps/rejected": -400.6532389322917, + "loss": 0.1198, + "rewards/chosen": 1.6595938546316964, + "rewards/margins": 9.098846919952877, + "rewards/rejected": -7.43925306532118, + "step": 733 + }, + { + "epoch": 0.27096119237690924, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 8.426766245904263e-06, + "logits/chosen": 2045888188.631579, + "logits/rejected": 1950828701.5384614, + "logps/chosen": -306.00899465460526, + "logps/rejected": -311.11609825721155, + "loss": 0.2255, + "rewards/chosen": 1.1355122014095909, + "rewards/margins": 6.30951110747179, + "rewards/rejected": -5.1739989060622, + "step": 734 + }, + { + "epoch": 0.2713303493147524, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 8.422478107565108e-06, + "logits/chosen": 1954966674.2857144, + "logits/rejected": 1370471082.6666667, + "logps/chosen": -198.89540318080358, + "logps/rejected": -435.7649197048611, + "loss": 0.1129, + "rewards/chosen": 1.7736165182931083, + "rewards/margins": 8.770435484628829, + "rewards/rejected": -6.99681896633572, + "step": 735 + }, + { + "epoch": 0.27169950625259565, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 8.418185227426016e-06, + "logits/chosen": 1740405009.0666666, + "logits/rejected": 1649257773.1764705, + "logps/chosen": -404.66588541666664, + "logps/rejected": -535.7639016544117, + "loss": 0.1833, + "rewards/chosen": 0.9322424570719401, + "rewards/margins": 9.032965723673502, + "rewards/rejected": -8.100723266601562, + "step": 736 + }, + { + "epoch": 0.2720686631904388, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 8.41388761143472e-06, + "logits/chosen": 1791831848.4210527, + "logits/rejected": 1640457137.2307692, + "logps/chosen": -232.26755242598685, + "logps/rejected": -422.34175931490387, + "loss": 0.1456, + "rewards/chosen": 1.6087160612407483, + "rewards/margins": 8.83876519454153, + "rewards/rejected": -7.230049133300781, + "step": 737 + }, + { + "epoch": 0.27243782012828205, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 8.409585265545509e-06, + "logits/chosen": 1417081514.6666667, + "logits/rejected": 2093712091.4285715, + "logps/chosen": -224.60643174913196, + "logps/rejected": -489.6946498325893, + "loss": 0.1888, + "rewards/chosen": 1.2593495051066081, + "rewards/margins": 8.408879643394833, + "rewards/rejected": -7.149530138288226, + "step": 738 + }, + { + "epoch": 0.2728069770661252, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 8.405278195719233e-06, + "logits/chosen": 2898794268.4444447, + "logits/rejected": 1607668004.5714285, + "logps/chosen": -299.5699869791667, + "logps/rejected": -415.4795619419643, + "loss": 0.1778, + "rewards/chosen": 1.3134018580118816, + "rewards/margins": 7.878941081819081, + "rewards/rejected": -6.565539223807199, + "step": 739 + }, + { + "epoch": 0.27317613400396845, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 8.40096640792328e-06, + "logits/chosen": 1600694810.9473684, + "logits/rejected": 1610778624.0, + "logps/chosen": -286.3563168174342, + "logps/rejected": -435.7476337139423, + "loss": 0.1579, + "rewards/chosen": 2.0813379789653577, + "rewards/margins": 8.825593044883327, + "rewards/rejected": -6.744255065917969, + "step": 740 + }, + { + "epoch": 0.2735452909418116, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 8.396649908131578e-06, + "logits/chosen": 992298461.8666667, + "logits/rejected": 1261353923.764706, + "logps/chosen": -222.095947265625, + "logps/rejected": -434.11646943933823, + "loss": 0.122, + "rewards/chosen": 2.016513188680013, + "rewards/margins": 8.744632085164389, + "rewards/rejected": -6.728118896484375, + "step": 741 + }, + { + "epoch": 0.27391444787965485, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 8.39232870232458e-06, + "logits/chosen": 1883850368.0, + "logits/rejected": 2926250496.0, + "logps/chosen": -327.1271057128906, + "logps/rejected": -513.7808227539062, + "loss": 0.1535, + "rewards/chosen": 1.3816542625427246, + "rewards/margins": 8.368366241455078, + "rewards/rejected": -6.9867119789123535, + "step": 742 + }, + { + "epoch": 0.27428360481749803, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 8.388002796489267e-06, + "logits/chosen": 1672761636.5714285, + "logits/rejected": 1775308913.7777777, + "logps/chosen": -233.78553989955358, + "logps/rejected": -427.9992404513889, + "loss": 0.1439, + "rewards/chosen": 1.558112961905343, + "rewards/margins": 9.18136159200517, + "rewards/rejected": -7.623248630099827, + "step": 743 + }, + { + "epoch": 0.27465276175534126, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 8.383672196619123e-06, + "logits/chosen": 1935238290.2857144, + "logits/rejected": 2633674379.6363635, + "logps/chosen": -263.06849888392856, + "logps/rejected": -494.3399769176136, + "loss": 0.2092, + "rewards/chosen": 1.2428042093912761, + "rewards/margins": 8.891675891298236, + "rewards/rejected": -7.64887168190696, + "step": 744 + }, + { + "epoch": 0.27502191869318443, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 8.37933690871414e-06, + "logits/chosen": 1874977249.8823528, + "logits/rejected": 1927916339.2, + "logps/chosen": -281.28937844669116, + "logps/rejected": -384.8652018229167, + "loss": 0.1381, + "rewards/chosen": 1.6131117203656364, + "rewards/margins": 7.994609421374751, + "rewards/rejected": -6.381497701009114, + "step": 745 + }, + { + "epoch": 0.27539107563102766, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 8.374996938780804e-06, + "logits/chosen": 1971765850.3529413, + "logits/rejected": 1536362222.9333334, + "logps/chosen": -354.9510857077206, + "logps/rejected": -447.5161458333333, + "loss": 0.1122, + "rewards/chosen": 2.0395447226131664, + "rewards/margins": 9.545179284787645, + "rewards/rejected": -7.505634562174479, + "step": 746 + }, + { + "epoch": 0.27576023256887083, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 8.370652292832087e-06, + "logits/chosen": 1677098154.6666667, + "logits/rejected": 1403976601.6, + "logps/chosen": -316.8774820963542, + "logps/rejected": -517.842724609375, + "loss": 0.0975, + "rewards/chosen": 1.9518934885660808, + "rewards/margins": 9.733116022745769, + "rewards/rejected": -7.781222534179688, + "step": 747 + }, + { + "epoch": 0.27612938950671406, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 8.366302976887442e-06, + "logits/chosen": 1577245582.2222223, + "logits/rejected": 1311952749.7142856, + "logps/chosen": -297.1344943576389, + "logps/rejected": -412.52762276785717, + "loss": 0.1604, + "rewards/chosen": 1.7567385567559137, + "rewards/margins": 9.2358122174702, + "rewards/rejected": -7.479073660714286, + "step": 748 + }, + { + "epoch": 0.27649854644455724, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 8.361948996972792e-06, + "logits/chosen": 2142604065.3913043, + "logits/rejected": 1724719900.4444444, + "logps/chosen": -261.6469089673913, + "logps/rejected": -394.0995822482639, + "loss": 0.2383, + "rewards/chosen": 1.134678550388502, + "rewards/margins": 8.576778227580341, + "rewards/rejected": -7.44209967719184, + "step": 749 + }, + { + "epoch": 0.27686770338240047, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 8.357590359120518e-06, + "logits/chosen": 1716166294.5882354, + "logits/rejected": 1356393676.8, + "logps/chosen": -256.6574276194853, + "logps/rejected": -518.3037109375, + "loss": 0.1659, + "rewards/chosen": 1.1612701416015625, + "rewards/margins": 10.466366577148438, + "rewards/rejected": -9.305096435546876, + "step": 750 + }, + { + "epoch": 0.27723686032024364, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 8.353227069369461e-06, + "logits/chosen": 1748869688.8888888, + "logits/rejected": 1602397622.857143, + "logps/chosen": -335.7532009548611, + "logps/rejected": -393.21888950892856, + "loss": 0.2198, + "rewards/chosen": 0.8964270485772027, + "rewards/margins": 8.122755762130495, + "rewards/rejected": -7.2263287135532925, + "step": 751 + }, + { + "epoch": 0.27760601725808687, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 8.348859133764902e-06, + "logits/chosen": 1615163632.9411764, + "logits/rejected": 1529911432.5333333, + "logps/chosen": -386.5240693933824, + "logps/rejected": -366.5708333333333, + "loss": 0.2168, + "rewards/chosen": 0.690982369815602, + "rewards/margins": 7.027823698754404, + "rewards/rejected": -6.336841328938802, + "step": 752 + }, + { + "epoch": 0.27797517419593004, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 8.34448655835856e-06, + "logits/chosen": 1645199667.2, + "logits/rejected": 1599855616.0, + "logps/chosen": -290.9887939453125, + "logps/rejected": -415.5536702473958, + "loss": 0.2095, + "rewards/chosen": 1.0834282875061034, + "rewards/margins": 7.789382521311442, + "rewards/rejected": -6.705954233805339, + "step": 753 + }, + { + "epoch": 0.27834433113377327, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 8.34010934920858e-06, + "logits/chosen": 1542402404.173913, + "logits/rejected": 1927808341.3333333, + "logps/chosen": -273.32033372961956, + "logps/rejected": -443.34190538194446, + "loss": 0.2449, + "rewards/chosen": 1.099161148071289, + "rewards/margins": 6.864378823174371, + "rewards/rejected": -5.765217675103082, + "step": 754 + }, + { + "epoch": 0.27871348807161644, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 8.335727512379535e-06, + "logits/chosen": 1669322752.0, + "logits/rejected": 1378362647.2727273, + "logps/chosen": -251.3404296875, + "logps/rejected": -428.9881480823864, + "loss": 0.0852, + "rewards/chosen": 1.3134403228759766, + "rewards/margins": 9.980798721313477, + "rewards/rejected": -8.6673583984375, + "step": 755 + }, + { + "epoch": 0.2790826450094597, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 8.3313410539424e-06, + "logits/chosen": 2006537947.4285715, + "logits/rejected": 1937155413.3333333, + "logps/chosen": -245.164794921875, + "logps/rejected": -386.92778862847223, + "loss": 0.1364, + "rewards/chosen": 1.4845929827008928, + "rewards/margins": 7.347633906773158, + "rewards/rejected": -5.863040924072266, + "step": 756 + }, + { + "epoch": 0.27945180194730285, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 8.32694997997456e-06, + "logits/chosen": 1701811792.8421052, + "logits/rejected": 1718808891.0769231, + "logps/chosen": -274.68716591282896, + "logps/rejected": -411.28064903846155, + "loss": 0.2414, + "rewards/chosen": 0.7195495304308439, + "rewards/margins": 7.341983119485832, + "rewards/rejected": -6.622433589054988, + "step": 757 + }, + { + "epoch": 0.279820958885146, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 8.322554296559792e-06, + "logits/chosen": 2085876736.0, + "logits/rejected": 1581271040.0, + "logps/chosen": -309.3119201660156, + "logps/rejected": -376.4004211425781, + "loss": 0.1942, + "rewards/chosen": 1.4214270114898682, + "rewards/margins": 7.609297037124634, + "rewards/rejected": -6.187870025634766, + "step": 758 + }, + { + "epoch": 0.28019011582298925, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 8.318154009788257e-06, + "logits/chosen": 2207913691.428571, + "logits/rejected": 1648308968.7272727, + "logps/chosen": -308.4857933407738, + "logps/rejected": -463.4357244318182, + "loss": 0.2117, + "rewards/chosen": 1.3291015625, + "rewards/margins": 7.837362809614702, + "rewards/rejected": -6.508261247114702, + "step": 759 + }, + { + "epoch": 0.2805592727608324, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 8.3137491257565e-06, + "logits/chosen": 2180984832.0, + "logits/rejected": 2946864274.285714, + "logps/chosen": -265.23008897569446, + "logps/rejected": -516.3490164620536, + "loss": 0.205, + "rewards/chosen": 1.1903200149536133, + "rewards/margins": 8.279749597821917, + "rewards/rejected": -7.089429582868304, + "step": 760 + }, + { + "epoch": 0.28092842969867565, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 8.30933965056743e-06, + "logits/chosen": 2080200294.4, + "logits/rejected": 1794530605.1764705, + "logps/chosen": -441.62604166666665, + "logps/rejected": -421.8765510110294, + "loss": 0.1558, + "rewards/chosen": 1.3187789916992188, + "rewards/margins": 8.952957602108226, + "rewards/rejected": -7.634178610409007, + "step": 761 + }, + { + "epoch": 0.2812975866365188, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 8.304925590330318e-06, + "logits/chosen": 2067771776.0, + "logits/rejected": 1599947648.0, + "logps/chosen": -328.9259338378906, + "logps/rejected": -520.1430053710938, + "loss": 0.1249, + "rewards/chosen": 1.9356510639190674, + "rewards/margins": 9.888720273971558, + "rewards/rejected": -7.95306921005249, + "step": 762 + }, + { + "epoch": 0.28166674357436206, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 8.300506951160789e-06, + "logits/chosen": 1608517778.2857144, + "logits/rejected": 1204006684.4444444, + "logps/chosen": -339.0216587611607, + "logps/rejected": -451.3554958767361, + "loss": 0.1558, + "rewards/chosen": 1.1144777025495256, + "rewards/margins": 8.15309939308772, + "rewards/rejected": -7.038621690538195, + "step": 763 + }, + { + "epoch": 0.28203590051220523, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 8.296083739180812e-06, + "logits/chosen": 2149351680.0, + "logits/rejected": 2357044224.0, + "logps/chosen": -316.3919372558594, + "logps/rejected": -444.80926513671875, + "loss": 0.1695, + "rewards/chosen": 1.044415831565857, + "rewards/margins": 9.435024619102478, + "rewards/rejected": -8.390608787536621, + "step": 764 + }, + { + "epoch": 0.28240505745004846, + "grad_norm": 11.5625, + "kl": 0.22266674041748047, + "learning_rate": 8.29165596051869e-06, + "logits/chosen": 1829449728.0, + "logits/rejected": 2631174553.6, + "logps/chosen": -277.0448404947917, + "logps/rejected": -524.19052734375, + "loss": 0.1437, + "rewards/chosen": 0.9586912790934244, + "rewards/margins": 7.74412218729655, + "rewards/rejected": -6.785430908203125, + "step": 765 + }, + { + "epoch": 0.28277421438789163, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 8.287223621309055e-06, + "logits/chosen": 1858469614.9333334, + "logits/rejected": 1870799570.8235295, + "logps/chosen": -284.45530598958334, + "logps/rejected": -694.7135799632352, + "loss": 0.1766, + "rewards/chosen": 0.7417887369791667, + "rewards/margins": 10.97066608503753, + "rewards/rejected": -10.228877348058363, + "step": 766 + }, + { + "epoch": 0.28314337132573486, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 8.282786727692856e-06, + "logits/chosen": 1748838184.4210527, + "logits/rejected": 1371550956.3076923, + "logps/chosen": -235.44140625, + "logps/rejected": -445.85990084134613, + "loss": 0.1579, + "rewards/chosen": 1.8203420137104236, + "rewards/margins": 9.876913974159642, + "rewards/rejected": -8.056571960449219, + "step": 767 + }, + { + "epoch": 0.28351252826357803, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 8.278345285817353e-06, + "logits/chosen": 1767820107.2941177, + "logits/rejected": 1284047530.6666667, + "logps/chosen": -329.79181985294116, + "logps/rejected": -441.61217447916664, + "loss": 0.1603, + "rewards/chosen": 1.191459431367762, + "rewards/margins": 8.286346719779221, + "rewards/rejected": -7.094887288411458, + "step": 768 + }, + { + "epoch": 0.28388168520142126, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 8.273899301836111e-06, + "logits/chosen": 2094867865.6, + "logits/rejected": 2216908458.6666665, + "logps/chosen": -278.3982177734375, + "logps/rejected": -437.493896484375, + "loss": 0.1828, + "rewards/chosen": 1.2558897018432618, + "rewards/margins": 8.385862795511882, + "rewards/rejected": -7.12997309366862, + "step": 769 + }, + { + "epoch": 0.28425084213926444, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 8.269448781908977e-06, + "logits/chosen": 1579163216.8421052, + "logits/rejected": 1355386564.9230769, + "logps/chosen": -310.46767064144734, + "logps/rejected": -474.17886117788464, + "loss": 0.1883, + "rewards/chosen": 1.2862842961361534, + "rewards/margins": 8.333298972743727, + "rewards/rejected": -7.0470146766075725, + "step": 770 + }, + { + "epoch": 0.28461999907710767, + "grad_norm": 8.5625, + "kl": 0.4121088981628418, + "learning_rate": 8.264993732202094e-06, + "logits/chosen": 2757693755.076923, + "logits/rejected": 2028119309.4736843, + "logps/chosen": -253.33710186298077, + "logps/rejected": -377.8335731907895, + "loss": 0.083, + "rewards/chosen": 2.3260924999530497, + "rewards/margins": 8.270744856552557, + "rewards/rejected": -5.944652356599507, + "step": 771 + }, + { + "epoch": 0.28498915601495084, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 8.260534158887878e-06, + "logits/chosen": 1885598378.6666667, + "logits/rejected": 1636342723.764706, + "logps/chosen": -239.49612630208333, + "logps/rejected": -550.2908432904412, + "loss": 0.1687, + "rewards/chosen": 0.8601183573404948, + "rewards/margins": 9.799769846598307, + "rewards/rejected": -8.939651489257812, + "step": 772 + }, + { + "epoch": 0.28535831295279407, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 8.256070068145009e-06, + "logits/chosen": 1815894144.0, + "logits/rejected": 2124727552.0, + "logps/chosen": -331.2087707519531, + "logps/rejected": -452.85400390625, + "loss": 0.1587, + "rewards/chosen": 1.2800062894821167, + "rewards/margins": 8.432247996330261, + "rewards/rejected": -7.1522417068481445, + "step": 773 + }, + { + "epoch": 0.28572746989063724, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 8.251601466158428e-06, + "logits/chosen": 2008872576.0, + "logits/rejected": 1761766784.0, + "logps/chosen": -242.1033172607422, + "logps/rejected": -635.6817626953125, + "loss": 0.106, + "rewards/chosen": 2.5335941314697266, + "rewards/margins": 12.11904525756836, + "rewards/rejected": -9.585451126098633, + "step": 774 + }, + { + "epoch": 0.28609662682848047, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 8.247128359119326e-06, + "logits/chosen": 1794399872.0, + "logits/rejected": 1635362816.0, + "logps/chosen": -331.3878173828125, + "logps/rejected": -544.0890502929688, + "loss": 0.1684, + "rewards/chosen": 1.1966028213500977, + "rewards/margins": 9.600072860717773, + "rewards/rejected": -8.403470039367676, + "step": 775 + }, + { + "epoch": 0.28646578376632365, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 8.242650753225137e-06, + "logits/chosen": 1904461414.4, + "logits/rejected": 1611770709.3333333, + "logps/chosen": -273.80771484375, + "logps/rejected": -443.1193033854167, + "loss": 0.2077, + "rewards/chosen": 1.1953511238098145, + "rewards/margins": 8.147849241892498, + "rewards/rejected": -6.952498118082683, + "step": 776 + }, + { + "epoch": 0.2868349407041669, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 8.238168654679528e-06, + "logits/chosen": 1889235259.0769231, + "logits/rejected": 1862129125.0526316, + "logps/chosen": -281.66562124399036, + "logps/rejected": -474.93251439144734, + "loss": 0.1539, + "rewards/chosen": 0.9561989124004657, + "rewards/margins": 9.507205831859757, + "rewards/rejected": -8.551006919459292, + "step": 777 + }, + { + "epoch": 0.28720409764201005, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 8.233682069692388e-06, + "logits/chosen": 1260089990.7368422, + "logits/rejected": 2550590857.8461537, + "logps/chosen": -241.13666735197367, + "logps/rejected": -581.9029071514423, + "loss": 0.2017, + "rewards/chosen": 0.9727998030813116, + "rewards/margins": 8.633146787944593, + "rewards/rejected": -7.660346984863281, + "step": 778 + }, + { + "epoch": 0.2875732545798533, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 8.229191004479825e-06, + "logits/chosen": 1650074965.3333333, + "logits/rejected": 1767265718.857143, + "logps/chosen": -238.38064236111111, + "logps/rejected": -570.4975934709821, + "loss": 0.1762, + "rewards/chosen": 1.0989985995822482, + "rewards/margins": 9.829411763993521, + "rewards/rejected": -8.730413164411273, + "step": 779 + }, + { + "epoch": 0.28794241151769645, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 8.22469546526415e-06, + "logits/chosen": 1627609600.0, + "logits/rejected": 1682386560.0, + "logps/chosen": -306.8687744140625, + "logps/rejected": -530.6214599609375, + "loss": 0.1753, + "rewards/chosen": 0.9425001740455627, + "rewards/margins": 9.181773245334625, + "rewards/rejected": -8.239273071289062, + "step": 780 + }, + { + "epoch": 0.2883115684555397, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 8.220195458273879e-06, + "logits/chosen": 1872500248.3809524, + "logits/rejected": 2260174103.2727275, + "logps/chosen": -189.82042875744048, + "logps/rejected": -349.0494495738636, + "loss": 0.2311, + "rewards/chosen": 0.9272289276123047, + "rewards/margins": 6.977015581997958, + "rewards/rejected": -6.049786654385653, + "step": 781 + }, + { + "epoch": 0.28868072539338285, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 8.21569098974371e-06, + "logits/chosen": 2099915434.6666667, + "logits/rejected": 2713975868.2352943, + "logps/chosen": -264.8400390625, + "logps/rejected": -515.2446001838235, + "loss": 0.156, + "rewards/chosen": 0.9563973108927409, + "rewards/margins": 8.259683695026473, + "rewards/rejected": -7.303286384133732, + "step": 782 + }, + { + "epoch": 0.2890498823312261, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 8.211182065914531e-06, + "logits/chosen": 1855494912.0, + "logits/rejected": 1574805120.0, + "logps/chosen": -261.7393798828125, + "logps/rejected": -497.7912902832031, + "loss": 0.1779, + "rewards/chosen": 1.088707685470581, + "rewards/margins": 8.731104612350464, + "rewards/rejected": -7.642396926879883, + "step": 783 + }, + { + "epoch": 0.28941903926906926, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 8.206668693033399e-06, + "logits/chosen": 2046985344.0, + "logits/rejected": 1863873536.0, + "logps/chosen": -302.32244873046875, + "logps/rejected": -480.84954833984375, + "loss": 0.1591, + "rewards/chosen": 1.0393097400665283, + "rewards/margins": 9.838927507400513, + "rewards/rejected": -8.799617767333984, + "step": 784 + }, + { + "epoch": 0.2897881962069125, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 8.202150877353533e-06, + "logits/chosen": 1967445760.0, + "logits/rejected": 1730024448.0, + "logps/chosen": -298.3865661621094, + "logps/rejected": -527.1275634765625, + "loss": 0.1651, + "rewards/chosen": 1.0504655838012695, + "rewards/margins": 11.105935096740723, + "rewards/rejected": -10.055469512939453, + "step": 785 + }, + { + "epoch": 0.29015735314475566, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 8.197628625134306e-06, + "logits/chosen": 1962916864.0, + "logits/rejected": 1984057728.0, + "logps/chosen": -297.5548400878906, + "logps/rejected": -582.4058227539062, + "loss": 0.1215, + "rewards/chosen": 1.6408048868179321, + "rewards/margins": 9.789620995521545, + "rewards/rejected": -8.148816108703613, + "step": 786 + }, + { + "epoch": 0.2905265100825989, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 8.193101942641248e-06, + "logits/chosen": 1613820586.6666667, + "logits/rejected": 1536797696.0, + "logps/chosen": -280.4144287109375, + "logps/rejected": -612.351123046875, + "loss": 0.0981, + "rewards/chosen": 1.8199532826741536, + "rewards/margins": 18.205771764119465, + "rewards/rejected": -16.385818481445312, + "step": 787 + }, + { + "epoch": 0.29089566702044206, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 8.188570836146015e-06, + "logits/chosen": 1538041976.4705882, + "logits/rejected": 2477161949.866667, + "logps/chosen": -284.44226792279414, + "logps/rejected": -559.7286458333333, + "loss": 0.1468, + "rewards/chosen": 1.5848420087028952, + "rewards/margins": 8.970545540603936, + "rewards/rejected": -7.385703531901042, + "step": 788 + }, + { + "epoch": 0.2912648239582853, + "grad_norm": 12.8125, + "kl": 0.11930704116821289, + "learning_rate": 8.184035311926397e-06, + "logits/chosen": 2058395921.0666666, + "logits/rejected": 2079244528.9411764, + "logps/chosen": -343.15247395833336, + "logps/rejected": -634.4252642463235, + "loss": 0.162, + "rewards/chosen": 0.9836466471354167, + "rewards/margins": 9.543459305108764, + "rewards/rejected": -8.559812657973346, + "step": 789 + }, + { + "epoch": 0.29163398089612846, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 8.17949537626631e-06, + "logits/chosen": 1745475111.3846154, + "logits/rejected": 1933801579.7894738, + "logps/chosen": -283.62252103365387, + "logps/rejected": -501.8071546052632, + "loss": 0.1124, + "rewards/chosen": 1.8299647111159105, + "rewards/margins": 9.434399075836305, + "rewards/rejected": -7.604434364720395, + "step": 790 + }, + { + "epoch": 0.2920031378339717, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 8.174951035455772e-06, + "logits/chosen": 1403702980.9230769, + "logits/rejected": 1370635425.6842105, + "logps/chosen": -254.3580603966346, + "logps/rejected": -453.39766652960526, + "loss": 0.1313, + "rewards/chosen": 1.5636182931753306, + "rewards/margins": 8.556041902858718, + "rewards/rejected": -6.992423609683388, + "step": 791 + }, + { + "epoch": 0.29237229477181487, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 8.170402295790913e-06, + "logits/chosen": 1577094940.4444444, + "logits/rejected": 1511763968.0, + "logps/chosen": -272.1066623263889, + "logps/rejected": -474.45717075892856, + "loss": 0.1887, + "rewards/chosen": 1.3187084197998047, + "rewards/margins": 9.258815492902484, + "rewards/rejected": -7.940107073102679, + "step": 792 + }, + { + "epoch": 0.2927414517096581, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 8.165849163573954e-06, + "logits/chosen": 2107981824.0, + "logits/rejected": 1416503588.5714285, + "logps/chosen": -319.99354383680554, + "logps/rejected": -503.9273158482143, + "loss": 0.2151, + "rewards/chosen": 0.8559544351365831, + "rewards/margins": 8.393319939810132, + "rewards/rejected": -7.5373655046735495, + "step": 793 + }, + { + "epoch": 0.29311060864750127, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 8.161291645113198e-06, + "logits/chosen": 2130477283.5555556, + "logits/rejected": 2515419721.142857, + "logps/chosen": -324.08124457465277, + "logps/rejected": -680.0563616071429, + "loss": 0.1846, + "rewards/chosen": 1.0262871848212347, + "rewards/margins": 10.007030048067609, + "rewards/rejected": -8.980742863246373, + "step": 794 + }, + { + "epoch": 0.29347976558534444, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 8.156729746723034e-06, + "logits/chosen": 1590843776.0, + "logits/rejected": 1275528064.0, + "logps/chosen": -307.7215270996094, + "logps/rejected": -440.2825012207031, + "loss": 0.2022, + "rewards/chosen": 0.7474891543388367, + "rewards/margins": 6.7448737025260925, + "rewards/rejected": -5.997384548187256, + "step": 795 + }, + { + "epoch": 0.2938489225231877, + "grad_norm": 11.1875, + "kl": 0.2413802146911621, + "learning_rate": 8.15216347472391e-06, + "logits/chosen": 1804501307.0769231, + "logits/rejected": 2096511946.1052632, + "logps/chosen": -246.74275090144232, + "logps/rejected": -427.66737767269734, + "loss": 0.1506, + "rewards/chosen": 0.8671428240262545, + "rewards/margins": 8.057019982743359, + "rewards/rejected": -7.189877158717105, + "step": 796 + }, + { + "epoch": 0.29421807946103085, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 8.14759283544234e-06, + "logits/chosen": 1894426477.7142856, + "logits/rejected": 2007614350.2222223, + "logps/chosen": -335.78982979910717, + "logps/rejected": -490.38368055555554, + "loss": 0.0905, + "rewards/chosen": 2.523394993373326, + "rewards/margins": 10.30474841405475, + "rewards/rejected": -7.781353420681423, + "step": 797 + }, + { + "epoch": 0.2945872363988741, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 8.143017835210886e-06, + "logits/chosen": 1871879246.7692308, + "logits/rejected": 1869968869.0526316, + "logps/chosen": -294.62950721153845, + "logps/rejected": -540.0745785361842, + "loss": 0.1127, + "rewards/chosen": 1.684869619516226, + "rewards/margins": 9.496207542264992, + "rewards/rejected": -7.811337922748766, + "step": 798 + }, + { + "epoch": 0.29495639333671725, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 8.138438480368153e-06, + "logits/chosen": 1932779038.1176472, + "logits/rejected": 1321326182.4, + "logps/chosen": -311.02478745404414, + "logps/rejected": -550.4051432291667, + "loss": 0.1587, + "rewards/chosen": 1.3772319344913257, + "rewards/margins": 10.33872994067622, + "rewards/rejected": -8.961498006184895, + "step": 799 + }, + { + "epoch": 0.2953255502745605, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 8.133854777258779e-06, + "logits/chosen": 1410485589.3333333, + "logits/rejected": 2007218778.3529413, + "logps/chosen": -315.62060546875, + "logps/rejected": -498.88493795955884, + "loss": 0.14, + "rewards/chosen": 1.6581541697184246, + "rewards/margins": 10.358503865260703, + "rewards/rejected": -8.700349695542279, + "step": 800 + }, + { + "epoch": 0.29569470721240365, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 8.129266732233427e-06, + "logits/chosen": 1715517560.4705882, + "logits/rejected": 1721836202.6666667, + "logps/chosen": -329.07146139705884, + "logps/rejected": -525.2569661458333, + "loss": 0.1602, + "rewards/chosen": 1.6880555994370405, + "rewards/margins": 8.610727616852405, + "rewards/rejected": -6.922672017415365, + "step": 801 + }, + { + "epoch": 0.2960638641502469, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 8.124674351648773e-06, + "logits/chosen": 1967045339.4285715, + "logits/rejected": 2015654343.1111112, + "logps/chosen": -285.47964913504467, + "logps/rejected": -555.0106336805555, + "loss": 0.1641, + "rewards/chosen": 0.7254183632986886, + "rewards/margins": 9.31349718003046, + "rewards/rejected": -8.588078816731771, + "step": 802 + }, + { + "epoch": 0.29643302108809005, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 8.120077641867506e-06, + "logits/chosen": 2209987515.733333, + "logits/rejected": 2046963952.9411764, + "logps/chosen": -266.27252604166665, + "logps/rejected": -375.4507697610294, + "loss": 0.1433, + "rewards/chosen": 1.7118882497151693, + "rewards/margins": 7.565687508676566, + "rewards/rejected": -5.853799258961397, + "step": 803 + }, + { + "epoch": 0.2968021780259333, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 8.115476609258303e-06, + "logits/chosen": 1672854949.6470587, + "logits/rejected": 2294377676.8, + "logps/chosen": -279.0005744485294, + "logps/rejected": -457.2577799479167, + "loss": 0.1472, + "rewards/chosen": 1.9660868925206803, + "rewards/margins": 8.950609394148284, + "rewards/rejected": -6.9845225016276045, + "step": 804 + }, + { + "epoch": 0.29717133496377646, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 8.110871260195843e-06, + "logits/chosen": 1355694923.2941177, + "logits/rejected": 1886667161.6, + "logps/chosen": -262.04664522058823, + "logps/rejected": -330.3739908854167, + "loss": 0.1819, + "rewards/chosen": 1.5579574809354895, + "rewards/margins": 6.109295168109969, + "rewards/rejected": -4.551337687174479, + "step": 805 + }, + { + "epoch": 0.2975404919016197, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 8.106261601060773e-06, + "logits/chosen": 1960393581.7142856, + "logits/rejected": 1478986638.2222223, + "logps/chosen": -376.36094447544644, + "logps/rejected": -393.79554578993054, + "loss": 0.1636, + "rewards/chosen": 0.8501204081944057, + "rewards/margins": 7.2051483487326005, + "rewards/rejected": -6.355027940538195, + "step": 806 + }, + { + "epoch": 0.29790964883946286, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 8.10164763823972e-06, + "logits/chosen": 1776711559.5294118, + "logits/rejected": 2229499767.4666667, + "logps/chosen": -144.30712890625, + "logps/rejected": -537.4246419270834, + "loss": 0.1197, + "rewards/chosen": 1.7147829392377067, + "rewards/margins": 9.895915446561926, + "rewards/rejected": -8.181132507324218, + "step": 807 + }, + { + "epoch": 0.2982788057773061, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 8.097029378125269e-06, + "logits/chosen": 1782969344.0, + "logits/rejected": 1916704085.3333333, + "logps/chosen": -345.97011021205356, + "logps/rejected": -546.0418836805555, + "loss": 0.1453, + "rewards/chosen": 1.2972239085606165, + "rewards/margins": 7.867170197623117, + "rewards/rejected": -6.5699462890625, + "step": 808 + }, + { + "epoch": 0.29864796271514926, + "grad_norm": 14.5625, + "kl": 0.4725308418273926, + "learning_rate": 8.092406827115964e-06, + "logits/chosen": 1916424045.7142856, + "logits/rejected": 2491431822.2222223, + "logps/chosen": -312.70263671875, + "logps/rejected": -383.3122829861111, + "loss": 0.1935, + "rewards/chosen": 0.5758002826145717, + "rewards/margins": 6.934301716940744, + "rewards/rejected": -6.358501434326172, + "step": 809 + }, + { + "epoch": 0.2990171196529925, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 8.087779991616287e-06, + "logits/chosen": 2305710834.5263157, + "logits/rejected": 1833789597.5384614, + "logps/chosen": -352.52518503289474, + "logps/rejected": -574.2116887019231, + "loss": 0.2213, + "rewards/chosen": 0.8366149099249589, + "rewards/margins": 7.839409028952904, + "rewards/rejected": -7.002794119027945, + "step": 810 + }, + { + "epoch": 0.29938627659083566, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 8.083148878036662e-06, + "logits/chosen": 2242518812.4444447, + "logits/rejected": 1754433097.142857, + "logps/chosen": -306.30088975694446, + "logps/rejected": -525.7042759486607, + "loss": 0.2172, + "rewards/chosen": 1.0050924089219835, + "rewards/margins": 7.88835558815608, + "rewards/rejected": -6.883263179234096, + "step": 811 + }, + { + "epoch": 0.2997554335286789, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 8.078513492793438e-06, + "logits/chosen": 2229952785.0666666, + "logits/rejected": 1723739678.1176472, + "logps/chosen": -256.3374837239583, + "logps/rejected": -398.6739717371324, + "loss": 0.1617, + "rewards/chosen": 1.688958994547526, + "rewards/margins": 7.8996254266477095, + "rewards/rejected": -6.210666432100184, + "step": 812 + }, + { + "epoch": 0.30012459046652207, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 8.073873842308882e-06, + "logits/chosen": 1441944507.7333333, + "logits/rejected": 1432007378.8235295, + "logps/chosen": -244.43362630208333, + "logps/rejected": -509.1650965073529, + "loss": 0.1445, + "rewards/chosen": 1.368047332763672, + "rewards/margins": 11.12289513980641, + "rewards/rejected": -9.754847807042738, + "step": 813 + }, + { + "epoch": 0.3004937474043653, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 8.06922993301117e-06, + "logits/chosen": 1783088696.8888888, + "logits/rejected": 1600717385.142857, + "logps/chosen": -287.3686794704861, + "logps/rejected": -470.32212611607144, + "loss": 0.1665, + "rewards/chosen": 1.4139023886786566, + "rewards/margins": 7.958555599999806, + "rewards/rejected": -6.544653211321149, + "step": 814 + }, + { + "epoch": 0.30086290434220847, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 8.06458177133438e-06, + "logits/chosen": 1862189686.1538463, + "logits/rejected": 1446147772.631579, + "logps/chosen": -336.5436448317308, + "logps/rejected": -431.9814453125, + "loss": 0.1496, + "rewards/chosen": 0.794594251192533, + "rewards/margins": 7.943976189926085, + "rewards/rejected": -7.149381938733552, + "step": 815 + }, + { + "epoch": 0.3012320612800517, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 8.059929363718482e-06, + "logits/chosen": 1840019797.3333333, + "logits/rejected": 1429795693.7142856, + "logps/chosen": -299.02704535590277, + "logps/rejected": -445.1821986607143, + "loss": 0.1649, + "rewards/chosen": 1.6504359775119357, + "rewards/margins": 8.557581341455853, + "rewards/rejected": -6.9071453639439175, + "step": 816 + }, + { + "epoch": 0.3016012182178949, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 8.055272716609325e-06, + "logits/chosen": 1079479247.2380953, + "logits/rejected": 1137328128.0, + "logps/chosen": -266.5591750372024, + "logps/rejected": -350.0929509943182, + "loss": 0.1932, + "rewards/chosen": 1.4339239029657274, + "rewards/margins": 7.15955943153018, + "rewards/rejected": -5.725635528564453, + "step": 817 + }, + { + "epoch": 0.3019703751557381, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 8.050611836458638e-06, + "logits/chosen": 1612715918.2222223, + "logits/rejected": 1363407579.4285715, + "logps/chosen": -285.65643988715277, + "logps/rejected": -465.4964076450893, + "loss": 0.1736, + "rewards/chosen": 1.238120608859592, + "rewards/margins": 8.374668242439391, + "rewards/rejected": -7.1365476335797995, + "step": 818 + }, + { + "epoch": 0.3023395320935813, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 8.045946729724006e-06, + "logits/chosen": 1880778428.631579, + "logits/rejected": 2447008059.076923, + "logps/chosen": -343.3036852384868, + "logps/rejected": -408.2204026442308, + "loss": 0.2144, + "rewards/chosen": 1.2783741198087994, + "rewards/margins": 7.553862166308199, + "rewards/rejected": -6.275488046499399, + "step": 819 + }, + { + "epoch": 0.3027086890314245, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 8.041277402868881e-06, + "logits/chosen": 2307380565.3333335, + "logits/rejected": 2018702998.5882354, + "logps/chosen": -284.2231770833333, + "logps/rejected": -661.1852022058823, + "loss": 0.1459, + "rewards/chosen": 1.295455805460612, + "rewards/margins": 9.093158370373295, + "rewards/rejected": -7.797702564912684, + "step": 820 + }, + { + "epoch": 0.3030778459692677, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 8.036603862362553e-06, + "logits/chosen": 1566930944.0, + "logits/rejected": 1995055826.8235295, + "logps/chosen": -259.1030598958333, + "logps/rejected": -517.7414981617648, + "loss": 0.186, + "rewards/chosen": 0.8430644989013671, + "rewards/margins": 9.743824027566347, + "rewards/rejected": -8.900759528664981, + "step": 821 + }, + { + "epoch": 0.3034470029071109, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 8.031926114680153e-06, + "logits/chosen": 1599134866.2857144, + "logits/rejected": 1836644238.2222223, + "logps/chosen": -302.35414341517856, + "logps/rejected": -446.29462348090277, + "loss": 0.16, + "rewards/chosen": 1.1045301301138741, + "rewards/margins": 7.215257228366912, + "rewards/rejected": -6.110727098253038, + "step": 822 + }, + { + "epoch": 0.3038161598449541, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 8.027244166302641e-06, + "logits/chosen": 1522726754.4615386, + "logits/rejected": 1644102602.1052632, + "logps/chosen": -306.7819260817308, + "logps/rejected": -423.27073910361844, + "loss": 0.1398, + "rewards/chosen": 1.3881590916560247, + "rewards/margins": 8.87909617404706, + "rewards/rejected": -7.490937082391036, + "step": 823 + }, + { + "epoch": 0.3041853167827973, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 8.022558023716799e-06, + "logits/chosen": 1832794955.2941177, + "logits/rejected": 2229187379.2, + "logps/chosen": -296.2141544117647, + "logps/rejected": -545.9464192708333, + "loss": 0.1632, + "rewards/chosen": 1.3795812270220589, + "rewards/margins": 9.726521839815028, + "rewards/rejected": -8.346940612792968, + "step": 824 + }, + { + "epoch": 0.3045544737206405, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 8.017867693415214e-06, + "logits/chosen": 1346095206.4, + "logits/rejected": 1339008093.090909, + "logps/chosen": -194.44639892578124, + "logps/rejected": -450.3898259943182, + "loss": 0.1156, + "rewards/chosen": 1.4361581802368164, + "rewards/margins": 8.218214121731844, + "rewards/rejected": -6.782055941495028, + "step": 825 + }, + { + "epoch": 0.3049236306584837, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 8.013173181896283e-06, + "logits/chosen": 1727281971.2, + "logits/rejected": 2156776448.0, + "logps/chosen": -269.996630859375, + "logps/rejected": -474.599609375, + "loss": 0.1871, + "rewards/chosen": 1.3075427055358886, + "rewards/margins": 8.960203711191813, + "rewards/rejected": -7.652661005655925, + "step": 826 + }, + { + "epoch": 0.3052927875963269, + "grad_norm": 10.4375, + "kl": 0.501070499420166, + "learning_rate": 8.008474495664189e-06, + "logits/chosen": 1383627161.6, + "logits/rejected": 2093605345.8823528, + "logps/chosen": -239.06378580729168, + "logps/rejected": -470.8046300551471, + "loss": 0.1523, + "rewards/chosen": 1.244427490234375, + "rewards/margins": 8.324596719180837, + "rewards/rejected": -7.0801692289464615, + "step": 827 + }, + { + "epoch": 0.3056619445341701, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 8.003771641228905e-06, + "logits/chosen": 1466680506.1818182, + "logits/rejected": 1519476540.952381, + "logps/chosen": -263.35595703125, + "logps/rejected": -413.21805245535717, + "loss": 0.0956, + "rewards/chosen": 1.7336137945001775, + "rewards/margins": 8.510508500136337, + "rewards/rejected": -6.776894705636161, + "step": 828 + }, + { + "epoch": 0.3060311014720133, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 7.999064625106174e-06, + "logits/chosen": 1780798464.0, + "logits/rejected": 1442287820.8, + "logps/chosen": -311.51849365234375, + "logps/rejected": -468.090087890625, + "loss": 0.1556, + "rewards/chosen": 0.6209770838419596, + "rewards/margins": 7.846309725443523, + "rewards/rejected": -7.225332641601563, + "step": 829 + }, + { + "epoch": 0.30640025840985646, + "grad_norm": 15.125, + "kl": 0.0, + "learning_rate": 7.994353453817508e-06, + "logits/chosen": 1567426304.0, + "logits/rejected": 1793480192.0, + "logps/chosen": -272.9121398925781, + "logps/rejected": -472.41534423828125, + "loss": 0.1434, + "rewards/chosen": 1.765087604522705, + "rewards/margins": 8.643160343170166, + "rewards/rejected": -6.878072738647461, + "step": 830 + }, + { + "epoch": 0.3067694153476997, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 7.989638133890174e-06, + "logits/chosen": 1846025830.4, + "logits/rejected": 1813832797.090909, + "logps/chosen": -278.8658203125, + "logps/rejected": -466.7374378551136, + "loss": 0.1349, + "rewards/chosen": 0.49846348762512205, + "rewards/margins": 7.225576691194013, + "rewards/rejected": -6.727113203568892, + "step": 831 + }, + { + "epoch": 0.30713857228554287, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 7.984918671857189e-06, + "logits/chosen": 1748837841.4545455, + "logits/rejected": 1713967104.0, + "logps/chosen": -343.68430397727275, + "logps/rejected": -489.34351748511904, + "loss": 0.1482, + "rewards/chosen": 0.45304398103193805, + "rewards/margins": 9.26828620340917, + "rewards/rejected": -8.815242222377233, + "step": 832 + }, + { + "epoch": 0.3075077292233861, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 7.980195074257307e-06, + "logits/chosen": 1777100913.7777777, + "logits/rejected": 1371332900.5714285, + "logps/chosen": -311.48670789930554, + "logps/rejected": -501.6962890625, + "loss": 0.2414, + "rewards/chosen": 0.5426774024963379, + "rewards/margins": 7.749609470367432, + "rewards/rejected": -7.206932067871094, + "step": 833 + }, + { + "epoch": 0.30787688616122927, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 7.975467347635012e-06, + "logits/chosen": 2009840896.0, + "logits/rejected": 2519991040.0, + "logps/chosen": -385.7723083496094, + "logps/rejected": -499.6424560546875, + "loss": 0.2121, + "rewards/chosen": 0.44561702013015747, + "rewards/margins": 9.27849131822586, + "rewards/rejected": -8.832874298095703, + "step": 834 + }, + { + "epoch": 0.3082460430990725, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 7.97073549854051e-06, + "logits/chosen": 1729585044.2105262, + "logits/rejected": 2396147396.923077, + "logps/chosen": -279.35570004111844, + "logps/rejected": -504.0295973557692, + "loss": 0.1944, + "rewards/chosen": 1.1528539155658923, + "rewards/margins": 9.089363268029835, + "rewards/rejected": -7.9365093524639425, + "step": 835 + }, + { + "epoch": 0.30861520003691567, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 7.965999533529718e-06, + "logits/chosen": 1602250752.0, + "logits/rejected": 1625722880.0, + "logps/chosen": -312.43860677083336, + "logps/rejected": -445.0725528492647, + "loss": 0.1557, + "rewards/chosen": 0.9785322825113932, + "rewards/margins": 8.30315673678529, + "rewards/rejected": -7.324624454273897, + "step": 836 + }, + { + "epoch": 0.3089843569747589, + "grad_norm": 13.5, + "kl": 0.8632402420043945, + "learning_rate": 7.961259459164254e-06, + "logits/chosen": 2449136298.6666665, + "logits/rejected": 1711760530.2857144, + "logps/chosen": -302.5050455729167, + "logps/rejected": -364.52064732142856, + "loss": 0.1564, + "rewards/chosen": 1.5189595752292209, + "rewards/margins": 7.422852092319065, + "rewards/rejected": -5.903892517089844, + "step": 837 + }, + { + "epoch": 0.3093535139126021, + "grad_norm": 8.625, + "kl": 0.1173563003540039, + "learning_rate": 7.956515282011434e-06, + "logits/chosen": 1912410726.4, + "logits/rejected": 1991198161.4545455, + "logps/chosen": -261.2768310546875, + "logps/rejected": -506.15598366477275, + "loss": 0.1034, + "rewards/chosen": 1.3286043167114259, + "rewards/margins": 9.454141148653896, + "rewards/rejected": -8.12553683194247, + "step": 838 + }, + { + "epoch": 0.3097226708504453, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 7.951767008644251e-06, + "logits/chosen": 1904186853.0526316, + "logits/rejected": 2634485760.0, + "logps/chosen": -304.5488024259868, + "logps/rejected": -420.5614483173077, + "loss": 0.1538, + "rewards/chosen": 1.6742320813630756, + "rewards/margins": 8.151954249331826, + "rewards/rejected": -6.47772216796875, + "step": 839 + }, + { + "epoch": 0.3100918277882885, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 7.94701464564138e-06, + "logits/chosen": 1815226624.0, + "logits/rejected": 1394356096.0, + "logps/chosen": -254.62367248535156, + "logps/rejected": -513.35986328125, + "loss": 0.1686, + "rewards/chosen": 1.2715548276901245, + "rewards/margins": 8.247064471244812, + "rewards/rejected": -6.9755096435546875, + "step": 840 + }, + { + "epoch": 0.3104609847261317, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 7.942258199587158e-06, + "logits/chosen": 1590144682.6666667, + "logits/rejected": 2149934811.428571, + "logps/chosen": -234.60926649305554, + "logps/rejected": -454.6900111607143, + "loss": 0.1761, + "rewards/chosen": 1.2197287877400715, + "rewards/margins": 8.611499286833263, + "rewards/rejected": -7.391770499093192, + "step": 841 + }, + { + "epoch": 0.3108301416639749, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 7.937497677071583e-06, + "logits/chosen": 1759271680.0, + "logits/rejected": 1243496704.0, + "logps/chosen": -346.13360595703125, + "logps/rejected": -441.9634704589844, + "loss": 0.1619, + "rewards/chosen": 1.4510995149612427, + "rewards/margins": 9.325560688972473, + "rewards/rejected": -7.8744611740112305, + "step": 842 + }, + { + "epoch": 0.3111992986018181, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 7.932733084690296e-06, + "logits/chosen": 2695615283.2, + "logits/rejected": 2047527273.4117646, + "logps/chosen": -242.93645833333332, + "logps/rejected": -499.9755284926471, + "loss": 0.154, + "rewards/chosen": 1.401041030883789, + "rewards/margins": 9.020968829884248, + "rewards/rejected": -7.619927799000459, + "step": 843 + }, + { + "epoch": 0.3115684555396613, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 7.92796442904458e-06, + "logits/chosen": 1695778816.0, + "logits/rejected": 2190669824.0, + "logps/chosen": -221.9394287109375, + "logps/rejected": -522.3164469401041, + "loss": 0.2014, + "rewards/chosen": 1.383891487121582, + "rewards/margins": 9.715642992655436, + "rewards/rejected": -8.331751505533854, + "step": 844 + }, + { + "epoch": 0.3119376124775045, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 7.923191716741348e-06, + "logits/chosen": 1731405092.5714285, + "logits/rejected": 1420649472.0, + "logps/chosen": -256.72764369419644, + "logps/rejected": -493.5515407986111, + "loss": 0.125, + "rewards/chosen": 1.5650439943586076, + "rewards/margins": 9.874180279080829, + "rewards/rejected": -8.309136284722221, + "step": 845 + }, + { + "epoch": 0.3123067694153477, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 7.91841495439313e-06, + "logits/chosen": 2037930752.0, + "logits/rejected": 1181810432.0, + "logps/chosen": -313.9730224609375, + "logps/rejected": -426.9949951171875, + "loss": 0.117, + "rewards/chosen": 1.6640669107437134, + "rewards/margins": 9.747782349586487, + "rewards/rejected": -8.083715438842773, + "step": 846 + }, + { + "epoch": 0.3126759263531909, + "grad_norm": 12.0625, + "kl": 1.4579029083251953, + "learning_rate": 7.913634148618073e-06, + "logits/chosen": 2648354575.0588236, + "logits/rejected": 2590585651.2, + "logps/chosen": -299.03733915441177, + "logps/rejected": -597.1313151041667, + "loss": 0.1463, + "rewards/chosen": 1.845749350155101, + "rewards/margins": 10.176253703996247, + "rewards/rejected": -8.330504353841146, + "step": 847 + }, + { + "epoch": 0.3130450832910341, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 7.908849306039918e-06, + "logits/chosen": 1793597224.4210527, + "logits/rejected": 1987188578.4615386, + "logps/chosen": -211.62826377467104, + "logps/rejected": -452.23888221153845, + "loss": 0.1447, + "rewards/chosen": 1.88387037578382, + "rewards/margins": 9.43273019211495, + "rewards/rejected": -7.54885981633113, + "step": 848 + }, + { + "epoch": 0.3134142402288773, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 7.904060433288007e-06, + "logits/chosen": 1583498368.0, + "logits/rejected": 1967017984.0, + "logps/chosen": -345.40704345703125, + "logps/rejected": -496.48956298828125, + "loss": 0.153, + "rewards/chosen": 1.4391528367996216, + "rewards/margins": 8.345987915992737, + "rewards/rejected": -6.906835079193115, + "step": 849 + }, + { + "epoch": 0.3137833971667205, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 7.899267536997261e-06, + "logits/chosen": 1980651520.0, + "logits/rejected": 1958105088.0, + "logps/chosen": -283.6163330078125, + "logps/rejected": -461.9182400173611, + "loss": 0.1873, + "rewards/chosen": 0.6477373668125698, + "rewards/margins": 10.040383308652848, + "rewards/rejected": -9.392645941840279, + "step": 850 + }, + { + "epoch": 0.3141525541045637, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 7.894470623808176e-06, + "logits/chosen": 1569617646.9333334, + "logits/rejected": 1806889321.4117646, + "logps/chosen": -289.48268229166666, + "logps/rejected": -545.7815946691177, + "loss": 0.1845, + "rewards/chosen": 0.7846832911173502, + "rewards/margins": 8.969135123608158, + "rewards/rejected": -8.184451832490808, + "step": 851 + }, + { + "epoch": 0.3145217110424069, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 7.889669700366818e-06, + "logits/chosen": 2559108096.0, + "logits/rejected": 1608796672.0, + "logps/chosen": -310.5573425292969, + "logps/rejected": -481.6836853027344, + "loss": 0.1616, + "rewards/chosen": 1.2778233289718628, + "rewards/margins": 8.877211928367615, + "rewards/rejected": -7.599388599395752, + "step": 852 + }, + { + "epoch": 0.3148908679802501, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 7.884864773324802e-06, + "logits/chosen": 1995534882.1333334, + "logits/rejected": 1424070053.6470587, + "logps/chosen": -300.98522135416664, + "logps/rejected": -409.033203125, + "loss": 0.1691, + "rewards/chosen": 0.9390234629313151, + "rewards/margins": 7.236901503918218, + "rewards/rejected": -6.297878040986903, + "step": 853 + }, + { + "epoch": 0.3152600249180933, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 7.880055849339294e-06, + "logits/chosen": 1757543966.1176472, + "logits/rejected": 1802464324.2666667, + "logps/chosen": -269.2121151194853, + "logps/rejected": -487.03040364583336, + "loss": 0.2151, + "rewards/chosen": 0.9384696062873391, + "rewards/margins": 8.705585898605047, + "rewards/rejected": -7.767116292317708, + "step": 854 + }, + { + "epoch": 0.3156291818559365, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 7.875242935073e-06, + "logits/chosen": 1987477744.9411764, + "logits/rejected": 2268258577.0666666, + "logps/chosen": -234.45444623161765, + "logps/rejected": -569.7572916666667, + "loss": 0.1736, + "rewards/chosen": 1.1210244122673483, + "rewards/margins": 9.779857216629328, + "rewards/rejected": -8.65883280436198, + "step": 855 + }, + { + "epoch": 0.3159983387937797, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 7.870426037194146e-06, + "logits/chosen": 2192152439.4666667, + "logits/rejected": 1715654535.5294118, + "logps/chosen": -282.333203125, + "logps/rejected": -556.7888901654412, + "loss": 0.1325, + "rewards/chosen": 1.3768541971842447, + "rewards/margins": 8.7076467925427, + "rewards/rejected": -7.330792595358456, + "step": 856 + }, + { + "epoch": 0.3163674957316229, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 7.865605162376485e-06, + "logits/chosen": 2648439125.3333335, + "logits/rejected": 2109074432.0, + "logps/chosen": -255.0826416015625, + "logps/rejected": -610.97919921875, + "loss": 0.1604, + "rewards/chosen": 0.5252934296925863, + "rewards/margins": 9.420483096440634, + "rewards/rejected": -8.895189666748047, + "step": 857 + }, + { + "epoch": 0.3167366526694661, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 7.860780317299282e-06, + "logits/chosen": 1570806000.9411764, + "logits/rejected": 2079949482.6666667, + "logps/chosen": -317.6905158547794, + "logps/rejected": -551.1836588541667, + "loss": 0.2373, + "rewards/chosen": 0.40599890316233916, + "rewards/margins": 8.36161922380036, + "rewards/rejected": -7.955620320638021, + "step": 858 + }, + { + "epoch": 0.31710580960730933, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 7.855951508647295e-06, + "logits/chosen": 1765538884.2666667, + "logits/rejected": 1681512086.5882354, + "logps/chosen": -293.0960286458333, + "logps/rejected": -579.9281939338235, + "loss": 0.1577, + "rewards/chosen": 1.0512412389119465, + "rewards/margins": 10.966892448126101, + "rewards/rejected": -9.915651209214154, + "step": 859 + }, + { + "epoch": 0.3174749665451525, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 7.851118743110774e-06, + "logits/chosen": 1880508825.6, + "logits/rejected": 1363168798.1176472, + "logps/chosen": -259.2115234375, + "logps/rejected": -414.41096047794116, + "loss": 0.1456, + "rewards/chosen": 1.181839116414388, + "rewards/margins": 7.924150033090628, + "rewards/rejected": -6.7423109166762405, + "step": 860 + }, + { + "epoch": 0.31784412348299573, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 7.846282027385462e-06, + "logits/chosen": 1934284361.142857, + "logits/rejected": 1841744099.5555556, + "logps/chosen": -325.43289620535717, + "logps/rejected": -557.28515625, + "loss": 0.1751, + "rewards/chosen": 0.7499315398080009, + "rewards/margins": 9.445112644679963, + "rewards/rejected": -8.695181104871962, + "step": 861 + }, + { + "epoch": 0.3182132804208389, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 7.841441368172559e-06, + "logits/chosen": 1720026794.6666667, + "logits/rejected": 2204854476.8, + "logps/chosen": -285.39902750651044, + "logps/rejected": -625.31611328125, + "loss": 0.1504, + "rewards/chosen": 0.6516085465749105, + "rewards/margins": 9.230434687932332, + "rewards/rejected": -8.578826141357421, + "step": 862 + }, + { + "epoch": 0.31858243735868214, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 7.836596772178741e-06, + "logits/chosen": 1264582440.4210527, + "logits/rejected": 1422821218.4615386, + "logps/chosen": -264.1763466282895, + "logps/rejected": -508.7086838942308, + "loss": 0.1399, + "rewards/chosen": 1.973711515727796, + "rewards/margins": 9.603615455781883, + "rewards/rejected": -7.629903940054087, + "step": 863 + }, + { + "epoch": 0.3189515942965253, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 7.831748246116136e-06, + "logits/chosen": 1554434867.2, + "logits/rejected": 1591946541.1764705, + "logps/chosen": -308.05983072916666, + "logps/rejected": -436.9375, + "loss": 0.1631, + "rewards/chosen": 1.367718505859375, + "rewards/margins": 8.272011341768152, + "rewards/rejected": -6.904292835908778, + "step": 864 + }, + { + "epoch": 0.31932075123436854, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 7.826895796702311e-06, + "logits/chosen": 1430278451.2, + "logits/rejected": 1330676821.3333333, + "logps/chosen": -269.13447265625, + "logps/rejected": -535.0544026692709, + "loss": 0.1515, + "rewards/chosen": 1.753652000427246, + "rewards/margins": 12.64719778696696, + "rewards/rejected": -10.893545786539713, + "step": 865 + }, + { + "epoch": 0.3196899081722117, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 7.822039430660276e-06, + "logits/chosen": 1734446464.0, + "logits/rejected": 1881682688.0, + "logps/chosen": -254.1585693359375, + "logps/rejected": -429.14788818359375, + "loss": 0.175, + "rewards/chosen": 1.269014835357666, + "rewards/margins": 8.75364065170288, + "rewards/rejected": -7.484625816345215, + "step": 866 + }, + { + "epoch": 0.3200590651100549, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 7.817179154718463e-06, + "logits/chosen": 1960170496.0, + "logits/rejected": 1939534233.6, + "logps/chosen": -273.4120279947917, + "logps/rejected": -593.1083984375, + "loss": 0.136, + "rewards/chosen": 1.4268786112467449, + "rewards/margins": 9.494739786783853, + "rewards/rejected": -8.067861175537109, + "step": 867 + }, + { + "epoch": 0.3204282220478981, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 7.812314975610722e-06, + "logits/chosen": 1603350698.6666667, + "logits/rejected": 2280404992.0, + "logps/chosen": -323.5094807942708, + "logps/rejected": -501.25146484375, + "loss": 0.1175, + "rewards/chosen": 1.5535982449849446, + "rewards/margins": 10.187495644887289, + "rewards/rejected": -8.633897399902343, + "step": 868 + }, + { + "epoch": 0.3207973789857413, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 7.807446900076314e-06, + "logits/chosen": 1788359065.6, + "logits/rejected": 1520271872.0, + "logps/chosen": -290.7920166015625, + "logps/rejected": -464.9145914713542, + "loss": 0.2118, + "rewards/chosen": 1.0742314338684082, + "rewards/margins": 8.150565433502198, + "rewards/rejected": -7.076333999633789, + "step": 869 + }, + { + "epoch": 0.3211665359235845, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 7.802574934859894e-06, + "logits/chosen": 2170383291.733333, + "logits/rejected": 2172929927.529412, + "logps/chosen": -351.991015625, + "logps/rejected": -413.9040958180147, + "loss": 0.1894, + "rewards/chosen": 0.7881627400716146, + "rewards/margins": 6.570350706811045, + "rewards/rejected": -5.78218796673943, + "step": 870 + }, + { + "epoch": 0.3215356928614277, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 7.797699086711507e-06, + "logits/chosen": 1279808365.7142856, + "logits/rejected": 1880706730.6666667, + "logps/chosen": -274.23985072544644, + "logps/rejected": -457.2399088541667, + "loss": 0.1414, + "rewards/chosen": 1.2010801860264368, + "rewards/margins": 9.530923495216975, + "rewards/rejected": -8.329843309190538, + "step": 871 + }, + { + "epoch": 0.3219048497992709, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 7.792819362386581e-06, + "logits/chosen": 2444352275.6923075, + "logits/rejected": 2515995378.5263157, + "logps/chosen": -285.77723106971155, + "logps/rejected": -499.4429481907895, + "loss": 0.1614, + "rewards/chosen": 0.6998635071974534, + "rewards/margins": 8.135114040451976, + "rewards/rejected": -7.435250533254523, + "step": 872 + }, + { + "epoch": 0.3222740067371141, + "grad_norm": 13.75, + "kl": 0.2538723945617676, + "learning_rate": 7.78793576864591e-06, + "logits/chosen": 2022321212.235294, + "logits/rejected": 2103823701.3333333, + "logps/chosen": -361.94921875, + "logps/rejected": -485.2930013020833, + "loss": 0.1823, + "rewards/chosen": 1.061662898344152, + "rewards/margins": 8.366700333239987, + "rewards/rejected": -7.305037434895834, + "step": 873 + }, + { + "epoch": 0.3226431636749573, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 7.783048312255653e-06, + "logits/chosen": 1785292344.8888888, + "logits/rejected": 1668517595.4285715, + "logps/chosen": -360.0162760416667, + "logps/rejected": -456.99107142857144, + "loss": 0.1947, + "rewards/chosen": 1.303312725490994, + "rewards/margins": 8.372472641960023, + "rewards/rejected": -7.069159916469029, + "step": 874 + }, + { + "epoch": 0.3230123206128005, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 7.778156999987317e-06, + "logits/chosen": 2182021734.4, + "logits/rejected": 1793286485.3333333, + "logps/chosen": -290.0043212890625, + "logps/rejected": -436.28271484375, + "loss": 0.1927, + "rewards/chosen": 1.5033204078674316, + "rewards/margins": 8.524445565541585, + "rewards/rejected": -7.021125157674153, + "step": 875 + }, + { + "epoch": 0.3233814775506437, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 7.773261838617753e-06, + "logits/chosen": 1628498602.6666667, + "logits/rejected": 2125614381.1764705, + "logps/chosen": -260.090625, + "logps/rejected": -474.8009823069853, + "loss": 0.1342, + "rewards/chosen": 1.50275510152181, + "rewards/margins": 9.636059473075118, + "rewards/rejected": -8.133304371553308, + "step": 876 + }, + { + "epoch": 0.3237506344884869, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 7.768362834929146e-06, + "logits/chosen": 1198572885.3333333, + "logits/rejected": 1701367326.1176472, + "logps/chosen": -266.14222005208336, + "logps/rejected": -409.30612362132354, + "loss": 0.1193, + "rewards/chosen": 1.7067952473958334, + "rewards/margins": 8.17200341318168, + "rewards/rejected": -6.465208165785846, + "step": 877 + }, + { + "epoch": 0.32411979142633013, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 7.763459995709004e-06, + "logits/chosen": 2087007573.3333333, + "logits/rejected": 1960215347.2, + "logps/chosen": -333.076171875, + "logps/rejected": -416.950634765625, + "loss": 0.1421, + "rewards/chosen": 1.0731836954752605, + "rewards/margins": 8.53884531656901, + "rewards/rejected": -7.46566162109375, + "step": 878 + }, + { + "epoch": 0.3244889483641733, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 7.758553327750146e-06, + "logits/chosen": 2049599146.6666667, + "logits/rejected": 1973374566.4, + "logps/chosen": -306.4269612630208, + "logps/rejected": -515.5177734375, + "loss": 0.118, + "rewards/chosen": 1.1513862609863281, + "rewards/margins": 8.792795562744141, + "rewards/rejected": -7.641409301757813, + "step": 879 + }, + { + "epoch": 0.32485810530201653, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 7.753642837850698e-06, + "logits/chosen": 2004762168.8888888, + "logits/rejected": 1707413796.5714285, + "logps/chosen": -308.2451443142361, + "logps/rejected": -402.8251953125, + "loss": 0.1414, + "rewards/chosen": 1.7647307713826497, + "rewards/margins": 8.60278270358131, + "rewards/rejected": -6.838051932198661, + "step": 880 + }, + { + "epoch": 0.3252272622398597, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 7.748728532814087e-06, + "logits/chosen": 1987456236.3076923, + "logits/rejected": 1560770128.8421052, + "logps/chosen": -292.4107196514423, + "logps/rejected": -505.78818873355266, + "loss": 0.1189, + "rewards/chosen": 1.4480360471285307, + "rewards/margins": 9.257310820977215, + "rewards/rejected": -7.809274773848684, + "step": 881 + }, + { + "epoch": 0.32559641917770293, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 7.743810419449014e-06, + "logits/chosen": 1880472234.6666667, + "logits/rejected": 1317931287.2727273, + "logps/chosen": -306.9309895833333, + "logps/rejected": -313.5305841619318, + "loss": 0.2248, + "rewards/chosen": 1.0016085306803386, + "rewards/margins": 6.9947531729033505, + "rewards/rejected": -5.993144642223012, + "step": 882 + }, + { + "epoch": 0.3259655761155461, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 7.738888504569463e-06, + "logits/chosen": 1755457280.0, + "logits/rejected": 1859082624.0, + "logps/chosen": -282.9569091796875, + "logps/rejected": -484.5386962890625, + "loss": 0.1622, + "rewards/chosen": 1.0370687246322632, + "rewards/margins": 8.921853423118591, + "rewards/rejected": -7.884784698486328, + "step": 883 + }, + { + "epoch": 0.32633473305338934, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 7.733962794994689e-06, + "logits/chosen": 1581197507.047619, + "logits/rejected": 1897428061.090909, + "logps/chosen": -310.4190383184524, + "logps/rejected": -431.15163352272725, + "loss": 0.2255, + "rewards/chosen": 1.6414188203357516, + "rewards/margins": 8.294922238304501, + "rewards/rejected": -6.65350341796875, + "step": 884 + }, + { + "epoch": 0.3267038899912325, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 7.729033297549195e-06, + "logits/chosen": 1954659942.4, + "logits/rejected": 1719486277.8181818, + "logps/chosen": -291.13603515625, + "logps/rejected": -496.1344549005682, + "loss": 0.0977, + "rewards/chosen": 1.359393310546875, + "rewards/margins": 9.567505160245029, + "rewards/rejected": -8.208111849698154, + "step": 885 + }, + { + "epoch": 0.32707304692907574, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 7.724100019062739e-06, + "logits/chosen": 1463232619.7894738, + "logits/rejected": 2030309376.0, + "logps/chosen": -318.97977487664474, + "logps/rejected": -437.8596379206731, + "loss": 0.1802, + "rewards/chosen": 1.2761028691342002, + "rewards/margins": 8.760137480762806, + "rewards/rejected": -7.484034611628606, + "step": 886 + }, + { + "epoch": 0.3274422038669189, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 7.719162966370318e-06, + "logits/chosen": 1773986048.0, + "logits/rejected": 2529993472.0, + "logps/chosen": -309.1400146484375, + "logps/rejected": -520.2398071289062, + "loss": 0.1693, + "rewards/chosen": 0.8205697536468506, + "rewards/margins": 9.172093152999878, + "rewards/rejected": -8.351523399353027, + "step": 887 + }, + { + "epoch": 0.32781136080476214, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 7.714222146312151e-06, + "logits/chosen": 1725058141.090909, + "logits/rejected": 1580771523.047619, + "logps/chosen": -266.48970170454544, + "logps/rejected": -410.1572730654762, + "loss": 0.0769, + "rewards/chosen": 1.9124239141290837, + "rewards/margins": 8.947156551080349, + "rewards/rejected": -7.034732636951265, + "step": 888 + }, + { + "epoch": 0.3281805177426053, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 7.709277565733686e-06, + "logits/chosen": 2168832945.230769, + "logits/rejected": 1874427041.6842105, + "logps/chosen": -333.0720402644231, + "logps/rejected": -440.02451685855266, + "loss": 0.1308, + "rewards/chosen": 1.128370578472431, + "rewards/margins": 8.275280593377857, + "rewards/rejected": -7.146910014905427, + "step": 889 + }, + { + "epoch": 0.32854967468044854, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 7.704329231485576e-06, + "logits/chosen": 1901655040.0, + "logits/rejected": 2078494976.0, + "logps/chosen": -250.25067138671875, + "logps/rejected": -620.6270141601562, + "loss": 0.1702, + "rewards/chosen": 0.9859291911125183, + "rewards/margins": 10.087418258190155, + "rewards/rejected": -9.101489067077637, + "step": 890 + }, + { + "epoch": 0.3289188316182917, + "grad_norm": 12.625, + "kl": 0.6705756187438965, + "learning_rate": 7.699377150423673e-06, + "logits/chosen": 1901219960.4705882, + "logits/rejected": 1919831381.3333333, + "logps/chosen": -271.7267635569853, + "logps/rejected": -413.4860026041667, + "loss": 0.176, + "rewards/chosen": 1.210100061753217, + "rewards/margins": 7.839382695216759, + "rewards/rejected": -6.629282633463542, + "step": 891 + }, + { + "epoch": 0.32928798855613495, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 7.69442132940902e-06, + "logits/chosen": 1952762013.5384614, + "logits/rejected": 1612072421.0526316, + "logps/chosen": -256.74569936899036, + "logps/rejected": -579.6266961348684, + "loss": 0.1164, + "rewards/chosen": 1.5467969454251802, + "rewards/margins": 10.161844199485625, + "rewards/rejected": -8.615047254060444, + "step": 892 + }, + { + "epoch": 0.3296571454939781, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 7.689461775307852e-06, + "logits/chosen": 1823993514.6666667, + "logits/rejected": 1844002816.0, + "logps/chosen": -370.154541015625, + "logps/rejected": -476.57841796875, + "loss": 0.0902, + "rewards/chosen": 2.6579599380493164, + "rewards/margins": 9.21244831085205, + "rewards/rejected": -6.5544883728027346, + "step": 893 + }, + { + "epoch": 0.33002630243182135, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 7.684498494991562e-06, + "logits/chosen": 1833489261.7142856, + "logits/rejected": 2251695217.7777777, + "logps/chosen": -282.82137625558033, + "logps/rejected": -568.4927300347222, + "loss": 0.1838, + "rewards/chosen": 0.5352809088570731, + "rewards/margins": 8.542147363935198, + "rewards/rejected": -8.006866455078125, + "step": 894 + }, + { + "epoch": 0.3303954593696645, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 7.679531495336712e-06, + "logits/chosen": 1223900452.5714285, + "logits/rejected": 1317417756.4444444, + "logps/chosen": -280.4718540736607, + "logps/rejected": -508.79839409722223, + "loss": 0.1245, + "rewards/chosen": 1.3638677597045898, + "rewards/margins": 10.956251250372993, + "rewards/rejected": -9.592383490668404, + "step": 895 + }, + { + "epoch": 0.33076461630750775, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 7.674560783225018e-06, + "logits/chosen": 1593461930.6666667, + "logits/rejected": 1526610432.0, + "logps/chosen": -330.1383870442708, + "logps/rejected": -491.8966796875, + "loss": 0.1423, + "rewards/chosen": 0.7845905621846517, + "rewards/margins": 7.885630448659261, + "rewards/rejected": -7.101039886474609, + "step": 896 + }, + { + "epoch": 0.3311337732453509, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 7.669586365543342e-06, + "logits/chosen": 1434181511.5294118, + "logits/rejected": 1812982715.7333333, + "logps/chosen": -265.90593405330884, + "logps/rejected": -376.2314778645833, + "loss": 0.1248, + "rewards/chosen": 1.6857174144071692, + "rewards/margins": 8.931303046731388, + "rewards/rejected": -7.245585632324219, + "step": 897 + }, + { + "epoch": 0.33150293018319416, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 7.66460824918367e-06, + "logits/chosen": 1352985031.1111112, + "logits/rejected": 1459286747.4285715, + "logps/chosen": -284.2196994357639, + "logps/rejected": -536.5718819754464, + "loss": 0.1559, + "rewards/chosen": 1.8418197631835938, + "rewards/margins": 10.688632420131139, + "rewards/rejected": -8.846812656947545, + "step": 898 + }, + { + "epoch": 0.33187208712103733, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 7.659626441043125e-06, + "logits/chosen": 1665285558.857143, + "logits/rejected": 1617629184.0, + "logps/chosen": -248.09040178571428, + "logps/rejected": -539.4910333806819, + "loss": 0.2342, + "rewards/chosen": 0.9018145061674572, + "rewards/margins": 9.706793310322286, + "rewards/rejected": -8.80497880415483, + "step": 899 + }, + { + "epoch": 0.33224124405888056, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 7.654640948023934e-06, + "logits/chosen": 1655850325.3333333, + "logits/rejected": 1784091238.4, + "logps/chosen": -344.3448486328125, + "logps/rejected": -526.92978515625, + "loss": 0.0933, + "rewards/chosen": 1.5509324073791504, + "rewards/margins": 10.57110071182251, + "rewards/rejected": -9.02016830444336, + "step": 900 + }, + { + "epoch": 0.33261040099672373, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 7.649651777033438e-06, + "logits/chosen": 2249472819.2, + "logits/rejected": 2201947306.6666665, + "logps/chosen": -343.315673828125, + "logps/rejected": -493.31298828125, + "loss": 0.246, + "rewards/chosen": 1.0787864685058595, + "rewards/margins": 6.9987528483072925, + "rewards/rejected": -5.919966379801433, + "step": 901 + }, + { + "epoch": 0.3329795579345669, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 7.644658934984066e-06, + "logits/chosen": 2456276992.0, + "logits/rejected": 1881527523.5555556, + "logps/chosen": -244.56180245535714, + "logps/rejected": -487.82329644097223, + "loss": 0.1489, + "rewards/chosen": 0.9881204196384975, + "rewards/margins": 9.73927584905473, + "rewards/rejected": -8.751155429416233, + "step": 902 + }, + { + "epoch": 0.33334871487241013, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 7.639662428793342e-06, + "logits/chosen": 1487158091.2941177, + "logits/rejected": 1611816140.8, + "logps/chosen": -278.10572725183823, + "logps/rejected": -420.52623697916664, + "loss": 0.2118, + "rewards/chosen": 1.27820385203642, + "rewards/margins": 8.86293285594267, + "rewards/rejected": -7.58472900390625, + "step": 903 + }, + { + "epoch": 0.3337178718102533, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 7.634662265383858e-06, + "logits/chosen": 1662423235.047619, + "logits/rejected": 1726442402.909091, + "logps/chosen": -291.7034505208333, + "logps/rejected": -438.85129616477275, + "loss": 0.2083, + "rewards/chosen": 1.4665750776018416, + "rewards/margins": 8.466131185556387, + "rewards/rejected": -6.999556107954546, + "step": 904 + }, + { + "epoch": 0.33408702874809654, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 7.62965845168328e-06, + "logits/chosen": 1726058868.3636363, + "logits/rejected": 2354392268.8, + "logps/chosen": -215.2125799005682, + "logps/rejected": -506.12451171875, + "loss": 0.2494, + "rewards/chosen": 0.8305882540616122, + "rewards/margins": 7.739098080721768, + "rewards/rejected": -6.908509826660156, + "step": 905 + }, + { + "epoch": 0.3344561856859397, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 7.624650994624325e-06, + "logits/chosen": 1767791138.1333334, + "logits/rejected": 2375940216.470588, + "logps/chosen": -293.40514322916664, + "logps/rejected": -447.08375459558823, + "loss": 0.1658, + "rewards/chosen": 0.9302012125651041, + "rewards/margins": 8.82211243872549, + "rewards/rejected": -7.891911226160386, + "step": 906 + }, + { + "epoch": 0.33482534262378294, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 7.619639901144764e-06, + "logits/chosen": 1255263865.9047618, + "logits/rejected": 2391726266.181818, + "logps/chosen": -249.60291108630952, + "logps/rejected": -515.1480823863636, + "loss": 0.16, + "rewards/chosen": 1.883026849655878, + "rewards/margins": 8.997358181775907, + "rewards/rejected": -7.114331332120028, + "step": 907 + }, + { + "epoch": 0.3351944995616261, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 7.614625178187402e-06, + "logits/chosen": 2859116905.4117646, + "logits/rejected": 2164651076.266667, + "logps/chosen": -262.6431525735294, + "logps/rejected": -525.9163736979167, + "loss": 0.1595, + "rewards/chosen": 1.163559408748851, + "rewards/margins": 8.732710206274891, + "rewards/rejected": -7.569150797526041, + "step": 908 + }, + { + "epoch": 0.33556365649946934, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 7.609606832700074e-06, + "logits/chosen": 1687007744.0, + "logits/rejected": 1782651904.0, + "logps/chosen": -276.0148010253906, + "logps/rejected": -471.23260498046875, + "loss": 0.2142, + "rewards/chosen": 0.4478931128978729, + "rewards/margins": 8.311402767896652, + "rewards/rejected": -7.863509654998779, + "step": 909 + }, + { + "epoch": 0.3359328134373125, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 7.604584871635634e-06, + "logits/chosen": 2069287040.0, + "logits/rejected": 1681955328.0, + "logps/chosen": -333.0334777832031, + "logps/rejected": -435.6813659667969, + "loss": 0.1716, + "rewards/chosen": 0.8769656419754028, + "rewards/margins": 8.55918037891388, + "rewards/rejected": -7.682214736938477, + "step": 910 + }, + { + "epoch": 0.33630197037515575, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 7.5995593019519444e-06, + "logits/chosen": 1722787367.3846154, + "logits/rejected": 1900985290.1052632, + "logps/chosen": -298.0274000901442, + "logps/rejected": -489.00956003289474, + "loss": 0.1875, + "rewards/chosen": 0.6020212173461914, + "rewards/margins": 9.469590187072754, + "rewards/rejected": -8.867568969726562, + "step": 911 + }, + { + "epoch": 0.3366711273129989, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 7.59453013061187e-06, + "logits/chosen": 1690889637.6470587, + "logits/rejected": 1660935918.9333334, + "logps/chosen": -298.0385167738971, + "logps/rejected": -337.87568359375, + "loss": 0.1988, + "rewards/chosen": 1.154226639691521, + "rewards/margins": 8.371024531944125, + "rewards/rejected": -7.216797892252604, + "step": 912 + }, + { + "epoch": 0.33704028425084215, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 7.589497364583263e-06, + "logits/chosen": 1773031664.9411764, + "logits/rejected": 1574099763.2, + "logps/chosen": -264.55506089154414, + "logps/rejected": -505.86266276041664, + "loss": 0.1831, + "rewards/chosen": 0.8982828925637638, + "rewards/margins": 9.883475763657513, + "rewards/rejected": -8.98519287109375, + "step": 913 + }, + { + "epoch": 0.3374094411886853, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 7.5844610108389546e-06, + "logits/chosen": 1447904460.8, + "logits/rejected": 1668153514.6666667, + "logps/chosen": -292.718017578125, + "logps/rejected": -363.1314697265625, + "loss": 0.2237, + "rewards/chosen": 0.7272062301635742, + "rewards/margins": 7.903041521708171, + "rewards/rejected": -7.175835291544597, + "step": 914 + }, + { + "epoch": 0.33777859812652855, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 7.579421076356753e-06, + "logits/chosen": 1018841838.9333333, + "logits/rejected": 1410410134.5882354, + "logps/chosen": -218.51357421875, + "logps/rejected": -461.0711454503676, + "loss": 0.1083, + "rewards/chosen": 1.9160456339518228, + "rewards/margins": 9.014958041321997, + "rewards/rejected": -7.098912407370174, + "step": 915 + }, + { + "epoch": 0.3381477550643717, + "grad_norm": 17.125, + "kl": 0.13433599472045898, + "learning_rate": 7.574377568119421e-06, + "logits/chosen": 1930412590.5454545, + "logits/rejected": 1891858432.0, + "logps/chosen": -358.59028764204544, + "logps/rejected": -470.4443359375, + "loss": 0.2489, + "rewards/chosen": 0.7783184918490323, + "rewards/margins": 9.669926157864658, + "rewards/rejected": -8.891607666015625, + "step": 916 + }, + { + "epoch": 0.33851691200221495, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 7.569330493114675e-06, + "logits/chosen": 1960144151.2727273, + "logits/rejected": 2162470521.904762, + "logps/chosen": -375.31125710227275, + "logps/rejected": -513.3297061011905, + "loss": 0.1091, + "rewards/chosen": 1.2385460246693005, + "rewards/margins": 8.746607024948318, + "rewards/rejected": -7.508061000279018, + "step": 917 + }, + { + "epoch": 0.3388860689400581, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 7.564279858335174e-06, + "logits/chosen": 1869662549.3333333, + "logits/rejected": 2316551606.857143, + "logps/chosen": -338.86610243055554, + "logps/rejected": -413.498046875, + "loss": 0.1876, + "rewards/chosen": 1.2881849077012804, + "rewards/margins": 7.598282738337441, + "rewards/rejected": -6.310097830636161, + "step": 918 + }, + { + "epoch": 0.33925522587790136, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 7.5592256707785085e-06, + "logits/chosen": 1960458870.1538463, + "logits/rejected": 1602352397.4736843, + "logps/chosen": -339.49954927884613, + "logps/rejected": -491.3453947368421, + "loss": 0.1199, + "rewards/chosen": 1.098565174983098, + "rewards/margins": 8.301733542067801, + "rewards/rejected": -7.203168367084704, + "step": 919 + }, + { + "epoch": 0.33962438281574453, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 7.55416793744719e-06, + "logits/chosen": 2056471250.8235295, + "logits/rejected": 2146412407.4666667, + "logps/chosen": -270.07080078125, + "logps/rejected": -555.6276041666666, + "loss": 0.1706, + "rewards/chosen": 1.395520154167624, + "rewards/margins": 10.307916394402, + "rewards/rejected": -8.912396240234376, + "step": 920 + }, + { + "epoch": 0.33999353975358776, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 7.549106665348644e-06, + "logits/chosen": 1905739093.3333333, + "logits/rejected": 1687318287.0588236, + "logps/chosen": -300.5034505208333, + "logps/rejected": -439.69192325367646, + "loss": 0.1365, + "rewards/chosen": 1.4077505747477213, + "rewards/margins": 8.567379050161325, + "rewards/rejected": -7.159628475413603, + "step": 921 + }, + { + "epoch": 0.34036269669143093, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 7.544041861495202e-06, + "logits/chosen": 1344253470.1176472, + "logits/rejected": 1642606182.4, + "logps/chosen": -225.7610294117647, + "logps/rejected": -457.58203125, + "loss": 0.1943, + "rewards/chosen": 0.7626733218922335, + "rewards/margins": 8.16777280919692, + "rewards/rejected": -7.405099487304687, + "step": 922 + }, + { + "epoch": 0.34073185362927416, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 7.53897353290408e-06, + "logits/chosen": 1800802889.142857, + "logits/rejected": 2673995403.6363635, + "logps/chosen": -249.97484188988096, + "logps/rejected": -537.8727361505681, + "loss": 0.1836, + "rewards/chosen": 1.3472404479980469, + "rewards/margins": 10.790766282515092, + "rewards/rejected": -9.443525834517045, + "step": 923 + }, + { + "epoch": 0.34110101056711734, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 7.5339016865973865e-06, + "logits/chosen": 1795347251.2, + "logits/rejected": 1636105557.3333333, + "logps/chosen": -298.932861328125, + "logps/rejected": -551.721435546875, + "loss": 0.2117, + "rewards/chosen": 1.0166474342346192, + "rewards/margins": 9.860231494903564, + "rewards/rejected": -8.843584060668945, + "step": 924 + }, + { + "epoch": 0.34147016750496056, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 7.528826329602099e-06, + "logits/chosen": 2177455360.0, + "logits/rejected": 2134380928.0, + "logps/chosen": -285.8106994628906, + "logps/rejected": -420.74798583984375, + "loss": 0.1701, + "rewards/chosen": 1.2023110389709473, + "rewards/margins": 8.19617748260498, + "rewards/rejected": -6.993866443634033, + "step": 925 + }, + { + "epoch": 0.34183932444280374, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 7.523747468950061e-06, + "logits/chosen": 1323178257.0666666, + "logits/rejected": 1444000707.764706, + "logps/chosen": -245.53020833333332, + "logps/rejected": -403.16946231617646, + "loss": 0.1866, + "rewards/chosen": 0.753152338663737, + "rewards/margins": 8.40325573939903, + "rewards/rejected": -7.650103400735294, + "step": 926 + }, + { + "epoch": 0.34220848138064697, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 7.518665111677968e-06, + "logits/chosen": 2290167076.571429, + "logits/rejected": 2018201827.5555556, + "logps/chosen": -291.4937220982143, + "logps/rejected": -431.3548177083333, + "loss": 0.1155, + "rewards/chosen": 1.6446316582815987, + "rewards/margins": 7.973628104679168, + "rewards/rejected": -6.32899644639757, + "step": 927 + }, + { + "epoch": 0.34257763831849014, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 7.513579264827362e-06, + "logits/chosen": 1385023715.5555556, + "logits/rejected": 1243799552.0, + "logps/chosen": -271.28968641493054, + "logps/rejected": -429.49428013392856, + "loss": 0.166, + "rewards/chosen": 1.675567838880751, + "rewards/margins": 8.763795247153631, + "rewards/rejected": -7.088227408272879, + "step": 928 + }, + { + "epoch": 0.34294679525633337, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 7.508489935444618e-06, + "logits/chosen": 2002825671.1111112, + "logits/rejected": 1914287981.7142856, + "logps/chosen": -285.6776529947917, + "logps/rejected": -538.4484165736607, + "loss": 0.193, + "rewards/chosen": 1.1649994320339627, + "rewards/margins": 10.725618241325257, + "rewards/rejected": -9.560618809291295, + "step": 929 + }, + { + "epoch": 0.34331595219417654, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 7.5033971305809405e-06, + "logits/chosen": 2330751939.7647057, + "logits/rejected": 2037251003.7333333, + "logps/chosen": -295.12293198529414, + "logps/rejected": -350.74462890625, + "loss": 0.1766, + "rewards/chosen": 1.469827315386604, + "rewards/margins": 7.661250350054573, + "rewards/rejected": -6.191423034667968, + "step": 930 + }, + { + "epoch": 0.3436851091320198, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 7.498300857292342e-06, + "logits/chosen": 1719453696.0, + "logits/rejected": 2068799715.5555556, + "logps/chosen": -321.3524693080357, + "logps/rejected": -447.4122721354167, + "loss": 0.1707, + "rewards/chosen": 0.7054557800292969, + "rewards/margins": 7.708563910590278, + "rewards/rejected": -7.003108130560981, + "step": 931 + }, + { + "epoch": 0.34405426606986295, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 7.493201122639648e-06, + "logits/chosen": 2340573184.0, + "logits/rejected": 1927329189.6470587, + "logps/chosen": -314.91253255208335, + "logps/rejected": -533.3768382352941, + "loss": 0.1049, + "rewards/chosen": 2.145867919921875, + "rewards/margins": 10.510021613625918, + "rewards/rejected": -8.364153693704043, + "step": 932 + }, + { + "epoch": 0.3444234230077062, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 7.488097933688474e-06, + "logits/chosen": 1447946496.0, + "logits/rejected": 1541616640.0, + "logps/chosen": -259.95098876953125, + "logps/rejected": -495.1393127441406, + "loss": 0.1307, + "rewards/chosen": 1.893912672996521, + "rewards/margins": 8.915956854820251, + "rewards/rejected": -7.0220441818237305, + "step": 933 + }, + { + "epoch": 0.34479257994554935, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 7.482991297509225e-06, + "logits/chosen": 2224604306.285714, + "logits/rejected": 1424257024.0, + "logps/chosen": -221.37976655505952, + "logps/rejected": -401.4173029119318, + "loss": 0.2513, + "rewards/chosen": 1.1251788366408575, + "rewards/margins": 8.36860923436813, + "rewards/rejected": -7.2434303977272725, + "step": 934 + }, + { + "epoch": 0.3451617368833926, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 7.477881221177077e-06, + "logits/chosen": 1761141191.1111112, + "logits/rejected": 2132146322.2857144, + "logps/chosen": -263.58873155381946, + "logps/rejected": -593.7700892857143, + "loss": 0.1433, + "rewards/chosen": 1.5794934166802301, + "rewards/margins": 11.216820338415722, + "rewards/rejected": -9.637326921735491, + "step": 935 + }, + { + "epoch": 0.34553089382123575, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 7.472767711771979e-06, + "logits/chosen": 1821833362.2857144, + "logits/rejected": 1630131086.2222223, + "logps/chosen": -297.4893275669643, + "logps/rejected": -479.9275716145833, + "loss": 0.146, + "rewards/chosen": 1.212001119341169, + "rewards/margins": 7.0146333452255005, + "rewards/rejected": -5.802632225884332, + "step": 936 + }, + { + "epoch": 0.345900050759079, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 7.467650776378633e-06, + "logits/chosen": 3512381440.0, + "logits/rejected": 1458853730.4615386, + "logps/chosen": -338.2206217447917, + "logps/rejected": -422.85171274038464, + "loss": 0.0864, + "rewards/chosen": 0.9173627694447836, + "rewards/margins": 7.412650738006983, + "rewards/rejected": -6.4952879685622, + "step": 937 + }, + { + "epoch": 0.34626920769692215, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 7.462530422086487e-06, + "logits/chosen": 1397631385.6, + "logits/rejected": 1547091245.1764705, + "logps/chosen": -329.06634114583335, + "logps/rejected": -407.38536879595586, + "loss": 0.1634, + "rewards/chosen": 1.2566670735677083, + "rewards/margins": 8.51182409548292, + "rewards/rejected": -7.2551570219152115, + "step": 938 + }, + { + "epoch": 0.34663836463476533, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 7.4574066559897276e-06, + "logits/chosen": 2377773251.047619, + "logits/rejected": 1754632378.1818182, + "logps/chosen": -432.77901785714283, + "logps/rejected": -614.4805131392045, + "loss": 0.2243, + "rewards/chosen": 0.8133530389694941, + "rewards/margins": 11.139354986545843, + "rewards/rejected": -10.32600194757635, + "step": 939 + }, + { + "epoch": 0.34700752157260856, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 7.452279485187268e-06, + "logits/chosen": 1447949653.3333333, + "logits/rejected": 2150024285.090909, + "logps/chosen": -262.5908900669643, + "logps/rejected": -357.42502663352275, + "loss": 0.1509, + "rewards/chosen": 2.028566451299758, + "rewards/margins": 8.210210544206364, + "rewards/rejected": -6.181644092906605, + "step": 940 + }, + { + "epoch": 0.34737667851045173, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 7.4471489167827374e-06, + "logits/chosen": 1611676525.7142856, + "logits/rejected": 1414036707.5555556, + "logps/chosen": -265.53526088169644, + "logps/rejected": -587.0643446180555, + "loss": 0.1662, + "rewards/chosen": 0.8271936689104352, + "rewards/margins": 9.588599613734655, + "rewards/rejected": -8.761405944824219, + "step": 941 + }, + { + "epoch": 0.34774583544829496, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 7.442014957884473e-06, + "logits/chosen": 1233277064.5333333, + "logits/rejected": 1708645918.1176472, + "logps/chosen": -283.3255208333333, + "logps/rejected": -518.5861098345588, + "loss": 0.0978, + "rewards/chosen": 1.9671727498372396, + "rewards/margins": 8.511437150543811, + "rewards/rejected": -6.544264400706572, + "step": 942 + }, + { + "epoch": 0.34811499238613813, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 7.43687761560551e-06, + "logits/chosen": 1316014518.857143, + "logits/rejected": 1534406997.3333333, + "logps/chosen": -277.56649344308033, + "logps/rejected": -457.04893663194446, + "loss": 0.1335, + "rewards/chosen": 1.7897484643118722, + "rewards/margins": 9.203452382768903, + "rewards/rejected": -7.413703918457031, + "step": 943 + }, + { + "epoch": 0.34848414932398136, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 7.43173689706357e-06, + "logits/chosen": 1710671744.0, + "logits/rejected": 2123786240.0, + "logps/chosen": -256.97344970703125, + "logps/rejected": -378.7826843261719, + "loss": 0.1972, + "rewards/chosen": 0.7358039021492004, + "rewards/margins": 8.032732784748077, + "rewards/rejected": -7.296928882598877, + "step": 944 + }, + { + "epoch": 0.34885330626182454, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 7.4265928093810545e-06, + "logits/chosen": 1805941760.0, + "logits/rejected": 1353394029.7142856, + "logps/chosen": -315.53607855902777, + "logps/rejected": -467.92599051339283, + "loss": 0.1995, + "rewards/chosen": 1.014037874009874, + "rewards/margins": 8.545091144622319, + "rewards/rejected": -7.5310532706124445, + "step": 945 + }, + { + "epoch": 0.34922246319966777, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 7.421445359685031e-06, + "logits/chosen": 2025756672.0, + "logits/rejected": 2363623594.6666665, + "logps/chosen": -260.0708740234375, + "logps/rejected": -463.7125651041667, + "loss": 0.1523, + "rewards/chosen": 1.8779289245605468, + "rewards/margins": 9.56402130126953, + "rewards/rejected": -7.686092376708984, + "step": 946 + }, + { + "epoch": 0.34959162013751094, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 7.416294555107226e-06, + "logits/chosen": 1616024462.2222223, + "logits/rejected": 2003059273.142857, + "logps/chosen": -229.84727647569446, + "logps/rejected": -626.2214704241071, + "loss": 0.1279, + "rewards/chosen": 1.7303761376274958, + "rewards/margins": 11.969911878071134, + "rewards/rejected": -10.239535740443639, + "step": 947 + }, + { + "epoch": 0.34996077707535417, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 7.411140402784014e-06, + "logits/chosen": 1490518256.9411764, + "logits/rejected": 1578971409.0666666, + "logps/chosen": -258.0514705882353, + "logps/rejected": -527.38232421875, + "loss": 0.1893, + "rewards/chosen": 1.2784929836497587, + "rewards/margins": 9.739565778246114, + "rewards/rejected": -8.461072794596355, + "step": 948 + }, + { + "epoch": 0.35032993401319734, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 7.4059829098564075e-06, + "logits/chosen": 1558872519.1111112, + "logits/rejected": 1927073206.857143, + "logps/chosen": -316.08827039930554, + "logps/rejected": -447.53780691964283, + "loss": 0.2301, + "rewards/chosen": 0.4418847295973036, + "rewards/margins": 8.732850218576099, + "rewards/rejected": -8.290965488978795, + "step": 949 + }, + { + "epoch": 0.35069909095104057, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 7.400822083470046e-06, + "logits/chosen": 1879575250.8235295, + "logits/rejected": 1586992469.3333333, + "logps/chosen": -302.0782111672794, + "logps/rejected": -451.1384765625, + "loss": 0.2205, + "rewards/chosen": 0.42042970657348633, + "rewards/margins": 6.872364012400309, + "rewards/rejected": -6.451934305826823, + "step": 950 + }, + { + "epoch": 0.35106824788888374, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 7.395657930775191e-06, + "logits/chosen": 2129165789.8666666, + "logits/rejected": 1816884645.6470587, + "logps/chosen": -250.94674479166667, + "logps/rejected": -426.09329044117646, + "loss": 0.169, + "rewards/chosen": 1.108864720662435, + "rewards/margins": 7.327608886419558, + "rewards/rejected": -6.218744165757123, + "step": 951 + }, + { + "epoch": 0.351437404826727, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 7.390490458926708e-06, + "logits/chosen": 1667485952.0, + "logits/rejected": 1894405760.0, + "logps/chosen": -303.7418518066406, + "logps/rejected": -434.12652587890625, + "loss": 0.1836, + "rewards/chosen": 0.9207848906517029, + "rewards/margins": 8.116925656795502, + "rewards/rejected": -7.196140766143799, + "step": 952 + }, + { + "epoch": 0.35180656176457015, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 7.385319675084066e-06, + "logits/chosen": 1373067400.5333333, + "logits/rejected": 2307482563.7647057, + "logps/chosen": -220.604052734375, + "logps/rejected": -401.45447495404414, + "loss": 0.1471, + "rewards/chosen": 1.1077101389567057, + "rewards/margins": 8.315524972653856, + "rewards/rejected": -7.20781483369715, + "step": 953 + }, + { + "epoch": 0.3521757187024134, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 7.38014558641132e-06, + "logits/chosen": 1684706450.2857144, + "logits/rejected": 2076216661.3333333, + "logps/chosen": -301.26346261160717, + "logps/rejected": -408.6073404947917, + "loss": 0.1348, + "rewards/chosen": 1.1618796757289342, + "rewards/margins": 7.90938104901995, + "rewards/rejected": -6.747501373291016, + "step": 954 + }, + { + "epoch": 0.35254487564025655, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 7.3749682000771016e-06, + "logits/chosen": 1645678045.8666666, + "logits/rejected": 1413347809.8823528, + "logps/chosen": -287.57607421875, + "logps/rejected": -477.34791475183823, + "loss": 0.1031, + "rewards/chosen": 2.4843419392903647, + "rewards/margins": 10.2081210566502, + "rewards/rejected": -7.723779117359834, + "step": 955 + }, + { + "epoch": 0.3529140325780998, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 7.369787523254617e-06, + "logits/chosen": 1606024669.8666666, + "logits/rejected": 1487674428.235294, + "logps/chosen": -303.75478515625, + "logps/rejected": -648.9426700367648, + "loss": 0.1396, + "rewards/chosen": 1.4781453450520834, + "rewards/margins": 9.526215018478094, + "rewards/rejected": -8.048069673426012, + "step": 956 + }, + { + "epoch": 0.35328318951594295, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 7.364603563121627e-06, + "logits/chosen": 1678673578.6666667, + "logits/rejected": 1662014902.857143, + "logps/chosen": -294.0096842447917, + "logps/rejected": -526.1534598214286, + "loss": 0.175, + "rewards/chosen": 1.0924949645996094, + "rewards/margins": 8.929591587611608, + "rewards/rejected": -7.837096623011997, + "step": 957 + }, + { + "epoch": 0.3536523464537862, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 7.359416326860443e-06, + "logits/chosen": 1438470609.4545455, + "logits/rejected": 1270316339.2, + "logps/chosen": -299.47713955965907, + "logps/rejected": -499.35068359375, + "loss": 0.2181, + "rewards/chosen": 1.450064485723322, + "rewards/margins": 10.619850557500666, + "rewards/rejected": -9.169786071777343, + "step": 958 + }, + { + "epoch": 0.35402150339162936, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 7.3542258216579136e-06, + "logits/chosen": 1652226048.0, + "logits/rejected": 1550886195.2, + "logps/chosen": -234.97845458984375, + "logps/rejected": -515.172998046875, + "loss": 0.1079, + "rewards/chosen": 1.8943514823913574, + "rewards/margins": 9.456859111785889, + "rewards/rejected": -7.562507629394531, + "step": 959 + }, + { + "epoch": 0.3543906603294726, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 7.349032054705417e-06, + "logits/chosen": 1516410042.1818182, + "logits/rejected": 1574237427.8095238, + "logps/chosen": -317.68148526278407, + "logps/rejected": -417.3616536458333, + "loss": 0.066, + "rewards/chosen": 2.162708802656694, + "rewards/margins": 9.034151324978122, + "rewards/rejected": -6.871442522321429, + "step": 960 + }, + { + "epoch": 0.35475981726731576, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 7.343835033198854e-06, + "logits/chosen": 1409585561.6, + "logits/rejected": 1751366826.6666667, + "logps/chosen": -245.317529296875, + "logps/rejected": -461.6015625, + "loss": 0.2072, + "rewards/chosen": 0.9638049125671386, + "rewards/margins": 9.06177905400594, + "rewards/rejected": -8.097974141438803, + "step": 961 + }, + { + "epoch": 0.355128974205159, + "grad_norm": 13.4375, + "kl": 0.24583673477172852, + "learning_rate": 7.33863476433863e-06, + "logits/chosen": 2062272102.4, + "logits/rejected": 2364149037.1764708, + "logps/chosen": -293.49261067708335, + "logps/rejected": -568.8755744485294, + "loss": 0.1889, + "rewards/chosen": 0.7752403895060221, + "rewards/margins": 8.824402412713743, + "rewards/rejected": -8.049162023207721, + "step": 962 + }, + { + "epoch": 0.35549813114300216, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 7.333431255329653e-06, + "logits/chosen": 1834292932.9230769, + "logits/rejected": 1764339173.0526316, + "logps/chosen": -216.812255859375, + "logps/rejected": -476.7093441611842, + "loss": 0.1674, + "rewards/chosen": 0.5153369169968826, + "rewards/margins": 7.828008779147376, + "rewards/rejected": -7.312671862150493, + "step": 963 + }, + { + "epoch": 0.3558672880808454, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 7.3282245133813155e-06, + "logits/chosen": 2321588224.0, + "logits/rejected": 2192022483.478261, + "logps/chosen": -255.67643229166666, + "logps/rejected": -489.8873131793478, + "loss": 0.0913, + "rewards/chosen": 2.4182039896647134, + "rewards/margins": 10.7249136109283, + "rewards/rejected": -8.306709621263588, + "step": 964 + }, + { + "epoch": 0.35623644501868856, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 7.323014545707497e-06, + "logits/chosen": 1908292539.7333333, + "logits/rejected": 1798498304.0, + "logps/chosen": -264.74534505208334, + "logps/rejected": -429.3397863051471, + "loss": 0.1298, + "rewards/chosen": 1.4637151082356772, + "rewards/margins": 9.023291703766468, + "rewards/rejected": -7.559576595530791, + "step": 965 + }, + { + "epoch": 0.3566056019565318, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 7.317801359526538e-06, + "logits/chosen": 2345181184.0, + "logits/rejected": 2348881237.3333335, + "logps/chosen": -266.5017520680147, + "logps/rejected": -469.2376953125, + "loss": 0.2089, + "rewards/chosen": 0.7775572608498966, + "rewards/margins": 8.016799814560834, + "rewards/rejected": -7.2392425537109375, + "step": 966 + }, + { + "epoch": 0.35697475889437497, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 7.312584962061243e-06, + "logits/chosen": 1393080758.857143, + "logits/rejected": 1729947648.0, + "logps/chosen": -320.68233816964283, + "logps/rejected": -473.22422960069446, + "loss": 0.1185, + "rewards/chosen": 1.7503790174211775, + "rewards/margins": 8.686952469840882, + "rewards/rejected": -6.9365734524197045, + "step": 967 + }, + { + "epoch": 0.3573439158322182, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 7.307365360538865e-06, + "logits/chosen": 1636368930.1333334, + "logits/rejected": 1213020400.9411764, + "logps/chosen": -260.43097330729165, + "logps/rejected": -562.6962316176471, + "loss": 0.1051, + "rewards/chosen": 2.345916239420573, + "rewards/margins": 10.971485302495022, + "rewards/rejected": -8.62556906307445, + "step": 968 + }, + { + "epoch": 0.35771307277006137, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 7.302142562191092e-06, + "logits/chosen": 1700714064.8421052, + "logits/rejected": 1755746461.5384614, + "logps/chosen": -236.75647615131578, + "logps/rejected": -352.7238581730769, + "loss": 0.1956, + "rewards/chosen": 1.1716317628559314, + "rewards/margins": 8.01102361794908, + "rewards/rejected": -6.839391855093149, + "step": 969 + }, + { + "epoch": 0.3580822297079046, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 7.2969165742540495e-06, + "logits/chosen": 2663314578.285714, + "logits/rejected": 1753791829.3333333, + "logps/chosen": -252.30189732142858, + "logps/rejected": -431.94151475694446, + "loss": 0.1416, + "rewards/chosen": 1.6858813422066825, + "rewards/margins": 7.935684249514625, + "rewards/rejected": -6.249802907307942, + "step": 970 + }, + { + "epoch": 0.35845138664574777, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 7.2916874039682765e-06, + "logits/chosen": 2230933504.0, + "logits/rejected": 1695539785.142857, + "logps/chosen": -326.95871803977275, + "logps/rejected": -461.6517857142857, + "loss": 0.1146, + "rewards/chosen": 1.3776614449240945, + "rewards/margins": 8.49801187391405, + "rewards/rejected": -7.120350428989956, + "step": 971 + }, + { + "epoch": 0.358820543583591, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 7.286455058578719e-06, + "logits/chosen": 1716515726.2222223, + "logits/rejected": 1816069997.7142856, + "logps/chosen": -266.0740559895833, + "logps/rejected": -526.49658203125, + "loss": 0.1466, + "rewards/chosen": 1.6248890558878581, + "rewards/margins": 9.270361991155715, + "rewards/rejected": -7.645472935267857, + "step": 972 + }, + { + "epoch": 0.3591897005214342, + "grad_norm": 15.125, + "kl": 0.0, + "learning_rate": 7.281219545334727e-06, + "logits/chosen": 2064604364.8, + "logits/rejected": 2393996800.0, + "logps/chosen": -286.6337158203125, + "logps/rejected": -608.9473470052084, + "loss": 0.2436, + "rewards/chosen": 0.623737096786499, + "rewards/margins": 9.939618825912476, + "rewards/rejected": -9.315881729125977, + "step": 973 + }, + { + "epoch": 0.35955885745927735, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 7.275980871490038e-06, + "logits/chosen": 1813209088.0, + "logits/rejected": 2076497289.8461537, + "logps/chosen": -221.1667351973684, + "logps/rejected": -374.6608698918269, + "loss": 0.2224, + "rewards/chosen": 0.785782864219264, + "rewards/margins": 7.862442433592762, + "rewards/rejected": -7.076659569373498, + "step": 974 + }, + { + "epoch": 0.3599280143971206, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 7.270739044302767e-06, + "logits/chosen": 1592355717.12, + "logits/rejected": 1840764489.142857, + "logps/chosen": -254.803359375, + "logps/rejected": -565.531982421875, + "loss": 0.232, + "rewards/chosen": 1.3555564880371094, + "rewards/margins": 10.002648489815849, + "rewards/rejected": -8.64709200177874, + "step": 975 + }, + { + "epoch": 0.36029717133496375, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 7.265494071035401e-06, + "logits/chosen": 1381356416.0, + "logits/rejected": 1673970176.0, + "logps/chosen": -305.58660888671875, + "logps/rejected": -544.9133911132812, + "loss": 0.1208, + "rewards/chosen": 1.9210807085037231, + "rewards/margins": 9.764545559883118, + "rewards/rejected": -7.8434648513793945, + "step": 976 + }, + { + "epoch": 0.360666328272807, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 7.260245958954783e-06, + "logits/chosen": 2319021149.090909, + "logits/rejected": 2420463001.6, + "logps/chosen": -257.863037109375, + "logps/rejected": -594.554443359375, + "loss": 0.1359, + "rewards/chosen": 2.1475452076305044, + "rewards/margins": 10.656732524525035, + "rewards/rejected": -8.509187316894531, + "step": 977 + }, + { + "epoch": 0.36103548521065015, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 7.254994715332102e-06, + "logits/chosen": 1574477653.3333333, + "logits/rejected": 1777807360.0, + "logps/chosen": -296.8848876953125, + "logps/rejected": -472.112646484375, + "loss": 0.1124, + "rewards/chosen": 1.5844793319702148, + "rewards/margins": 8.958944511413574, + "rewards/rejected": -7.3744651794433596, + "step": 978 + }, + { + "epoch": 0.3614046421484934, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 7.249740347442895e-06, + "logits/chosen": 1846079360.0, + "logits/rejected": 1587787008.0, + "logps/chosen": -286.3741455078125, + "logps/rejected": -449.79815673828125, + "loss": 0.1571, + "rewards/chosen": 1.2616990804672241, + "rewards/margins": 8.543264746665955, + "rewards/rejected": -7.2815656661987305, + "step": 979 + }, + { + "epoch": 0.36177379908633656, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 7.244482862567018e-06, + "logits/chosen": 2190936221.5384617, + "logits/rejected": 1932879117.4736843, + "logps/chosen": -280.38521634615387, + "logps/rejected": -513.8957134046053, + "loss": 0.1384, + "rewards/chosen": 0.987820551945613, + "rewards/margins": 9.474336600979331, + "rewards/rejected": -8.486516049033717, + "step": 980 + }, + { + "epoch": 0.3621429560241798, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 7.2392222679886506e-06, + "logits/chosen": 2757159273.4117646, + "logits/rejected": 2392661469.866667, + "logps/chosen": -283.2100183823529, + "logps/rejected": -430.2301432291667, + "loss": 0.1369, + "rewards/chosen": 1.8558273315429688, + "rewards/margins": 7.846414693196615, + "rewards/rejected": -5.990587361653646, + "step": 981 + }, + { + "epoch": 0.36251211296202296, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 7.23395857099628e-06, + "logits/chosen": 1959237159.3846154, + "logits/rejected": 1727010923.7894738, + "logps/chosen": -279.84420072115387, + "logps/rejected": -449.20579769736844, + "loss": 0.1268, + "rewards/chosen": 1.8084634634164662, + "rewards/margins": 8.901284283471977, + "rewards/rejected": -7.09282082005551, + "step": 982 + }, + { + "epoch": 0.3628812698998662, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 7.2286917788826926e-06, + "logits/chosen": 1503039744.0, + "logits/rejected": 2163049813.3333335, + "logps/chosen": -307.04766845703125, + "logps/rejected": -438.4186604817708, + "loss": 0.0863, + "rewards/chosen": 1.2653594017028809, + "rewards/margins": 8.401666800181072, + "rewards/rejected": -7.13630739847819, + "step": 983 + }, + { + "epoch": 0.36325042683770936, + "grad_norm": 17.625, + "kl": 0.23502063751220703, + "learning_rate": 7.22342189894496e-06, + "logits/chosen": 1752616550.4, + "logits/rejected": 1625619626.6666667, + "logps/chosen": -362.8557861328125, + "logps/rejected": -364.2164713541667, + "loss": 0.2617, + "rewards/chosen": 0.5863108158111572, + "rewards/margins": 6.7914690494537355, + "rewards/rejected": -6.205158233642578, + "step": 984 + }, + { + "epoch": 0.3636195837755526, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 7.218148938484435e-06, + "logits/chosen": 1652110267.7333333, + "logits/rejected": 1324196562.8235295, + "logps/chosen": -288.89147135416664, + "logps/rejected": -462.12971047794116, + "loss": 0.1594, + "rewards/chosen": 1.224811808268229, + "rewards/margins": 9.152072263231464, + "rewards/rejected": -7.9272604549632355, + "step": 985 + }, + { + "epoch": 0.36398874071339576, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 7.212872904806736e-06, + "logits/chosen": 1901161728.0, + "logits/rejected": 1616806400.0, + "logps/chosen": -228.57269287109375, + "logps/rejected": -400.9443054199219, + "loss": 0.1549, + "rewards/chosen": 1.6516929864883423, + "rewards/margins": 8.632474541664124, + "rewards/rejected": -6.980781555175781, + "step": 986 + }, + { + "epoch": 0.364357897651239, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 7.207593805221742e-06, + "logits/chosen": 1408388681.142857, + "logits/rejected": 1415150478.2222223, + "logps/chosen": -236.67501395089286, + "logps/rejected": -489.34950086805554, + "loss": 0.1353, + "rewards/chosen": 1.3888410840715681, + "rewards/margins": 9.922443480718703, + "rewards/rejected": -8.533602396647135, + "step": 987 + }, + { + "epoch": 0.36472705458908217, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 7.202311647043579e-06, + "logits/chosen": 1497909394.2857144, + "logits/rejected": 1511184611.5555556, + "logps/chosen": -319.49204799107144, + "logps/rejected": -533.3566623263889, + "loss": 0.1143, + "rewards/chosen": 1.4950159617832728, + "rewards/margins": 14.413478927006796, + "rewards/rejected": -12.918462965223524, + "step": 988 + }, + { + "epoch": 0.3650962115269254, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 7.197026437590608e-06, + "logits/chosen": 2296422400.0, + "logits/rejected": 1475995648.0, + "logps/chosen": -328.4907740542763, + "logps/rejected": -608.2450796274038, + "loss": 0.176, + "rewards/chosen": 1.2631664276123047, + "rewards/margins": 9.48023326580341, + "rewards/rejected": -8.217066838191105, + "step": 989 + }, + { + "epoch": 0.36546536846476857, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 7.191738184185422e-06, + "logits/chosen": 1630315520.0, + "logits/rejected": 1560410496.0, + "logps/chosen": -331.495361328125, + "logps/rejected": -477.4812316894531, + "loss": 0.1674, + "rewards/chosen": 1.070287823677063, + "rewards/margins": 9.238373875617981, + "rewards/rejected": -8.168086051940918, + "step": 990 + }, + { + "epoch": 0.3658345254026118, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 7.186446894154826e-06, + "logits/chosen": 1444898508.8, + "logits/rejected": 1346974720.0, + "logps/chosen": -242.1988037109375, + "logps/rejected": -366.4676106770833, + "loss": 0.2152, + "rewards/chosen": 1.2893174171447754, + "rewards/margins": 7.068068790435791, + "rewards/rejected": -5.778751373291016, + "step": 991 + }, + { + "epoch": 0.366203682340455, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 7.181152574829837e-06, + "logits/chosen": 1762307510.857143, + "logits/rejected": 1726475377.7777777, + "logps/chosen": -281.13462611607144, + "logps/rejected": -441.1068522135417, + "loss": 0.1504, + "rewards/chosen": 1.120798110961914, + "rewards/margins": 7.686441421508789, + "rewards/rejected": -6.565643310546875, + "step": 992 + }, + { + "epoch": 0.3665728392782982, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 7.175855233545669e-06, + "logits/chosen": 1077204480.0, + "logits/rejected": 1931332096.0, + "logps/chosen": -262.81243896484375, + "logps/rejected": -427.8982747395833, + "loss": 0.089, + "rewards/chosen": 1.3157358169555664, + "rewards/margins": 8.350486437479656, + "rewards/rejected": -7.034750620524089, + "step": 993 + }, + { + "epoch": 0.3669419962161414, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 7.1705548776417165e-06, + "logits/chosen": 1722438314.6666667, + "logits/rejected": 1947669913.6, + "logps/chosen": -222.4847208658854, + "logps/rejected": -423.852978515625, + "loss": 0.1279, + "rewards/chosen": 1.4284906387329102, + "rewards/margins": 7.967957496643066, + "rewards/rejected": -6.539466857910156, + "step": 994 + }, + { + "epoch": 0.3673111531539846, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 7.1652515144615575e-06, + "logits/chosen": 2405808583.111111, + "logits/rejected": 2353978953.142857, + "logps/chosen": -337.2381998697917, + "logps/rejected": -482.495849609375, + "loss": 0.1692, + "rewards/chosen": 1.3000715043809679, + "rewards/margins": 9.607193326193189, + "rewards/rejected": -8.30712182181222, + "step": 995 + }, + { + "epoch": 0.3676803100918278, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 7.1599451513529364e-06, + "logits/chosen": 1827178359.4666667, + "logits/rejected": 2424532269.1764708, + "logps/chosen": -272.7055338541667, + "logps/rejected": -425.91061580882354, + "loss": 0.1273, + "rewards/chosen": 1.6635528564453126, + "rewards/margins": 8.568769746668199, + "rewards/rejected": -6.905216890222886, + "step": 996 + }, + { + "epoch": 0.368049467029671, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 7.154635795667748e-06, + "logits/chosen": 2276263799.4666667, + "logits/rejected": 1883278878.1176472, + "logps/chosen": -183.10221354166666, + "logps/rejected": -445.306640625, + "loss": 0.1339, + "rewards/chosen": 1.7934338887532553, + "rewards/margins": 9.271901942234415, + "rewards/rejected": -7.4784680534811585, + "step": 997 + }, + { + "epoch": 0.3684186239675142, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 7.149323454762039e-06, + "logits/chosen": 2160585458.5263157, + "logits/rejected": 1944447763.6923077, + "logps/chosen": -271.9268092105263, + "logps/rejected": -583.3894981971154, + "loss": 0.1788, + "rewards/chosen": 1.4040230198910362, + "rewards/margins": 11.444025696047888, + "rewards/rejected": -10.04000267615685, + "step": 998 + }, + { + "epoch": 0.3687877809053574, + "grad_norm": 11.5625, + "kl": 0.23833942413330078, + "learning_rate": 7.144008135995992e-06, + "logits/chosen": 1906898156.3076923, + "logits/rejected": 2135348924.631579, + "logps/chosen": -269.36632361778845, + "logps/rejected": -509.2290296052632, + "loss": 0.1521, + "rewards/chosen": 1.0306680385883038, + "rewards/margins": 10.820670961851052, + "rewards/rejected": -9.790002923262747, + "step": 999 + }, + { + "epoch": 0.3691569378432006, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 7.1386898467339114e-06, + "logits/chosen": 2097580032.0, + "logits/rejected": 1917192704.0, + "logps/chosen": -361.8123291015625, + "logps/rejected": -551.8881022135416, + "loss": 0.2031, + "rewards/chosen": 1.446066188812256, + "rewards/margins": 8.684330272674561, + "rewards/rejected": -7.238264083862305, + "step": 1000 + }, + { + "epoch": 0.3695260947810438, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 7.13336859434422e-06, + "logits/chosen": 1305741635.368421, + "logits/rejected": 1609012145.2307692, + "logps/chosen": -295.3207750822368, + "logps/rejected": -361.2751277043269, + "loss": 0.1953, + "rewards/chosen": 1.1566213306627775, + "rewards/margins": 6.857979832390542, + "rewards/rejected": -5.701358501727764, + "step": 1001 + }, + { + "epoch": 0.369895251718887, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 7.128044386199445e-06, + "logits/chosen": 1775492534.857143, + "logits/rejected": 1439385372.4444444, + "logps/chosen": -294.73228236607144, + "logps/rejected": -417.39453125, + "loss": 0.1552, + "rewards/chosen": 1.1806485312325614, + "rewards/margins": 8.779596692039853, + "rewards/rejected": -7.598948160807292, + "step": 1002 + }, + { + "epoch": 0.3702644086567302, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 7.1227172296762086e-06, + "logits/chosen": 1571582554.3529413, + "logits/rejected": 1307812113.0666666, + "logps/chosen": -340.0009765625, + "logps/rejected": -421.11712239583335, + "loss": 0.2024, + "rewards/chosen": 0.9526125963996438, + "rewards/margins": 7.747683498906154, + "rewards/rejected": -6.79507090250651, + "step": 1003 + }, + { + "epoch": 0.3706335655945734, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 7.11738713215522e-06, + "logits/chosen": 1451334528.0, + "logits/rejected": 2082155264.0, + "logps/chosen": -281.0663146972656, + "logps/rejected": -549.1507568359375, + "loss": 0.1469, + "rewards/chosen": 1.365386724472046, + "rewards/margins": 10.51999831199646, + "rewards/rejected": -9.154611587524414, + "step": 1004 + }, + { + "epoch": 0.3710027225324166, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 7.112054101021262e-06, + "logits/chosen": 2051193651.2, + "logits/rejected": 2593418782.117647, + "logps/chosen": -269.0847493489583, + "logps/rejected": -389.28435202205884, + "loss": 0.1824, + "rewards/chosen": 0.7525379180908203, + "rewards/margins": 7.855279832727769, + "rewards/rejected": -7.102741914636948, + "step": 1005 + }, + { + "epoch": 0.3713718794702598, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 7.106718143663178e-06, + "logits/chosen": 1515413383.5294118, + "logits/rejected": 1759716966.4, + "logps/chosen": -249.42681525735293, + "logps/rejected": -430.48896484375, + "loss": 0.1241, + "rewards/chosen": 2.167745029225069, + "rewards/margins": 9.018424553964653, + "rewards/rejected": -6.850679524739584, + "step": 1006 + }, + { + "epoch": 0.371741036408103, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 7.101379267473873e-06, + "logits/chosen": 1397140736.0, + "logits/rejected": 1566905216.0, + "logps/chosen": -212.9573974609375, + "logps/rejected": -484.15118408203125, + "loss": 0.1198, + "rewards/chosen": 1.9922702312469482, + "rewards/margins": 9.459661722183228, + "rewards/rejected": -7.467391490936279, + "step": 1007 + }, + { + "epoch": 0.3721101933459462, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 7.096037479850292e-06, + "logits/chosen": 1703364096.0, + "logits/rejected": 1684157952.0, + "logps/chosen": -287.6410827636719, + "logps/rejected": -498.546875, + "loss": 0.176, + "rewards/chosen": 1.269957423210144, + "rewards/margins": 11.07624614238739, + "rewards/rejected": -9.806288719177246, + "step": 1008 + }, + { + "epoch": 0.3724793502837894, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 7.090692788193409e-06, + "logits/chosen": 1348441460.3636363, + "logits/rejected": 1387642489.9047618, + "logps/chosen": -226.32741477272728, + "logps/rejected": -432.0068359375, + "loss": 0.0945, + "rewards/chosen": 1.7215891751376065, + "rewards/margins": 10.125743634773023, + "rewards/rejected": -8.404154459635416, + "step": 1009 + }, + { + "epoch": 0.3728485072216326, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 7.085345199908234e-06, + "logits/chosen": 1900947132.631579, + "logits/rejected": 2362239448.6153846, + "logps/chosen": -320.6041837993421, + "logps/rejected": -469.6843449519231, + "loss": 0.1808, + "rewards/chosen": 1.0752721083791632, + "rewards/margins": 9.314012295804043, + "rewards/rejected": -8.23874018742488, + "step": 1010 + }, + { + "epoch": 0.37321766415947577, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 7.0799947224037765e-06, + "logits/chosen": 2052017298.2857144, + "logits/rejected": 1794142435.5555556, + "logps/chosen": -302.06689453125, + "logps/rejected": -408.7010091145833, + "loss": 0.1213, + "rewards/chosen": 1.681612423488072, + "rewards/margins": 9.354882754976787, + "rewards/rejected": -7.673270331488715, + "step": 1011 + }, + { + "epoch": 0.373586821097319, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 7.074641363093058e-06, + "logits/chosen": 1759558957.1764705, + "logits/rejected": 1738430464.0, + "logps/chosen": -284.1734834558824, + "logps/rejected": -574.5643880208333, + "loss": 0.1831, + "rewards/chosen": 0.7744261797736672, + "rewards/margins": 8.885008658614813, + "rewards/rejected": -8.110582478841145, + "step": 1012 + }, + { + "epoch": 0.3739559780351622, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 7.0692851293930885e-06, + "logits/chosen": 1928637644.8, + "logits/rejected": 1967263930.1818182, + "logps/chosen": -287.18701171875, + "logps/rejected": -535.2310901988636, + "loss": 0.105, + "rewards/chosen": 1.008524513244629, + "rewards/margins": 10.19213976426558, + "rewards/rejected": -9.183615251020951, + "step": 1013 + }, + { + "epoch": 0.3743251349730054, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 7.063926028724861e-06, + "logits/chosen": 2877248365.714286, + "logits/rejected": 2187818325.3333335, + "logps/chosen": -200.97377232142858, + "logps/rejected": -422.8129069010417, + "loss": 0.151, + "rewards/chosen": 0.9688583782741002, + "rewards/margins": 7.786101318541027, + "rewards/rejected": -6.817242940266927, + "step": 1014 + }, + { + "epoch": 0.3746942919108486, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 7.058564068513344e-06, + "logits/chosen": 2151062323.2, + "logits/rejected": 1565646607.0588236, + "logps/chosen": -362.2584635416667, + "logps/rejected": -562.8606962316177, + "loss": 0.1693, + "rewards/chosen": 0.9718924204508463, + "rewards/margins": 10.957364258111692, + "rewards/rejected": -9.985471837660846, + "step": 1015 + }, + { + "epoch": 0.3750634488486918, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 7.053199256187464e-06, + "logits/chosen": 1247994441.142857, + "logits/rejected": 1363786296.8888888, + "logps/chosen": -288.19775390625, + "logps/rejected": -493.1687825520833, + "loss": 0.126, + "rewards/chosen": 1.473339217049735, + "rewards/margins": 9.890036401294527, + "rewards/rejected": -8.416697184244791, + "step": 1016 + }, + { + "epoch": 0.375432605786535, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 7.047831599180099e-06, + "logits/chosen": 1764913906.5263157, + "logits/rejected": 1760204642.4615386, + "logps/chosen": -271.7682462993421, + "logps/rejected": -491.1436298076923, + "loss": 0.1656, + "rewards/chosen": 1.477714036640368, + "rewards/margins": 8.784272498930031, + "rewards/rejected": -7.306558462289663, + "step": 1017 + }, + { + "epoch": 0.375432605786535, + "eval_kl": 0.0, + "eval_logits/chosen": 3611505208.3444977, + "eval_logits/rejected": 3641732818.5627704, + "eval_logps/chosen": -292.8368906997608, + "eval_logps/rejected": -479.27627840909093, + "eval_loss": 0.1404857188463211, + "eval_rewards/chosen": 1.4177916312331789, + "eval_rewards/margins": 9.572374922291079, + "eval_rewards/rejected": -8.1545832910579, + "eval_runtime": 109.497, + "eval_samples_per_second": 8.0, + "eval_steps_per_second": 0.502, + "step": 1017 + }, + { + "epoch": 0.3758017627243782, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 7.042461104928072e-06, + "logits/chosen": 2308291840.0, + "logits/rejected": 2235788800.0, + "logps/chosen": -277.43316650390625, + "logps/rejected": -451.8218994140625, + "loss": 0.1494, + "rewards/chosen": 1.4298688173294067, + "rewards/margins": 9.353927254676819, + "rewards/rejected": -7.924058437347412, + "step": 1018 + }, + { + "epoch": 0.3761709196622214, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 7.037087780872134e-06, + "logits/chosen": 1762543802.1818182, + "logits/rejected": 1822557135.2380953, + "logps/chosen": -230.17649147727272, + "logps/rejected": -531.8423549107143, + "loss": 0.0798, + "rewards/chosen": 1.6482153805819424, + "rewards/margins": 9.225164727215127, + "rewards/rejected": -7.576949346633184, + "step": 1019 + }, + { + "epoch": 0.3765400766000646, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 7.031711634456954e-06, + "logits/chosen": 1740394222.9333334, + "logits/rejected": 1744674695.5294118, + "logps/chosen": -280.4731119791667, + "logps/rejected": -521.5599724264706, + "loss": 0.1718, + "rewards/chosen": 0.8565059026082357, + "rewards/margins": 10.729071676964853, + "rewards/rejected": -9.872565774356618, + "step": 1020 + }, + { + "epoch": 0.3769092335379078, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 7.02633267313112e-06, + "logits/chosen": 1977146669.1764705, + "logits/rejected": 1873325397.3333333, + "logps/chosen": -350.4914981617647, + "logps/rejected": -631.047265625, + "loss": 0.1623, + "rewards/chosen": 1.2001571655273438, + "rewards/margins": 20.739378356933592, + "rewards/rejected": -19.53922119140625, + "step": 1021 + }, + { + "epoch": 0.377278390475751, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 7.02095090434711e-06, + "logits/chosen": 1923323611.4285715, + "logits/rejected": 1513963292.4444444, + "logps/chosen": -205.41287667410714, + "logps/rejected": -464.9054361979167, + "loss": 0.1326, + "rewards/chosen": 1.5992021560668945, + "rewards/margins": 8.810722033182781, + "rewards/rejected": -7.211519877115886, + "step": 1022 + }, + { + "epoch": 0.3776475474135942, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 7.015566335561297e-06, + "logits/chosen": 1366490538.6666667, + "logits/rejected": 1401238937.6, + "logps/chosen": -274.3295491536458, + "logps/rejected": -413.497119140625, + "loss": 0.1514, + "rewards/chosen": 0.5374922752380371, + "rewards/margins": 7.9245329856872555, + "rewards/rejected": -7.387040710449218, + "step": 1023 + }, + { + "epoch": 0.3780167043514374, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 7.010178974233936e-06, + "logits/chosen": 1271549520.8421052, + "logits/rejected": 1499073142.1538463, + "logps/chosen": -288.74043996710526, + "logps/rejected": -354.60006009615387, + "loss": 0.2372, + "rewards/chosen": 1.1480709879021895, + "rewards/margins": 6.171163991395279, + "rewards/rejected": -5.023093003493089, + "step": 1024 + }, + { + "epoch": 0.3783858612892806, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 7.004788827829143e-06, + "logits/chosen": 1838691766.857143, + "logits/rejected": 2093924807.1111112, + "logps/chosen": -237.09024483816964, + "logps/rejected": -469.8660481770833, + "loss": 0.1701, + "rewards/chosen": 0.9613872255597796, + "rewards/margins": 8.38368014683799, + "rewards/rejected": -7.422292921278212, + "step": 1025 + }, + { + "epoch": 0.3787550182271238, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 6.9993959038149e-06, + "logits/chosen": 1867516635.4285715, + "logits/rejected": 1856136988.4444444, + "logps/chosen": -284.51790945870533, + "logps/rejected": -479.26719835069446, + "loss": 0.1954, + "rewards/chosen": 0.5043069635118756, + "rewards/margins": 8.398740726803977, + "rewards/rejected": -7.894433763292101, + "step": 1026 + }, + { + "epoch": 0.379124175164967, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 6.994000209663037e-06, + "logits/chosen": 2062204723.2, + "logits/rejected": 2233994240.0, + "logps/chosen": -256.8276611328125, + "logps/rejected": -490.1243489583333, + "loss": 0.2158, + "rewards/chosen": 0.913912582397461, + "rewards/margins": 8.102145640055339, + "rewards/rejected": -7.188233057657878, + "step": 1027 + }, + { + "epoch": 0.3794933321028102, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 6.988601752849213e-06, + "logits/chosen": 1167296585.142857, + "logits/rejected": 1973685589.3333333, + "logps/chosen": -240.83367047991072, + "logps/rejected": -483.53038194444446, + "loss": 0.1569, + "rewards/chosen": 1.602776391165597, + "rewards/margins": 8.96336785573808, + "rewards/rejected": -7.360591464572483, + "step": 1028 + }, + { + "epoch": 0.3798624890406534, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 6.983200540852928e-06, + "logits/chosen": 1465306663.3846154, + "logits/rejected": 2209368602.9473686, + "logps/chosen": -253.55885667067307, + "logps/rejected": -618.7204975328947, + "loss": 0.1161, + "rewards/chosen": 1.612320533165565, + "rewards/margins": 12.05262413488226, + "rewards/rejected": -10.440303601716694, + "step": 1029 + }, + { + "epoch": 0.3802316459784966, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 6.97779658115749e-06, + "logits/chosen": 1983917624.8888888, + "logits/rejected": 2281856146.285714, + "logps/chosen": -245.47466362847223, + "logps/rejected": -456.01175362723217, + "loss": 0.1951, + "rewards/chosen": 1.3230375713772244, + "rewards/margins": 7.345101780361599, + "rewards/rejected": -6.022064208984375, + "step": 1030 + }, + { + "epoch": 0.3806008029163398, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 6.972389881250015e-06, + "logits/chosen": 1558298781.5384614, + "logits/rejected": 2149740328.4210525, + "logps/chosen": -351.7357647235577, + "logps/rejected": -441.39828330592104, + "loss": 0.1155, + "rewards/chosen": 1.8236981905423677, + "rewards/margins": 8.840064446453141, + "rewards/rejected": -7.016366255910773, + "step": 1031 + }, + { + "epoch": 0.380969959854183, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 6.9669804486214196e-06, + "logits/chosen": 2142004077.7142856, + "logits/rejected": 3051310193.7777777, + "logps/chosen": -287.0013427734375, + "logps/rejected": -575.4581705729166, + "loss": 0.1923, + "rewards/chosen": 0.3978395462036133, + "rewards/margins": 9.325613763597277, + "rewards/rejected": -8.927774217393663, + "step": 1032 + }, + { + "epoch": 0.3813391167920262, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 6.9615682907664025e-06, + "logits/chosen": 1474775771.4285715, + "logits/rejected": 1737034865.7777777, + "logps/chosen": -281.27083914620533, + "logps/rejected": -519.3575303819445, + "loss": 0.1237, + "rewards/chosen": 1.2917416436331612, + "rewards/margins": 9.480021537296356, + "rewards/rejected": -8.188279893663195, + "step": 1033 + }, + { + "epoch": 0.38170827372986943, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 6.95615341518344e-06, + "logits/chosen": 1503608217.6, + "logits/rejected": 1627995477.3333333, + "logps/chosen": -283.79169921875, + "logps/rejected": -479.4219563802083, + "loss": 0.2233, + "rewards/chosen": 0.972289752960205, + "rewards/margins": 8.359754276275634, + "rewards/rejected": -7.38746452331543, + "step": 1034 + }, + { + "epoch": 0.3820774306677126, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 6.950735829374773e-06, + "logits/chosen": 1944830313.4117646, + "logits/rejected": 1608161416.5333333, + "logps/chosen": -276.91906020220586, + "logps/rejected": -425.09485677083336, + "loss": 0.1686, + "rewards/chosen": 1.1956363004796646, + "rewards/margins": 8.467504179711437, + "rewards/rejected": -7.271867879231771, + "step": 1035 + }, + { + "epoch": 0.38244658760555583, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 6.9453155408464005e-06, + "logits/chosen": 1364251884.3076923, + "logits/rejected": 2067474539.7894738, + "logps/chosen": -307.3048565204327, + "logps/rejected": -421.2173622532895, + "loss": 0.1127, + "rewards/chosen": 1.5592793684739332, + "rewards/margins": 8.88919366226505, + "rewards/rejected": -7.329914293791118, + "step": 1036 + }, + { + "epoch": 0.382815744543399, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 6.939892557108059e-06, + "logits/chosen": 1341420836.5714285, + "logits/rejected": 1569915107.5555556, + "logps/chosen": -217.47148786272322, + "logps/rejected": -457.4826388888889, + "loss": 0.1035, + "rewards/chosen": 2.421404702322824, + "rewards/margins": 9.182301173134455, + "rewards/rejected": -6.760896470811632, + "step": 1037 + }, + { + "epoch": 0.38318490148124223, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 6.9344668856732255e-06, + "logits/chosen": 1595065344.0, + "logits/rejected": 1447868586.6666667, + "logps/chosen": -261.9731201171875, + "logps/rejected": -594.255859375, + "loss": 0.179, + "rewards/chosen": 1.436758041381836, + "rewards/margins": 9.232110977172852, + "rewards/rejected": -7.795352935791016, + "step": 1038 + }, + { + "epoch": 0.3835540584190854, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 6.9290385340591e-06, + "logits/chosen": 1886173952.0, + "logits/rejected": 1903924992.0, + "logps/chosen": -272.93597412109375, + "logps/rejected": -523.7559814453125, + "loss": 0.1671, + "rewards/chosen": 1.059628963470459, + "rewards/margins": 10.914914608001709, + "rewards/rejected": -9.85528564453125, + "step": 1039 + }, + { + "epoch": 0.38392321535692864, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 6.923607509786593e-06, + "logits/chosen": 1697210368.0, + "logits/rejected": 1431115414.5882354, + "logps/chosen": -282.9158203125, + "logps/rejected": -464.20978860294116, + "loss": 0.1563, + "rewards/chosen": 1.1723981221516928, + "rewards/margins": 7.068711060168696, + "rewards/rejected": -5.896312938017004, + "step": 1040 + }, + { + "epoch": 0.3842923722947718, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 6.918173820380321e-06, + "logits/chosen": 1955246694.4, + "logits/rejected": 1858172245.3333333, + "logps/chosen": -320.8313232421875, + "logps/rejected": -463.9962158203125, + "loss": 0.2125, + "rewards/chosen": 1.063975715637207, + "rewards/margins": 9.645189984639487, + "rewards/rejected": -8.58121426900228, + "step": 1041 + }, + { + "epoch": 0.38466152923261504, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 6.91273747336859e-06, + "logits/chosen": 1671425675.6363637, + "logits/rejected": 1863190235.4285715, + "logps/chosen": -403.3275035511364, + "logps/rejected": -477.48902529761904, + "loss": 0.1058, + "rewards/chosen": 1.5428071455522017, + "rewards/margins": 8.234510578634419, + "rewards/rejected": -6.691703433082218, + "step": 1042 + }, + { + "epoch": 0.3850306861704582, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 6.907298476283392e-06, + "logits/chosen": 1792460559.0588236, + "logits/rejected": 1976245452.8, + "logps/chosen": -289.27001953125, + "logps/rejected": -495.2573567708333, + "loss": 0.1556, + "rewards/chosen": 1.134869519401999, + "rewards/margins": 9.29150048050226, + "rewards/rejected": -8.15663096110026, + "step": 1043 + }, + { + "epoch": 0.38539984310830144, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 6.901856836660386e-06, + "logits/chosen": 1665567597.7142856, + "logits/rejected": 1531277994.6666667, + "logps/chosen": -308.95382254464283, + "logps/rejected": -477.3097330729167, + "loss": 0.1233, + "rewards/chosen": 1.54719420841762, + "rewards/margins": 9.833128656659808, + "rewards/rejected": -8.285934448242188, + "step": 1044 + }, + { + "epoch": 0.3857690000461446, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 6.896412562038897e-06, + "logits/chosen": 2135352115.2, + "logits/rejected": 1951150592.0, + "logps/chosen": -316.8733154296875, + "logps/rejected": -463.3391927083333, + "loss": 0.1678, + "rewards/chosen": 1.4995895385742188, + "rewards/margins": 7.670197423299154, + "rewards/rejected": -6.170607884724935, + "step": 1045 + }, + { + "epoch": 0.3861381569839878, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 6.890965659961897e-06, + "logits/chosen": 1855345371.4285715, + "logits/rejected": 1641295872.0, + "logps/chosen": -273.76109095982144, + "logps/rejected": -410.3990071614583, + "loss": 0.1632, + "rewards/chosen": 0.7023242541721889, + "rewards/margins": 7.484789023323665, + "rewards/rejected": -6.782464769151476, + "step": 1046 + }, + { + "epoch": 0.386507313921831, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 6.885516137975998e-06, + "logits/chosen": 2374526005.894737, + "logits/rejected": 2854545880.6153846, + "logps/chosen": -295.58277652138156, + "logps/rejected": -551.6556865985577, + "loss": 0.2131, + "rewards/chosen": 1.1426232990465666, + "rewards/margins": 10.180677545215438, + "rewards/rejected": -9.03805424616887, + "step": 1047 + }, + { + "epoch": 0.3868764708596742, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 6.880064003631446e-06, + "logits/chosen": 1734248075.6363637, + "logits/rejected": 1319384576.0, + "logps/chosen": -261.5970348011364, + "logps/rejected": -555.1615234375, + "loss": 0.1929, + "rewards/chosen": 1.5112571716308594, + "rewards/margins": 9.255259704589843, + "rewards/rejected": -7.7440025329589846, + "step": 1048 + }, + { + "epoch": 0.3872456277975174, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 6.874609264482103e-06, + "logits/chosen": 2079796428.8, + "logits/rejected": 2374291456.0, + "logps/chosen": -295.87216796875, + "logps/rejected": -528.7972005208334, + "loss": 0.1862, + "rewards/chosen": 1.1997077941894532, + "rewards/margins": 9.661069361368815, + "rewards/rejected": -8.461361567179361, + "step": 1049 + }, + { + "epoch": 0.3876147847353606, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 6.8691519280854406e-06, + "logits/chosen": 1482780535.4666667, + "logits/rejected": 1877646637.1764705, + "logps/chosen": -258.8557454427083, + "logps/rejected": -474.5862821691176, + "loss": 0.1284, + "rewards/chosen": 1.8471700032552083, + "rewards/margins": 9.155250339882047, + "rewards/rejected": -7.308080336626838, + "step": 1050 + }, + { + "epoch": 0.3879839416732038, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 6.863692002002529e-06, + "logits/chosen": 1874244494.2222223, + "logits/rejected": 1743058505.142857, + "logps/chosen": -264.25013563368054, + "logps/rejected": -465.16074916294644, + "loss": 0.1696, + "rewards/chosen": 1.35261779361301, + "rewards/margins": 8.799388597881983, + "rewards/rejected": -7.446770804268973, + "step": 1051 + }, + { + "epoch": 0.388353098611047, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 6.858229493798026e-06, + "logits/chosen": 2008383078.4, + "logits/rejected": 1645460901.6470587, + "logps/chosen": -297.67291666666665, + "logps/rejected": -458.19264131433823, + "loss": 0.1667, + "rewards/chosen": 1.3095297495524088, + "rewards/margins": 8.976888147989909, + "rewards/rejected": -7.6673583984375, + "step": 1052 + }, + { + "epoch": 0.3887222555488902, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 6.85276441104017e-06, + "logits/chosen": 1269208632.8888888, + "logits/rejected": 1441612214.857143, + "logps/chosen": -200.60045030381946, + "logps/rejected": -424.145751953125, + "loss": 0.1084, + "rewards/chosen": 2.139698028564453, + "rewards/margins": 9.366568429129465, + "rewards/rejected": -7.226870400565011, + "step": 1053 + }, + { + "epoch": 0.3890914124867334, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 6.84729676130076e-06, + "logits/chosen": 1720410298.1818182, + "logits/rejected": 1806025094.0952382, + "logps/chosen": -294.91455078125, + "logps/rejected": -408.66866629464283, + "loss": 0.1352, + "rewards/chosen": 0.9500854665582831, + "rewards/margins": 8.02888330649504, + "rewards/rejected": -7.078797839936756, + "step": 1054 + }, + { + "epoch": 0.38946056942457663, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 6.841826552155158e-06, + "logits/chosen": 1740043264.0, + "logits/rejected": 1787326634.6666667, + "logps/chosen": -295.114697265625, + "logps/rejected": -522.4771321614584, + "loss": 0.1849, + "rewards/chosen": 1.6446212768554687, + "rewards/margins": 8.710415903727213, + "rewards/rejected": -7.065794626871745, + "step": 1055 + }, + { + "epoch": 0.3898297263624198, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 6.836353791182266e-06, + "logits/chosen": 1634993421.4736843, + "logits/rejected": 1509501085.5384614, + "logps/chosen": -320.8282534950658, + "logps/rejected": -481.29627403846155, + "loss": 0.1987, + "rewards/chosen": 1.2330954702276933, + "rewards/margins": 8.729897580166094, + "rewards/rejected": -7.496802109938401, + "step": 1056 + }, + { + "epoch": 0.39019888330026303, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 6.830878485964528e-06, + "logits/chosen": 2385185370.352941, + "logits/rejected": 2357308620.8, + "logps/chosen": -290.01447610294116, + "logps/rejected": -517.9828450520833, + "loss": 0.1509, + "rewards/chosen": 1.3951729045194738, + "rewards/margins": 9.56887082118614, + "rewards/rejected": -8.173697916666667, + "step": 1057 + }, + { + "epoch": 0.3905680402381062, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 6.8254006440879094e-06, + "logits/chosen": 1822228187.4285715, + "logits/rejected": 2376029980.4444447, + "logps/chosen": -262.7651890345982, + "logps/rejected": -467.51605902777777, + "loss": 0.1095, + "rewards/chosen": 1.7289127622331892, + "rewards/margins": 9.106157499646384, + "rewards/rejected": -7.377244737413195, + "step": 1058 + }, + { + "epoch": 0.39093719717594944, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 6.81992027314189e-06, + "logits/chosen": 1610216960.0, + "logits/rejected": 1451137433.6, + "logps/chosen": -352.8995768229167, + "logps/rejected": -535.891162109375, + "loss": 0.1011, + "rewards/chosen": 1.7764005661010742, + "rewards/margins": 9.954036521911622, + "rewards/rejected": -8.177635955810548, + "step": 1059 + }, + { + "epoch": 0.3913063541137926, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 6.814437380719453e-06, + "logits/chosen": 1706885376.0, + "logits/rejected": 1430523008.0, + "logps/chosen": -338.99603271484375, + "logps/rejected": -497.7025146484375, + "loss": 0.167, + "rewards/chosen": 1.2853343486785889, + "rewards/margins": 9.400489568710327, + "rewards/rejected": -8.115155220031738, + "step": 1060 + }, + { + "epoch": 0.39167551105163584, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 6.808951974417077e-06, + "logits/chosen": 1654338901.3333333, + "logits/rejected": 2470924288.0, + "logps/chosen": -320.02239583333335, + "logps/rejected": -551.1536649816177, + "loss": 0.1571, + "rewards/chosen": 1.0757904052734375, + "rewards/margins": 8.820557538200827, + "rewards/rejected": -7.7447671329273895, + "step": 1061 + }, + { + "epoch": 0.392044667989479, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 6.803464061834725e-06, + "logits/chosen": 2183733679.1578946, + "logits/rejected": 2154804775.3846154, + "logps/chosen": -259.61973170230266, + "logps/rejected": -475.3329326923077, + "loss": 0.2198, + "rewards/chosen": 0.9142705013877467, + "rewards/margins": 10.85560867371347, + "rewards/rejected": -9.941338172325722, + "step": 1062 + }, + { + "epoch": 0.39241382492732224, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 6.7979736505758264e-06, + "logits/chosen": 1948117248.0, + "logits/rejected": 2163497472.0, + "logps/chosen": -257.1861267089844, + "logps/rejected": -734.1687622070312, + "loss": 0.1596, + "rewards/chosen": 1.0611233711242676, + "rewards/margins": 12.813782215118408, + "rewards/rejected": -11.75265884399414, + "step": 1063 + }, + { + "epoch": 0.3927829818651654, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 6.792480748247278e-06, + "logits/chosen": 1448152832.0, + "logits/rejected": 1541000064.0, + "logps/chosen": -203.82208251953125, + "logps/rejected": -502.36737060546875, + "loss": 0.188, + "rewards/chosen": 0.7844272255897522, + "rewards/margins": 7.859462797641754, + "rewards/rejected": -7.075035572052002, + "step": 1064 + }, + { + "epoch": 0.39315213880300864, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 6.786985362459427e-06, + "logits/chosen": 1916399856.9411764, + "logits/rejected": 1841463022.9333334, + "logps/chosen": -308.50281479779414, + "logps/rejected": -474.8029296875, + "loss": 0.2352, + "rewards/chosen": 0.532837475047392, + "rewards/margins": 7.137090099559111, + "rewards/rejected": -6.604252624511719, + "step": 1065 + }, + { + "epoch": 0.3935212957408518, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 6.78148750082606e-06, + "logits/chosen": 2029342720.0, + "logits/rejected": 1566383274.6666667, + "logps/chosen": -267.5640380859375, + "logps/rejected": -443.1219482421875, + "loss": 0.1779, + "rewards/chosen": 1.6598033905029297, + "rewards/margins": 8.844360987345379, + "rewards/rejected": -7.184557596842448, + "step": 1066 + }, + { + "epoch": 0.39389045267869505, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 6.7759871709643934e-06, + "logits/chosen": 2633280170.6666665, + "logits/rejected": 1716884684.8, + "logps/chosen": -270.5099690755208, + "logps/rejected": -476.720458984375, + "loss": 0.1225, + "rewards/chosen": 1.2277071475982666, + "rewards/margins": 8.100663995742797, + "rewards/rejected": -6.872956848144531, + "step": 1067 + }, + { + "epoch": 0.3942596096165382, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 6.770484380495064e-06, + "logits/chosen": 1452035449.2631578, + "logits/rejected": 1376616132.9230769, + "logps/chosen": -297.52467105263156, + "logps/rejected": -344.2648737980769, + "loss": 0.1763, + "rewards/chosen": 1.3346168116519326, + "rewards/margins": 7.819943725338832, + "rewards/rejected": -6.485326913686899, + "step": 1068 + }, + { + "epoch": 0.39462876655438145, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 6.76497913704212e-06, + "logits/chosen": 2628989513.142857, + "logits/rejected": 1922882673.7777777, + "logps/chosen": -251.04506138392858, + "logps/rejected": -419.33685980902777, + "loss": 0.1447, + "rewards/chosen": 1.1268036024911063, + "rewards/margins": 8.774278103359162, + "rewards/rejected": -7.647474500868055, + "step": 1069 + }, + { + "epoch": 0.3949979234922246, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 6.759471448233008e-06, + "logits/chosen": 2149958354.8235292, + "logits/rejected": 2147723400.5333333, + "logps/chosen": -309.3968864889706, + "logps/rejected": -390.9126953125, + "loss": 0.1881, + "rewards/chosen": 1.1064137851490694, + "rewards/margins": 6.825656135409486, + "rewards/rejected": -5.719242350260417, + "step": 1070 + }, + { + "epoch": 0.39536708043006785, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 6.7539613216985555e-06, + "logits/chosen": 1769832334.2222223, + "logits/rejected": 2350473216.0, + "logps/chosen": -332.0930989583333, + "logps/rejected": -547.8415178571429, + "loss": 0.1829, + "rewards/chosen": 1.2105256186591253, + "rewards/margins": 7.974713567703489, + "rewards/rejected": -6.764187949044364, + "step": 1071 + }, + { + "epoch": 0.395736237367911, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 6.748448765072977e-06, + "logits/chosen": 2189282304.0, + "logits/rejected": 1654270208.0, + "logps/chosen": -322.883544921875, + "logps/rejected": -451.36541748046875, + "loss": 0.1766, + "rewards/chosen": 1.4144606590270996, + "rewards/margins": 8.264976024627686, + "rewards/rejected": -6.850515365600586, + "step": 1072 + }, + { + "epoch": 0.39610539430575425, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 6.742933785993847e-06, + "logits/chosen": 1477750784.0, + "logits/rejected": 1608406471.1111112, + "logps/chosen": -354.78745814732144, + "logps/rejected": -488.7117513020833, + "loss": 0.153, + "rewards/chosen": 0.7984057835170201, + "rewards/margins": 8.426434653145927, + "rewards/rejected": -7.628028869628906, + "step": 1073 + }, + { + "epoch": 0.39647455124359743, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 6.737416392102101e-06, + "logits/chosen": 1754761728.0, + "logits/rejected": 3110345472.0, + "logps/chosen": -344.156005859375, + "logps/rejected": -457.4859924316406, + "loss": 0.1588, + "rewards/chosen": 1.0564942359924316, + "rewards/margins": 7.976099491119385, + "rewards/rejected": -6.919605255126953, + "step": 1074 + }, + { + "epoch": 0.39684370818144066, + "grad_norm": 11.5, + "kl": 0.10857439041137695, + "learning_rate": 6.731896591042016e-06, + "logits/chosen": 2848242892.8, + "logits/rejected": 1513277098.6666667, + "logps/chosen": -259.1068603515625, + "logps/rejected": -470.3874104817708, + "loss": 0.1405, + "rewards/chosen": 1.8974687576293945, + "rewards/margins": 10.167544873555501, + "rewards/rejected": -8.270076115926107, + "step": 1075 + }, + { + "epoch": 0.39721286511928383, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 6.72637439046121e-06, + "logits/chosen": 2200587008.0, + "logits/rejected": 2018576384.0, + "logps/chosen": -231.5769805908203, + "logps/rejected": -546.85107421875, + "loss": 0.1844, + "rewards/chosen": 0.7452334761619568, + "rewards/margins": 8.770974099636078, + "rewards/rejected": -8.025740623474121, + "step": 1076 + }, + { + "epoch": 0.39758202205712706, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 6.720849798010618e-06, + "logits/chosen": 1728916736.0, + "logits/rejected": 1202708736.0, + "logps/chosen": -316.3252868652344, + "logps/rejected": -330.36627197265625, + "loss": 0.1605, + "rewards/chosen": 1.364596962928772, + "rewards/margins": 8.332844853401184, + "rewards/rejected": -6.968247890472412, + "step": 1077 + }, + { + "epoch": 0.39795117899497023, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 6.715322821344495e-06, + "logits/chosen": 1815720960.0, + "logits/rejected": 1506452593.7777777, + "logps/chosen": -261.01661900111606, + "logps/rejected": -412.3838161892361, + "loss": 0.1259, + "rewards/chosen": 1.493248394557408, + "rewards/margins": 7.944241402641175, + "rewards/rejected": -6.450993008083767, + "step": 1078 + }, + { + "epoch": 0.39832033593281346, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 6.709793468120395e-06, + "logits/chosen": 2138135130.3529413, + "logits/rejected": 1652556868.2666667, + "logps/chosen": -280.15062040441177, + "logps/rejected": -414.19469401041664, + "loss": 0.1463, + "rewards/chosen": 1.7847302380730123, + "rewards/margins": 9.132459730260512, + "rewards/rejected": -7.3477294921875, + "step": 1079 + }, + { + "epoch": 0.39868949287065664, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 6.704261745999168e-06, + "logits/chosen": 1866493952.0, + "logits/rejected": 2490449510.4, + "logps/chosen": -253.5846280184659, + "logps/rejected": -545.456640625, + "loss": 0.1757, + "rewards/chosen": 1.9148472872647373, + "rewards/margins": 9.670248898592863, + "rewards/rejected": -7.755401611328125, + "step": 1080 + }, + { + "epoch": 0.39905864980849987, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 6.698727662644944e-06, + "logits/chosen": 1572910518.857143, + "logits/rejected": 2027579619.5555556, + "logps/chosen": -346.2440708705357, + "logps/rejected": -478.9582248263889, + "loss": 0.1502, + "rewards/chosen": 0.9923221043178013, + "rewards/margins": 9.176119304838636, + "rewards/rejected": -8.183797200520834, + "step": 1081 + }, + { + "epoch": 0.39942780674634304, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 6.693191225725125e-06, + "logits/chosen": 2154774869.3333335, + "logits/rejected": 1602721484.8, + "logps/chosen": -318.2677815755208, + "logps/rejected": -487.480615234375, + "loss": 0.1724, + "rewards/chosen": 0.2615639567375183, + "rewards/margins": 7.575173270702362, + "rewards/rejected": -7.313609313964844, + "step": 1082 + }, + { + "epoch": 0.3997969636841862, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 6.687652442910375e-06, + "logits/chosen": 2295648665.6, + "logits/rejected": 2079275861.3333333, + "logps/chosen": -308.50146484375, + "logps/rejected": -525.5748291015625, + "loss": 0.1521, + "rewards/chosen": 1.5182968139648438, + "rewards/margins": 8.581021245320638, + "rewards/rejected": -7.062724431355794, + "step": 1083 + }, + { + "epoch": 0.40016612062202944, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 6.682111321874608e-06, + "logits/chosen": 1779618579.6923077, + "logits/rejected": 2139230854.7368422, + "logps/chosen": -297.64881310096155, + "logps/rejected": -444.95101768092104, + "loss": 0.1467, + "rewards/chosen": 1.1541243333082933, + "rewards/margins": 8.995461838448096, + "rewards/rejected": -7.841337505139802, + "step": 1084 + }, + { + "epoch": 0.4005352775598726, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 6.6765678702949744e-06, + "logits/chosen": 1722460882.8235295, + "logits/rejected": 2046736520.5333333, + "logps/chosen": -292.67155905330884, + "logps/rejected": -606.8057942708333, + "loss": 0.1613, + "rewards/chosen": 1.4810222176944507, + "rewards/margins": 8.706855856203566, + "rewards/rejected": -7.225833638509115, + "step": 1085 + }, + { + "epoch": 0.40090443449771584, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 6.671022095851857e-06, + "logits/chosen": 1377177088.0, + "logits/rejected": 1273354880.0, + "logps/chosen": -238.42919921875, + "logps/rejected": -338.3040771484375, + "loss": 0.1703, + "rewards/chosen": 1.0728797912597656, + "rewards/margins": 8.594537734985352, + "rewards/rejected": -7.521657943725586, + "step": 1086 + }, + { + "epoch": 0.401273591435559, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 6.6654740062288555e-06, + "logits/chosen": 1826151243.2941177, + "logits/rejected": 1477414638.9333334, + "logps/chosen": -319.5673828125, + "logps/rejected": -465.1244791666667, + "loss": 0.1747, + "rewards/chosen": 1.2964708664838005, + "rewards/margins": 8.54504264382755, + "rewards/rejected": -7.24857177734375, + "step": 1087 + }, + { + "epoch": 0.40164274837340225, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 6.65992360911278e-06, + "logits/chosen": 1716000904.5333333, + "logits/rejected": 2282586593.882353, + "logps/chosen": -318.74274088541665, + "logps/rejected": -422.5145048253676, + "loss": 0.2004, + "rewards/chosen": 0.6620027542114257, + "rewards/margins": 7.636162443721996, + "rewards/rejected": -6.97415968951057, + "step": 1088 + }, + { + "epoch": 0.4020119053112454, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 6.654370912193633e-06, + "logits/chosen": 1798950619.4285715, + "logits/rejected": 1833331598.2222223, + "logps/chosen": -239.75355747767858, + "logps/rejected": -413.25634765625, + "loss": 0.1621, + "rewards/chosen": 1.013793672834124, + "rewards/margins": 8.6400084419856, + "rewards/rejected": -7.626214769151476, + "step": 1089 + }, + { + "epoch": 0.40238106224908865, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 6.648815923164604e-06, + "logits/chosen": 1368820940.8, + "logits/rejected": 1533021485.1764705, + "logps/chosen": -258.4552734375, + "logps/rejected": -451.1934168198529, + "loss": 0.1343, + "rewards/chosen": 1.4674625396728516, + "rewards/margins": 9.484920389512006, + "rewards/rejected": -8.017457849839154, + "step": 1090 + }, + { + "epoch": 0.4027502191869318, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 6.6432586497220615e-06, + "logits/chosen": 2034860714.6666667, + "logits/rejected": 1664781165.7142856, + "logps/chosen": -329.9111328125, + "logps/rejected": -445.34852818080356, + "loss": 0.1826, + "rewards/chosen": 0.9422001308865018, + "rewards/margins": 8.085133022732204, + "rewards/rejected": -7.142932891845703, + "step": 1091 + }, + { + "epoch": 0.40311937612477505, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 6.637699099565538e-06, + "logits/chosen": 1102140928.0, + "logits/rejected": 1957234176.0, + "logps/chosen": -234.69979858398438, + "logps/rejected": -398.2297668457031, + "loss": 0.1144, + "rewards/chosen": 2.1419057846069336, + "rewards/margins": 8.355353355407715, + "rewards/rejected": -6.213447570800781, + "step": 1092 + }, + { + "epoch": 0.4034885330626182, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 6.632137280397719e-06, + "logits/chosen": 1345742336.0, + "logits/rejected": 1185961472.0, + "logps/chosen": -221.3385467529297, + "logps/rejected": -418.79638671875, + "loss": 0.1473, + "rewards/chosen": 1.4392707347869873, + "rewards/margins": 9.306103944778442, + "rewards/rejected": -7.866833209991455, + "step": 1093 + }, + { + "epoch": 0.40385769000046146, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 6.626573199924433e-06, + "logits/chosen": 1883031688.5333333, + "logits/rejected": 1826290989.1764705, + "logps/chosen": -256.943408203125, + "logps/rejected": -432.3770392922794, + "loss": 0.2093, + "rewards/chosen": 0.6236879348754882, + "rewards/margins": 8.390810988931095, + "rewards/rejected": -7.767123054055607, + "step": 1094 + }, + { + "epoch": 0.40422684693830463, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 6.621006865854645e-06, + "logits/chosen": 1363621614.9333334, + "logits/rejected": 2099916318.1176472, + "logps/chosen": -264.11180013020834, + "logps/rejected": -482.5666934742647, + "loss": 0.1792, + "rewards/chosen": 1.009035873413086, + "rewards/margins": 9.644103083891029, + "rewards/rejected": -8.635067210477942, + "step": 1095 + }, + { + "epoch": 0.40459600387614786, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 6.6154382859004385e-06, + "logits/chosen": 1671596544.0, + "logits/rejected": 1756755763.2, + "logps/chosen": -300.86431884765625, + "logps/rejected": -456.12861328125, + "loss": 0.0506, + "rewards/chosen": 2.6266072591145835, + "rewards/margins": 11.428383382161458, + "rewards/rejected": -8.801776123046874, + "step": 1096 + }, + { + "epoch": 0.40496516081399103, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 6.609867467777011e-06, + "logits/chosen": 1575780096.0, + "logits/rejected": 1986036608.0, + "logps/chosen": -247.15362548828125, + "logps/rejected": -409.39166259765625, + "loss": 0.1368, + "rewards/chosen": 1.9149789810180664, + "rewards/margins": 8.450450420379639, + "rewards/rejected": -6.535471439361572, + "step": 1097 + }, + { + "epoch": 0.40533431775183426, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 6.60429441920266e-06, + "logits/chosen": 1234863581.8666666, + "logits/rejected": 1268639141.6470587, + "logps/chosen": -260.67286783854166, + "logps/rejected": -417.61669921875, + "loss": 0.1488, + "rewards/chosen": 1.2188986460367839, + "rewards/margins": 8.177619582531499, + "rewards/rejected": -6.958720936494715, + "step": 1098 + }, + { + "epoch": 0.40570347468967743, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 6.598719147898773e-06, + "logits/chosen": 2582913570.133333, + "logits/rejected": 2258682578.8235292, + "logps/chosen": -320.8715494791667, + "logps/rejected": -518.5440027573529, + "loss": 0.1175, + "rewards/chosen": 1.4747879028320312, + "rewards/margins": 9.46514793844784, + "rewards/rejected": -7.990360035615809, + "step": 1099 + }, + { + "epoch": 0.40607263162752066, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 6.593141661589819e-06, + "logits/chosen": 1318332416.0, + "logits/rejected": 1603574272.0, + "logps/chosen": -260.5073547363281, + "logps/rejected": -502.38873291015625, + "loss": 0.155, + "rewards/chosen": 1.172070026397705, + "rewards/margins": 9.341768741607666, + "rewards/rejected": -8.169698715209961, + "step": 1100 + }, + { + "epoch": 0.40644178856536384, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 6.5875619680033334e-06, + "logits/chosen": 1675709312.0, + "logits/rejected": 1772520960.0, + "logps/chosen": -321.2685852050781, + "logps/rejected": -467.0329895019531, + "loss": 0.1751, + "rewards/chosen": 0.9003105163574219, + "rewards/margins": 8.461559295654297, + "rewards/rejected": -7.561248779296875, + "step": 1101 + }, + { + "epoch": 0.40681094550320707, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 6.581980074869911e-06, + "logits/chosen": 1714092544.0, + "logits/rejected": 1687138676.3636363, + "logps/chosen": -249.962890625, + "logps/rejected": -459.38325639204544, + "loss": 0.0833, + "rewards/chosen": 1.6970787048339844, + "rewards/margins": 10.40854887528853, + "rewards/rejected": -8.711470170454545, + "step": 1102 + }, + { + "epoch": 0.40718010244105024, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 6.576395989923193e-06, + "logits/chosen": 2520461702.095238, + "logits/rejected": 3754214865.4545455, + "logps/chosen": -299.0514322916667, + "logps/rejected": -549.5311168323864, + "loss": 0.1738, + "rewards/chosen": 1.3261250995454335, + "rewards/margins": 9.291498745674694, + "rewards/rejected": -7.965373646129262, + "step": 1103 + }, + { + "epoch": 0.40754925937889347, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 6.57080972089986e-06, + "logits/chosen": 1685218560.0, + "logits/rejected": 2441829376.0, + "logps/chosen": -269.3581237792969, + "logps/rejected": -613.5254516601562, + "loss": 0.1814, + "rewards/chosen": 0.9856081008911133, + "rewards/margins": 9.273841857910156, + "rewards/rejected": -8.288233757019043, + "step": 1104 + }, + { + "epoch": 0.40791841631673664, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 6.565221275539615e-06, + "logits/chosen": 1720788352.0, + "logits/rejected": 3060720128.0, + "logps/chosen": -287.57244873046875, + "logps/rejected": -524.9459228515625, + "loss": 0.1628, + "rewards/chosen": 1.0667322874069214, + "rewards/margins": 9.874749064445496, + "rewards/rejected": -8.808016777038574, + "step": 1105 + }, + { + "epoch": 0.40828757325457987, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 6.559630661585179e-06, + "logits/chosen": 2250348544.0, + "logits/rejected": 2399702016.0, + "logps/chosen": -246.15711975097656, + "logps/rejected": -493.6845703125, + "loss": 0.1979, + "rewards/chosen": 0.7520021200180054, + "rewards/margins": 9.300316214561462, + "rewards/rejected": -8.548314094543457, + "step": 1106 + }, + { + "epoch": 0.40865673019242305, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 6.554037886782276e-06, + "logits/chosen": 1768274688.0, + "logits/rejected": 1983123968.0, + "logps/chosen": -282.6739196777344, + "logps/rejected": -639.4232177734375, + "loss": 0.168, + "rewards/chosen": 1.048548936843872, + "rewards/margins": 11.228220224380493, + "rewards/rejected": -10.179671287536621, + "step": 1107 + }, + { + "epoch": 0.4090258871302663, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 6.548442958879624e-06, + "logits/chosen": 1799554779.4285715, + "logits/rejected": 1885458059.6363637, + "logps/chosen": -349.1602492559524, + "logps/rejected": -376.32967862215907, + "loss": 0.1789, + "rewards/chosen": 1.561465127127511, + "rewards/margins": 8.300678550422965, + "rewards/rejected": -6.739213423295454, + "step": 1108 + }, + { + "epoch": 0.40939504406810945, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 6.542845885628926e-06, + "logits/chosen": 2072133427.2, + "logits/rejected": 1951218005.3333333, + "logps/chosen": -334.244482421875, + "logps/rejected": -544.6512858072916, + "loss": 0.2149, + "rewards/chosen": 0.9359611511230469, + "rewards/margins": 7.532897694905599, + "rewards/rejected": -6.596936543782552, + "step": 1109 + }, + { + "epoch": 0.4097642010059527, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 6.537246674784855e-06, + "logits/chosen": 1867153839.1578948, + "logits/rejected": 2346710567.3846154, + "logps/chosen": -256.1425010279605, + "logps/rejected": -416.83435997596155, + "loss": 0.1947, + "rewards/chosen": 1.641961750231291, + "rewards/margins": 10.085263611334055, + "rewards/rejected": -8.443301861102764, + "step": 1110 + }, + { + "epoch": 0.41013335794379585, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 6.531645334105045e-06, + "logits/chosen": 1257903182.7692308, + "logits/rejected": 1269046864.8421052, + "logps/chosen": -291.4960186298077, + "logps/rejected": -465.91976768092104, + "loss": 0.118, + "rewards/chosen": 1.589037381685697, + "rewards/margins": 8.692923233094003, + "rewards/rejected": -7.103885851408306, + "step": 1111 + }, + { + "epoch": 0.4105025148816391, + "grad_norm": 12.25, + "kl": 0.1269521713256836, + "learning_rate": 6.526041871350086e-06, + "logits/chosen": 2045406617.6, + "logits/rejected": 2009409706.6666667, + "logps/chosen": -269.8741943359375, + "logps/rejected": -413.7782389322917, + "loss": 0.1711, + "rewards/chosen": 1.3508588790893554, + "rewards/margins": 8.308771959940593, + "rewards/rejected": -6.957913080851237, + "step": 1112 + }, + { + "epoch": 0.41087167181948225, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 6.520436294283503e-06, + "logits/chosen": 1492258816.0, + "logits/rejected": 1909271371.2941177, + "logps/chosen": -370.37067057291665, + "logps/rejected": -499.5703699448529, + "loss": 0.1596, + "rewards/chosen": 1.0874961853027343, + "rewards/margins": 9.080233001708985, + "rewards/rejected": -7.99273681640625, + "step": 1113 + }, + { + "epoch": 0.4112408287573255, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 6.514828610671751e-06, + "logits/chosen": 1138387171.5555556, + "logits/rejected": 1366036480.0, + "logps/chosen": -258.1832682291667, + "logps/rejected": -452.3655482700893, + "loss": 0.1647, + "rewards/chosen": 1.5192832946777344, + "rewards/margins": 8.75993183680943, + "rewards/rejected": -7.240648542131696, + "step": 1114 + }, + { + "epoch": 0.41160998569516866, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 6.509218828284203e-06, + "logits/chosen": 1768484224.0, + "logits/rejected": 1837243520.0, + "logps/chosen": -242.58375549316406, + "logps/rejected": -434.974365234375, + "loss": 0.2303, + "rewards/chosen": 0.6882071495056152, + "rewards/margins": 8.065849781036377, + "rewards/rejected": -7.377642631530762, + "step": 1115 + }, + { + "epoch": 0.4119791426330119, + "grad_norm": 12.875, + "kl": 1.3077964782714844, + "learning_rate": 6.503606954893143e-06, + "logits/chosen": 2068589146.3529413, + "logits/rejected": 1327237529.6, + "logps/chosen": -286.3373448988971, + "logps/rejected": -469.93639322916664, + "loss": 0.1656, + "rewards/chosen": 1.3158083523021025, + "rewards/margins": 9.355337262621113, + "rewards/rejected": -8.03952891031901, + "step": 1116 + }, + { + "epoch": 0.41234829957085506, + "grad_norm": 14.625, + "kl": 0.0, + "learning_rate": 6.497992998273751e-06, + "logits/chosen": 1952715008.0, + "logits/rejected": 2775755776.0, + "logps/chosen": -292.9701232910156, + "logps/rejected": -535.5264892578125, + "loss": 0.204, + "rewards/chosen": 0.6841689944267273, + "rewards/margins": 9.049020946025848, + "rewards/rejected": -8.364851951599121, + "step": 1117 + }, + { + "epoch": 0.41271745650869823, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 6.492376966204092e-06, + "logits/chosen": 2344901578.105263, + "logits/rejected": 1948000886.1538463, + "logps/chosen": -270.2896278782895, + "logps/rejected": -490.9521484375, + "loss": 0.1722, + "rewards/chosen": 1.1184669293855365, + "rewards/margins": 10.533515381909575, + "rewards/rejected": -9.415048452524038, + "step": 1118 + }, + { + "epoch": 0.41308661344654146, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 6.486758866465106e-06, + "logits/chosen": 2102126405.8181818, + "logits/rejected": 1895719731.2, + "logps/chosen": -262.9108220880682, + "logps/rejected": -314.0551025390625, + "loss": 0.2529, + "rewards/chosen": 0.9838359139182351, + "rewards/margins": 8.212189032814718, + "rewards/rejected": -7.228353118896484, + "step": 1119 + }, + { + "epoch": 0.41345577038438464, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 6.4811387068406e-06, + "logits/chosen": 2003318311.3846154, + "logits/rejected": 1924786714.9473684, + "logps/chosen": -337.1915940504808, + "logps/rejected": -437.0299650493421, + "loss": 0.0897, + "rewards/chosen": 2.399751369769757, + "rewards/margins": 9.86688233194081, + "rewards/rejected": -7.467130962171052, + "step": 1120 + }, + { + "epoch": 0.41382492732222786, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 6.475516495117233e-06, + "logits/chosen": 1753316944.8421052, + "logits/rejected": 1924960413.5384614, + "logps/chosen": -261.00318667763156, + "logps/rejected": -538.9020057091346, + "loss": 0.1668, + "rewards/chosen": 1.6847694798519737, + "rewards/margins": 10.288064439287071, + "rewards/rejected": -8.603294959435097, + "step": 1121 + }, + { + "epoch": 0.41419408426007104, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 6.4698922390845085e-06, + "logits/chosen": 1683193719.4666667, + "logits/rejected": 1687166976.0, + "logps/chosen": -337.6322265625, + "logps/rejected": -532.2019186580883, + "loss": 0.1827, + "rewards/chosen": 1.0114428202311199, + "rewards/margins": 9.718021692014208, + "rewards/rejected": -8.706578871783089, + "step": 1122 + }, + { + "epoch": 0.41456324119791427, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 6.464265946534762e-06, + "logits/chosen": 1628887040.0, + "logits/rejected": 1815603785.142857, + "logps/chosen": -242.24549696180554, + "logps/rejected": -485.71944754464283, + "loss": 0.1966, + "rewards/chosen": 1.2598820792304144, + "rewards/margins": 8.970887698824443, + "rewards/rejected": -7.711005619594029, + "step": 1123 + }, + { + "epoch": 0.41493239813575744, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 6.4586376252631485e-06, + "logits/chosen": 2302734336.0, + "logits/rejected": 1491904835.368421, + "logps/chosen": -267.0373722956731, + "logps/rejected": -527.2330900493421, + "loss": 0.1008, + "rewards/chosen": 1.6559059436504657, + "rewards/margins": 10.085405519616748, + "rewards/rejected": -8.429499575966283, + "step": 1124 + }, + { + "epoch": 0.41530155507360067, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 6.453007283067638e-06, + "logits/chosen": 1544921646.5454545, + "logits/rejected": 1785599317.3333333, + "logps/chosen": -323.65189985795456, + "logps/rejected": -469.6827101934524, + "loss": 0.1003, + "rewards/chosen": 1.6645911823619495, + "rewards/margins": 9.30596095659, + "rewards/rejected": -7.641369774228051, + "step": 1125 + }, + { + "epoch": 0.41567071201144384, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 6.447374927748997e-06, + "logits/chosen": 1922283648.0, + "logits/rejected": 2328636672.0, + "logps/chosen": -270.9356689453125, + "logps/rejected": -632.645263671875, + "loss": 0.1432, + "rewards/chosen": 1.3607394695281982, + "rewards/margins": 9.40466856956482, + "rewards/rejected": -8.043929100036621, + "step": 1126 + }, + { + "epoch": 0.4160398689492871, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 6.4417405671107826e-06, + "logits/chosen": 1603148686.2222223, + "logits/rejected": 1720604672.0, + "logps/chosen": -242.55327690972223, + "logps/rejected": -405.3080357142857, + "loss": 0.2098, + "rewards/chosen": 0.7402637269761827, + "rewards/margins": 7.627523619031149, + "rewards/rejected": -6.887259892054966, + "step": 1127 + }, + { + "epoch": 0.41640902588713025, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 6.4361042089593285e-06, + "logits/chosen": 1081351533.7142856, + "logits/rejected": 1631940608.0, + "logps/chosen": -161.29227120535714, + "logps/rejected": -487.464375, + "loss": 0.0474, + "rewards/chosen": 2.1523145948137556, + "rewards/margins": 9.27088759286063, + "rewards/rejected": -7.118572998046875, + "step": 1128 + }, + { + "epoch": 0.4167781828249735, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 6.43046586110374e-06, + "logits/chosen": 1291870663.1111112, + "logits/rejected": 1146556342.857143, + "logps/chosen": -195.44893391927084, + "logps/rejected": -457.042236328125, + "loss": 0.124, + "rewards/chosen": 1.9835215674506292, + "rewards/margins": 9.076082471817259, + "rewards/rejected": -7.092560904366629, + "step": 1129 + }, + { + "epoch": 0.41714733976281665, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 6.4248255313558735e-06, + "logits/chosen": 2056995498.6666667, + "logits/rejected": 1691546770.2857144, + "logps/chosen": -234.71310763888889, + "logps/rejected": -717.1671316964286, + "loss": 0.1393, + "rewards/chosen": 1.9324864281548395, + "rewards/margins": 12.466210531809974, + "rewards/rejected": -10.533724103655134, + "step": 1130 + }, + { + "epoch": 0.4175164967006599, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 6.419183227530336e-06, + "logits/chosen": 1991164550.7368422, + "logits/rejected": 2730601865.8461537, + "logps/chosen": -287.17655222039474, + "logps/rejected": -388.02700570913464, + "loss": 0.2119, + "rewards/chosen": 0.8276575991981908, + "rewards/margins": 8.972054454479139, + "rewards/rejected": -8.144396855280949, + "step": 1131 + }, + { + "epoch": 0.41788565363850305, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 6.413538957444468e-06, + "logits/chosen": 1639600600.6153846, + "logits/rejected": 1920802600.4210527, + "logps/chosen": -233.87603290264423, + "logps/rejected": -434.2275904605263, + "loss": 0.0978, + "rewards/chosen": 1.6472176771897535, + "rewards/margins": 8.774054353536382, + "rewards/rejected": -7.126836676346628, + "step": 1132 + }, + { + "epoch": 0.4182548105763463, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 6.407892728918333e-06, + "logits/chosen": 1551009069.1764705, + "logits/rejected": 1465116672.0, + "logps/chosen": -375.33645450367646, + "logps/rejected": -479.93095703125, + "loss": 0.203, + "rewards/chosen": 1.5076521705178654, + "rewards/margins": 8.756058135687136, + "rewards/rejected": -7.248405965169271, + "step": 1133 + }, + { + "epoch": 0.41862396751418945, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 6.402244549774707e-06, + "logits/chosen": 2049305429.3333333, + "logits/rejected": 1487737856.0, + "logps/chosen": -364.9890543619792, + "logps/rejected": -395.2612548828125, + "loss": 0.1162, + "rewards/chosen": 1.6487247149149578, + "rewards/margins": 8.443416182200114, + "rewards/rejected": -6.794691467285157, + "step": 1134 + }, + { + "epoch": 0.4189931244520327, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 6.396594427839076e-06, + "logits/chosen": 2071104398.2222223, + "logits/rejected": 2137403830.857143, + "logps/chosen": -371.79161241319446, + "logps/rejected": -409.3422154017857, + "loss": 0.1588, + "rewards/chosen": 1.3216544257269964, + "rewards/margins": 8.581192803761315, + "rewards/rejected": -7.2595383780343195, + "step": 1135 + }, + { + "epoch": 0.41936228138987586, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 6.3909423709396054e-06, + "logits/chosen": 1363735732.7058823, + "logits/rejected": 1484259874.1333334, + "logps/chosen": -237.30710018382354, + "logps/rejected": -466.32610677083335, + "loss": 0.176, + "rewards/chosen": 1.174126568962546, + "rewards/margins": 9.750303530225567, + "rewards/rejected": -8.576176961263021, + "step": 1136 + }, + { + "epoch": 0.4197314383277191, + "grad_norm": 14.0625, + "kl": 0.2199411392211914, + "learning_rate": 6.385288386907155e-06, + "logits/chosen": 2198811828.7058825, + "logits/rejected": 1326079590.4, + "logps/chosen": -334.3848230698529, + "logps/rejected": -417.2638346354167, + "loss": 0.1986, + "rewards/chosen": 0.8786048889160156, + "rewards/margins": 7.615520985921224, + "rewards/rejected": -6.736916097005208, + "step": 1137 + }, + { + "epoch": 0.42010059526556226, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 6.379632483575242e-06, + "logits/chosen": 1504016384.0, + "logits/rejected": 1379714944.0, + "logps/chosen": -230.348388671875, + "logps/rejected": -453.11279296875, + "loss": 0.1131, + "rewards/chosen": 2.285276174545288, + "rewards/margins": 9.45160698890686, + "rewards/rejected": -7.166330814361572, + "step": 1138 + }, + { + "epoch": 0.4204697522034055, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 6.373974668780053e-06, + "logits/chosen": 2059143031.4666667, + "logits/rejected": 1624866816.0, + "logps/chosen": -245.61163736979168, + "logps/rejected": -554.9939682904412, + "loss": 0.1344, + "rewards/chosen": 1.384009552001953, + "rewards/margins": 9.36359598496381, + "rewards/rejected": -7.979586432961857, + "step": 1139 + }, + { + "epoch": 0.42083890914124866, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 6.368314950360416e-06, + "logits/chosen": 1493679360.0, + "logits/rejected": 1566018048.0, + "logps/chosen": -250.75241088867188, + "logps/rejected": -399.3016662597656, + "loss": 0.1291, + "rewards/chosen": 1.5947717428207397, + "rewards/margins": 7.962820410728455, + "rewards/rejected": -6.368048667907715, + "step": 1140 + }, + { + "epoch": 0.4212080660790919, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 6.362653336157798e-06, + "logits/chosen": 1781087310.7692308, + "logits/rejected": 2392368181.894737, + "logps/chosen": -284.09262319711536, + "logps/rejected": -580.8717105263158, + "loss": 0.1431, + "rewards/chosen": 1.3495875138502855, + "rewards/margins": 8.396975691019282, + "rewards/rejected": -7.047388177168997, + "step": 1141 + }, + { + "epoch": 0.42157722301693507, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 6.356989834016296e-06, + "logits/chosen": 1847291041.6842105, + "logits/rejected": 1232314131.6923077, + "logps/chosen": -417.98812705592104, + "logps/rejected": -385.4050856370192, + "loss": 0.1951, + "rewards/chosen": 1.281013287995991, + "rewards/margins": 9.887054242585835, + "rewards/rejected": -8.606040954589844, + "step": 1142 + }, + { + "epoch": 0.4219463799547783, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 6.35132445178262e-06, + "logits/chosen": 2133992684.3076923, + "logits/rejected": 1827002044.631579, + "logps/chosen": -349.7681415264423, + "logps/rejected": -473.3071546052632, + "loss": 0.1165, + "rewards/chosen": 1.5746553861177885, + "rewards/margins": 8.605774481769515, + "rewards/rejected": -7.031119095651727, + "step": 1143 + }, + { + "epoch": 0.42231553689262147, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 6.3456571973060835e-06, + "logits/chosen": 1694877696.0, + "logits/rejected": 1631411053.7142856, + "logps/chosen": -258.93755425347223, + "logps/rejected": -392.53634207589283, + "loss": 0.1326, + "rewards/chosen": 1.6761817932128906, + "rewards/margins": 9.116954258510045, + "rewards/rejected": -7.440772465297154, + "step": 1144 + }, + { + "epoch": 0.4226846938304647, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 6.339988078438597e-06, + "logits/chosen": 1762287988.3636363, + "logits/rejected": 1539779925.3333333, + "logps/chosen": -252.0506258877841, + "logps/rejected": -419.8271019345238, + "loss": 0.1235, + "rewards/chosen": 1.0255888158624822, + "rewards/margins": 7.33895493379403, + "rewards/rejected": -6.3133661179315474, + "step": 1145 + }, + { + "epoch": 0.42305385076830787, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 6.3343171030346525e-06, + "logits/chosen": 2056606900.7058823, + "logits/rejected": 1524163515.7333333, + "logps/chosen": -380.4685489430147, + "logps/rejected": -541.4267252604167, + "loss": 0.1909, + "rewards/chosen": 0.7522559446447036, + "rewards/margins": 8.904037662580901, + "rewards/rejected": -8.151781717936197, + "step": 1146 + }, + { + "epoch": 0.4234230077061511, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 6.3286442789513135e-06, + "logits/chosen": 1846317933.7142856, + "logits/rejected": 1858965048.8888888, + "logps/chosen": -327.10030691964283, + "logps/rejected": -519.5309787326389, + "loss": 0.1448, + "rewards/chosen": 1.3853645324707031, + "rewards/margins": 9.764601389567057, + "rewards/rejected": -8.379236857096354, + "step": 1147 + }, + { + "epoch": 0.4237921646439943, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 6.322969614048207e-06, + "logits/chosen": 1845080473.6, + "logits/rejected": 2054415540.7058823, + "logps/chosen": -233.98995768229167, + "logps/rejected": -454.52306410845586, + "loss": 0.1709, + "rewards/chosen": 0.8968348185221354, + "rewards/margins": 8.678135980344285, + "rewards/rejected": -7.78130116182215, + "step": 1148 + }, + { + "epoch": 0.4241613215818375, + "grad_norm": 14.5625, + "kl": 0.6815147399902344, + "learning_rate": 6.317293116187508e-06, + "logits/chosen": 1318118741.3333333, + "logits/rejected": 1146506752.0, + "logps/chosen": -325.28130425347223, + "logps/rejected": -378.51283482142856, + "loss": 0.2119, + "rewards/chosen": 0.845975293053521, + "rewards/margins": 7.358902757129972, + "rewards/rejected": -6.5129274640764505, + "step": 1149 + }, + { + "epoch": 0.4245304785196807, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 6.311614793233932e-06, + "logits/chosen": 2105605120.0, + "logits/rejected": 1956215193.6, + "logps/chosen": -347.881591796875, + "logps/rejected": -528.3490234375, + "loss": 0.1155, + "rewards/chosen": 1.1439013481140137, + "rewards/margins": 9.072905254364013, + "rewards/rejected": -7.92900390625, + "step": 1150 + }, + { + "epoch": 0.4248996354575239, + "grad_norm": 11.5, + "kl": 1.2215385437011719, + "learning_rate": 6.3059346530547245e-06, + "logits/chosen": 1729642837.3333333, + "logits/rejected": 1619941990.4, + "logps/chosen": -352.1038818359375, + "logps/rejected": -416.00341796875, + "loss": 0.124, + "rewards/chosen": 1.104286829630534, + "rewards/margins": 8.16099764506022, + "rewards/rejected": -7.056710815429687, + "step": 1151 + }, + { + "epoch": 0.4252687923953671, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 6.300252703519647e-06, + "logits/chosen": 1930884573.8666666, + "logits/rejected": 1877269443.764706, + "logps/chosen": -250.41813151041666, + "logps/rejected": -459.25539981617646, + "loss": 0.1654, + "rewards/chosen": 0.9490040461222331, + "rewards/margins": 9.570912794973337, + "rewards/rejected": -8.621908748851103, + "step": 1152 + }, + { + "epoch": 0.4256379493332103, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 6.294568952500968e-06, + "logits/chosen": 2442897863.111111, + "logits/rejected": 1969817600.0, + "logps/chosen": -364.4619954427083, + "logps/rejected": -538.2336077008929, + "loss": 0.206, + "rewards/chosen": 1.1122061411539714, + "rewards/margins": 10.133271989368257, + "rewards/rejected": -9.021065848214286, + "step": 1153 + }, + { + "epoch": 0.4260071062710535, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 6.288883407873452e-06, + "logits/chosen": 1907285248.0, + "logits/rejected": 1914947840.0, + "logps/chosen": -231.92572021484375, + "logps/rejected": -615.7651977539062, + "loss": 0.1436, + "rewards/chosen": 1.3146599531173706, + "rewards/margins": 10.787593722343445, + "rewards/rejected": -9.472933769226074, + "step": 1154 + }, + { + "epoch": 0.42637626320889666, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 6.283196077514351e-06, + "logits/chosen": 1970119296.0, + "logits/rejected": 2374567936.0, + "logps/chosen": -249.85861206054688, + "logps/rejected": -459.8196716308594, + "loss": 0.1746, + "rewards/chosen": 1.0710588693618774, + "rewards/margins": 8.319616436958313, + "rewards/rejected": -7.2485575675964355, + "step": 1155 + }, + { + "epoch": 0.4267454201467399, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 6.277506969303387e-06, + "logits/chosen": 1930484224.0, + "logits/rejected": 1529655808.0, + "logps/chosen": -278.8389587402344, + "logps/rejected": -369.96929931640625, + "loss": 0.1074, + "rewards/chosen": 2.15155029296875, + "rewards/margins": 9.867129802703857, + "rewards/rejected": -7.715579509735107, + "step": 1156 + }, + { + "epoch": 0.42711457708458306, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 6.271816091122748e-06, + "logits/chosen": 1174500273.2307692, + "logits/rejected": 1485946880.0, + "logps/chosen": -267.9584209735577, + "logps/rejected": -403.4807771381579, + "loss": 0.1057, + "rewards/chosen": 1.9342938936673677, + "rewards/margins": 8.65736454025454, + "rewards/rejected": -6.723070646587171, + "step": 1157 + }, + { + "epoch": 0.4274837340224263, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 6.266123450857071e-06, + "logits/chosen": 1549263872.0, + "logits/rejected": 2493089536.0, + "logps/chosen": -362.7276916503906, + "logps/rejected": -615.659912109375, + "loss": 0.1158, + "rewards/chosen": 1.7271019220352173, + "rewards/margins": 8.561207890510559, + "rewards/rejected": -6.834105968475342, + "step": 1158 + }, + { + "epoch": 0.42785289096026946, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 6.26042905639344e-06, + "logits/chosen": 2171231393.6842103, + "logits/rejected": 1991832182.1538463, + "logps/chosen": -254.29970189144737, + "logps/rejected": -634.875, + "loss": 0.1962, + "rewards/chosen": 1.0531517831902755, + "rewards/margins": 9.369530449994661, + "rewards/rejected": -8.316378666804386, + "step": 1159 + }, + { + "epoch": 0.4282220478981127, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 6.254732915621365e-06, + "logits/chosen": 1854142122.6666667, + "logits/rejected": 2540871680.0, + "logps/chosen": -186.53206380208334, + "logps/rejected": -459.5015625, + "loss": 0.1267, + "rewards/chosen": 1.4769954681396484, + "rewards/margins": 8.428338241577148, + "rewards/rejected": -6.9513427734375, + "step": 1160 + }, + { + "epoch": 0.42859120483595586, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 6.249035036432776e-06, + "logits/chosen": 1165527040.0, + "logits/rejected": 2775265039.0588236, + "logps/chosen": -279.04212239583336, + "logps/rejected": -345.0897863051471, + "loss": 0.0813, + "rewards/chosen": 2.81498285929362, + "rewards/margins": 8.918571187935623, + "rewards/rejected": -6.103588328642004, + "step": 1161 + }, + { + "epoch": 0.4289603617737991, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 6.243335426722014e-06, + "logits/chosen": 1389212765.090909, + "logits/rejected": 1344073728.0, + "logps/chosen": -260.4420055042614, + "logps/rejected": -401.3618396577381, + "loss": 0.1538, + "rewards/chosen": 0.5857828747142445, + "rewards/margins": 7.569769138897652, + "rewards/rejected": -6.983986264183407, + "step": 1162 + }, + { + "epoch": 0.42932951871164227, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 6.237634094385814e-06, + "logits/chosen": 1910650790.9565217, + "logits/rejected": 2024341959.1111112, + "logps/chosen": -321.93546195652175, + "logps/rejected": -334.76860894097223, + "loss": 0.2052, + "rewards/chosen": 1.425300764000934, + "rewards/margins": 7.059296244008529, + "rewards/rejected": -5.633995480007595, + "step": 1163 + }, + { + "epoch": 0.4296986756494855, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 6.2319310473233e-06, + "logits/chosen": 1756033843.2, + "logits/rejected": 2160959969.882353, + "logps/chosen": -270.2632161458333, + "logps/rejected": -507.60403262867646, + "loss": 0.1608, + "rewards/chosen": 1.045746421813965, + "rewards/margins": 8.742213204327753, + "rewards/rejected": -7.696466782513787, + "step": 1164 + }, + { + "epoch": 0.43006783258732867, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 6.226226293435973e-06, + "logits/chosen": 1725159581.5384614, + "logits/rejected": 1544387745.6842105, + "logps/chosen": -374.47506009615387, + "logps/rejected": -594.3879009046053, + "loss": 0.1383, + "rewards/chosen": 1.0191868268526518, + "rewards/margins": 10.440421760806188, + "rewards/rejected": -9.421234933953537, + "step": 1165 + }, + { + "epoch": 0.4304369895251719, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 6.2205198406276946e-06, + "logits/chosen": 1835970087.3846154, + "logits/rejected": 2041349820.631579, + "logps/chosen": -229.0101036658654, + "logps/rejected": -564.3002158717105, + "loss": 0.0761, + "rewards/chosen": 2.8253951439490685, + "rewards/margins": 12.381373216266091, + "rewards/rejected": -9.555978072317023, + "step": 1166 + }, + { + "epoch": 0.43080614646301507, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 6.214811696804682e-06, + "logits/chosen": 1432108193.6842105, + "logits/rejected": 1391518326.1538463, + "logps/chosen": -330.2620785361842, + "logps/rejected": -332.88333834134613, + "loss": 0.1739, + "rewards/chosen": 1.5397603888260691, + "rewards/margins": 6.876531840335986, + "rewards/rejected": -5.336771451509916, + "step": 1167 + }, + { + "epoch": 0.4311753034008583, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 6.2091018698755e-06, + "logits/chosen": 1752609177.6, + "logits/rejected": 1628360192.0, + "logps/chosen": -329.652783203125, + "logps/rejected": -560.4564615885416, + "loss": 0.1757, + "rewards/chosen": 1.4416525840759278, + "rewards/margins": 7.585653591156006, + "rewards/rejected": -6.144001007080078, + "step": 1168 + }, + { + "epoch": 0.4315444603387015, + "grad_norm": 11.5625, + "kl": 0.725778341293335, + "learning_rate": 6.203390367751038e-06, + "logits/chosen": 1748932224.0, + "logits/rejected": 1559972608.0, + "logps/chosen": -222.55104064941406, + "logps/rejected": -576.6209716796875, + "loss": 0.1622, + "rewards/chosen": 1.1698040962219238, + "rewards/margins": 10.918261051177979, + "rewards/rejected": -9.748456954956055, + "step": 1169 + }, + { + "epoch": 0.4319136172765447, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 6.197677198344508e-06, + "logits/chosen": 1562490538.6666667, + "logits/rejected": 1583447478.857143, + "logps/chosen": -336.48046875, + "logps/rejected": -526.939208984375, + "loss": 0.1996, + "rewards/chosen": 1.3562950558132596, + "rewards/margins": 10.80140483190143, + "rewards/rejected": -9.44510977608817, + "step": 1170 + }, + { + "epoch": 0.4322827742143879, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 6.191962369571439e-06, + "logits/chosen": 1714684800.0, + "logits/rejected": 2165829632.0, + "logps/chosen": -200.8190460205078, + "logps/rejected": -616.0302734375, + "loss": 0.1569, + "rewards/chosen": 1.3629053831100464, + "rewards/margins": 10.023276209831238, + "rewards/rejected": -8.660370826721191, + "step": 1171 + }, + { + "epoch": 0.4326519311522311, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 6.18624588934965e-06, + "logits/chosen": 1825007616.0, + "logits/rejected": 1593575082.6666667, + "logps/chosen": -300.746240234375, + "logps/rejected": -549.9100341796875, + "loss": 0.2117, + "rewards/chosen": 1.0870254516601563, + "rewards/margins": 10.471473185221354, + "rewards/rejected": -9.384447733561197, + "step": 1172 + }, + { + "epoch": 0.4330210880900743, + "grad_norm": 13.0625, + "kl": 1.0147876739501953, + "learning_rate": 6.1805277655992514e-06, + "logits/chosen": 1357554609.2307692, + "logits/rejected": 1241522499.368421, + "logps/chosen": -325.1211688701923, + "logps/rejected": -377.4354697779605, + "loss": 0.1663, + "rewards/chosen": 0.9268924272977389, + "rewards/margins": 7.230636291658348, + "rewards/rejected": -6.303743864360609, + "step": 1173 + }, + { + "epoch": 0.4333902450279175, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 6.1748080062426345e-06, + "logits/chosen": 1546373658.9473684, + "logits/rejected": 1704660676.9230769, + "logps/chosen": -260.5208675986842, + "logps/rejected": -413.3268479567308, + "loss": 0.1868, + "rewards/chosen": 1.2665455466822575, + "rewards/margins": 8.056643296832497, + "rewards/rejected": -6.79009775015024, + "step": 1174 + }, + { + "epoch": 0.4337594019657607, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 6.169086619204447e-06, + "logits/chosen": 1653013390.2222223, + "logits/rejected": 1492339273.142857, + "logps/chosen": -289.9823404947917, + "logps/rejected": -521.7072405133929, + "loss": 0.1516, + "rewards/chosen": 1.5092287063598633, + "rewards/margins": 8.2974944795881, + "rewards/rejected": -6.788265773228237, + "step": 1175 + }, + { + "epoch": 0.4341285589036039, + "grad_norm": 11.375, + "kl": 0.17486953735351562, + "learning_rate": 6.1633636124116045e-06, + "logits/chosen": 2221824000.0, + "logits/rejected": 1360264533.3333333, + "logps/chosen": -238.27641950334822, + "logps/rejected": -437.22216796875, + "loss": 0.1297, + "rewards/chosen": 1.7954562050955636, + "rewards/margins": 8.397061090620737, + "rewards/rejected": -6.601604885525173, + "step": 1176 + }, + { + "epoch": 0.4344977158414471, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 6.157638993793257e-06, + "logits/chosen": 2236897280.0, + "logits/rejected": 2706335914.6666665, + "logps/chosen": -266.4365478515625, + "logps/rejected": -463.990478515625, + "loss": 0.22, + "rewards/chosen": 0.9851810455322265, + "rewards/margins": 8.468461227416991, + "rewards/rejected": -7.483280181884766, + "step": 1177 + }, + { + "epoch": 0.4348668727792903, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 6.15191277128079e-06, + "logits/chosen": 1442499145.142857, + "logits/rejected": 1629059072.0, + "logps/chosen": -277.12874930245533, + "logps/rejected": -392.89252387152777, + "loss": 0.1443, + "rewards/chosen": 1.0982837677001953, + "rewards/margins": 7.683296839396159, + "rewards/rejected": -6.585013071695964, + "step": 1178 + }, + { + "epoch": 0.4352360297171335, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 6.146184952807815e-06, + "logits/chosen": 1754633102.2222223, + "logits/rejected": 1636933778.2857144, + "logps/chosen": -227.54985894097223, + "logps/rejected": -406.6975795200893, + "loss": 0.2225, + "rewards/chosen": 0.820282088385688, + "rewards/margins": 6.5763398276435, + "rewards/rejected": -5.7560577392578125, + "step": 1179 + }, + { + "epoch": 0.4356051866549767, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 6.140455546310149e-06, + "logits/chosen": 2224008011.2941175, + "logits/rejected": 1498087833.6, + "logps/chosen": -214.51319795496323, + "logps/rejected": -478.83811848958334, + "loss": 0.1217, + "rewards/chosen": 1.7925473381491268, + "rewards/margins": 9.45401075774548, + "rewards/rejected": -7.6614634195963545, + "step": 1180 + }, + { + "epoch": 0.4359743435928199, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 6.134724559725812e-06, + "logits/chosen": 1237875858.2857144, + "logits/rejected": 1739870776.8888888, + "logps/chosen": -181.70908900669642, + "logps/rejected": -591.18359375, + "loss": 0.0739, + "rewards/chosen": 3.107684816632952, + "rewards/margins": 10.792802750118195, + "rewards/rejected": -7.685117933485243, + "step": 1181 + }, + { + "epoch": 0.4363435005306631, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 6.128992000995015e-06, + "logits/chosen": 1452509730.1333334, + "logits/rejected": 1326499237.6470587, + "logps/chosen": -254.52776692708332, + "logps/rejected": -488.2190946691176, + "loss": 0.1279, + "rewards/chosen": 1.8877037048339844, + "rewards/margins": 9.98043046839097, + "rewards/rejected": -8.092726763556986, + "step": 1182 + }, + { + "epoch": 0.4367126574685063, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 6.123257878060146e-06, + "logits/chosen": 1854752229.0526316, + "logits/rejected": 1756491776.0, + "logps/chosen": -320.4297388980263, + "logps/rejected": -609.3537785456731, + "loss": 0.1622, + "rewards/chosen": 1.3992160997892682, + "rewards/margins": 10.522290557984881, + "rewards/rejected": -9.123074458195614, + "step": 1183 + }, + { + "epoch": 0.4370818144063495, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 6.1175221988657555e-06, + "logits/chosen": 1513226555.0769231, + "logits/rejected": 2028709780.2105262, + "logps/chosen": -274.0032489483173, + "logps/rejected": -616.7845394736842, + "loss": 0.0994, + "rewards/chosen": 1.8532689901498647, + "rewards/margins": 11.015977473394107, + "rewards/rejected": -9.162708483244243, + "step": 1184 + }, + { + "epoch": 0.4374509713441927, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 6.111784971358556e-06, + "logits/chosen": 1832744487.3846154, + "logits/rejected": 1290767629.4736843, + "logps/chosen": -246.79479041466345, + "logps/rejected": -426.1707185444079, + "loss": 0.1309, + "rewards/chosen": 1.108178212092473, + "rewards/margins": 9.519169680020106, + "rewards/rejected": -8.410991467927632, + "step": 1185 + }, + { + "epoch": 0.4378201282820359, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 6.106046203487406e-06, + "logits/chosen": 1892622982.7368422, + "logits/rejected": 2296552054.1538463, + "logps/chosen": -276.08280222039474, + "logps/rejected": -506.60385366586536, + "loss": 0.1655, + "rewards/chosen": 1.5301130194413035, + "rewards/margins": 9.17572001407021, + "rewards/rejected": -7.645606994628906, + "step": 1186 + }, + { + "epoch": 0.4381892852198791, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 6.100305903203292e-06, + "logits/chosen": 2090680064.0, + "logits/rejected": 1524462720.0, + "logps/chosen": -303.1239318847656, + "logps/rejected": -413.6661071777344, + "loss": 0.1732, + "rewards/chosen": 0.900115430355072, + "rewards/margins": 7.91779226064682, + "rewards/rejected": -7.017676830291748, + "step": 1187 + }, + { + "epoch": 0.43855844215772233, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 6.094564078459329e-06, + "logits/chosen": 1995865916.952381, + "logits/rejected": 1719082077.090909, + "logps/chosen": -327.9398484002976, + "logps/rejected": -439.54909446022725, + "loss": 0.2154, + "rewards/chosen": 1.2101111639113653, + "rewards/margins": 8.796088198046663, + "rewards/rejected": -7.585977034135298, + "step": 1188 + }, + { + "epoch": 0.4389275990955655, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 6.08882073721074e-06, + "logits/chosen": 2015229952.0, + "logits/rejected": 2145516066.1333334, + "logps/chosen": -220.79549632352942, + "logps/rejected": -523.5539713541667, + "loss": 0.1503, + "rewards/chosen": 1.4565528420840992, + "rewards/margins": 10.015811875287223, + "rewards/rejected": -8.559259033203125, + "step": 1189 + }, + { + "epoch": 0.4392967560334087, + "grad_norm": 22.25, + "kl": 0.0, + "learning_rate": 6.083075887414854e-06, + "logits/chosen": 2800536576.0, + "logits/rejected": 2805027072.0, + "logps/chosen": -593.7982177734375, + "logps/rejected": -464.11492919921875, + "loss": 0.2057, + "rewards/chosen": 0.6975958347320557, + "rewards/margins": 7.765184640884399, + "rewards/rejected": -7.067588806152344, + "step": 1190 + }, + { + "epoch": 0.4396659129712519, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 6.077329537031087e-06, + "logits/chosen": 1896302592.0, + "logits/rejected": 1878963404.8, + "logps/chosen": -281.3272298177083, + "logps/rejected": -414.824365234375, + "loss": 0.1053, + "rewards/chosen": 2.070777098337809, + "rewards/margins": 9.892878691355387, + "rewards/rejected": -7.822101593017578, + "step": 1191 + }, + { + "epoch": 0.4400350699090951, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 6.071581694020933e-06, + "logits/chosen": 2042647424.0, + "logits/rejected": 2000086912.0, + "logps/chosen": -241.7680206298828, + "logps/rejected": -453.7569274902344, + "loss": 0.1614, + "rewards/chosen": 1.0097142457962036, + "rewards/margins": 8.50803005695343, + "rewards/rejected": -7.498315811157227, + "step": 1192 + }, + { + "epoch": 0.4404042268469383, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 6.0658323663479555e-06, + "logits/chosen": 1755452928.0, + "logits/rejected": 2008418688.0, + "logps/chosen": -343.9593200683594, + "logps/rejected": -417.07623291015625, + "loss": 0.1973, + "rewards/chosen": 0.8431195020675659, + "rewards/margins": 6.240869879722595, + "rewards/rejected": -5.397750377655029, + "step": 1193 + }, + { + "epoch": 0.4407733837847815, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 6.060081561977778e-06, + "logits/chosen": 2562091264.0, + "logits/rejected": 2177996544.0, + "logps/chosen": -266.28472900390625, + "logps/rejected": -485.1036376953125, + "loss": 0.1919, + "rewards/chosen": 0.9154738783836365, + "rewards/margins": 8.303599774837494, + "rewards/rejected": -7.388125896453857, + "step": 1194 + }, + { + "epoch": 0.4411425407226247, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 6.054329288878062e-06, + "logits/chosen": 1566706619.7333333, + "logits/rejected": 1717107049.4117646, + "logps/chosen": -323.4928385416667, + "logps/rejected": -401.29279641544116, + "loss": 0.1434, + "rewards/chosen": 1.609241739908854, + "rewards/margins": 8.901518458946079, + "rewards/rejected": -7.292276719037225, + "step": 1195 + }, + { + "epoch": 0.4415116976604679, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 6.048575555018512e-06, + "logits/chosen": 1431801582.9333334, + "logits/rejected": 1437397714.8235295, + "logps/chosen": -228.38050130208333, + "logps/rejected": -339.2764246323529, + "loss": 0.1725, + "rewards/chosen": 1.303792953491211, + "rewards/margins": 7.495886297786937, + "rewards/rejected": -6.192093344295726, + "step": 1196 + }, + { + "epoch": 0.4418808545983111, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 6.042820368370854e-06, + "logits/chosen": 2360157735.3846154, + "logits/rejected": 1598738647.5789473, + "logps/chosen": -298.84664212740387, + "logps/rejected": -570.661287006579, + "loss": 0.1197, + "rewards/chosen": 1.3980539762056792, + "rewards/margins": 10.39228374465757, + "rewards/rejected": -8.994229768451891, + "step": 1197 + }, + { + "epoch": 0.4422500115361543, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 6.037063736908822e-06, + "logits/chosen": 2263678603.6363635, + "logits/rejected": 2211574150.095238, + "logps/chosen": -303.0801890980114, + "logps/rejected": -555.9850260416666, + "loss": 0.1034, + "rewards/chosen": 1.2730738032947888, + "rewards/margins": 9.049683678201783, + "rewards/rejected": -7.776609874906994, + "step": 1198 + }, + { + "epoch": 0.4426191684739975, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 6.03130566860816e-06, + "logits/chosen": 1346461696.0, + "logits/rejected": 1491568298.6666667, + "logps/chosen": -231.18781389508928, + "logps/rejected": -500.9045138888889, + "loss": 0.0867, + "rewards/chosen": 2.0051404408046176, + "rewards/margins": 9.647707545568071, + "rewards/rejected": -7.6425671047634545, + "step": 1199 + }, + { + "epoch": 0.4429883254118407, + "grad_norm": 12.125, + "kl": 0.2620217800140381, + "learning_rate": 6.025546171446599e-06, + "logits/chosen": 1541022606.2222223, + "logits/rejected": 2020823186.2857144, + "logps/chosen": -365.2447916666667, + "logps/rejected": -341.2520228794643, + "loss": 0.1454, + "rewards/chosen": 1.557430585225423, + "rewards/margins": 8.61750280289423, + "rewards/rejected": -7.0600722176688055, + "step": 1200 + }, + { + "epoch": 0.4433574823496839, + "grad_norm": 13.1875, + "kl": 0.7655420303344727, + "learning_rate": 6.019785253403843e-06, + "logits/chosen": 1700520448.0, + "logits/rejected": 1358886741.3333333, + "logps/chosen": -285.6713623046875, + "logps/rejected": -472.9306233723958, + "loss": 0.1732, + "rewards/chosen": 1.501136016845703, + "rewards/margins": 10.205138397216796, + "rewards/rejected": -8.704002380371094, + "step": 1201 + }, + { + "epoch": 0.4437266392875271, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 6.0140229224615765e-06, + "logits/chosen": 1855772069.6470587, + "logits/rejected": 1800440490.6666667, + "logps/chosen": -329.4501378676471, + "logps/rejected": -474.51067708333335, + "loss": 0.1685, + "rewards/chosen": 1.2871527952306412, + "rewards/margins": 8.473191003238455, + "rewards/rejected": -7.186038208007813, + "step": 1202 + }, + { + "epoch": 0.4440957962253703, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 6.008259186603434e-06, + "logits/chosen": 2020252672.0, + "logits/rejected": 1473277824.0, + "logps/chosen": -278.72344970703125, + "logps/rejected": -345.509765625, + "loss": 0.1417, + "rewards/chosen": 1.5811870098114014, + "rewards/margins": 7.500243902206421, + "rewards/rejected": -5.9190568923950195, + "step": 1203 + }, + { + "epoch": 0.4444649531632135, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 6.0024940538149965e-06, + "logits/chosen": 1819806254.5454545, + "logits/rejected": 1489909467.4285715, + "logps/chosen": -251.67808948863637, + "logps/rejected": -482.00320870535717, + "loss": 0.1343, + "rewards/chosen": 1.0286538384177468, + "rewards/margins": 10.236009882642076, + "rewards/rejected": -9.20735604422433, + "step": 1204 + }, + { + "epoch": 0.4448341101010567, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 5.996727532083786e-06, + "logits/chosen": 2133358796.8, + "logits/rejected": 2853421614.5454545, + "logps/chosen": -275.001953125, + "logps/rejected": -515.6335671164773, + "loss": 0.1006, + "rewards/chosen": 1.579201889038086, + "rewards/margins": 9.978601386330343, + "rewards/rejected": -8.399399497292258, + "step": 1205 + }, + { + "epoch": 0.4452032670388999, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 5.990959629399242e-06, + "logits/chosen": 2673478997.3333335, + "logits/rejected": 1972337078.857143, + "logps/chosen": -249.64501953125, + "logps/rejected": -447.3184291294643, + "loss": 0.1886, + "rewards/chosen": 1.1158578660753038, + "rewards/margins": 8.043473440503316, + "rewards/rejected": -6.927615574428013, + "step": 1206 + }, + { + "epoch": 0.4455724239767431, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 5.9851903537527225e-06, + "logits/chosen": 2054583637.3333333, + "logits/rejected": 3197068008.7272725, + "logps/chosen": -291.3203590029762, + "logps/rejected": -519.6983309659091, + "loss": 0.1963, + "rewards/chosen": 1.041286831810361, + "rewards/margins": 9.012508755638486, + "rewards/rejected": -7.971221923828125, + "step": 1207 + }, + { + "epoch": 0.4459415809145863, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 5.979419713137484e-06, + "logits/chosen": 2014196736.0, + "logits/rejected": 1931886796.8, + "logps/chosen": -407.2604166666667, + "logps/rejected": -561.581396484375, + "loss": 0.1302, + "rewards/chosen": 1.230411132176717, + "rewards/margins": 9.07372473080953, + "rewards/rejected": -7.843313598632813, + "step": 1208 + }, + { + "epoch": 0.44631073785242953, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 5.973647715548676e-06, + "logits/chosen": 1875393792.0, + "logits/rejected": 1684791296.0, + "logps/chosen": -256.8343811035156, + "logps/rejected": -413.5158386230469, + "loss": 0.1744, + "rewards/chosen": 1.0216689109802246, + "rewards/margins": 7.533850193023682, + "rewards/rejected": -6.512181282043457, + "step": 1209 + }, + { + "epoch": 0.4466798947902727, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 5.9678743689833284e-06, + "logits/chosen": 1364298330.3529413, + "logits/rejected": 1680835652.2666667, + "logps/chosen": -332.80807674632354, + "logps/rejected": -441.7998372395833, + "loss": 0.1681, + "rewards/chosen": 1.2386263679055607, + "rewards/margins": 11.259316268621706, + "rewards/rejected": -10.020689900716146, + "step": 1210 + }, + { + "epoch": 0.44704905172811593, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 5.962099681440341e-06, + "logits/chosen": 2271398229.3333335, + "logits/rejected": 2435773440.0, + "logps/chosen": -319.8719482421875, + "logps/rejected": -505.247216796875, + "loss": 0.0856, + "rewards/chosen": 1.6468556722005208, + "rewards/margins": 11.313663228352866, + "rewards/rejected": -9.666807556152344, + "step": 1211 + }, + { + "epoch": 0.4474182086659591, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 5.9563236609204655e-06, + "logits/chosen": 1256605157.0526316, + "logits/rejected": 1234640659.6923077, + "logps/chosen": -254.10264185855263, + "logps/rejected": -389.03331580528845, + "loss": 0.1772, + "rewards/chosen": 1.2331807989823191, + "rewards/margins": 10.453341063217596, + "rewards/rejected": -9.220160264235277, + "step": 1212 + }, + { + "epoch": 0.44778736560380233, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 5.950546315426309e-06, + "logits/chosen": 1574508646.4, + "logits/rejected": 1581717333.3333333, + "logps/chosen": -347.7767578125, + "logps/rejected": -580.7117919921875, + "loss": 0.2246, + "rewards/chosen": 0.8461013793945312, + "rewards/margins": 9.856765365600586, + "rewards/rejected": -9.010663986206055, + "step": 1213 + }, + { + "epoch": 0.4481565225416455, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 5.944767652962309e-06, + "logits/chosen": 2185421902.769231, + "logits/rejected": 1601608218.9473684, + "logps/chosen": -295.25940880408655, + "logps/rejected": -493.65244654605266, + "loss": 0.1338, + "rewards/chosen": 1.616997792170598, + "rewards/margins": 9.221936500024217, + "rewards/rejected": -7.604938707853618, + "step": 1214 + }, + { + "epoch": 0.44852567947948874, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 5.938987681534729e-06, + "logits/chosen": 2318612889.6, + "logits/rejected": 2851488466.8235292, + "logps/chosen": -284.07607421875, + "logps/rejected": -543.5099379595588, + "loss": 0.1784, + "rewards/chosen": 0.6491394678751627, + "rewards/margins": 9.000372557546578, + "rewards/rejected": -8.351233089671416, + "step": 1215 + }, + { + "epoch": 0.4488948364173319, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5.933206409151646e-06, + "logits/chosen": 2260613864.7272725, + "logits/rejected": 1400607695.2380953, + "logps/chosen": -260.8787730823864, + "logps/rejected": -366.06719680059524, + "loss": 0.1068, + "rewards/chosen": 1.41098031130704, + "rewards/margins": 8.437018344928692, + "rewards/rejected": -7.026038033621652, + "step": 1216 + }, + { + "epoch": 0.44926399335517514, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 5.92742384382294e-06, + "logits/chosen": 1553157324.8, + "logits/rejected": 1821359646.1176472, + "logps/chosen": -244.955078125, + "logps/rejected": -444.84593290441177, + "loss": 0.1391, + "rewards/chosen": 1.2986845652262369, + "rewards/margins": 9.09821190179563, + "rewards/rejected": -7.799527336569393, + "step": 1217 + }, + { + "epoch": 0.4496331502930183, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 5.92163999356028e-06, + "logits/chosen": 1328993673.8461537, + "logits/rejected": 2256255299.368421, + "logps/chosen": -232.06120417668268, + "logps/rejected": -557.3095703125, + "loss": 0.0339, + "rewards/chosen": 3.691144503079928, + "rewards/margins": 13.032181651003448, + "rewards/rejected": -9.34103714792352, + "step": 1218 + }, + { + "epoch": 0.45000230723086154, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 5.91585486637712e-06, + "logits/chosen": 1228726994.8235295, + "logits/rejected": 1830348800.0, + "logps/chosen": -292.02780330882354, + "logps/rejected": -344.55579427083336, + "loss": 0.1261, + "rewards/chosen": 1.8638019561767578, + "rewards/margins": 8.369182205200195, + "rewards/rejected": -6.505380249023437, + "step": 1219 + }, + { + "epoch": 0.4503714641687047, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 5.910068470288677e-06, + "logits/chosen": 1787162624.0, + "logits/rejected": 1427194311.1111112, + "logps/chosen": -252.86805943080358, + "logps/rejected": -470.22157118055554, + "loss": 0.154, + "rewards/chosen": 0.7780662264142718, + "rewards/margins": 8.091146060398646, + "rewards/rejected": -7.313079833984375, + "step": 1220 + }, + { + "epoch": 0.45074062110654795, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 5.90428081331193e-06, + "logits/chosen": 1609466948.2666667, + "logits/rejected": 2006268626.8235295, + "logps/chosen": -263.45774739583334, + "logps/rejected": -443.23733340992646, + "loss": 0.1032, + "rewards/chosen": 2.535178629557292, + "rewards/margins": 10.641681386910234, + "rewards/rejected": -8.106502757352942, + "step": 1221 + }, + { + "epoch": 0.4511097780443911, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 5.898491903465607e-06, + "logits/chosen": 2119557632.0, + "logits/rejected": 2010020864.0, + "logps/chosen": -302.6974182128906, + "logps/rejected": -505.8480224609375, + "loss": 0.1362, + "rewards/chosen": 1.3110463619232178, + "rewards/margins": 9.3225839138031, + "rewards/rejected": -8.011537551879883, + "step": 1222 + }, + { + "epoch": 0.45147893498223435, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 5.892701748770165e-06, + "logits/chosen": 1501518994.2857144, + "logits/rejected": 1429623921.7777777, + "logps/chosen": -336.3925083705357, + "logps/rejected": -448.51752387152777, + "loss": 0.1696, + "rewards/chosen": 1.1946919304983956, + "rewards/margins": 9.690975567651174, + "rewards/rejected": -8.496283637152779, + "step": 1223 + }, + { + "epoch": 0.4518480919200775, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 5.886910357247792e-06, + "logits/chosen": 1445423513.6, + "logits/rejected": 1527701504.0, + "logps/chosen": -281.47978515625, + "logps/rejected": -417.58289292279414, + "loss": 0.1531, + "rewards/chosen": 1.0559834798177083, + "rewards/margins": 8.95943124808517, + "rewards/rejected": -7.903447768267463, + "step": 1224 + }, + { + "epoch": 0.45221724885792075, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 5.8811177369223895e-06, + "logits/chosen": 1963946456.6153846, + "logits/rejected": 2457097701.0526314, + "logps/chosen": -239.3133826622596, + "logps/rejected": -444.90316611842104, + "loss": 0.1389, + "rewards/chosen": 1.289617685171274, + "rewards/margins": 7.403787960407705, + "rewards/rejected": -6.114170275236431, + "step": 1225 + }, + { + "epoch": 0.4525864057957639, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 5.875323895819554e-06, + "logits/chosen": 1622376155.4285715, + "logits/rejected": 1542382819.5555556, + "logps/chosen": -342.06675502232144, + "logps/rejected": -461.17632378472223, + "loss": 0.143, + "rewards/chosen": 1.1816917146955217, + "rewards/margins": 8.13701101333376, + "rewards/rejected": -6.955319298638238, + "step": 1226 + }, + { + "epoch": 0.4529555627336071, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 5.869528841966583e-06, + "logits/chosen": 1349430067.2, + "logits/rejected": 1569057008.9411764, + "logps/chosen": -242.981396484375, + "logps/rejected": -439.3377470128676, + "loss": 0.1575, + "rewards/chosen": 1.3296928405761719, + "rewards/margins": 9.372089834774242, + "rewards/rejected": -8.04239699419807, + "step": 1227 + }, + { + "epoch": 0.4533247196714503, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 5.8637325833924494e-06, + "logits/chosen": 1174395562.6666667, + "logits/rejected": 1660332393.4117646, + "logps/chosen": -212.13362630208334, + "logps/rejected": -518.6707835477941, + "loss": 0.1155, + "rewards/chosen": 2.1738690694173175, + "rewards/margins": 10.203831945681104, + "rewards/rejected": -8.029962876263786, + "step": 1228 + }, + { + "epoch": 0.4536938766092935, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 5.857935128127793e-06, + "logits/chosen": 2365410123.2941175, + "logits/rejected": 2166744951.4666667, + "logps/chosen": -343.05193014705884, + "logps/rejected": -352.9173177083333, + "loss": 0.1634, + "rewards/chosen": 1.4876788644229664, + "rewards/margins": 8.633831106447706, + "rewards/rejected": -7.14615224202474, + "step": 1229 + }, + { + "epoch": 0.45406303354713673, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 5.852136484204918e-06, + "logits/chosen": 2396326912.0, + "logits/rejected": 2090340352.0, + "logps/chosen": -288.4384765625, + "logps/rejected": -531.9009399414062, + "loss": 0.0932, + "rewards/chosen": 1.9437556266784668, + "rewards/margins": 11.144826412200928, + "rewards/rejected": -9.201070785522461, + "step": 1230 + }, + { + "epoch": 0.4544321904849799, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 5.8463366596577706e-06, + "logits/chosen": 2382004614.095238, + "logits/rejected": 2396348788.3636365, + "logps/chosen": -180.46914527529762, + "logps/rejected": -475.4094904119318, + "loss": 0.1904, + "rewards/chosen": 1.9556323460170202, + "rewards/margins": 9.508001897242162, + "rewards/rejected": -7.552369551225142, + "step": 1231 + }, + { + "epoch": 0.45480134742282313, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 5.8405356625219335e-06, + "logits/chosen": 2065018441.142857, + "logits/rejected": 1884366620.4444444, + "logps/chosen": -293.97073800223217, + "logps/rejected": -516.9618055555555, + "loss": 0.1156, + "rewards/chosen": 2.0288102286202565, + "rewards/margins": 9.391472649952721, + "rewards/rejected": -7.362662421332465, + "step": 1232 + }, + { + "epoch": 0.4551705043606663, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 5.834733500834615e-06, + "logits/chosen": 1148140701.5384614, + "logits/rejected": 1146724244.2105262, + "logps/chosen": -203.06114783653845, + "logps/rejected": -367.587890625, + "loss": 0.1449, + "rewards/chosen": 1.4231803600604718, + "rewards/margins": 8.270568376610637, + "rewards/rejected": -6.847388016550164, + "step": 1233 + }, + { + "epoch": 0.45553966129850954, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 5.8289301826346375e-06, + "logits/chosen": 1455251335.5294118, + "logits/rejected": 1382165708.8, + "logps/chosen": -287.36506204044116, + "logps/rejected": -507.770703125, + "loss": 0.1952, + "rewards/chosen": 0.7624001222498277, + "rewards/margins": 8.872617916032379, + "rewards/rejected": -8.110217793782551, + "step": 1234 + }, + { + "epoch": 0.4559088182363527, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5.823125715962421e-06, + "logits/chosen": 2696668081.230769, + "logits/rejected": 2132366389.8947368, + "logps/chosen": -260.568359375, + "logps/rejected": -358.1229954769737, + "loss": 0.1323, + "rewards/chosen": 1.1942630914541392, + "rewards/margins": 7.4920162749193935, + "rewards/rejected": -6.297753183465255, + "step": 1235 + }, + { + "epoch": 0.45627797517419594, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 5.817320108859984e-06, + "logits/chosen": 1886683750.4, + "logits/rejected": 2053242135.2727273, + "logps/chosen": -309.898876953125, + "logps/rejected": -390.2923029119318, + "loss": 0.1443, + "rewards/chosen": 1.429996395111084, + "rewards/margins": 6.617664241790772, + "rewards/rejected": -5.1876678466796875, + "step": 1236 + }, + { + "epoch": 0.4566471321120391, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 5.811513369370921e-06, + "logits/chosen": 2279487624.5333333, + "logits/rejected": 2077135811.764706, + "logps/chosen": -285.45192057291666, + "logps/rejected": -455.3132755055147, + "loss": 0.1544, + "rewards/chosen": 1.2030244191487631, + "rewards/margins": 9.618088613771924, + "rewards/rejected": -8.415064194623161, + "step": 1237 + }, + { + "epoch": 0.45701628904988234, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 5.805705505540392e-06, + "logits/chosen": 1523144396.8, + "logits/rejected": 1777177258.6666667, + "logps/chosen": -337.6244140625, + "logps/rejected": -541.5087076822916, + "loss": 0.1981, + "rewards/chosen": 0.952875804901123, + "rewards/margins": 10.259911886850992, + "rewards/rejected": -9.30703608194987, + "step": 1238 + }, + { + "epoch": 0.4573854459877255, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 5.799896525415124e-06, + "logits/chosen": 2006574501.6470587, + "logits/rejected": 2102981973.3333333, + "logps/chosen": -292.20703125, + "logps/rejected": -433.98570963541664, + "loss": 0.195, + "rewards/chosen": 1.1579180324778837, + "rewards/margins": 9.019636273851582, + "rewards/rejected": -7.861718241373698, + "step": 1239 + }, + { + "epoch": 0.45775460292556874, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 5.7940864370433825e-06, + "logits/chosen": 1898868893.5384614, + "logits/rejected": 1820311336.4210527, + "logps/chosen": -249.96574519230768, + "logps/rejected": -336.3924496299342, + "loss": 0.158, + "rewards/chosen": 0.8757619124192458, + "rewards/margins": 7.235195538293012, + "rewards/rejected": -6.359433625873766, + "step": 1240 + }, + { + "epoch": 0.4581237598634119, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 5.78827524847497e-06, + "logits/chosen": 2071726552.6153846, + "logits/rejected": 2597189524.2105265, + "logps/chosen": -244.2867713341346, + "logps/rejected": -572.7776521381579, + "loss": 0.1228, + "rewards/chosen": 1.2823998377873347, + "rewards/margins": 10.17870431969523, + "rewards/rejected": -8.896304481907896, + "step": 1241 + }, + { + "epoch": 0.45849291680125515, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 5.782462967761217e-06, + "logits/chosen": 2230663770.352941, + "logits/rejected": 2650438587.733333, + "logps/chosen": -345.81554457720586, + "logps/rejected": -415.34873046875, + "loss": 0.1491, + "rewards/chosen": 1.3631730921128218, + "rewards/margins": 9.286865922516467, + "rewards/rejected": -7.923692830403646, + "step": 1242 + }, + { + "epoch": 0.4588620737390983, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 5.776649602954963e-06, + "logits/chosen": 1526717952.0, + "logits/rejected": 1852887552.0, + "logps/chosen": -316.0617370605469, + "logps/rejected": -470.9912109375, + "loss": 0.1509, + "rewards/chosen": 1.6883612871170044, + "rewards/margins": 9.559767365455627, + "rewards/rejected": -7.871406078338623, + "step": 1243 + }, + { + "epoch": 0.45923123067694155, + "grad_norm": 11.5, + "kl": 0.3478884696960449, + "learning_rate": 5.770835162110551e-06, + "logits/chosen": 1603625164.8, + "logits/rejected": 1446146730.6666667, + "logps/chosen": -239.4867431640625, + "logps/rejected": -413.8687744140625, + "loss": 0.1515, + "rewards/chosen": 1.8164600372314452, + "rewards/margins": 9.336744689941407, + "rewards/rejected": -7.520284652709961, + "step": 1244 + }, + { + "epoch": 0.4596003876147847, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 5.765019653283814e-06, + "logits/chosen": 1460917833.142857, + "logits/rejected": 1950867000.8888888, + "logps/chosen": -244.91615513392858, + "logps/rejected": -525.5714518229166, + "loss": 0.1437, + "rewards/chosen": 1.304199491228376, + "rewards/margins": 9.710113797869, + "rewards/rejected": -8.405914306640625, + "step": 1245 + }, + { + "epoch": 0.45996954455262795, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 5.759203084532068e-06, + "logits/chosen": 1415099099.4285715, + "logits/rejected": 1731776967.1111112, + "logps/chosen": -253.03949846540178, + "logps/rejected": -424.7610134548611, + "loss": 0.1277, + "rewards/chosen": 1.9732042040143694, + "rewards/margins": 8.159210538107251, + "rewards/rejected": -6.186006334092882, + "step": 1246 + }, + { + "epoch": 0.4603387014904711, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 5.753385463914094e-06, + "logits/chosen": 1865517909.3333333, + "logits/rejected": 2402443776.0, + "logps/chosen": -281.33013916015625, + "logps/rejected": -470.059814453125, + "loss": 0.2223, + "rewards/chosen": 1.663313388824463, + "rewards/margins": 9.450936794281006, + "rewards/rejected": -7.787623405456543, + "step": 1247 + }, + { + "epoch": 0.46070785842831435, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 5.7475667994901316e-06, + "logits/chosen": 1754968320.0, + "logits/rejected": 1428333312.0, + "logps/chosen": -214.22906494140625, + "logps/rejected": -447.44293212890625, + "loss": 0.1045, + "rewards/chosen": 2.797363519668579, + "rewards/margins": 9.79190468788147, + "rewards/rejected": -6.994541168212891, + "step": 1248 + }, + { + "epoch": 0.4610770153661575, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 5.741747099321866e-06, + "logits/chosen": 2795557683.2, + "logits/rejected": 1755072512.0, + "logps/chosen": -347.5180419921875, + "logps/rejected": -503.2632242838542, + "loss": 0.1784, + "rewards/chosen": 1.4398449897766112, + "rewards/margins": 7.826426410675049, + "rewards/rejected": -6.3865814208984375, + "step": 1249 + }, + { + "epoch": 0.46144617230400076, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5.735926371472418e-06, + "logits/chosen": 1943041792.0, + "logits/rejected": 1800609664.0, + "logps/chosen": -308.3397521972656, + "logps/rejected": -465.0818786621094, + "loss": 0.1018, + "rewards/chosen": 2.063084125518799, + "rewards/margins": 9.08430004119873, + "rewards/rejected": -7.021215915679932, + "step": 1250 + }, + { + "epoch": 0.46181532924184393, + "grad_norm": 12.75, + "kl": 0.058286190032958984, + "learning_rate": 5.730104624006333e-06, + "logits/chosen": 1651552870.4, + "logits/rejected": 1679456597.3333333, + "logps/chosen": -311.655517578125, + "logps/rejected": -654.9146728515625, + "loss": 0.1777, + "rewards/chosen": 1.484128761291504, + "rewards/margins": 12.667819150288901, + "rewards/rejected": -11.183690388997396, + "step": 1251 + }, + { + "epoch": 0.46218448617968716, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5.724281864989567e-06, + "logits/chosen": 979742720.0, + "logits/rejected": 1431876065.8823528, + "logps/chosen": -234.73251953125, + "logps/rejected": -518.0363625919117, + "loss": 0.1292, + "rewards/chosen": 1.6141777038574219, + "rewards/margins": 8.364383697509766, + "rewards/rejected": -6.750205993652344, + "step": 1252 + }, + { + "epoch": 0.46255364311753033, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 5.718458102489479e-06, + "logits/chosen": 1938567649.8823528, + "logits/rejected": 2226235801.6, + "logps/chosen": -279.6455078125, + "logps/rejected": -441.65758463541664, + "loss": 0.17, + "rewards/chosen": 1.2799157535328585, + "rewards/margins": 8.008850995232077, + "rewards/rejected": -6.728935241699219, + "step": 1253 + }, + { + "epoch": 0.46292280005537356, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 5.712633344574816e-06, + "logits/chosen": 1788218761.8461537, + "logits/rejected": 1813510251.7894738, + "logps/chosen": -309.4136305588942, + "logps/rejected": -405.37181332236844, + "loss": 0.1603, + "rewards/chosen": 0.821734721844013, + "rewards/margins": 7.634694091704211, + "rewards/rejected": -6.812959369860198, + "step": 1254 + }, + { + "epoch": 0.46329195699321674, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 5.70680759931571e-06, + "logits/chosen": 1841186048.0, + "logits/rejected": 1521164160.0, + "logps/chosen": -328.46795654296875, + "logps/rejected": -596.1806030273438, + "loss": 0.1781, + "rewards/chosen": 0.8562191128730774, + "rewards/margins": 9.154949963092804, + "rewards/rejected": -8.298730850219727, + "step": 1255 + }, + { + "epoch": 0.46366111393105996, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 5.7009808747836546e-06, + "logits/chosen": 1675446954.6666667, + "logits/rejected": 1797146503.5294118, + "logps/chosen": -275.03134765625, + "logps/rejected": -457.6252872242647, + "loss": 0.1342, + "rewards/chosen": 1.3551958719889323, + "rewards/margins": 8.358873240152995, + "rewards/rejected": -7.0036773681640625, + "step": 1256 + }, + { + "epoch": 0.46403027086890314, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 5.6951531790515045e-06, + "logits/chosen": 2454933504.0, + "logits/rejected": 1391053312.0, + "logps/chosen": -283.8570556640625, + "logps/rejected": -606.6972045898438, + "loss": 0.1267, + "rewards/chosen": 1.6672395467758179, + "rewards/margins": 10.889033675193787, + "rewards/rejected": -9.221794128417969, + "step": 1257 + }, + { + "epoch": 0.46439942780674637, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 5.689324520193455e-06, + "logits/chosen": 1202447360.0, + "logits/rejected": 1310785536.0, + "logps/chosen": -173.88551330566406, + "logps/rejected": -469.436279296875, + "loss": 0.1231, + "rewards/chosen": 1.9846092462539673, + "rewards/margins": 8.776886582374573, + "rewards/rejected": -6.7922773361206055, + "step": 1258 + }, + { + "epoch": 0.46476858474458954, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 5.68349490628504e-06, + "logits/chosen": 3062858956.8, + "logits/rejected": 1835020660.3636363, + "logps/chosen": -273.6923583984375, + "logps/rejected": -526.5743075284091, + "loss": 0.1315, + "rewards/chosen": 0.42178821563720703, + "rewards/margins": 8.822230165654963, + "rewards/rejected": -8.400441950017756, + "step": 1259 + }, + { + "epoch": 0.46513774168243277, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 5.677664345403118e-06, + "logits/chosen": 1590345932.8, + "logits/rejected": 2145810090.6666667, + "logps/chosen": -304.1717041015625, + "logps/rejected": -397.7779947916667, + "loss": 0.2242, + "rewards/chosen": 1.4158255577087402, + "rewards/margins": 7.893369706471761, + "rewards/rejected": -6.4775441487630205, + "step": 1260 + }, + { + "epoch": 0.46550689862027594, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 5.671832845625853e-06, + "logits/chosen": 1742845006.7692308, + "logits/rejected": 1210948338.5263157, + "logps/chosen": -241.88185471754807, + "logps/rejected": -407.84385279605266, + "loss": 0.1442, + "rewards/chosen": 1.04372684772198, + "rewards/margins": 8.31325729462782, + "rewards/rejected": -7.269530446905839, + "step": 1261 + }, + { + "epoch": 0.4658760555581192, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 5.6660004150327175e-06, + "logits/chosen": 1553163520.0, + "logits/rejected": 1601211776.0, + "logps/chosen": -268.818115234375, + "logps/rejected": -409.6429748535156, + "loss": 0.157, + "rewards/chosen": 1.1948740482330322, + "rewards/margins": 8.582369089126587, + "rewards/rejected": -7.387495040893555, + "step": 1262 + }, + { + "epoch": 0.46624521249596235, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 5.660167061704467e-06, + "logits/chosen": 1322750267.0769231, + "logits/rejected": 2040750080.0, + "logps/chosen": -293.79306265024036, + "logps/rejected": -525.7649568256579, + "loss": 0.1466, + "rewards/chosen": 0.8343367209801307, + "rewards/margins": 9.179957930375691, + "rewards/rejected": -8.34562120939556, + "step": 1263 + }, + { + "epoch": 0.4666143694338055, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 5.654332793723141e-06, + "logits/chosen": 1423350169.6, + "logits/rejected": 1356341930.6666667, + "logps/chosen": -275.570361328125, + "logps/rejected": -490.2438151041667, + "loss": 0.1845, + "rewards/chosen": 1.3467785835266113, + "rewards/margins": 8.474871031443278, + "rewards/rejected": -7.128092447916667, + "step": 1264 + }, + { + "epoch": 0.46698352637164875, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 5.648497619172042e-06, + "logits/chosen": 2446392064.0, + "logits/rejected": 3096707072.0, + "logps/chosen": -338.63818359375, + "logps/rejected": -539.504150390625, + "loss": 0.1357, + "rewards/chosen": 1.4695488214492798, + "rewards/margins": 9.217888474464417, + "rewards/rejected": -7.748339653015137, + "step": 1265 + }, + { + "epoch": 0.4673526833094919, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 5.6426615461357305e-06, + "logits/chosen": 2088707072.0, + "logits/rejected": 2162203648.0, + "logps/chosen": -287.1134033203125, + "logps/rejected": -495.737548828125, + "loss": 0.221, + "rewards/chosen": 0.8323314666748047, + "rewards/margins": 10.71656239827474, + "rewards/rejected": -9.884230931599935, + "step": 1266 + }, + { + "epoch": 0.46772184024733515, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 5.636824582700012e-06, + "logits/chosen": 1491187126.857143, + "logits/rejected": 1557911324.4444444, + "logps/chosen": -249.90961565290178, + "logps/rejected": -452.10956488715277, + "loss": 0.1323, + "rewards/chosen": 1.403160231454032, + "rewards/margins": 9.604744941469223, + "rewards/rejected": -8.20158471001519, + "step": 1267 + }, + { + "epoch": 0.4680909971851783, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 5.630986736951925e-06, + "logits/chosen": 1372629811.2, + "logits/rejected": 1716016640.0, + "logps/chosen": -351.32822265625, + "logps/rejected": -441.597412109375, + "loss": 0.2024, + "rewards/chosen": 1.0433027267456054, + "rewards/margins": 9.956344159444173, + "rewards/rejected": -8.913041432698568, + "step": 1268 + }, + { + "epoch": 0.46846015412302155, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 5.625148016979731e-06, + "logits/chosen": 1915670714.1818182, + "logits/rejected": 1738449091.047619, + "logps/chosen": -230.6058016690341, + "logps/rejected": -579.5094401041666, + "loss": 0.1025, + "rewards/chosen": 2.1428780989213423, + "rewards/margins": 10.755178426767324, + "rewards/rejected": -8.612300327845983, + "step": 1269 + }, + { + "epoch": 0.46882931106086473, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 5.619308430872902e-06, + "logits/chosen": 1799578443.2941177, + "logits/rejected": 1814912614.4, + "logps/chosen": -277.30224609375, + "logps/rejected": -407.60078125, + "loss": 0.1613, + "rewards/chosen": 1.3436546325683594, + "rewards/margins": 8.783315785725911, + "rewards/rejected": -7.4396611531575525, + "step": 1270 + }, + { + "epoch": 0.46919846799870796, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 5.613467986722109e-06, + "logits/chosen": 2037561929.142857, + "logits/rejected": 2009167872.0, + "logps/chosen": -258.8886195591518, + "logps/rejected": -476.23285590277777, + "loss": 0.1343, + "rewards/chosen": 1.25894410269601, + "rewards/margins": 8.651989316183423, + "rewards/rejected": -7.393045213487413, + "step": 1271 + }, + { + "epoch": 0.46956762493655113, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 5.607626692619216e-06, + "logits/chosen": 2358964224.0, + "logits/rejected": 1602792594.2857144, + "logps/chosen": -276.76267311789775, + "logps/rejected": -418.0455729166667, + "loss": 0.1301, + "rewards/chosen": 1.0925772406838157, + "rewards/margins": 8.456655580760081, + "rewards/rejected": -7.364078340076265, + "step": 1272 + }, + { + "epoch": 0.46993678187439436, + "grad_norm": 13.625, + "kl": 0.2142777442932129, + "learning_rate": 5.601784556657259e-06, + "logits/chosen": 2541459348.2105265, + "logits/rejected": 2678162668.3076925, + "logps/chosen": -265.8229337993421, + "logps/rejected": -533.1594050480769, + "loss": 0.2165, + "rewards/chosen": 0.7064057902285927, + "rewards/margins": 9.181478797665491, + "rewards/rejected": -8.4750730074369, + "step": 1273 + }, + { + "epoch": 0.47030593881223753, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 5.5959415869304445e-06, + "logits/chosen": 1596286020.2666667, + "logits/rejected": 1490923399.5294118, + "logps/chosen": -297.73675130208335, + "logps/rejected": -429.6953699448529, + "loss": 0.1372, + "rewards/chosen": 1.4699593861897786, + "rewards/margins": 8.12140511157466, + "rewards/rejected": -6.651445725384881, + "step": 1274 + }, + { + "epoch": 0.47067509575008076, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 5.590097791534132e-06, + "logits/chosen": 1849802069.3333333, + "logits/rejected": 1509565440.0, + "logps/chosen": -302.30287000868054, + "logps/rejected": -445.26834542410717, + "loss": 0.1945, + "rewards/chosen": 1.0815956327650282, + "rewards/margins": 8.595228270878868, + "rewards/rejected": -7.513632638113839, + "step": 1275 + }, + { + "epoch": 0.47104425268792394, + "grad_norm": 15.9375, + "kl": 0.12620162963867188, + "learning_rate": 5.584253178564829e-06, + "logits/chosen": 1605073578.6666667, + "logits/rejected": 2175478603.2941175, + "logps/chosen": -286.10393880208335, + "logps/rejected": -437.28972311580884, + "loss": 0.2047, + "rewards/chosen": 0.8126660664876302, + "rewards/margins": 7.0792249941358385, + "rewards/rejected": -6.266558927648208, + "step": 1276 + }, + { + "epoch": 0.47141340962576717, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 5.578407756120167e-06, + "logits/chosen": 1571761883.4285715, + "logits/rejected": 2657991111.111111, + "logps/chosen": -224.30203683035714, + "logps/rejected": -430.33241102430554, + "loss": 0.114, + "rewards/chosen": 1.7509636197771346, + "rewards/margins": 7.661468278794061, + "rewards/rejected": -5.910504659016927, + "step": 1277 + }, + { + "epoch": 0.47178256656361034, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5.57256153229891e-06, + "logits/chosen": 1833593124.5714285, + "logits/rejected": 1975439173.8181818, + "logps/chosen": -383.9208054315476, + "logps/rejected": -541.19091796875, + "loss": 0.2369, + "rewards/chosen": 1.0536855061848958, + "rewards/margins": 8.431841994776871, + "rewards/rejected": -7.378156488591975, + "step": 1278 + }, + { + "epoch": 0.47215172350145357, + "grad_norm": 13.625, + "kl": 0.18763256072998047, + "learning_rate": 5.566714515200924e-06, + "logits/chosen": 1614522210.4615386, + "logits/rejected": 1795443442.5263157, + "logps/chosen": -394.5290339543269, + "logps/rejected": -424.8228824013158, + "loss": 0.1401, + "rewards/chosen": 0.8593849035409781, + "rewards/margins": 8.170134961363757, + "rewards/rejected": -7.31075005782278, + "step": 1279 + }, + { + "epoch": 0.47252088043929674, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 5.560866712927176e-06, + "logits/chosen": 2259810196.2105265, + "logits/rejected": 2241767896.6153846, + "logps/chosen": -394.91719777960526, + "logps/rejected": -518.0201322115385, + "loss": 0.1656, + "rewards/chosen": 1.5932235717773438, + "rewards/margins": 9.567390441894531, + "rewards/rejected": -7.9741668701171875, + "step": 1280 + }, + { + "epoch": 0.47289003737713997, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5.555018133579723e-06, + "logits/chosen": 1708401956.5714285, + "logits/rejected": 1525711416.8888888, + "logps/chosen": -248.92014857700892, + "logps/rejected": -515.7660047743055, + "loss": 0.1157, + "rewards/chosen": 1.7379063197544642, + "rewards/margins": 11.426573617117745, + "rewards/rejected": -9.688667297363281, + "step": 1281 + }, + { + "epoch": 0.47325919431498314, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 5.549168785261698e-06, + "logits/chosen": 1891424392.5333333, + "logits/rejected": 1580815781.6470587, + "logps/chosen": -317.3435546875, + "logps/rejected": -582.0832950367648, + "loss": 0.1386, + "rewards/chosen": 1.375164794921875, + "rewards/margins": 9.339936738855698, + "rewards/rejected": -7.964771943933823, + "step": 1282 + }, + { + "epoch": 0.4736283512528264, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 5.543318676077297e-06, + "logits/chosen": 1642259200.0, + "logits/rejected": 1401176576.0, + "logps/chosen": -239.35415649414062, + "logps/rejected": -343.9851989746094, + "loss": 0.1466, + "rewards/chosen": 1.2199809551239014, + "rewards/margins": 8.215405225753784, + "rewards/rejected": -6.995424270629883, + "step": 1283 + }, + { + "epoch": 0.47399750819066955, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 5.537467814131774e-06, + "logits/chosen": 1456396416.0, + "logits/rejected": 1336012032.0, + "logps/chosen": -319.8012390136719, + "logps/rejected": -421.53668212890625, + "loss": 0.1376, + "rewards/chosen": 1.9294440746307373, + "rewards/margins": 10.163841009140015, + "rewards/rejected": -8.234396934509277, + "step": 1284 + }, + { + "epoch": 0.4743666651285128, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 5.531616207531423e-06, + "logits/chosen": 1745274265.6, + "logits/rejected": 1146377642.6666667, + "logps/chosen": -177.51326904296874, + "logps/rejected": -386.1200358072917, + "loss": 0.1912, + "rewards/chosen": 1.6444686889648437, + "rewards/margins": 9.141820017496745, + "rewards/rejected": -7.497351328531901, + "step": 1285 + }, + { + "epoch": 0.47473582206635595, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 5.525763864383571e-06, + "logits/chosen": 2345455143.3846154, + "logits/rejected": 1758702969.2631578, + "logps/chosen": -331.0238506610577, + "logps/rejected": -540.4753289473684, + "loss": 0.1455, + "rewards/chosen": 1.3506861466627855, + "rewards/margins": 7.973228276982481, + "rewards/rejected": -6.622542130319696, + "step": 1286 + }, + { + "epoch": 0.4751049790041992, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5.519910792796565e-06, + "logits/chosen": 1299959567.0588236, + "logits/rejected": 1713693491.2, + "logps/chosen": -207.71896541819854, + "logps/rejected": -577.3069010416667, + "loss": 0.1454, + "rewards/chosen": 1.6057474472943474, + "rewards/margins": 9.96106310077742, + "rewards/rejected": -8.355315653483073, + "step": 1287 + }, + { + "epoch": 0.47547413594204235, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 5.514057000879759e-06, + "logits/chosen": 1685478130.5263157, + "logits/rejected": 2120633895.3846154, + "logps/chosen": -324.87232730263156, + "logps/rejected": -483.63773287259613, + "loss": 0.1718, + "rewards/chosen": 1.189930564478824, + "rewards/margins": 7.751122416754966, + "rewards/rejected": -6.561191852276142, + "step": 1288 + }, + { + "epoch": 0.4758432928798856, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 5.508202496743511e-06, + "logits/chosen": 2014744576.0, + "logits/rejected": 2203625676.8, + "logps/chosen": -240.30138050426137, + "logps/rejected": -398.94638671875, + "loss": 0.236, + "rewards/chosen": 0.8383747447620739, + "rewards/margins": 8.29025386463512, + "rewards/rejected": -7.4518791198730465, + "step": 1289 + }, + { + "epoch": 0.47621244981772876, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 5.50234728849916e-06, + "logits/chosen": 1520803840.0, + "logits/rejected": 1430438249.4117646, + "logps/chosen": -198.38883463541666, + "logps/rejected": -488.4974149816176, + "loss": 0.1659, + "rewards/chosen": 1.2136229197184245, + "rewards/margins": 8.781379123762543, + "rewards/rejected": -7.567756204044118, + "step": 1290 + }, + { + "epoch": 0.476581606755572, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 5.496491384259022e-06, + "logits/chosen": 1379639003.4285715, + "logits/rejected": 1674884747.6363637, + "logps/chosen": -294.07652064732144, + "logps/rejected": -446.74027876420456, + "loss": 0.1901, + "rewards/chosen": 1.2130085173107328, + "rewards/margins": 8.974377702324936, + "rewards/rejected": -7.761369185014204, + "step": 1291 + }, + { + "epoch": 0.47695076369341516, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 5.49063479213638e-06, + "logits/chosen": 1584521697.8823528, + "logits/rejected": 2190001220.266667, + "logps/chosen": -265.82536764705884, + "logps/rejected": -509.8167317708333, + "loss": 0.1717, + "rewards/chosen": 1.22653725567986, + "rewards/margins": 8.364065738752776, + "rewards/rejected": -7.137528483072916, + "step": 1292 + }, + { + "epoch": 0.4773199206312584, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 5.484777520245467e-06, + "logits/chosen": 1614214212.2666667, + "logits/rejected": 1890231838.1176472, + "logps/chosen": -249.16663411458333, + "logps/rejected": -458.64332490808823, + "loss": 0.1394, + "rewards/chosen": 1.3955904642740886, + "rewards/margins": 8.646675184661266, + "rewards/rejected": -7.251084720387178, + "step": 1293 + }, + { + "epoch": 0.47768907756910156, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 5.478919576701459e-06, + "logits/chosen": 1857916131.5555556, + "logits/rejected": 1529139053.7142856, + "logps/chosen": -395.26117621527777, + "logps/rejected": -455.662353515625, + "loss": 0.1677, + "rewards/chosen": 1.2377773920694988, + "rewards/margins": 9.08638840629941, + "rewards/rejected": -7.848611014229911, + "step": 1294 + }, + { + "epoch": 0.4780582345069448, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 5.473060969620462e-06, + "logits/chosen": 1986763224.6153846, + "logits/rejected": 1775314620.631579, + "logps/chosen": -254.94807316706732, + "logps/rejected": -524.9236225328947, + "loss": 0.1526, + "rewards/chosen": 0.8864560494056115, + "rewards/margins": 8.332916974056104, + "rewards/rejected": -7.446460924650493, + "step": 1295 + }, + { + "epoch": 0.47842739144478796, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 5.467201707119501e-06, + "logits/chosen": 1848791255.5789473, + "logits/rejected": 1642036617.8461537, + "logps/chosen": -291.47499486019734, + "logps/rejected": -449.3918644831731, + "loss": 0.1626, + "rewards/chosen": 1.54037134270919, + "rewards/margins": 8.333451599244647, + "rewards/rejected": -6.793080256535457, + "step": 1296 + }, + { + "epoch": 0.4787965483826312, + "grad_norm": 13.3125, + "kl": 3.539304733276367, + "learning_rate": 5.46134179731651e-06, + "logits/chosen": 1823378162.5263157, + "logits/rejected": 2818908790.1538463, + "logps/chosen": -294.7251747532895, + "logps/rejected": -612.2462439903846, + "loss": 0.169, + "rewards/chosen": 1.6242385663484271, + "rewards/margins": 12.370448517895904, + "rewards/rejected": -10.746209951547476, + "step": 1297 + }, + { + "epoch": 0.47916570532047437, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5.455481248330322e-06, + "logits/chosen": 1261615616.0, + "logits/rejected": 1412735744.0, + "logps/chosen": -264.7585144042969, + "logps/rejected": -418.4679260253906, + "loss": 0.1403, + "rewards/chosen": 1.4340113401412964, + "rewards/margins": 9.135201334953308, + "rewards/rejected": -7.701189994812012, + "step": 1298 + }, + { + "epoch": 0.47953486225831754, + "grad_norm": 13.6875, + "kl": 1.9575705528259277, + "learning_rate": 5.44962006828065e-06, + "logits/chosen": 1970439054.2222223, + "logits/rejected": 1931296036.5714285, + "logps/chosen": -267.701416015625, + "logps/rejected": -353.58754185267856, + "loss": 0.1906, + "rewards/chosen": 1.2216451432969835, + "rewards/margins": 6.979108326018803, + "rewards/rejected": -5.7574631827218195, + "step": 1299 + }, + { + "epoch": 0.47990401919616077, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 5.443758265288086e-06, + "logits/chosen": 2252687633.0666666, + "logits/rejected": 2124215476.7058823, + "logps/chosen": -314.2978190104167, + "logps/rejected": -446.45407284007354, + "loss": 0.1466, + "rewards/chosen": 1.422787602742513, + "rewards/margins": 8.494847682878083, + "rewards/rejected": -7.07206008013557, + "step": 1300 + }, + { + "epoch": 0.48027317613400394, + "grad_norm": 11.25, + "kl": 0.14725112915039062, + "learning_rate": 5.4378958474740826e-06, + "logits/chosen": 1790022314.6666667, + "logits/rejected": 1539240417.8823528, + "logps/chosen": -227.241259765625, + "logps/rejected": -506.6637752757353, + "loss": 0.1558, + "rewards/chosen": 1.1646222432454427, + "rewards/margins": 8.25747376984241, + "rewards/rejected": -7.0928515265969665, + "step": 1301 + }, + { + "epoch": 0.48064233307184717, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 5.4320328229609475e-06, + "logits/chosen": 2051667429.0526316, + "logits/rejected": 2811045730.4615383, + "logps/chosen": -335.99794407894734, + "logps/rejected": -583.3707932692307, + "loss": 0.2024, + "rewards/chosen": 1.106044468126799, + "rewards/margins": 11.199666529049274, + "rewards/rejected": -10.093622060922476, + "step": 1302 + }, + { + "epoch": 0.48101149000969035, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 5.426169199871824e-06, + "logits/chosen": 1664647987.2, + "logits/rejected": 1353760286.1176472, + "logps/chosen": -268.83429361979165, + "logps/rejected": -493.99718520220586, + "loss": 0.156, + "rewards/chosen": 1.0748116811116537, + "rewards/margins": 8.194945024976544, + "rewards/rejected": -7.1201333438648895, + "step": 1303 + }, + { + "epoch": 0.4813806469475336, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 5.42030498633069e-06, + "logits/chosen": 1515598279.1111112, + "logits/rejected": 1268363410.2857144, + "logps/chosen": -314.759033203125, + "logps/rejected": -418.04921177455356, + "loss": 0.1764, + "rewards/chosen": 1.0811715655856662, + "rewards/margins": 9.45944893549359, + "rewards/rejected": -8.378277369907924, + "step": 1304 + }, + { + "epoch": 0.48174980388537675, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 5.414440190462336e-06, + "logits/chosen": 2176743833.6, + "logits/rejected": 2097408843.2941177, + "logps/chosen": -308.03782552083334, + "logps/rejected": -498.0602022058824, + "loss": 0.1053, + "rewards/chosen": 1.671562703450521, + "rewards/margins": 9.62218885234758, + "rewards/rejected": -7.950626148897059, + "step": 1305 + }, + { + "epoch": 0.48211896082322, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 5.408574820392364e-06, + "logits/chosen": 1815151104.0, + "logits/rejected": 2396517376.0, + "logps/chosen": -259.60369873046875, + "logps/rejected": -427.76171875, + "loss": 0.1642, + "rewards/chosen": 1.3711602687835693, + "rewards/margins": 6.6565539836883545, + "rewards/rejected": -5.285393714904785, + "step": 1306 + }, + { + "epoch": 0.48248811776106315, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 5.402708884247169e-06, + "logits/chosen": 1595960805.0526316, + "logits/rejected": 2386163239.3846154, + "logps/chosen": -224.62705592105263, + "logps/rejected": -451.2565730168269, + "loss": 0.1514, + "rewards/chosen": 1.7443267420718545, + "rewards/margins": 8.520897575718188, + "rewards/rejected": -6.776570833646334, + "step": 1307 + }, + { + "epoch": 0.4828572746989064, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 5.39684239015393e-06, + "logits/chosen": 1928967964.4444444, + "logits/rejected": 1452820187.4285715, + "logps/chosen": -209.58490668402777, + "logps/rejected": -451.6669921875, + "loss": 0.1656, + "rewards/chosen": 1.1935692893134222, + "rewards/margins": 8.97541220225985, + "rewards/rejected": -7.781842912946429, + "step": 1308 + }, + { + "epoch": 0.48322643163674955, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 5.390975346240602e-06, + "logits/chosen": 2107858156.3076923, + "logits/rejected": 1809082906.9473684, + "logps/chosen": -253.60445462740384, + "logps/rejected": -498.1011513157895, + "loss": 0.1147, + "rewards/chosen": 1.3990399287297175, + "rewards/margins": 9.54690241331031, + "rewards/rejected": -8.147862484580592, + "step": 1309 + }, + { + "epoch": 0.4835955885745928, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 5.3851077606359e-06, + "logits/chosen": 1897576106.6666667, + "logits/rejected": 1946648576.0, + "logps/chosen": -258.45717366536456, + "logps/rejected": -470.247314453125, + "loss": 0.0966, + "rewards/chosen": 1.734079360961914, + "rewards/margins": 9.707441329956055, + "rewards/rejected": -7.973361968994141, + "step": 1310 + }, + { + "epoch": 0.48396474551243596, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 5.3792396414692895e-06, + "logits/chosen": 1775919585.8823528, + "logits/rejected": 2503379899.733333, + "logps/chosen": -246.5421932444853, + "logps/rejected": -598.6277994791667, + "loss": 0.1629, + "rewards/chosen": 1.3088937647202437, + "rewards/margins": 9.692184882070505, + "rewards/rejected": -8.383291117350261, + "step": 1311 + }, + { + "epoch": 0.4843339024502792, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 5.373370996870972e-06, + "logits/chosen": 1622649059.5555556, + "logits/rejected": 2106296466.2857144, + "logps/chosen": -262.3650716145833, + "logps/rejected": -441.1934291294643, + "loss": 0.1849, + "rewards/chosen": 1.3190646701388888, + "rewards/margins": 8.220709240625775, + "rewards/rejected": -6.901644570486886, + "step": 1312 + }, + { + "epoch": 0.48470305938812236, + "grad_norm": 12.5, + "kl": 0.2811737060546875, + "learning_rate": 5.367501834971882e-06, + "logits/chosen": 1692248907.2941177, + "logits/rejected": 1264151210.6666667, + "logps/chosen": -261.3942440257353, + "logps/rejected": -392.6326171875, + "loss": 0.1545, + "rewards/chosen": 1.4590644836425781, + "rewards/margins": 9.312414805094402, + "rewards/rejected": -7.853350321451823, + "step": 1313 + }, + { + "epoch": 0.4850722163259656, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 5.3616321639036685e-06, + "logits/chosen": 2033010414.9333334, + "logits/rejected": 2227374200.470588, + "logps/chosen": -239.54557291666666, + "logps/rejected": -505.8941865808824, + "loss": 0.1624, + "rewards/chosen": 0.873802121480306, + "rewards/margins": 7.979135337530398, + "rewards/rejected": -7.1053332160500915, + "step": 1314 + }, + { + "epoch": 0.48544137326380876, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 5.355761991798688e-06, + "logits/chosen": 1659141120.0, + "logits/rejected": 1465181952.0, + "logps/chosen": -228.775390625, + "logps/rejected": -525.4037475585938, + "loss": 0.1364, + "rewards/chosen": 1.44304358959198, + "rewards/margins": 8.97947871685028, + "rewards/rejected": -7.536435127258301, + "step": 1315 + }, + { + "epoch": 0.485810530201652, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 5.3498913267899864e-06, + "logits/chosen": 1621960476.4444444, + "logits/rejected": 1757743981.7142856, + "logps/chosen": -343.6247829861111, + "logps/rejected": -511.4305943080357, + "loss": 0.1218, + "rewards/chosen": 1.9471244812011719, + "rewards/margins": 9.948751722063337, + "rewards/rejected": -8.001627240862165, + "step": 1316 + }, + { + "epoch": 0.48617968713949516, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 5.344020177011297e-06, + "logits/chosen": 2415370671.1578946, + "logits/rejected": 1842721713.2307692, + "logps/chosen": -259.13987972861844, + "logps/rejected": -688.4924128605769, + "loss": 0.184, + "rewards/chosen": 1.2633466218647205, + "rewards/margins": 8.736241143724696, + "rewards/rejected": -7.472894521859976, + "step": 1317 + }, + { + "epoch": 0.4865488440773384, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 5.3381485505970235e-06, + "logits/chosen": 1950052547.047619, + "logits/rejected": 1725277835.6363637, + "logps/chosen": -236.72470238095238, + "logps/rejected": -436.8209339488636, + "loss": 0.1326, + "rewards/chosen": 2.096772693452381, + "rewards/margins": 9.318574451264881, + "rewards/rejected": -7.2218017578125, + "step": 1318 + }, + { + "epoch": 0.48691800101518157, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 5.3322764556822296e-06, + "logits/chosen": 1750813696.0, + "logits/rejected": 1646877568.0, + "logps/chosen": -296.6430969238281, + "logps/rejected": -605.7150268554688, + "loss": 0.1054, + "rewards/chosen": 1.9307348728179932, + "rewards/margins": 10.04166054725647, + "rewards/rejected": -8.110925674438477, + "step": 1319 + }, + { + "epoch": 0.4872871579530248, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 5.326403900402627e-06, + "logits/chosen": 2010110537.142857, + "logits/rejected": 1866685440.0, + "logps/chosen": -327.74204799107144, + "logps/rejected": -416.50157335069446, + "loss": 0.1185, + "rewards/chosen": 1.6507712772914342, + "rewards/margins": 8.491193589710054, + "rewards/rejected": -6.84042231241862, + "step": 1320 + }, + { + "epoch": 0.48765631489086797, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 5.3205308928945676e-06, + "logits/chosen": 1597392896.0, + "logits/rejected": 1690978490.1818182, + "logps/chosen": -269.39183407738096, + "logps/rejected": -560.1590465198864, + "loss": 0.1775, + "rewards/chosen": 1.6533410208565849, + "rewards/margins": 9.039959895146357, + "rewards/rejected": -7.3866188742897725, + "step": 1321 + }, + { + "epoch": 0.4880254718287112, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 5.314657441295028e-06, + "logits/chosen": 2454639023.1578946, + "logits/rejected": 2170288443.076923, + "logps/chosen": -312.21548622532896, + "logps/rejected": -450.6037785456731, + "loss": 0.1787, + "rewards/chosen": 1.2701740264892578, + "rewards/margins": 7.924861761239859, + "rewards/rejected": -6.654687734750601, + "step": 1322 + }, + { + "epoch": 0.4883946287665544, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 5.308783553741602e-06, + "logits/chosen": 2362375168.0, + "logits/rejected": 2158749696.0, + "logps/chosen": -309.65386962890625, + "logps/rejected": -571.8545532226562, + "loss": 0.133, + "rewards/chosen": 1.2727434635162354, + "rewards/margins": 10.859785318374634, + "rewards/rejected": -9.587041854858398, + "step": 1323 + }, + { + "epoch": 0.4887637857043976, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 5.302909238372485e-06, + "logits/chosen": 1591755044.5714285, + "logits/rejected": 2099575694.2222223, + "logps/chosen": -263.10934012276783, + "logps/rejected": -456.1965060763889, + "loss": 0.1151, + "rewards/chosen": 1.5688112803867884, + "rewards/margins": 8.858124702695816, + "rewards/rejected": -7.289313422309028, + "step": 1324 + }, + { + "epoch": 0.4891329426422408, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 5.297034503326466e-06, + "logits/chosen": 2792816128.0, + "logits/rejected": 2558228224.0, + "logps/chosen": -331.14752197265625, + "logps/rejected": -460.96673583984375, + "loss": 0.111, + "rewards/chosen": 2.3612864017486572, + "rewards/margins": 9.325124502182007, + "rewards/rejected": -6.96383810043335, + "step": 1325 + }, + { + "epoch": 0.489502099580084, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 5.291159356742918e-06, + "logits/chosen": 1155911439.0588236, + "logits/rejected": 1325107609.6, + "logps/chosen": -281.9095818014706, + "logps/rejected": -457.8368815104167, + "loss": 0.1638, + "rewards/chosen": 1.524407330681296, + "rewards/margins": 8.071737551221661, + "rewards/rejected": -6.547330220540364, + "step": 1326 + }, + { + "epoch": 0.4898712565179272, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 5.285283806761778e-06, + "logits/chosen": 1676260124.4444444, + "logits/rejected": 1350570569.142857, + "logps/chosen": -307.65087890625, + "logps/rejected": -648.8137555803571, + "loss": 0.208, + "rewards/chosen": 0.8008493847317166, + "rewards/margins": 10.45610429370214, + "rewards/rejected": -9.655254908970424, + "step": 1327 + }, + { + "epoch": 0.4902404134557704, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 5.27940786152355e-06, + "logits/chosen": 2108298854.4, + "logits/rejected": 2100041848.4705882, + "logps/chosen": -242.38282877604166, + "logps/rejected": -528.9017693014706, + "loss": 0.1083, + "rewards/chosen": 2.044091796875, + "rewards/margins": 10.404807775160846, + "rewards/rejected": -8.360715978285846, + "step": 1328 + }, + { + "epoch": 0.4906095703936136, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 5.27353152916928e-06, + "logits/chosen": 2099923382.857143, + "logits/rejected": 2052806428.4444444, + "logps/chosen": -201.33091517857142, + "logps/rejected": -517.92578125, + "loss": 0.1399, + "rewards/chosen": 2.0609978267124722, + "rewards/margins": 10.126928511120024, + "rewards/rejected": -8.065930684407553, + "step": 1329 + }, + { + "epoch": 0.4909787273314568, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5.267654817840552e-06, + "logits/chosen": 1372736307.2, + "logits/rejected": 1720474453.3333333, + "logps/chosen": -216.452001953125, + "logps/rejected": -536.033203125, + "loss": 0.1723, + "rewards/chosen": 1.4906314849853515, + "rewards/margins": 8.892738087972004, + "rewards/rejected": -7.402106602986653, + "step": 1330 + }, + { + "epoch": 0.4913478842693, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 5.261777735679472e-06, + "logits/chosen": 1934565504.0, + "logits/rejected": 1465056768.0, + "logps/chosen": -330.0137023925781, + "logps/rejected": -426.4817810058594, + "loss": 0.1562, + "rewards/chosen": 1.5575343370437622, + "rewards/margins": 9.330515027046204, + "rewards/rejected": -7.772980690002441, + "step": 1331 + }, + { + "epoch": 0.4917170412071432, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 5.255900290828666e-06, + "logits/chosen": 1822771386.1818182, + "logits/rejected": 2649400466.285714, + "logps/chosen": -279.80051491477275, + "logps/rejected": -627.8981584821429, + "loss": 0.1246, + "rewards/chosen": 0.8953801935369318, + "rewards/margins": 10.463937932794744, + "rewards/rejected": -9.568557739257812, + "step": 1332 + }, + { + "epoch": 0.4920861981449864, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 5.250022491431259e-06, + "logits/chosen": 2551811614.117647, + "logits/rejected": 1603687219.2, + "logps/chosen": -254.9582950367647, + "logps/rejected": -582.7176432291667, + "loss": 0.1716, + "rewards/chosen": 0.9330510532154757, + "rewards/margins": 10.603259793449851, + "rewards/rejected": -9.670208740234376, + "step": 1333 + }, + { + "epoch": 0.4924553550828296, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 5.2441443456308665e-06, + "logits/chosen": 2742068480.0, + "logits/rejected": 2128558336.0, + "logps/chosen": -348.3977355957031, + "logps/rejected": -369.847900390625, + "loss": 0.1403, + "rewards/chosen": 1.4834749698638916, + "rewards/margins": 7.628100633621216, + "rewards/rejected": -6.144625663757324, + "step": 1334 + }, + { + "epoch": 0.4928245120206728, + "grad_norm": 11.625, + "kl": 0.07767105102539062, + "learning_rate": 5.238265861571585e-06, + "logits/chosen": 1710151680.0, + "logits/rejected": 2514321408.0, + "logps/chosen": -290.6258544921875, + "logps/rejected": -567.0692274305555, + "loss": 0.1354, + "rewards/chosen": 1.4652857099260603, + "rewards/margins": 10.232826535663907, + "rewards/rejected": -8.767540825737846, + "step": 1335 + }, + { + "epoch": 0.49319366895851596, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 5.232387047397979e-06, + "logits/chosen": 1729505664.0, + "logits/rejected": 1448064384.0, + "logps/chosen": -295.0443115234375, + "logps/rejected": -538.0873413085938, + "loss": 0.0943, + "rewards/chosen": 2.2136785984039307, + "rewards/margins": 11.146042108535767, + "rewards/rejected": -8.932363510131836, + "step": 1336 + }, + { + "epoch": 0.4935628258963592, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 5.226507911255071e-06, + "logits/chosen": 2066705314.909091, + "logits/rejected": 1949240368.7619047, + "logps/chosen": -277.5948597301136, + "logps/rejected": -467.56854538690476, + "loss": 0.0581, + "rewards/chosen": 2.2151366147128018, + "rewards/margins": 9.279992281100451, + "rewards/rejected": -7.064855666387649, + "step": 1337 + }, + { + "epoch": 0.49393198283420237, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 5.22062846128833e-06, + "logits/chosen": 2080055705.6, + "logits/rejected": 1303627655.5294118, + "logps/chosen": -286.93860677083336, + "logps/rejected": -413.72351792279414, + "loss": 0.1567, + "rewards/chosen": 1.5726119995117187, + "rewards/margins": 8.59171869614545, + "rewards/rejected": -7.019106696633732, + "step": 1338 + }, + { + "epoch": 0.4943011397720456, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 5.214748705643659e-06, + "logits/chosen": 2291966674.8235292, + "logits/rejected": 2666770158.9333334, + "logps/chosen": -286.4226505055147, + "logps/rejected": -495.48606770833334, + "loss": 0.1613, + "rewards/chosen": 1.468365052167107, + "rewards/margins": 7.12423714282466, + "rewards/rejected": -5.655872090657552, + "step": 1339 + }, + { + "epoch": 0.49467029670988877, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 5.208868652467385e-06, + "logits/chosen": 2536419913.142857, + "logits/rejected": 1855157703.1111112, + "logps/chosen": -287.57125418526783, + "logps/rejected": -547.8064778645834, + "loss": 0.1029, + "rewards/chosen": 1.8387325831821986, + "rewards/margins": 10.348413437131851, + "rewards/rejected": -8.509680853949654, + "step": 1340 + }, + { + "epoch": 0.495039453647732, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 5.202988309906246e-06, + "logits/chosen": 2133601757.8666666, + "logits/rejected": 2206402439.529412, + "logps/chosen": -291.4091471354167, + "logps/rejected": -535.5203929227941, + "loss": 0.1385, + "rewards/chosen": 1.5718788146972655, + "rewards/margins": 8.556912635354434, + "rewards/rejected": -6.985033820657169, + "step": 1341 + }, + { + "epoch": 0.49540861058557517, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 5.1971076861073825e-06, + "logits/chosen": 2214473142.857143, + "logits/rejected": 1831108380.4444444, + "logps/chosen": -376.65464564732144, + "logps/rejected": -551.3812934027778, + "loss": 0.1348, + "rewards/chosen": 1.209993634905134, + "rewards/margins": 9.563559274824838, + "rewards/rejected": -8.353565639919704, + "step": 1342 + }, + { + "epoch": 0.4957777675234184, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 5.1912267892183245e-06, + "logits/chosen": 1305046528.0, + "logits/rejected": 1471241984.0, + "logps/chosen": -227.49436950683594, + "logps/rejected": -444.3511657714844, + "loss": 0.1576, + "rewards/chosen": 1.2611172199249268, + "rewards/margins": 8.324685335159302, + "rewards/rejected": -7.063568115234375, + "step": 1343 + }, + { + "epoch": 0.4961469244612616, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 5.1853456273869794e-06, + "logits/chosen": 1759601868.8, + "logits/rejected": 2462648199.529412, + "logps/chosen": -254.67054036458333, + "logps/rejected": -407.61345358455884, + "loss": 0.0916, + "rewards/chosen": 2.173112233479818, + "rewards/margins": 8.280657525156059, + "rewards/rejected": -6.1075452916762405, + "step": 1344 + }, + { + "epoch": 0.4965160813991048, + "grad_norm": 13.0625, + "kl": 0.4599893093109131, + "learning_rate": 5.179464208761622e-06, + "logits/chosen": 2120657498.3529413, + "logits/rejected": 1712831146.6666667, + "logps/chosen": -273.67503446691177, + "logps/rejected": -375.01930338541666, + "loss": 0.2057, + "rewards/chosen": 1.2168242510627298, + "rewards/margins": 8.43655793433096, + "rewards/rejected": -7.219733683268229, + "step": 1345 + }, + { + "epoch": 0.496885238336948, + "grad_norm": 10.25, + "kl": 0.060272216796875, + "learning_rate": 5.173582541490886e-06, + "logits/chosen": 1636279296.0, + "logits/rejected": 1574219264.0, + "logps/chosen": -261.7189636230469, + "logps/rejected": -407.4404602050781, + "loss": 0.1238, + "rewards/chosen": 1.6984626054763794, + "rewards/margins": 8.87115204334259, + "rewards/rejected": -7.172689437866211, + "step": 1346 + }, + { + "epoch": 0.4972543952747912, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 5.167700633723742e-06, + "logits/chosen": 1963524577.8823528, + "logits/rejected": 1844099754.6666667, + "logps/chosen": -344.1368049172794, + "logps/rejected": -596.7352213541667, + "loss": 0.1501, + "rewards/chosen": 1.524704989264993, + "rewards/margins": 10.832813509772805, + "rewards/rejected": -9.308108520507812, + "step": 1347 + }, + { + "epoch": 0.4976235522126344, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 5.1618184936095e-06, + "logits/chosen": 1494571331.368421, + "logits/rejected": 2322222316.3076925, + "logps/chosen": -261.13998252467104, + "logps/rejected": -608.8333458533654, + "loss": 0.1837, + "rewards/chosen": 1.244883185938785, + "rewards/margins": 12.026020505650322, + "rewards/rejected": -10.781137319711538, + "step": 1348 + }, + { + "epoch": 0.4979927091504776, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 5.1559361292977915e-06, + "logits/chosen": 1926492979.2, + "logits/rejected": 1670428792.4705882, + "logps/chosen": -249.45784505208334, + "logps/rejected": -443.8468232996324, + "loss": 0.1419, + "rewards/chosen": 1.5241971333821616, + "rewards/margins": 7.23117093852922, + "rewards/rejected": -5.706973805147059, + "step": 1349 + }, + { + "epoch": 0.4983618660883208, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 5.150053548938557e-06, + "logits/chosen": 1448998326.857143, + "logits/rejected": 1743995562.6666667, + "logps/chosen": -291.0453404017857, + "logps/rejected": -548.1700303819445, + "loss": 0.098, + "rewards/chosen": 1.7653214590890067, + "rewards/margins": 10.375164395286923, + "rewards/rejected": -8.609842936197916, + "step": 1350 + }, + { + "epoch": 0.498731023026164, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 5.1441707606820365e-06, + "logits/chosen": 1626287250.2857144, + "logits/rejected": 1895019613.090909, + "logps/chosen": -256.7495814732143, + "logps/rejected": -423.4954279119318, + "loss": 0.1837, + "rewards/chosen": 1.4540764944893974, + "rewards/margins": 8.532113409661628, + "rewards/rejected": -7.07803691517223, + "step": 1351 + }, + { + "epoch": 0.4991001799640072, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 5.138287772678759e-06, + "logits/chosen": 1596020508.4444444, + "logits/rejected": 1569261568.0, + "logps/chosen": -310.40757921006946, + "logps/rejected": -401.31009347098217, + "loss": 0.1367, + "rewards/chosen": 1.4865263832939997, + "rewards/margins": 9.287470575362917, + "rewards/rejected": -7.8009441920689175, + "step": 1352 + }, + { + "epoch": 0.4994693369018504, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 5.132404593079531e-06, + "logits/chosen": 1452183311.0588236, + "logits/rejected": 1540505053.8666666, + "logps/chosen": -243.78079044117646, + "logps/rejected": -455.63525390625, + "loss": 0.1294, + "rewards/chosen": 1.6914053524241728, + "rewards/margins": 9.131033953498392, + "rewards/rejected": -7.439628601074219, + "step": 1353 + }, + { + "epoch": 0.4998384938396936, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 5.1265212300354205e-06, + "logits/chosen": 2254913536.0, + "logits/rejected": 1557009521.7777777, + "logps/chosen": -321.76224190848217, + "logps/rejected": -428.58930121527777, + "loss": 0.1698, + "rewards/chosen": 0.6745338439941406, + "rewards/margins": 8.375309838189018, + "rewards/rejected": -7.700775994194879, + "step": 1354 + }, + { + "epoch": 0.5002076507775368, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 5.120637691697753e-06, + "logits/chosen": 1679704064.0, + "logits/rejected": 1817905639.6190476, + "logps/chosen": -338.9024547230114, + "logps/rejected": -450.47251674107144, + "loss": 0.1163, + "rewards/chosen": 1.8095938942649148, + "rewards/margins": 9.150026197557326, + "rewards/rejected": -7.340432303292411, + "step": 1355 + } + ], + "logging_steps": 1, + "max_steps": 2709, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1355, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}