{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.6, "eval_steps": 500, "global_step": 7000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008, "grad_norm": 2.3092777729034424, "learning_rate": 3.6000000000000005e-08, "logits/chosen": 0.7764996886253357, "logits/rejected": 0.8174192309379578, "logps/chosen": -195.11270141601562, "logps/rejected": -207.083251953125, "loss": 0.6965, "rewards/accuracies": 0.30000001192092896, "rewards/chosen": -0.0012020017020404339, "rewards/margins": -0.006086898501962423, "rewards/rejected": 0.004884896334260702, "step": 10 }, { "epoch": 0.016, "grad_norm": 3.044440507888794, "learning_rate": 7.6e-08, "logits/chosen": 0.8046936988830566, "logits/rejected": 0.7519802451133728, "logps/chosen": -196.82322692871094, "logps/rejected": -199.39141845703125, "loss": 0.6877, "rewards/accuracies": 0.5250000357627869, "rewards/chosen": 0.0069522010162472725, "rewards/margins": 0.01239713840186596, "rewards/rejected": -0.005444936919957399, "step": 20 }, { "epoch": 0.024, "grad_norm": 2.9533703327178955, "learning_rate": 1.16e-07, "logits/chosen": 0.7309688925743103, "logits/rejected": 0.6841751933097839, "logps/chosen": -201.09934997558594, "logps/rejected": -231.3838653564453, "loss": 0.6962, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0034579948987811804, "rewards/margins": -0.00475282734259963, "rewards/rejected": 0.0012948326766490936, "step": 30 }, { "epoch": 0.032, "grad_norm": 3.043637990951538, "learning_rate": 1.56e-07, "logits/chosen": 0.707830548286438, "logits/rejected": 0.6798078417778015, "logps/chosen": -217.506103515625, "logps/rejected": -214.7266845703125, "loss": 0.695, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.006588793825358152, "rewards/margins": -0.0025365306064486504, "rewards/rejected": -0.004052260424941778, "step": 40 }, { "epoch": 0.04, "grad_norm": 2.9874885082244873, "learning_rate": 1.96e-07, "logits/chosen": 0.6864386796951294, "logits/rejected": 0.6313192248344421, "logps/chosen": -215.640869140625, "logps/rejected": -208.9573211669922, "loss": 0.688, "rewards/accuracies": 0.5875000357627869, "rewards/chosen": -0.0006361968698911369, "rewards/margins": 0.011526194401085377, "rewards/rejected": -0.012162390165030956, "step": 50 }, { "epoch": 0.048, "grad_norm": 2.3293204307556152, "learning_rate": 2.3600000000000002e-07, "logits/chosen": 0.7226251363754272, "logits/rejected": 0.826255738735199, "logps/chosen": -202.46560668945312, "logps/rejected": -201.2319793701172, "loss": 0.6887, "rewards/accuracies": 0.5875000357627869, "rewards/chosen": 0.007742859423160553, "rewards/margins": 0.010210123844444752, "rewards/rejected": -0.0024672651197761297, "step": 60 }, { "epoch": 0.056, "grad_norm": 2.408390522003174, "learning_rate": 2.7600000000000004e-07, "logits/chosen": 0.6049055457115173, "logits/rejected": 0.6685499548912048, "logps/chosen": -193.7244873046875, "logps/rejected": -202.5239715576172, "loss": 0.6949, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.004926090594381094, "rewards/margins": -0.0017412189627066255, "rewards/rejected": -0.0031848729122430086, "step": 70 }, { "epoch": 0.064, "grad_norm": 3.6175403594970703, "learning_rate": 3.160000000000001e-07, "logits/chosen": 0.5864802598953247, "logits/rejected": 0.6271843314170837, "logps/chosen": -202.54612731933594, "logps/rejected": -233.43150329589844, "loss": 0.6904, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.00966743566095829, "rewards/margins": 0.006970310118049383, "rewards/rejected": 0.002697124844416976, "step": 80 }, { "epoch": 0.072, "grad_norm": 2.3893001079559326, "learning_rate": 3.56e-07, "logits/chosen": 0.730164647102356, "logits/rejected": 0.713482677936554, "logps/chosen": -195.13555908203125, "logps/rejected": -219.647216796875, "loss": 0.6963, "rewards/accuracies": 0.5875000357627869, "rewards/chosen": 0.011823897249996662, "rewards/margins": -0.005143971648067236, "rewards/rejected": 0.01696786843240261, "step": 90 }, { "epoch": 0.08, "grad_norm": 2.7904205322265625, "learning_rate": 3.9600000000000005e-07, "logits/chosen": 0.5790210962295532, "logits/rejected": 0.6835765838623047, "logps/chosen": -202.36647033691406, "logps/rejected": -213.2338104248047, "loss": 0.696, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.004648447968065739, "rewards/margins": -0.004634796176105738, "rewards/rejected": -1.3651699191541411e-05, "step": 100 }, { "epoch": 0.088, "grad_norm": 2.350283145904541, "learning_rate": 4.3600000000000004e-07, "logits/chosen": 0.8016487956047058, "logits/rejected": 0.8312139511108398, "logps/chosen": -202.82815551757812, "logps/rejected": -218.19920349121094, "loss": 0.6913, "rewards/accuracies": 0.4375, "rewards/chosen": 0.01035931147634983, "rewards/margins": 0.005032673478126526, "rewards/rejected": 0.005326639395207167, "step": 110 }, { "epoch": 0.096, "grad_norm": 2.2766993045806885, "learning_rate": 4.760000000000001e-07, "logits/chosen": 0.8250360488891602, "logits/rejected": 0.835590660572052, "logps/chosen": -196.98216247558594, "logps/rejected": -210.0322723388672, "loss": 0.6919, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.00589400390163064, "rewards/margins": 0.003922100644558668, "rewards/rejected": 0.0019719023257493973, "step": 120 }, { "epoch": 0.104, "grad_norm": 2.0690665245056152, "learning_rate": 5.16e-07, "logits/chosen": 0.7067065238952637, "logits/rejected": 0.7281522154808044, "logps/chosen": -195.405029296875, "logps/rejected": -206.66079711914062, "loss": 0.6954, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.006292525213211775, "rewards/margins": -0.0033251051791012287, "rewards/rejected": 0.009617629460990429, "step": 130 }, { "epoch": 0.112, "grad_norm": 2.388901948928833, "learning_rate": 5.560000000000001e-07, "logits/chosen": 0.6768548488616943, "logits/rejected": 0.6542965173721313, "logps/chosen": -214.23342895507812, "logps/rejected": -216.52366638183594, "loss": 0.6939, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.012295923195779324, "rewards/margins": 8.453801456198562e-06, "rewards/rejected": -0.012304377742111683, "step": 140 }, { "epoch": 0.12, "grad_norm": 2.4146411418914795, "learning_rate": 5.960000000000001e-07, "logits/chosen": 0.8362228274345398, "logits/rejected": 0.780723512172699, "logps/chosen": -192.99110412597656, "logps/rejected": -182.7938232421875, "loss": 0.6965, "rewards/accuracies": 0.5, "rewards/chosen": 0.008289004676043987, "rewards/margins": -0.005621676333248615, "rewards/rejected": 0.013910681009292603, "step": 150 }, { "epoch": 0.128, "grad_norm": 2.408268690109253, "learning_rate": 6.360000000000001e-07, "logits/chosen": 0.6485953330993652, "logits/rejected": 0.7263766527175903, "logps/chosen": -191.4886016845703, "logps/rejected": -218.2458953857422, "loss": 0.7014, "rewards/accuracies": 0.4375, "rewards/chosen": 0.007325764745473862, "rewards/margins": -0.014910398982465267, "rewards/rejected": 0.022236162796616554, "step": 160 }, { "epoch": 0.136, "grad_norm": 2.3368568420410156, "learning_rate": 6.76e-07, "logits/chosen": 0.6823139190673828, "logits/rejected": 0.7047578692436218, "logps/chosen": -188.77145385742188, "logps/rejected": -230.02903747558594, "loss": 0.6928, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.009600992314517498, "rewards/margins": 0.0018820532131940126, "rewards/rejected": 0.007718939334154129, "step": 170 }, { "epoch": 0.144, "grad_norm": 2.562350273132324, "learning_rate": 7.16e-07, "logits/chosen": 0.7193002700805664, "logits/rejected": 0.6238381266593933, "logps/chosen": -199.7821807861328, "logps/rejected": -189.610595703125, "loss": 0.6955, "rewards/accuracies": 0.5625, "rewards/chosen": -0.00945024099200964, "rewards/margins": -0.003362303366884589, "rewards/rejected": -0.00608793692663312, "step": 180 }, { "epoch": 0.152, "grad_norm": 2.5308778285980225, "learning_rate": 7.56e-07, "logits/chosen": 0.7583810091018677, "logits/rejected": 0.7683423161506653, "logps/chosen": -179.4371795654297, "logps/rejected": -193.5501708984375, "loss": 0.6947, "rewards/accuracies": 0.45000001788139343, "rewards/chosen": 0.024644900113344193, "rewards/margins": -0.0013052498688921332, "rewards/rejected": 0.025950148701667786, "step": 190 }, { "epoch": 0.16, "grad_norm": 2.271794080734253, "learning_rate": 7.960000000000001e-07, "logits/chosen": 0.8137730956077576, "logits/rejected": 0.6786491274833679, "logps/chosen": -202.08787536621094, "logps/rejected": -198.05364990234375, "loss": 0.6975, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.01316139753907919, "rewards/margins": -0.007475724909454584, "rewards/rejected": -0.005685672629624605, "step": 200 }, { "epoch": 0.168, "grad_norm": 2.6240570545196533, "learning_rate": 8.36e-07, "logits/chosen": 0.6418821215629578, "logits/rejected": 0.7391870617866516, "logps/chosen": -202.360107421875, "logps/rejected": -229.84375, "loss": 0.6896, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.006038342602550983, "rewards/margins": 0.00873213168233633, "rewards/rejected": -0.002693791640922427, "step": 210 }, { "epoch": 0.176, "grad_norm": 2.454711675643921, "learning_rate": 8.760000000000001e-07, "logits/chosen": 0.6627649664878845, "logits/rejected": 0.5812577605247498, "logps/chosen": -231.45840454101562, "logps/rejected": -224.667724609375, "loss": 0.6921, "rewards/accuracies": 0.5250000357627869, "rewards/chosen": -0.004710569512099028, "rewards/margins": 0.0032497793436050415, "rewards/rejected": -0.007960348390042782, "step": 220 }, { "epoch": 0.184, "grad_norm": 2.9100327491760254, "learning_rate": 9.160000000000001e-07, "logits/chosen": 0.7058368921279907, "logits/rejected": 0.6532104015350342, "logps/chosen": -209.2056884765625, "logps/rejected": -206.2039031982422, "loss": 0.6916, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.00045182276517152786, "rewards/margins": 0.004544996656477451, "rewards/rejected": -0.004093174822628498, "step": 230 }, { "epoch": 0.192, "grad_norm": 2.845057487487793, "learning_rate": 9.56e-07, "logits/chosen": 0.5265730023384094, "logits/rejected": 0.6017157435417175, "logps/chosen": -202.66600036621094, "logps/rejected": -229.9880828857422, "loss": 0.697, "rewards/accuracies": 0.5250000357627869, "rewards/chosen": 0.0027109149377793074, "rewards/margins": -0.00603170320391655, "rewards/rejected": 0.008742619305849075, "step": 240 }, { "epoch": 0.2, "grad_norm": 2.409635543823242, "learning_rate": 9.96e-07, "logits/chosen": 0.7155398726463318, "logits/rejected": 0.7485236525535583, "logps/chosen": -208.7382354736328, "logps/rejected": -224.94078063964844, "loss": 0.689, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.021835366263985634, "rewards/margins": 0.009591616690158844, "rewards/rejected": 0.01224374771118164, "step": 250 }, { "epoch": 0.208, "grad_norm": 2.2228920459747314, "learning_rate": 1.0360000000000001e-06, "logits/chosen": 0.7711376547813416, "logits/rejected": 0.6718112230300903, "logps/chosen": -204.5934600830078, "logps/rejected": -199.9540252685547, "loss": 0.6978, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.006919704377651215, "rewards/margins": -0.008324065245687962, "rewards/rejected": 0.015243768692016602, "step": 260 }, { "epoch": 0.216, "grad_norm": 2.7563939094543457, "learning_rate": 1.0760000000000002e-06, "logits/chosen": 0.7490439414978027, "logits/rejected": 0.8420026898384094, "logps/chosen": -194.5001678466797, "logps/rejected": -194.9221954345703, "loss": 0.7006, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": 0.0004544306721072644, "rewards/margins": -0.01385478675365448, "rewards/rejected": 0.014309215359389782, "step": 270 }, { "epoch": 0.224, "grad_norm": 2.703477621078491, "learning_rate": 1.1160000000000002e-06, "logits/chosen": 0.654878556728363, "logits/rejected": 0.6851706504821777, "logps/chosen": -191.12637329101562, "logps/rejected": -179.5616455078125, "loss": 0.6963, "rewards/accuracies": 0.5, "rewards/chosen": -0.002727155340835452, "rewards/margins": -0.005241455975919962, "rewards/rejected": 0.002514300402253866, "step": 280 }, { "epoch": 0.232, "grad_norm": 2.8370096683502197, "learning_rate": 1.156e-06, "logits/chosen": 0.7035982012748718, "logits/rejected": 0.6883268356323242, "logps/chosen": -198.1017608642578, "logps/rejected": -205.367431640625, "loss": 0.6936, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0034764811862260103, "rewards/margins": 0.0005613662651740015, "rewards/rejected": 0.0029151157941669226, "step": 290 }, { "epoch": 0.24, "grad_norm": 2.892529249191284, "learning_rate": 1.196e-06, "logits/chosen": 0.661008358001709, "logits/rejected": 0.6034583449363708, "logps/chosen": -197.7530975341797, "logps/rejected": -212.4963836669922, "loss": 0.6944, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.005977009888738394, "rewards/margins": -0.0009391207131557167, "rewards/rejected": 0.006916132755577564, "step": 300 }, { "epoch": 0.248, "grad_norm": 2.4562253952026367, "learning_rate": 1.2360000000000001e-06, "logits/chosen": 0.7135326266288757, "logits/rejected": 0.7250908017158508, "logps/chosen": -203.17738342285156, "logps/rejected": -208.8437042236328, "loss": 0.6925, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.0018625364173203707, "rewards/margins": 0.002679466502740979, "rewards/rejected": -0.0008169323555193841, "step": 310 }, { "epoch": 0.256, "grad_norm": 2.6124107837677, "learning_rate": 1.276e-06, "logits/chosen": 0.8584432601928711, "logits/rejected": 0.8116118311882019, "logps/chosen": -195.6661376953125, "logps/rejected": -196.38145446777344, "loss": 0.6932, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.005818118806928396, "rewards/margins": 0.0013451100094243884, "rewards/rejected": 0.004473009612411261, "step": 320 }, { "epoch": 0.264, "grad_norm": 2.860166311264038, "learning_rate": 1.316e-06, "logits/chosen": 0.6980428099632263, "logits/rejected": 0.8046857118606567, "logps/chosen": -189.9589080810547, "logps/rejected": -203.92202758789062, "loss": 0.694, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.018240585923194885, "rewards/margins": -0.0006985944928601384, "rewards/rejected": 0.018939180299639702, "step": 330 }, { "epoch": 0.272, "grad_norm": 2.8200535774230957, "learning_rate": 1.356e-06, "logits/chosen": 0.8003985285758972, "logits/rejected": 0.77861088514328, "logps/chosen": -201.60928344726562, "logps/rejected": -203.56936645507812, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.002375365002080798, "rewards/margins": 0.001602139906026423, "rewards/rejected": 0.0007732249796390533, "step": 340 }, { "epoch": 0.28, "grad_norm": 3.1359260082244873, "learning_rate": 1.396e-06, "logits/chosen": 0.7161771655082703, "logits/rejected": 0.6535243988037109, "logps/chosen": -196.94798278808594, "logps/rejected": -207.852783203125, "loss": 0.6921, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0128513528034091, "rewards/margins": 0.003678938141092658, "rewards/rejected": 0.009172416292130947, "step": 350 }, { "epoch": 0.288, "grad_norm": 3.2514476776123047, "learning_rate": 1.436e-06, "logits/chosen": 0.6619038581848145, "logits/rejected": 0.656987726688385, "logps/chosen": -200.4764862060547, "logps/rejected": -231.81895446777344, "loss": 0.6878, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.022911271080374718, "rewards/margins": 0.012182674370706081, "rewards/rejected": 0.010728596709668636, "step": 360 }, { "epoch": 0.296, "grad_norm": 2.784947156906128, "learning_rate": 1.4760000000000001e-06, "logits/chosen": 0.7618936896324158, "logits/rejected": 0.7150487899780273, "logps/chosen": -196.36228942871094, "logps/rejected": -211.08935546875, "loss": 0.6905, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.017282454296946526, "rewards/margins": 0.006800856441259384, "rewards/rejected": 0.010481595061719418, "step": 370 }, { "epoch": 0.304, "grad_norm": 2.753596067428589, "learning_rate": 1.5160000000000002e-06, "logits/chosen": 0.8928642272949219, "logits/rejected": 0.8450748324394226, "logps/chosen": -204.9441680908203, "logps/rejected": -198.3724822998047, "loss": 0.6932, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.029540909454226494, "rewards/margins": 0.0015256500337272882, "rewards/rejected": 0.028015261515975, "step": 380 }, { "epoch": 0.312, "grad_norm": 3.1084704399108887, "learning_rate": 1.556e-06, "logits/chosen": 0.8117288947105408, "logits/rejected": 0.766746461391449, "logps/chosen": -195.1800079345703, "logps/rejected": -199.2845001220703, "loss": 0.6961, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.01765977405011654, "rewards/margins": -0.004569740500301123, "rewards/rejected": 0.0222295131534338, "step": 390 }, { "epoch": 0.32, "grad_norm": 3.7862966060638428, "learning_rate": 1.596e-06, "logits/chosen": 0.8835588693618774, "logits/rejected": 0.8266305327415466, "logps/chosen": -208.58726501464844, "logps/rejected": -217.5359649658203, "loss": 0.6976, "rewards/accuracies": 0.38750001788139343, "rewards/chosen": 0.020054074004292488, "rewards/margins": -0.0072321416810154915, "rewards/rejected": 0.027286216616630554, "step": 400 }, { "epoch": 0.328, "grad_norm": 2.675278663635254, "learning_rate": 1.636e-06, "logits/chosen": 0.7499464750289917, "logits/rejected": 0.7868902087211609, "logps/chosen": -176.598876953125, "logps/rejected": -185.85235595703125, "loss": 0.697, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.015415973030030727, "rewards/margins": -0.006510419305413961, "rewards/rejected": 0.02192639373242855, "step": 410 }, { "epoch": 0.336, "grad_norm": 2.8932230472564697, "learning_rate": 1.6760000000000001e-06, "logits/chosen": 0.8424084782600403, "logits/rejected": 0.8695581555366516, "logps/chosen": -196.9973602294922, "logps/rejected": -203.99620056152344, "loss": 0.6901, "rewards/accuracies": 0.5250000357627869, "rewards/chosen": 0.015818921849131584, "rewards/margins": 0.007251453585922718, "rewards/rejected": 0.008567466400563717, "step": 420 }, { "epoch": 0.344, "grad_norm": 2.983330488204956, "learning_rate": 1.7160000000000002e-06, "logits/chosen": 0.5948252081871033, "logits/rejected": 0.6821457743644714, "logps/chosen": -192.26760864257812, "logps/rejected": -200.97569274902344, "loss": 0.6913, "rewards/accuracies": 0.5250000357627869, "rewards/chosen": 0.010373975150287151, "rewards/margins": 0.004705019760876894, "rewards/rejected": 0.005668954458087683, "step": 430 }, { "epoch": 0.352, "grad_norm": 3.401388168334961, "learning_rate": 1.7560000000000002e-06, "logits/chosen": 0.7682614922523499, "logits/rejected": 0.8079833984375, "logps/chosen": -201.40171813964844, "logps/rejected": -215.61692810058594, "loss": 0.6936, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.018279600888490677, "rewards/margins": 0.0005021900869905949, "rewards/rejected": 0.01777741126716137, "step": 440 }, { "epoch": 0.36, "grad_norm": 3.9692232608795166, "learning_rate": 1.7960000000000003e-06, "logits/chosen": 0.6712386012077332, "logits/rejected": 0.7559861540794373, "logps/chosen": -198.07620239257812, "logps/rejected": -226.9971466064453, "loss": 0.6865, "rewards/accuracies": 0.625, "rewards/chosen": 0.02878693677484989, "rewards/margins": 0.014408250339329243, "rewards/rejected": 0.014378686435520649, "step": 450 }, { "epoch": 0.368, "grad_norm": 2.5181920528411865, "learning_rate": 1.8360000000000003e-06, "logits/chosen": 0.6611719131469727, "logits/rejected": 0.7472667098045349, "logps/chosen": -186.1030731201172, "logps/rejected": -198.84902954101562, "loss": 0.6893, "rewards/accuracies": 0.5625, "rewards/chosen": 0.022867394611239433, "rewards/margins": 0.009127254597842693, "rewards/rejected": 0.01374014001339674, "step": 460 }, { "epoch": 0.376, "grad_norm": 3.491785764694214, "learning_rate": 1.8760000000000001e-06, "logits/chosen": 0.6375329494476318, "logits/rejected": 0.7117382287979126, "logps/chosen": -191.62840270996094, "logps/rejected": -192.08445739746094, "loss": 0.6936, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.013536167331039906, "rewards/margins": 0.0005845252308063209, "rewards/rejected": 0.012951642274856567, "step": 470 }, { "epoch": 0.384, "grad_norm": 3.381208896636963, "learning_rate": 1.916e-06, "logits/chosen": 0.5306805968284607, "logits/rejected": 0.6659603714942932, "logps/chosen": -200.27557373046875, "logps/rejected": -223.94517517089844, "loss": 0.6912, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.020672379061579704, "rewards/margins": 0.0055915359407663345, "rewards/rejected": 0.015080844052135944, "step": 480 }, { "epoch": 0.392, "grad_norm": 2.9275639057159424, "learning_rate": 1.956e-06, "logits/chosen": 0.7187590599060059, "logits/rejected": 0.7936305403709412, "logps/chosen": -198.9750213623047, "logps/rejected": -198.08738708496094, "loss": 0.6898, "rewards/accuracies": 0.5625, "rewards/chosen": 0.023504601791501045, "rewards/margins": 0.007863587699830532, "rewards/rejected": 0.01564101316034794, "step": 490 }, { "epoch": 0.4, "grad_norm": 2.6241567134857178, "learning_rate": 1.996e-06, "logits/chosen": 0.7280531525611877, "logits/rejected": 0.8506690859794617, "logps/chosen": -188.4355010986328, "logps/rejected": -195.9119110107422, "loss": 0.69, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.03479978069663048, "rewards/margins": 0.007221006788313389, "rewards/rejected": 0.027578774839639664, "step": 500 }, { "epoch": 0.408, "grad_norm": 2.9502575397491455, "learning_rate": 2.036e-06, "logits/chosen": 0.7209169268608093, "logits/rejected": 0.6922793984413147, "logps/chosen": -197.63975524902344, "logps/rejected": -215.0830535888672, "loss": 0.689, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.02393524721264839, "rewards/margins": 0.009743581525981426, "rewards/rejected": 0.01419166661798954, "step": 510 }, { "epoch": 0.416, "grad_norm": 2.9870102405548096, "learning_rate": 2.076e-06, "logits/chosen": 0.8047041296958923, "logits/rejected": 0.764294445514679, "logps/chosen": -202.94752502441406, "logps/rejected": -195.111572265625, "loss": 0.7018, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.029683226719498634, "rewards/margins": -0.015081966295838356, "rewards/rejected": 0.04476520046591759, "step": 520 }, { "epoch": 0.424, "grad_norm": 2.907334804534912, "learning_rate": 2.116e-06, "logits/chosen": 0.7759226560592651, "logits/rejected": 0.7076265215873718, "logps/chosen": -193.26431274414062, "logps/rejected": -193.8086700439453, "loss": 0.6929, "rewards/accuracies": 0.5625, "rewards/chosen": 0.04167484864592552, "rewards/margins": 0.0013663482386618853, "rewards/rejected": 0.040308497846126556, "step": 530 }, { "epoch": 0.432, "grad_norm": 2.9025113582611084, "learning_rate": 2.156e-06, "logits/chosen": 0.674287736415863, "logits/rejected": 0.6436707377433777, "logps/chosen": -195.1566619873047, "logps/rejected": -226.0045166015625, "loss": 0.6892, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.03686271235346794, "rewards/margins": 0.009263001382350922, "rewards/rejected": 0.02759971097111702, "step": 540 }, { "epoch": 0.44, "grad_norm": 3.0069284439086914, "learning_rate": 2.1960000000000002e-06, "logits/chosen": 0.5614029169082642, "logits/rejected": 0.618638277053833, "logps/chosen": -204.8896026611328, "logps/rejected": -218.7178497314453, "loss": 0.683, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.037945158779621124, "rewards/margins": 0.021600285544991493, "rewards/rejected": 0.01634487323462963, "step": 550 }, { "epoch": 0.448, "grad_norm": 3.801163673400879, "learning_rate": 2.2360000000000003e-06, "logits/chosen": 0.7861131429672241, "logits/rejected": 0.6601011753082275, "logps/chosen": -201.13636779785156, "logps/rejected": -198.4203643798828, "loss": 0.6883, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.056006718426942825, "rewards/margins": 0.011804056353867054, "rewards/rejected": 0.044202663004398346, "step": 560 }, { "epoch": 0.456, "grad_norm": 3.0970733165740967, "learning_rate": 2.2760000000000003e-06, "logits/chosen": 0.8429245352745056, "logits/rejected": 0.8626702427864075, "logps/chosen": -195.6461639404297, "logps/rejected": -192.6861572265625, "loss": 0.684, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.06717582792043686, "rewards/margins": 0.020122915506362915, "rewards/rejected": 0.047052908688783646, "step": 570 }, { "epoch": 0.464, "grad_norm": 3.105346918106079, "learning_rate": 2.3160000000000004e-06, "logits/chosen": 0.6571624875068665, "logits/rejected": 0.7619710564613342, "logps/chosen": -191.7879180908203, "logps/rejected": -222.39219665527344, "loss": 0.6883, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.056688953191041946, "rewards/margins": 0.010812098160386086, "rewards/rejected": 0.04587685689330101, "step": 580 }, { "epoch": 0.472, "grad_norm": 3.110258102416992, "learning_rate": 2.3560000000000004e-06, "logits/chosen": 0.6822569966316223, "logits/rejected": 0.6781023144721985, "logps/chosen": -215.3318328857422, "logps/rejected": -217.4102020263672, "loss": 0.689, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.06999214738607407, "rewards/margins": 0.01019731443375349, "rewards/rejected": 0.0597948394715786, "step": 590 }, { "epoch": 0.48, "grad_norm": 3.3028745651245117, "learning_rate": 2.3960000000000004e-06, "logits/chosen": 0.6857186555862427, "logits/rejected": 0.8173799514770508, "logps/chosen": -199.44915771484375, "logps/rejected": -205.1499481201172, "loss": 0.6833, "rewards/accuracies": 0.5625, "rewards/chosen": 0.07196470350027084, "rewards/margins": 0.021447917446494102, "rewards/rejected": 0.05051679164171219, "step": 600 }, { "epoch": 0.488, "grad_norm": 3.5192556381225586, "learning_rate": 2.4360000000000005e-06, "logits/chosen": 0.664252758026123, "logits/rejected": 0.7970002889633179, "logps/chosen": -183.03111267089844, "logps/rejected": -200.3971405029297, "loss": 0.6889, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.09819995611906052, "rewards/margins": 0.01058993861079216, "rewards/rejected": 0.08761002123355865, "step": 610 }, { "epoch": 0.496, "grad_norm": 3.326347827911377, "learning_rate": 2.476e-06, "logits/chosen": 0.7640261054039001, "logits/rejected": 0.7449796795845032, "logps/chosen": -186.8088836669922, "logps/rejected": -198.9549560546875, "loss": 0.685, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.10616890341043472, "rewards/margins": 0.01862834393978119, "rewards/rejected": 0.08754055947065353, "step": 620 }, { "epoch": 0.504, "grad_norm": 3.3991470336914062, "learning_rate": 2.516e-06, "logits/chosen": 0.6187211275100708, "logits/rejected": 0.5761955380439758, "logps/chosen": -191.6369171142578, "logps/rejected": -208.64756774902344, "loss": 0.6814, "rewards/accuracies": 0.5625, "rewards/chosen": 0.10338740795850754, "rewards/margins": 0.028011484071612358, "rewards/rejected": 0.07537592202425003, "step": 630 }, { "epoch": 0.512, "grad_norm": 3.6961252689361572, "learning_rate": 2.556e-06, "logits/chosen": 0.6848690509796143, "logits/rejected": 0.679492175579071, "logps/chosen": -211.4331512451172, "logps/rejected": -206.33084106445312, "loss": 0.6836, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.09747160971164703, "rewards/margins": 0.0221050176769495, "rewards/rejected": 0.07536659389734268, "step": 640 }, { "epoch": 0.52, "grad_norm": 3.4172682762145996, "learning_rate": 2.5960000000000002e-06, "logits/chosen": 0.6366490721702576, "logits/rejected": 0.5817402005195618, "logps/chosen": -209.3129425048828, "logps/rejected": -210.4381866455078, "loss": 0.6758, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.12278693169355392, "rewards/margins": 0.03931554779410362, "rewards/rejected": 0.08347138017416, "step": 650 }, { "epoch": 0.528, "grad_norm": 3.1855289936065674, "learning_rate": 2.6360000000000003e-06, "logits/chosen": 0.6927405595779419, "logits/rejected": 0.6666523218154907, "logps/chosen": -204.4365234375, "logps/rejected": -215.52566528320312, "loss": 0.6857, "rewards/accuracies": 0.5625, "rewards/chosen": 0.11687055975198746, "rewards/margins": 0.01899743638932705, "rewards/rejected": 0.09787313640117645, "step": 660 }, { "epoch": 0.536, "grad_norm": 3.221082925796509, "learning_rate": 2.6760000000000003e-06, "logits/chosen": 0.675836980342865, "logits/rejected": 0.6653029322624207, "logps/chosen": -190.93809509277344, "logps/rejected": -192.1734161376953, "loss": 0.6701, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.16091938316822052, "rewards/margins": 0.05060316249728203, "rewards/rejected": 0.1103162169456482, "step": 670 }, { "epoch": 0.544, "grad_norm": 3.2758514881134033, "learning_rate": 2.7160000000000003e-06, "logits/chosen": 0.6717751622200012, "logits/rejected": 0.5811682343482971, "logps/chosen": -208.1886749267578, "logps/rejected": -225.54039001464844, "loss": 0.6732, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": 0.17058324813842773, "rewards/margins": 0.046290840953588486, "rewards/rejected": 0.12429242581129074, "step": 680 }, { "epoch": 0.552, "grad_norm": 3.140164375305176, "learning_rate": 2.7560000000000004e-06, "logits/chosen": 0.7419080138206482, "logits/rejected": 0.6190251111984253, "logps/chosen": -193.7239227294922, "logps/rejected": -195.77371215820312, "loss": 0.6682, "rewards/accuracies": 0.625, "rewards/chosen": 0.2087523490190506, "rewards/margins": 0.05501072481274605, "rewards/rejected": 0.15374161303043365, "step": 690 }, { "epoch": 0.56, "grad_norm": 3.4456276893615723, "learning_rate": 2.7960000000000004e-06, "logits/chosen": 0.7370930910110474, "logits/rejected": 0.8321747779846191, "logps/chosen": -186.71826171875, "logps/rejected": -214.2365264892578, "loss": 0.6777, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.22456181049346924, "rewards/margins": 0.03610905632376671, "rewards/rejected": 0.18845276534557343, "step": 700 }, { "epoch": 0.568, "grad_norm": 3.7341623306274414, "learning_rate": 2.8360000000000005e-06, "logits/chosen": 0.7567041516304016, "logits/rejected": 0.7720674872398376, "logps/chosen": -190.39320373535156, "logps/rejected": -222.18826293945312, "loss": 0.6632, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": 0.22317533195018768, "rewards/margins": 0.06986651569604874, "rewards/rejected": 0.15330885350704193, "step": 710 }, { "epoch": 0.576, "grad_norm": 2.9456615447998047, "learning_rate": 2.8760000000000005e-06, "logits/chosen": 0.6696327328681946, "logits/rejected": 0.697002649307251, "logps/chosen": -203.71766662597656, "logps/rejected": -222.6426239013672, "loss": 0.6658, "rewards/accuracies": 0.625, "rewards/chosen": 0.22747135162353516, "rewards/margins": 0.06573095172643661, "rewards/rejected": 0.16174040734767914, "step": 720 }, { "epoch": 0.584, "grad_norm": 3.985300302505493, "learning_rate": 2.9160000000000005e-06, "logits/chosen": 0.6535480618476868, "logits/rejected": 0.7352523803710938, "logps/chosen": -202.01422119140625, "logps/rejected": -223.960693359375, "loss": 0.6824, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.1612798571586609, "rewards/margins": 0.03076176717877388, "rewards/rejected": 0.1305180937051773, "step": 730 }, { "epoch": 0.592, "grad_norm": 3.8837013244628906, "learning_rate": 2.956e-06, "logits/chosen": 0.6671428084373474, "logits/rejected": 0.6649322509765625, "logps/chosen": -191.92437744140625, "logps/rejected": -226.7194366455078, "loss": 0.6725, "rewards/accuracies": 0.625, "rewards/chosen": 0.16540491580963135, "rewards/margins": 0.051003750413656235, "rewards/rejected": 0.11440115422010422, "step": 740 }, { "epoch": 0.6, "grad_norm": 3.762125015258789, "learning_rate": 2.996e-06, "logits/chosen": 0.7455517649650574, "logits/rejected": 0.6791686415672302, "logps/chosen": -201.34228515625, "logps/rejected": -205.9029998779297, "loss": 0.6731, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.15725918114185333, "rewards/margins": 0.05198857933282852, "rewards/rejected": 0.10527060180902481, "step": 750 }, { "epoch": 0.608, "grad_norm": 4.017007827758789, "learning_rate": 3.0360000000000002e-06, "logits/chosen": 0.5845433473587036, "logits/rejected": 0.6173285841941833, "logps/chosen": -211.0860595703125, "logps/rejected": -239.35105895996094, "loss": 0.6691, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.1531037837266922, "rewards/margins": 0.06398037821054459, "rewards/rejected": 0.08912339061498642, "step": 760 }, { "epoch": 0.616, "grad_norm": 3.3737785816192627, "learning_rate": 3.0760000000000003e-06, "logits/chosen": 0.6799761056900024, "logits/rejected": 0.7338641285896301, "logps/chosen": -197.92454528808594, "logps/rejected": -196.76071166992188, "loss": 0.6861, "rewards/accuracies": 0.5250000357627869, "rewards/chosen": 0.14501558244228363, "rewards/margins": 0.024691270664334297, "rewards/rejected": 0.12032430619001389, "step": 770 }, { "epoch": 0.624, "grad_norm": 3.9343101978302, "learning_rate": 3.1160000000000003e-06, "logits/chosen": 0.6844798922538757, "logits/rejected": 0.6486098766326904, "logps/chosen": -209.14988708496094, "logps/rejected": -208.64990234375, "loss": 0.684, "rewards/accuracies": 0.5250000357627869, "rewards/chosen": 0.12247097492218018, "rewards/margins": 0.03311007842421532, "rewards/rejected": 0.08936089277267456, "step": 780 }, { "epoch": 0.632, "grad_norm": 3.3528807163238525, "learning_rate": 3.1560000000000004e-06, "logits/chosen": 0.5940313339233398, "logits/rejected": 0.5495096445083618, "logps/chosen": -200.02276611328125, "logps/rejected": -203.39085388183594, "loss": 0.6646, "rewards/accuracies": 0.5875000357627869, "rewards/chosen": 0.21792371571063995, "rewards/margins": 0.07114797830581665, "rewards/rejected": 0.1467757374048233, "step": 790 }, { "epoch": 0.64, "grad_norm": 3.5861222743988037, "learning_rate": 3.1960000000000004e-06, "logits/chosen": 0.7208414077758789, "logits/rejected": 0.7849133610725403, "logps/chosen": -168.2899932861328, "logps/rejected": -183.3815155029297, "loss": 0.6719, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.22198240458965302, "rewards/margins": 0.059726305305957794, "rewards/rejected": 0.162256121635437, "step": 800 }, { "epoch": 0.648, "grad_norm": 3.695352077484131, "learning_rate": 3.2360000000000004e-06, "logits/chosen": 0.7138350009918213, "logits/rejected": 0.7761127352714539, "logps/chosen": -191.26907348632812, "logps/rejected": -199.15538024902344, "loss": 0.6862, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.17254018783569336, "rewards/margins": 0.027448756620287895, "rewards/rejected": 0.14509142935276031, "step": 810 }, { "epoch": 0.656, "grad_norm": 4.936346054077148, "learning_rate": 3.2760000000000005e-06, "logits/chosen": 0.651190459728241, "logits/rejected": 0.6843032240867615, "logps/chosen": -185.53834533691406, "logps/rejected": -210.05166625976562, "loss": 0.6538, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": 0.17089328169822693, "rewards/margins": 0.09923329949378967, "rewards/rejected": 0.07165997475385666, "step": 820 }, { "epoch": 0.664, "grad_norm": 3.7326865196228027, "learning_rate": 3.3160000000000005e-06, "logits/chosen": 0.8238712549209595, "logits/rejected": 0.849129855632782, "logps/chosen": -199.64431762695312, "logps/rejected": -206.5725555419922, "loss": 0.6559, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": 0.14575400948524475, "rewards/margins": 0.09553883224725723, "rewards/rejected": 0.05021516606211662, "step": 830 }, { "epoch": 0.672, "grad_norm": 3.8525242805480957, "learning_rate": 3.3560000000000006e-06, "logits/chosen": 0.68717360496521, "logits/rejected": 0.775698184967041, "logps/chosen": -198.4635009765625, "logps/rejected": -218.93565368652344, "loss": 0.6672, "rewards/accuracies": 0.5875000357627869, "rewards/chosen": 0.03423814848065376, "rewards/margins": 0.07274122536182404, "rewards/rejected": -0.03850306198000908, "step": 840 }, { "epoch": 0.68, "grad_norm": 3.930669069290161, "learning_rate": 3.3960000000000006e-06, "logits/chosen": 0.7518499493598938, "logits/rejected": 0.8216703534126282, "logps/chosen": -205.54248046875, "logps/rejected": -207.52676391601562, "loss": 0.6571, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.12743444740772247, "rewards/margins": 0.0990920141339302, "rewards/rejected": 0.028342435136437416, "step": 850 }, { "epoch": 0.688, "grad_norm": 3.8377163410186768, "learning_rate": 3.4360000000000006e-06, "logits/chosen": 0.7670461535453796, "logits/rejected": 0.7580811977386475, "logps/chosen": -182.31930541992188, "logps/rejected": -194.57054138183594, "loss": 0.6654, "rewards/accuracies": 0.625, "rewards/chosen": 0.18013069033622742, "rewards/margins": 0.07547135651111603, "rewards/rejected": 0.10465934127569199, "step": 860 }, { "epoch": 0.696, "grad_norm": 4.037013053894043, "learning_rate": 3.4760000000000007e-06, "logits/chosen": 0.6302778720855713, "logits/rejected": 0.5533519983291626, "logps/chosen": -201.7500457763672, "logps/rejected": -217.78018188476562, "loss": 0.6598, "rewards/accuracies": 0.5875000357627869, "rewards/chosen": 0.1704377979040146, "rewards/margins": 0.10010436922311783, "rewards/rejected": 0.07033341377973557, "step": 870 }, { "epoch": 0.704, "grad_norm": 3.9927608966827393, "learning_rate": 3.5160000000000007e-06, "logits/chosen": 0.6597375273704529, "logits/rejected": 0.6159626245498657, "logps/chosen": -193.73165893554688, "logps/rejected": -198.38595581054688, "loss": 0.6935, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.19550852477550507, "rewards/margins": 0.04272466525435448, "rewards/rejected": 0.15278387069702148, "step": 880 }, { "epoch": 0.712, "grad_norm": 3.5315475463867188, "learning_rate": 3.5560000000000008e-06, "logits/chosen": 0.685095489025116, "logits/rejected": 0.6823846697807312, "logps/chosen": -201.48406982421875, "logps/rejected": -212.81094360351562, "loss": 0.6487, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.20819902420043945, "rewards/margins": 0.10897611826658249, "rewards/rejected": 0.09922291338443756, "step": 890 }, { "epoch": 0.72, "grad_norm": 3.673504114151001, "learning_rate": 3.596e-06, "logits/chosen": 0.7472502589225769, "logits/rejected": 0.623907208442688, "logps/chosen": -183.38999938964844, "logps/rejected": -181.52911376953125, "loss": 0.6561, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.28343966603279114, "rewards/margins": 0.09876996278762817, "rewards/rejected": 0.18466970324516296, "step": 900 }, { "epoch": 0.728, "grad_norm": 4.183164596557617, "learning_rate": 3.636e-06, "logits/chosen": 0.6977134346961975, "logits/rejected": 0.6741080284118652, "logps/chosen": -192.44366455078125, "logps/rejected": -186.1464080810547, "loss": 0.6923, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.25711265206336975, "rewards/margins": 0.020020050927996635, "rewards/rejected": 0.23709259927272797, "step": 910 }, { "epoch": 0.736, "grad_norm": 3.74733304977417, "learning_rate": 3.676e-06, "logits/chosen": 0.7447720170021057, "logits/rejected": 0.6725283265113831, "logps/chosen": -186.6201934814453, "logps/rejected": -204.659423828125, "loss": 0.6199, "rewards/accuracies": 0.7125000357627869, "rewards/chosen": 0.3455497920513153, "rewards/margins": 0.18741247057914734, "rewards/rejected": 0.15813732147216797, "step": 920 }, { "epoch": 0.744, "grad_norm": 4.2909255027771, "learning_rate": 3.716e-06, "logits/chosen": 0.656113862991333, "logits/rejected": 0.6516739130020142, "logps/chosen": -197.02088928222656, "logps/rejected": -213.64414978027344, "loss": 0.6621, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": 0.20855574309825897, "rewards/margins": 0.08712134510278702, "rewards/rejected": 0.12143440544605255, "step": 930 }, { "epoch": 0.752, "grad_norm": 4.346622943878174, "learning_rate": 3.756e-06, "logits/chosen": 0.7425007224082947, "logits/rejected": 0.7672116160392761, "logps/chosen": -164.4781494140625, "logps/rejected": -189.1973876953125, "loss": 0.6322, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2733791768550873, "rewards/margins": 0.15534010529518127, "rewards/rejected": 0.1180390939116478, "step": 940 }, { "epoch": 0.76, "grad_norm": 3.8516108989715576, "learning_rate": 3.796e-06, "logits/chosen": 0.6322047710418701, "logits/rejected": 0.6729938387870789, "logps/chosen": -179.63258361816406, "logps/rejected": -213.58462524414062, "loss": 0.6348, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.10944360494613647, "rewards/margins": 0.161842480301857, "rewards/rejected": -0.05239887163043022, "step": 950 }, { "epoch": 0.768, "grad_norm": 3.8706507682800293, "learning_rate": 3.836e-06, "logits/chosen": 0.7383615374565125, "logits/rejected": 0.645232617855072, "logps/chosen": -209.4774932861328, "logps/rejected": -218.1513214111328, "loss": 0.6402, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": 0.016749557107686996, "rewards/margins": 0.16539375483989716, "rewards/rejected": -0.14864420890808105, "step": 960 }, { "epoch": 0.776, "grad_norm": 3.860457420349121, "learning_rate": 3.876000000000001e-06, "logits/chosen": 0.7996999025344849, "logits/rejected": 0.7975447773933411, "logps/chosen": -197.4671630859375, "logps/rejected": -207.5522003173828, "loss": 0.6625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.1389116793870926, "rewards/margins": 0.09984000772237778, "rewards/rejected": 0.03907167166471481, "step": 970 }, { "epoch": 0.784, "grad_norm": 4.529206275939941, "learning_rate": 3.916e-06, "logits/chosen": 0.7569703459739685, "logits/rejected": 0.7382944226264954, "logps/chosen": -191.17686462402344, "logps/rejected": -207.4125518798828, "loss": 0.625, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0675772950053215, "rewards/margins": 0.2260824292898178, "rewards/rejected": -0.15850511193275452, "step": 980 }, { "epoch": 0.792, "grad_norm": 3.819338083267212, "learning_rate": 3.956000000000001e-06, "logits/chosen": 0.7830851674079895, "logits/rejected": 0.7947285771369934, "logps/chosen": -205.89517211914062, "logps/rejected": -224.50967407226562, "loss": 0.6524, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.003182247979566455, "rewards/margins": 0.1461525410413742, "rewards/rejected": -0.14933478832244873, "step": 990 }, { "epoch": 0.8, "grad_norm": 3.4207472801208496, "learning_rate": 3.996e-06, "logits/chosen": 0.6669479012489319, "logits/rejected": 0.7112082839012146, "logps/chosen": -205.70693969726562, "logps/rejected": -223.3166046142578, "loss": 0.6352, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.08382979780435562, "rewards/margins": 0.19078943133354187, "rewards/rejected": -0.10695965588092804, "step": 1000 }, { "epoch": 0.808, "grad_norm": 4.2891526222229, "learning_rate": 4.036000000000001e-06, "logits/chosen": 0.6510803699493408, "logits/rejected": 0.6577145457267761, "logps/chosen": -191.43975830078125, "logps/rejected": -217.14639282226562, "loss": 0.6366, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": 0.09330642223358154, "rewards/margins": 0.17022596299648285, "rewards/rejected": -0.07691951841115952, "step": 1010 }, { "epoch": 0.816, "grad_norm": 4.56123685836792, "learning_rate": 4.0760000000000004e-06, "logits/chosen": 0.6556877493858337, "logits/rejected": 0.6388117671012878, "logps/chosen": -197.0552520751953, "logps/rejected": -203.9344482421875, "loss": 0.6354, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.07625667005777359, "rewards/margins": 0.21561256051063538, "rewards/rejected": -0.13935589790344238, "step": 1020 }, { "epoch": 0.824, "grad_norm": 3.7361807823181152, "learning_rate": 4.116000000000001e-06, "logits/chosen": 0.7679457068443298, "logits/rejected": 0.7817143797874451, "logps/chosen": -205.3223419189453, "logps/rejected": -211.694580078125, "loss": 0.654, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.11138266324996948, "rewards/margins": 0.13815438747406006, "rewards/rejected": -0.026771722361445427, "step": 1030 }, { "epoch": 0.832, "grad_norm": 5.420875072479248, "learning_rate": 4.1560000000000005e-06, "logits/chosen": 0.6636489629745483, "logits/rejected": 0.692511260509491, "logps/chosen": -207.5504150390625, "logps/rejected": -225.18753051757812, "loss": 0.6344, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1605958491563797, "rewards/margins": 0.1674598604440689, "rewards/rejected": -0.00686401454731822, "step": 1040 }, { "epoch": 0.84, "grad_norm": 3.272894859313965, "learning_rate": 4.196e-06, "logits/chosen": 0.7486108541488647, "logits/rejected": 0.7712265253067017, "logps/chosen": -194.90943908691406, "logps/rejected": -205.329345703125, "loss": 0.6322, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.22325454652309418, "rewards/margins": 0.22328029572963715, "rewards/rejected": -2.5737286705407314e-05, "step": 1050 }, { "epoch": 0.848, "grad_norm": 5.14847993850708, "learning_rate": 4.236e-06, "logits/chosen": 0.7043625116348267, "logits/rejected": 0.7297138571739197, "logps/chosen": -214.93301391601562, "logps/rejected": -221.634765625, "loss": 0.6797, "rewards/accuracies": 0.5625, "rewards/chosen": 0.19927440583705902, "rewards/margins": 0.10101515054702759, "rewards/rejected": 0.09825924783945084, "step": 1060 }, { "epoch": 0.856, "grad_norm": 4.329235076904297, "learning_rate": 4.276e-06, "logits/chosen": 0.7435252070426941, "logits/rejected": 0.7416861653327942, "logps/chosen": -206.2967987060547, "logps/rejected": -221.4429168701172, "loss": 0.6532, "rewards/accuracies": 0.625, "rewards/chosen": 0.21223540604114532, "rewards/margins": 0.14147967100143433, "rewards/rejected": 0.070755735039711, "step": 1070 }, { "epoch": 0.864, "grad_norm": 3.7461328506469727, "learning_rate": 4.316e-06, "logits/chosen": 0.8374633193016052, "logits/rejected": 0.8583774566650391, "logps/chosen": -203.61318969726562, "logps/rejected": -218.10350036621094, "loss": 0.6082, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": 0.2129761427640915, "rewards/margins": 0.223725363612175, "rewards/rejected": -0.010749227367341518, "step": 1080 }, { "epoch": 0.872, "grad_norm": 3.664881467819214, "learning_rate": 4.356e-06, "logits/chosen": 0.7724778652191162, "logits/rejected": 0.7607793807983398, "logps/chosen": -194.4547576904297, "logps/rejected": -221.48941040039062, "loss": 0.6174, "rewards/accuracies": 0.625, "rewards/chosen": 0.14989307522773743, "rewards/margins": 0.24988269805908203, "rewards/rejected": -0.0999896302819252, "step": 1090 }, { "epoch": 0.88, "grad_norm": 3.962773323059082, "learning_rate": 4.396e-06, "logits/chosen": 0.8602146506309509, "logits/rejected": 0.7610459327697754, "logps/chosen": -186.56153869628906, "logps/rejected": -191.79127502441406, "loss": 0.609, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": 0.13783621788024902, "rewards/margins": 0.24294762313365936, "rewards/rejected": -0.10511143505573273, "step": 1100 }, { "epoch": 0.888, "grad_norm": 3.6281254291534424, "learning_rate": 4.436e-06, "logits/chosen": 0.7969589233398438, "logits/rejected": 0.7449702620506287, "logps/chosen": -199.1891326904297, "logps/rejected": -212.9011688232422, "loss": 0.6054, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": -0.007032311055809259, "rewards/margins": 0.28387296199798584, "rewards/rejected": -0.29090526700019836, "step": 1110 }, { "epoch": 0.896, "grad_norm": 4.473705768585205, "learning_rate": 4.476e-06, "logits/chosen": 0.8331708312034607, "logits/rejected": 0.8655223846435547, "logps/chosen": -190.3954620361328, "logps/rejected": -204.8839874267578, "loss": 0.6281, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.1991288810968399, "rewards/margins": 0.20112191140651703, "rewards/rejected": -0.0019930123817175627, "step": 1120 }, { "epoch": 0.904, "grad_norm": 6.322517395019531, "learning_rate": 4.5160000000000005e-06, "logits/chosen": 0.8980779051780701, "logits/rejected": 0.8646361231803894, "logps/chosen": -207.82090759277344, "logps/rejected": -202.1289520263672, "loss": 0.6373, "rewards/accuracies": 0.625, "rewards/chosen": 0.1084756851196289, "rewards/margins": 0.20956145226955414, "rewards/rejected": -0.10108575969934464, "step": 1130 }, { "epoch": 0.912, "grad_norm": 6.67050838470459, "learning_rate": 4.556e-06, "logits/chosen": 0.6762229800224304, "logits/rejected": 0.843460738658905, "logps/chosen": -205.12686157226562, "logps/rejected": -238.93972778320312, "loss": 0.6326, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.20181012153625488, "rewards/margins": 0.23140645027160645, "rewards/rejected": -0.43321657180786133, "step": 1140 }, { "epoch": 0.92, "grad_norm": 3.84826922416687, "learning_rate": 4.5960000000000006e-06, "logits/chosen": 0.8595611453056335, "logits/rejected": 0.7575063705444336, "logps/chosen": -191.56690979003906, "logps/rejected": -211.3734588623047, "loss": 0.7016, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.017901450395584106, "rewards/margins": 0.08475067466497421, "rewards/rejected": -0.10265214741230011, "step": 1150 }, { "epoch": 0.928, "grad_norm": 5.697420120239258, "learning_rate": 4.636e-06, "logits/chosen": 0.7715900540351868, "logits/rejected": 0.6638416051864624, "logps/chosen": -197.2618865966797, "logps/rejected": -178.20579528808594, "loss": 0.6362, "rewards/accuracies": 0.625, "rewards/chosen": 0.10496443510055542, "rewards/margins": 0.22472666203975677, "rewards/rejected": -0.11976220458745956, "step": 1160 }, { "epoch": 0.936, "grad_norm": 4.553537845611572, "learning_rate": 4.676000000000001e-06, "logits/chosen": 0.7215653657913208, "logits/rejected": 0.7404984831809998, "logps/chosen": -198.07424926757812, "logps/rejected": -231.4817657470703, "loss": 0.5759, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.14309090375900269, "rewards/margins": 0.4066457450389862, "rewards/rejected": -0.2635548412799835, "step": 1170 }, { "epoch": 0.944, "grad_norm": 6.022956848144531, "learning_rate": 4.716e-06, "logits/chosen": 0.8578891754150391, "logits/rejected": 0.8036954998970032, "logps/chosen": -202.0140380859375, "logps/rejected": -212.0448760986328, "loss": 0.668, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03294707089662552, "rewards/margins": 0.15671278536319733, "rewards/rejected": -0.18965983390808105, "step": 1180 }, { "epoch": 0.952, "grad_norm": 4.317718029022217, "learning_rate": 4.756000000000001e-06, "logits/chosen": 0.8726083636283875, "logits/rejected": 0.9713365435600281, "logps/chosen": -185.42198181152344, "logps/rejected": -206.3362274169922, "loss": 0.6041, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.05594133213162422, "rewards/margins": 0.28803297877311707, "rewards/rejected": -0.3439743220806122, "step": 1190 }, { "epoch": 0.96, "grad_norm": 4.923951625823975, "learning_rate": 4.796e-06, "logits/chosen": 0.8178095817565918, "logits/rejected": 0.8852421641349792, "logps/chosen": -177.03765869140625, "logps/rejected": -189.92286682128906, "loss": 0.6351, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.016861964017152786, "rewards/margins": 0.20337799191474915, "rewards/rejected": -0.18651603162288666, "step": 1200 }, { "epoch": 0.968, "grad_norm": 4.542464256286621, "learning_rate": 4.836e-06, "logits/chosen": 0.8311338424682617, "logits/rejected": 0.7399374842643738, "logps/chosen": -202.60267639160156, "logps/rejected": -203.94236755371094, "loss": 0.6779, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.19154860079288483, "rewards/margins": 0.115830197930336, "rewards/rejected": -0.3073787987232208, "step": 1210 }, { "epoch": 0.976, "grad_norm": 6.436461925506592, "learning_rate": 4.876e-06, "logits/chosen": 0.794135570526123, "logits/rejected": 0.7919009327888489, "logps/chosen": -207.1306610107422, "logps/rejected": -205.37637329101562, "loss": 0.6314, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07707437127828598, "rewards/margins": 0.25921908020973206, "rewards/rejected": -0.33629345893859863, "step": 1220 }, { "epoch": 0.984, "grad_norm": 5.068552494049072, "learning_rate": 4.916e-06, "logits/chosen": 0.9020944833755493, "logits/rejected": 0.8565078973770142, "logps/chosen": -191.53636169433594, "logps/rejected": -192.14581298828125, "loss": 0.6288, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.21075907349586487, "rewards/margins": 0.26774677634239197, "rewards/rejected": -0.056987714022397995, "step": 1230 }, { "epoch": 0.992, "grad_norm": 3.66788649559021, "learning_rate": 4.9560000000000005e-06, "logits/chosen": 0.759829044342041, "logits/rejected": 0.7424188852310181, "logps/chosen": -196.9650115966797, "logps/rejected": -197.923583984375, "loss": 0.5986, "rewards/accuracies": 0.6875, "rewards/chosen": 0.22332988679409027, "rewards/margins": 0.3243139684200287, "rewards/rejected": -0.1009841114282608, "step": 1240 }, { "epoch": 1.0, "grad_norm": 5.5345940589904785, "learning_rate": 4.996e-06, "logits/chosen": 0.8046256899833679, "logits/rejected": 0.7486367225646973, "logps/chosen": -196.2307586669922, "logps/rejected": -213.2323455810547, "loss": 0.6443, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": -0.04985840991139412, "rewards/margins": 0.21199627220630646, "rewards/rejected": -0.2618546485900879, "step": 1250 }, { "epoch": 1.008, "grad_norm": 3.8329803943634033, "learning_rate": 4.9999921043206356e-06, "logits/chosen": 0.8027931451797485, "logits/rejected": 0.7976822853088379, "logps/chosen": -198.8918914794922, "logps/rejected": -188.731201171875, "loss": 0.5909, "rewards/accuracies": 0.7125000357627869, "rewards/chosen": 0.2840830385684967, "rewards/margins": 0.3135008215904236, "rewards/rejected": -0.029417768120765686, "step": 1260 }, { "epoch": 1.016, "grad_norm": 5.155350685119629, "learning_rate": 4.99996481067822e-06, "logits/chosen": 0.8220119476318359, "logits/rejected": 0.8561745882034302, "logps/chosen": -206.8731231689453, "logps/rejected": -216.76406860351562, "loss": 0.5927, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.008052355609834194, "rewards/margins": 0.3377354145050049, "rewards/rejected": -0.3296830654144287, "step": 1270 }, { "epoch": 1.024, "grad_norm": 4.907769203186035, "learning_rate": 4.99991802180802e-06, "logits/chosen": 0.9498602151870728, "logits/rejected": 0.8992852568626404, "logps/chosen": -205.61570739746094, "logps/rejected": -227.8877716064453, "loss": 0.6093, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.015175617299973965, "rewards/margins": 0.2621200382709503, "rewards/rejected": -0.24694442749023438, "step": 1280 }, { "epoch": 1.032, "grad_norm": 5.946913242340088, "learning_rate": 4.999851738074904e-06, "logits/chosen": 0.9917473196983337, "logits/rejected": 0.9724864363670349, "logps/chosen": -179.44322204589844, "logps/rejected": -202.18516540527344, "loss": 0.6529, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.23127515614032745, "rewards/margins": 0.1929376870393753, "rewards/rejected": 0.038337476551532745, "step": 1290 }, { "epoch": 1.04, "grad_norm": 3.936194896697998, "learning_rate": 4.999765959995769e-06, "logits/chosen": 0.9820284247398376, "logits/rejected": 0.8659685254096985, "logps/chosen": -204.54019165039062, "logps/rejected": -193.43544006347656, "loss": 0.5873, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2293062061071396, "rewards/margins": 0.34357786178588867, "rewards/rejected": -0.11427167803049088, "step": 1300 }, { "epoch": 1.048, "grad_norm": 4.1008076667785645, "learning_rate": 4.999660688239527e-06, "logits/chosen": 0.835074245929718, "logits/rejected": 0.7787433862686157, "logps/chosen": -188.2495880126953, "logps/rejected": -197.9439697265625, "loss": 0.6111, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2561519742012024, "rewards/margins": 0.31733056902885437, "rewards/rejected": -0.06117859110236168, "step": 1310 }, { "epoch": 1.056, "grad_norm": 5.592878341674805, "learning_rate": 4.9995359236271094e-06, "logits/chosen": 0.883532702922821, "logits/rejected": 0.8534946441650391, "logps/chosen": -199.9098358154297, "logps/rejected": -197.54641723632812, "loss": 0.6772, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.2197556495666504, "rewards/margins": 0.17633479833602905, "rewards/rejected": 0.04342082887887955, "step": 1320 }, { "epoch": 1.064, "grad_norm": 4.941606521606445, "learning_rate": 4.999391667131456e-06, "logits/chosen": 0.860753059387207, "logits/rejected": 0.8954046368598938, "logps/chosen": -178.4503936767578, "logps/rejected": -196.09449768066406, "loss": 0.6786, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.48869457840919495, "rewards/margins": 0.12707066535949707, "rewards/rejected": 0.36162394285202026, "step": 1330 }, { "epoch": 1.072, "grad_norm": 5.233992576599121, "learning_rate": 4.999227919877506e-06, "logits/chosen": 0.8799529075622559, "logits/rejected": 0.8783776164054871, "logps/chosen": -195.3287811279297, "logps/rejected": -195.21665954589844, "loss": 0.6231, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": 0.4896652400493622, "rewards/margins": 0.27517566084861755, "rewards/rejected": 0.2144896239042282, "step": 1340 }, { "epoch": 1.08, "grad_norm": 8.488966941833496, "learning_rate": 4.999044683142196e-06, "logits/chosen": 1.042872667312622, "logits/rejected": 0.8863340616226196, "logps/chosen": -197.02415466308594, "logps/rejected": -202.64767456054688, "loss": 0.6329, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.4562924802303314, "rewards/margins": 0.19963227212429047, "rewards/rejected": 0.25666019320487976, "step": 1350 }, { "epoch": 1.088, "grad_norm": 4.995717525482178, "learning_rate": 4.99884195835444e-06, "logits/chosen": 0.8799748420715332, "logits/rejected": 0.8477336764335632, "logps/chosen": -191.94471740722656, "logps/rejected": -206.7391815185547, "loss": 0.6352, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.18572449684143066, "rewards/margins": 0.2937488555908203, "rewards/rejected": -0.10802433639764786, "step": 1360 }, { "epoch": 1.096, "grad_norm": 4.870668888092041, "learning_rate": 4.998619747095129e-06, "logits/chosen": 0.7523741722106934, "logits/rejected": 0.7566614747047424, "logps/chosen": -200.09573364257812, "logps/rejected": -209.72647094726562, "loss": 0.6096, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.1902216672897339, "rewards/margins": 0.32739758491516113, "rewards/rejected": -0.5176193118095398, "step": 1370 }, { "epoch": 1.104, "grad_norm": 4.8878889083862305, "learning_rate": 4.998378051097111e-06, "logits/chosen": 0.6805158257484436, "logits/rejected": 0.6997960805892944, "logps/chosen": -211.3839874267578, "logps/rejected": -243.62759399414062, "loss": 0.6292, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": -0.17687633633613586, "rewards/margins": 0.31230995059013367, "rewards/rejected": -0.48918625712394714, "step": 1380 }, { "epoch": 1.112, "grad_norm": 4.575040817260742, "learning_rate": 4.998116872245178e-06, "logits/chosen": 0.9538719058036804, "logits/rejected": 0.8567468523979187, "logps/chosen": -197.8600311279297, "logps/rejected": -194.09893798828125, "loss": 0.6508, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0491158552467823, "rewards/margins": 0.23099613189697266, "rewards/rejected": -0.28011199831962585, "step": 1390 }, { "epoch": 1.12, "grad_norm": 4.09750509262085, "learning_rate": 4.997836212576057e-06, "logits/chosen": 0.7558371424674988, "logits/rejected": 0.7620294690132141, "logps/chosen": -210.34483337402344, "logps/rejected": -220.9149627685547, "loss": 0.583, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.07163530588150024, "rewards/margins": 0.40291815996170044, "rewards/rejected": -0.4745534360408783, "step": 1400 }, { "epoch": 1.1280000000000001, "grad_norm": 4.7588887214660645, "learning_rate": 4.997536074278388e-06, "logits/chosen": 0.7905246019363403, "logits/rejected": 0.8206660151481628, "logps/chosen": -211.0833740234375, "logps/rejected": -218.993896484375, "loss": 0.6144, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.14193372428417206, "rewards/margins": 0.2602233588695526, "rewards/rejected": -0.4021570682525635, "step": 1410 }, { "epoch": 1.1360000000000001, "grad_norm": 6.200011730194092, "learning_rate": 4.9972164596927085e-06, "logits/chosen": 0.9080677032470703, "logits/rejected": 0.8735291361808777, "logps/chosen": -194.74559020996094, "logps/rejected": -200.72265625, "loss": 0.6563, "rewards/accuracies": 0.625, "rewards/chosen": -0.202849343419075, "rewards/margins": 0.19263452291488647, "rewards/rejected": -0.3954838514328003, "step": 1420 }, { "epoch": 1.144, "grad_norm": 7.389719486236572, "learning_rate": 4.996877371311439e-06, "logits/chosen": 0.7800935506820679, "logits/rejected": 0.8093517422676086, "logps/chosen": -211.73667907714844, "logps/rejected": -222.60000610351562, "loss": 0.5757, "rewards/accuracies": 0.75, "rewards/chosen": -0.2545677125453949, "rewards/margins": 0.49100762605667114, "rewards/rejected": -0.7455753087997437, "step": 1430 }, { "epoch": 1.152, "grad_norm": 4.560316562652588, "learning_rate": 4.996518811778858e-06, "logits/chosen": 0.9419007301330566, "logits/rejected": 0.8434064984321594, "logps/chosen": -193.9005889892578, "logps/rejected": -196.77064514160156, "loss": 0.5937, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.10783553123474121, "rewards/margins": 0.35865020751953125, "rewards/rejected": -0.25081467628479004, "step": 1440 }, { "epoch": 1.16, "grad_norm": 4.446018218994141, "learning_rate": 4.996140783891085e-06, "logits/chosen": 0.8335108757019043, "logits/rejected": 0.7712218165397644, "logps/chosen": -200.7833251953125, "logps/rejected": -217.6037139892578, "loss": 0.5632, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -0.268947035074234, "rewards/margins": 0.4497573971748352, "rewards/rejected": -0.7187044024467468, "step": 1450 }, { "epoch": 1.168, "grad_norm": 4.578011989593506, "learning_rate": 4.9957432905960575e-06, "logits/chosen": 0.8321472406387329, "logits/rejected": 0.8203474283218384, "logps/chosen": -179.63121032714844, "logps/rejected": -218.509033203125, "loss": 0.5283, "rewards/accuracies": 0.7125000357627869, "rewards/chosen": -0.2659996449947357, "rewards/margins": 0.5776376724243164, "rewards/rejected": -0.8436372876167297, "step": 1460 }, { "epoch": 1.176, "grad_norm": 8.904568672180176, "learning_rate": 4.995326334993508e-06, "logits/chosen": 0.7332701086997986, "logits/rejected": 0.7367414832115173, "logps/chosen": -215.7530975341797, "logps/rejected": -235.0543670654297, "loss": 0.6247, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6151739954948425, "rewards/margins": 0.4263072907924652, "rewards/rejected": -1.041481375694275, "step": 1470 }, { "epoch": 1.184, "grad_norm": 4.632490158081055, "learning_rate": 4.994889920334939e-06, "logits/chosen": 0.8015623092651367, "logits/rejected": 0.6672269701957703, "logps/chosen": -206.82408142089844, "logps/rejected": -212.19155883789062, "loss": 0.5282, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.25984275341033936, "rewards/margins": 0.636847734451294, "rewards/rejected": -0.8966904878616333, "step": 1480 }, { "epoch": 1.192, "grad_norm": 5.93597412109375, "learning_rate": 4.994434050023601e-06, "logits/chosen": 0.8352071642875671, "logits/rejected": 0.796085000038147, "logps/chosen": -200.97604370117188, "logps/rejected": -198.58670043945312, "loss": 0.6777, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5017086267471313, "rewards/margins": 0.23309734463691711, "rewards/rejected": -0.7348060011863708, "step": 1490 }, { "epoch": 1.2, "grad_norm": 6.238118648529053, "learning_rate": 4.993958727614462e-06, "logits/chosen": 0.7685348391532898, "logits/rejected": 0.6558945775032043, "logps/chosen": -204.9226531982422, "logps/rejected": -204.36001586914062, "loss": 0.591, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.33830520510673523, "rewards/margins": 0.46260887384414673, "rewards/rejected": -0.8009141087532043, "step": 1500 }, { "epoch": 1.208, "grad_norm": 7.67111873626709, "learning_rate": 4.993463956814181e-06, "logits/chosen": 0.8750897645950317, "logits/rejected": 0.9166715741157532, "logps/chosen": -189.0762176513672, "logps/rejected": -201.63124084472656, "loss": 0.6032, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2800356447696686, "rewards/margins": 0.3452514708042145, "rewards/rejected": -0.6252870559692383, "step": 1510 }, { "epoch": 1.216, "grad_norm": 4.525489330291748, "learning_rate": 4.99294974148108e-06, "logits/chosen": 0.8436128497123718, "logits/rejected": 0.8338537216186523, "logps/chosen": -201.09213256835938, "logps/rejected": -220.0704803466797, "loss": 0.5747, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.21393656730651855, "rewards/margins": 0.4898119568824768, "rewards/rejected": -0.7037484645843506, "step": 1520 }, { "epoch": 1.224, "grad_norm": 7.136298656463623, "learning_rate": 4.992416085625115e-06, "logits/chosen": 0.9371658563613892, "logits/rejected": 0.9421829581260681, "logps/chosen": -187.67381286621094, "logps/rejected": -210.39036560058594, "loss": 0.6005, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.47332143783569336, "rewards/margins": 0.3808668553829193, "rewards/rejected": -0.8541883826255798, "step": 1530 }, { "epoch": 1.232, "grad_norm": 5.2598700523376465, "learning_rate": 4.991862993407841e-06, "logits/chosen": 0.9011389017105103, "logits/rejected": 0.8639345169067383, "logps/chosen": -204.3238525390625, "logps/rejected": -222.8467254638672, "loss": 0.546, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5745847821235657, "rewards/margins": 0.592379629611969, "rewards/rejected": -1.1669644117355347, "step": 1540 }, { "epoch": 1.24, "grad_norm": 6.515147686004639, "learning_rate": 4.99129046914238e-06, "logits/chosen": 0.855043888092041, "logits/rejected": 0.895084798336029, "logps/chosen": -178.9577178955078, "logps/rejected": -202.40538024902344, "loss": 0.525, "rewards/accuracies": 0.7125000357627869, "rewards/chosen": -0.3232458531856537, "rewards/margins": 0.7021111845970154, "rewards/rejected": -1.0253571271896362, "step": 1550 }, { "epoch": 1.248, "grad_norm": 4.3140177726745605, "learning_rate": 4.990698517293394e-06, "logits/chosen": 0.7897105813026428, "logits/rejected": 0.7928025126457214, "logps/chosen": -209.9716339111328, "logps/rejected": -214.2898712158203, "loss": 0.5895, "rewards/accuracies": 0.7125000357627869, "rewards/chosen": -0.16820655763149261, "rewards/margins": 0.539027988910675, "rewards/rejected": -0.7072345614433289, "step": 1560 }, { "epoch": 1.256, "grad_norm": 7.281121730804443, "learning_rate": 4.990087142477042e-06, "logits/chosen": 0.797493577003479, "logits/rejected": 0.841319739818573, "logps/chosen": -211.1555633544922, "logps/rejected": -206.3491668701172, "loss": 0.5963, "rewards/accuracies": 0.6875, "rewards/chosen": -0.420600026845932, "rewards/margins": 0.395569771528244, "rewards/rejected": -0.8161698579788208, "step": 1570 }, { "epoch": 1.264, "grad_norm": 6.5268683433532715, "learning_rate": 4.989456349460946e-06, "logits/chosen": 0.8825798034667969, "logits/rejected": 0.7889868021011353, "logps/chosen": -214.40603637695312, "logps/rejected": -223.7728729248047, "loss": 0.556, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4342477023601532, "rewards/margins": 0.5031339526176453, "rewards/rejected": -0.9373816847801208, "step": 1580 }, { "epoch": 1.272, "grad_norm": 5.648573398590088, "learning_rate": 4.988806143164159e-06, "logits/chosen": 0.8476539850234985, "logits/rejected": 0.7688016891479492, "logps/chosen": -213.4077911376953, "logps/rejected": -233.77456665039062, "loss": 0.5371, "rewards/accuracies": 0.75, "rewards/chosen": -0.7733410000801086, "rewards/margins": 0.6128751039505005, "rewards/rejected": -1.3862160444259644, "step": 1590 }, { "epoch": 1.28, "grad_norm": 5.341320037841797, "learning_rate": 4.988136528657118e-06, "logits/chosen": 0.8851507306098938, "logits/rejected": 0.9027878046035767, "logps/chosen": -213.1574249267578, "logps/rejected": -234.6796112060547, "loss": 0.5464, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7365319132804871, "rewards/margins": 0.615622878074646, "rewards/rejected": -1.3521548509597778, "step": 1600 }, { "epoch": 1.288, "grad_norm": 8.493136405944824, "learning_rate": 4.987447511161613e-06, "logits/chosen": 0.8196334838867188, "logits/rejected": 0.833379864692688, "logps/chosen": -220.86338806152344, "logps/rejected": -230.03433227539062, "loss": 0.656, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6297645568847656, "rewards/margins": 0.4346865713596344, "rewards/rejected": -1.0644512176513672, "step": 1610 }, { "epoch": 1.296, "grad_norm": 7.547203540802002, "learning_rate": 4.98673909605074e-06, "logits/chosen": 0.8613888025283813, "logits/rejected": 0.8237080574035645, "logps/chosen": -210.69900512695312, "logps/rejected": -225.7193145751953, "loss": 0.5646, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6549606323242188, "rewards/margins": 0.5880576968193054, "rewards/rejected": -1.243018388748169, "step": 1620 }, { "epoch": 1.304, "grad_norm": 4.983480453491211, "learning_rate": 4.986011288848863e-06, "logits/chosen": 0.9473403096199036, "logits/rejected": 0.8395648002624512, "logps/chosen": -205.3673858642578, "logps/rejected": -201.21458435058594, "loss": 0.5895, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": -0.6703227758407593, "rewards/margins": 0.437595933675766, "rewards/rejected": -1.1079187393188477, "step": 1630 }, { "epoch": 1.312, "grad_norm": 4.282800674438477, "learning_rate": 4.985264095231568e-06, "logits/chosen": 1.042983889579773, "logits/rejected": 0.9440056085586548, "logps/chosen": -218.52037048339844, "logps/rejected": -235.33702087402344, "loss": 0.5297, "rewards/accuracies": 0.75, "rewards/chosen": -0.7287598848342896, "rewards/margins": 0.6097584366798401, "rewards/rejected": -1.3385183811187744, "step": 1640 }, { "epoch": 1.32, "grad_norm": 5.741344928741455, "learning_rate": 4.984497521025622e-06, "logits/chosen": 0.9067613482475281, "logits/rejected": 0.907869279384613, "logps/chosen": -214.6510772705078, "logps/rejected": -233.52578735351562, "loss": 0.6452, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5831167101860046, "rewards/margins": 0.4216102659702301, "rewards/rejected": -1.0047270059585571, "step": 1650 }, { "epoch": 1.328, "grad_norm": 5.609917640686035, "learning_rate": 4.9837115722089235e-06, "logits/chosen": 1.0167735815048218, "logits/rejected": 1.0690789222717285, "logps/chosen": -198.3705291748047, "logps/rejected": -214.2977752685547, "loss": 0.591, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6117779612541199, "rewards/margins": 0.41639161109924316, "rewards/rejected": -1.0281696319580078, "step": 1660 }, { "epoch": 1.336, "grad_norm": 5.828847408294678, "learning_rate": 4.982906254910459e-06, "logits/chosen": 0.9671093821525574, "logits/rejected": 0.934682309627533, "logps/chosen": -224.7041473388672, "logps/rejected": -225.40408325195312, "loss": 0.6122, "rewards/accuracies": 0.7125000357627869, "rewards/chosen": -0.4490407407283783, "rewards/margins": 0.4267473816871643, "rewards/rejected": -0.875788152217865, "step": 1670 }, { "epoch": 1.3439999999999999, "grad_norm": 4.787741184234619, "learning_rate": 4.982081575410256e-06, "logits/chosen": 0.9712103009223938, "logits/rejected": 1.061010718345642, "logps/chosen": -199.6064910888672, "logps/rejected": -225.61083984375, "loss": 0.5951, "rewards/accuracies": 0.625, "rewards/chosen": -0.35054826736450195, "rewards/margins": 0.4877324104309082, "rewards/rejected": -0.8382806777954102, "step": 1680 }, { "epoch": 1.3519999999999999, "grad_norm": 5.184296607971191, "learning_rate": 4.981237540139331e-06, "logits/chosen": 0.9483404159545898, "logits/rejected": 0.9563024640083313, "logps/chosen": -181.48716735839844, "logps/rejected": -208.47232055664062, "loss": 0.5151, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.23454923927783966, "rewards/margins": 0.6790667176246643, "rewards/rejected": -0.913615882396698, "step": 1690 }, { "epoch": 1.3599999999999999, "grad_norm": 5.253026485443115, "learning_rate": 4.980374155679639e-06, "logits/chosen": 1.0046643018722534, "logits/rejected": 0.9964167475700378, "logps/chosen": -196.05494689941406, "logps/rejected": -208.57164001464844, "loss": 0.5772, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5216605067253113, "rewards/margins": 0.5054008960723877, "rewards/rejected": -1.0270613431930542, "step": 1700 }, { "epoch": 1.3679999999999999, "grad_norm": 4.707123279571533, "learning_rate": 4.9794914287640264e-06, "logits/chosen": 0.9587984085083008, "logits/rejected": 0.9082703590393066, "logps/chosen": -191.87796020507812, "logps/rejected": -229.51358032226562, "loss": 0.5563, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": -0.3796873688697815, "rewards/margins": 0.5814693570137024, "rewards/rejected": -0.9611567854881287, "step": 1710 }, { "epoch": 1.376, "grad_norm": 6.676301002502441, "learning_rate": 4.978589366276174e-06, "logits/chosen": 0.8662906885147095, "logits/rejected": 0.7710773348808289, "logps/chosen": -192.7714080810547, "logps/rejected": -228.8347625732422, "loss": 0.55, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7520737051963806, "rewards/margins": 0.6919212341308594, "rewards/rejected": -1.4439948797225952, "step": 1720 }, { "epoch": 1.384, "grad_norm": 6.470781326293945, "learning_rate": 4.977667975250548e-06, "logits/chosen": 0.8674607276916504, "logits/rejected": 0.8369401097297668, "logps/chosen": -204.9677276611328, "logps/rejected": -220.662353515625, "loss": 0.6208, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6717852354049683, "rewards/margins": 0.5695573687553406, "rewards/rejected": -1.241342544555664, "step": 1730 }, { "epoch": 1.392, "grad_norm": 7.195312023162842, "learning_rate": 4.97672726287234e-06, "logits/chosen": 0.9234275817871094, "logits/rejected": 0.9190540313720703, "logps/chosen": -210.9809112548828, "logps/rejected": -206.3329620361328, "loss": 0.6337, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.394771546125412, "rewards/margins": 0.37511372566223145, "rewards/rejected": -0.7698853015899658, "step": 1740 }, { "epoch": 1.4, "grad_norm": 6.862684726715088, "learning_rate": 4.975767236477413e-06, "logits/chosen": 1.0124229192733765, "logits/rejected": 1.0320953130722046, "logps/chosen": -182.6747589111328, "logps/rejected": -206.9286346435547, "loss": 0.5748, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.14587950706481934, "rewards/margins": 0.5520176887512207, "rewards/rejected": -0.6978972554206848, "step": 1750 }, { "epoch": 1.408, "grad_norm": 5.418376922607422, "learning_rate": 4.974787903552247e-06, "logits/chosen": 0.8783740997314453, "logits/rejected": 0.7816500067710876, "logps/chosen": -213.95645141601562, "logps/rejected": -225.9214324951172, "loss": 0.6632, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": -0.7024328708648682, "rewards/margins": 0.48432379961013794, "rewards/rejected": -1.1867567300796509, "step": 1760 }, { "epoch": 1.416, "grad_norm": 5.543606758117676, "learning_rate": 4.973789271733877e-06, "logits/chosen": 0.8639112710952759, "logits/rejected": 0.8347362875938416, "logps/chosen": -208.14321899414062, "logps/rejected": -232.296875, "loss": 0.5753, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4658367335796356, "rewards/margins": 0.6865634322166443, "rewards/rejected": -1.152400255203247, "step": 1770 }, { "epoch": 1.424, "grad_norm": 5.288059711456299, "learning_rate": 4.972771348809834e-06, "logits/chosen": 0.7869605422019958, "logits/rejected": 0.7716497778892517, "logps/chosen": -203.56529235839844, "logps/rejected": -216.66299438476562, "loss": 0.6529, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5823510885238647, "rewards/margins": 0.38376662135124207, "rewards/rejected": -0.9661176800727844, "step": 1780 }, { "epoch": 1.432, "grad_norm": 7.8693037033081055, "learning_rate": 4.9717341427180855e-06, "logits/chosen": 0.8742721676826477, "logits/rejected": 0.8222671747207642, "logps/chosen": -223.1770477294922, "logps/rejected": -230.52334594726562, "loss": 0.5733, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4773767590522766, "rewards/margins": 0.528388261795044, "rewards/rejected": -1.0057649612426758, "step": 1790 }, { "epoch": 1.44, "grad_norm": 5.248528480529785, "learning_rate": 4.970677661546972e-06, "logits/chosen": 0.7629317045211792, "logits/rejected": 0.8210358023643494, "logps/chosen": -188.845703125, "logps/rejected": -227.7703857421875, "loss": 0.5079, "rewards/accuracies": 0.7125000357627869, "rewards/chosen": -0.4410068690776825, "rewards/margins": 0.8114088177680969, "rewards/rejected": -1.252415657043457, "step": 1800 }, { "epoch": 1.448, "grad_norm": 6.2042365074157715, "learning_rate": 4.969601913535148e-06, "logits/chosen": 0.8677163124084473, "logits/rejected": 0.8122463226318359, "logps/chosen": -198.2725067138672, "logps/rejected": -191.28347778320312, "loss": 0.616, "rewards/accuracies": 0.625, "rewards/chosen": -0.5268740057945251, "rewards/margins": 0.3900458514690399, "rewards/rejected": -0.9169198274612427, "step": 1810 }, { "epoch": 1.456, "grad_norm": 3.95033597946167, "learning_rate": 4.9685069070715105e-06, "logits/chosen": 0.8852386474609375, "logits/rejected": 0.8015106320381165, "logps/chosen": -213.7109832763672, "logps/rejected": -232.56704711914062, "loss": 0.538, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.778232753276825, "rewards/margins": 0.8279777765274048, "rewards/rejected": -1.6062105894088745, "step": 1820 }, { "epoch": 1.464, "grad_norm": 7.496504783630371, "learning_rate": 4.967392650695141e-06, "logits/chosen": 0.8542248010635376, "logits/rejected": 0.8511247634887695, "logps/chosen": -208.1324005126953, "logps/rejected": -228.3732452392578, "loss": 0.6527, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2442353963851929, "rewards/margins": 0.4487527906894684, "rewards/rejected": -1.6929882764816284, "step": 1830 }, { "epoch": 1.472, "grad_norm": 11.455583572387695, "learning_rate": 4.966259153095235e-06, "logits/chosen": 0.9046268463134766, "logits/rejected": 0.8542624711990356, "logps/chosen": -218.9237823486328, "logps/rejected": -226.2521209716797, "loss": 0.5853, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": -1.0356435775756836, "rewards/margins": 0.6369892954826355, "rewards/rejected": -1.6726330518722534, "step": 1840 }, { "epoch": 1.48, "grad_norm": 4.398818016052246, "learning_rate": 4.965106423111033e-06, "logits/chosen": 0.8832312822341919, "logits/rejected": 0.9116541743278503, "logps/chosen": -205.5150909423828, "logps/rejected": -219.7370147705078, "loss": 0.64, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2404916286468506, "rewards/margins": 0.47511014342308044, "rewards/rejected": -1.7156018018722534, "step": 1850 }, { "epoch": 1.488, "grad_norm": 7.927629470825195, "learning_rate": 4.963934469731756e-06, "logits/chosen": 0.7935855984687805, "logits/rejected": 0.8751907348632812, "logps/chosen": -198.9912567138672, "logps/rejected": -239.27891540527344, "loss": 0.5558, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2322111129760742, "rewards/margins": 0.5577873587608337, "rewards/rejected": -1.7899982929229736, "step": 1860 }, { "epoch": 1.496, "grad_norm": 8.568868637084961, "learning_rate": 4.962743302096532e-06, "logits/chosen": 0.8752376437187195, "logits/rejected": 0.9415783286094666, "logps/chosen": -202.5786895751953, "logps/rejected": -227.0806121826172, "loss": 0.6257, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2775661945343018, "rewards/margins": 0.5068507790565491, "rewards/rejected": -1.7844170331954956, "step": 1870 }, { "epoch": 1.504, "grad_norm": 7.778118133544922, "learning_rate": 4.961532929494325e-06, "logits/chosen": 0.8835949301719666, "logits/rejected": 0.9183811545372009, "logps/chosen": -220.37466430664062, "logps/rejected": -255.732421875, "loss": 0.5707, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3042207956314087, "rewards/margins": 0.6345095634460449, "rewards/rejected": -1.938730239868164, "step": 1880 }, { "epoch": 1.512, "grad_norm": 8.768423080444336, "learning_rate": 4.960303361363863e-06, "logits/chosen": 0.9741565585136414, "logits/rejected": 0.9229215979576111, "logps/chosen": -193.06396484375, "logps/rejected": -212.60093688964844, "loss": 0.5493, "rewards/accuracies": 0.6875, "rewards/chosen": -0.825711190700531, "rewards/margins": 0.6793455481529236, "rewards/rejected": -1.5050567388534546, "step": 1890 }, { "epoch": 1.52, "grad_norm": 8.654824256896973, "learning_rate": 4.959054607293567e-06, "logits/chosen": 0.8612321019172668, "logits/rejected": 0.8589774966239929, "logps/chosen": -211.7546844482422, "logps/rejected": -245.64761352539062, "loss": 0.5589, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3008898496627808, "rewards/margins": 0.6522501111030579, "rewards/rejected": -1.9531399011611938, "step": 1900 }, { "epoch": 1.528, "grad_norm": 8.161283493041992, "learning_rate": 4.9577866770214715e-06, "logits/chosen": 0.9014043807983398, "logits/rejected": 0.972660481929779, "logps/chosen": -225.20498657226562, "logps/rejected": -247.1440887451172, "loss": 0.5944, "rewards/accuracies": 0.6875, "rewards/chosen": -1.402032732963562, "rewards/margins": 0.5216811299324036, "rewards/rejected": -1.9237139225006104, "step": 1910 }, { "epoch": 1.536, "grad_norm": 5.097872257232666, "learning_rate": 4.95649958043515e-06, "logits/chosen": 1.0036065578460693, "logits/rejected": 0.9283719062805176, "logps/chosen": -189.35348510742188, "logps/rejected": -208.98095703125, "loss": 0.6005, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9046257138252258, "rewards/margins": 0.6082975268363953, "rewards/rejected": -1.5129234790802002, "step": 1920 }, { "epoch": 1.544, "grad_norm": 6.37450647354126, "learning_rate": 4.955193327571643e-06, "logits/chosen": 0.9055408835411072, "logits/rejected": 0.8177289366722107, "logps/chosen": -201.5961151123047, "logps/rejected": -227.8748016357422, "loss": 0.5858, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2569955587387085, "rewards/margins": 0.5454604029655457, "rewards/rejected": -1.8024559020996094, "step": 1930 }, { "epoch": 1.552, "grad_norm": 7.622455596923828, "learning_rate": 4.95386792861737e-06, "logits/chosen": 0.917728841304779, "logits/rejected": 0.9097478985786438, "logps/chosen": -226.21763610839844, "logps/rejected": -250.62930297851562, "loss": 0.6142, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4412829875946045, "rewards/margins": 0.5837629437446594, "rewards/rejected": -2.025045871734619, "step": 1940 }, { "epoch": 1.56, "grad_norm": 7.772380828857422, "learning_rate": 4.952523393908059e-06, "logits/chosen": 0.9898624420166016, "logits/rejected": 0.9073271155357361, "logps/chosen": -230.84971618652344, "logps/rejected": -236.9692840576172, "loss": 0.6086, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.1479434967041016, "rewards/margins": 0.5934609770774841, "rewards/rejected": -1.7414045333862305, "step": 1950 }, { "epoch": 1.568, "grad_norm": 4.825398921966553, "learning_rate": 4.951159733928663e-06, "logits/chosen": 0.9667159914970398, "logits/rejected": 0.9257230162620544, "logps/chosen": -201.62188720703125, "logps/rejected": -218.0967254638672, "loss": 0.565, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7356204986572266, "rewards/margins": 0.5399088263511658, "rewards/rejected": -1.2755292654037476, "step": 1960 }, { "epoch": 1.576, "grad_norm": 5.7298760414123535, "learning_rate": 4.949776959313275e-06, "logits/chosen": 1.0072981119155884, "logits/rejected": 1.023651123046875, "logps/chosen": -182.70799255371094, "logps/rejected": -193.568359375, "loss": 0.5054, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.4652971923351288, "rewards/margins": 0.755893349647522, "rewards/rejected": -1.2211906909942627, "step": 1970 }, { "epoch": 1.584, "grad_norm": 5.249616622924805, "learning_rate": 4.94837508084505e-06, "logits/chosen": 1.0406349897384644, "logits/rejected": 1.0460046529769897, "logps/chosen": -207.79454040527344, "logps/rejected": -222.9483642578125, "loss": 0.5672, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7943063974380493, "rewards/margins": 0.5170424580574036, "rewards/rejected": -1.311348795890808, "step": 1980 }, { "epoch": 1.592, "grad_norm": 5.766063213348389, "learning_rate": 4.9469541094561185e-06, "logits/chosen": 1.1073545217514038, "logits/rejected": 1.0138510465621948, "logps/chosen": -207.35740661621094, "logps/rejected": -210.7662353515625, "loss": 0.5451, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6396881937980652, "rewards/margins": 0.5360140204429626, "rewards/rejected": -1.1757020950317383, "step": 1990 }, { "epoch": 1.6, "grad_norm": 4.6534624099731445, "learning_rate": 4.945514056227499e-06, "logits/chosen": 1.0667814016342163, "logits/rejected": 1.0090528726577759, "logps/chosen": -190.9570770263672, "logps/rejected": -224.06448364257812, "loss": 0.5949, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.903643786907196, "rewards/margins": 0.4600391387939453, "rewards/rejected": -1.3636829853057861, "step": 2000 }, { "epoch": 1.608, "grad_norm": 6.801830291748047, "learning_rate": 4.944054932389018e-06, "logits/chosen": 0.9834865927696228, "logits/rejected": 0.9897794723510742, "logps/chosen": -197.6669158935547, "logps/rejected": -222.3367156982422, "loss": 0.7015, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5120326280593872, "rewards/margins": 0.37569552659988403, "rewards/rejected": -0.8877281546592712, "step": 2010 }, { "epoch": 1.616, "grad_norm": 4.7985663414001465, "learning_rate": 4.942576749319215e-06, "logits/chosen": 0.9592499136924744, "logits/rejected": 0.9662960171699524, "logps/chosen": -208.7536163330078, "logps/rejected": -221.6714324951172, "loss": 0.6111, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": -0.6202236413955688, "rewards/margins": 0.34536442160606384, "rewards/rejected": -0.9655880331993103, "step": 2020 }, { "epoch": 1.624, "grad_norm": 6.010429382324219, "learning_rate": 4.9410795185452584e-06, "logits/chosen": 0.9691095352172852, "logits/rejected": 0.970175564289093, "logps/chosen": -202.74681091308594, "logps/rejected": -219.4949951171875, "loss": 0.6405, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6648755669593811, "rewards/margins": 0.40488871932029724, "rewards/rejected": -1.069764256477356, "step": 2030 }, { "epoch": 1.6320000000000001, "grad_norm": 3.876993179321289, "learning_rate": 4.9395632517428546e-06, "logits/chosen": 0.7599179148674011, "logits/rejected": 0.8189982771873474, "logps/chosen": -203.71568298339844, "logps/rejected": -221.14488220214844, "loss": 0.5479, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6682913303375244, "rewards/margins": 0.7366959452629089, "rewards/rejected": -1.4049873352050781, "step": 2040 }, { "epoch": 1.6400000000000001, "grad_norm": 4.428971767425537, "learning_rate": 4.938027960736158e-06, "logits/chosen": 0.8731341361999512, "logits/rejected": 0.9049884080886841, "logps/chosen": -195.8953399658203, "logps/rejected": -219.8108673095703, "loss": 0.572, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.802505612373352, "rewards/margins": 0.5797830820083618, "rewards/rejected": -1.3822888135910034, "step": 2050 }, { "epoch": 1.6480000000000001, "grad_norm": 4.308506011962891, "learning_rate": 4.936473657497674e-06, "logits/chosen": 0.8433103561401367, "logits/rejected": 0.8728559613227844, "logps/chosen": -189.4270477294922, "logps/rejected": -208.40074157714844, "loss": 0.5589, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5602220892906189, "rewards/margins": 0.6132001280784607, "rewards/rejected": -1.17342209815979, "step": 2060 }, { "epoch": 1.6560000000000001, "grad_norm": 5.429026126861572, "learning_rate": 4.934900354148173e-06, "logits/chosen": 0.8006834387779236, "logits/rejected": 0.8655485510826111, "logps/chosen": -196.53811645507812, "logps/rejected": -226.0628662109375, "loss": 0.5333, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5719932913780212, "rewards/margins": 0.6289103627204895, "rewards/rejected": -1.2009037733078003, "step": 2070 }, { "epoch": 1.6640000000000001, "grad_norm": 7.226114749908447, "learning_rate": 4.933308062956591e-06, "logits/chosen": 0.9665275812149048, "logits/rejected": 0.9210414886474609, "logps/chosen": -216.1051483154297, "logps/rejected": -238.590576171875, "loss": 0.6039, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": -1.081291913986206, "rewards/margins": 0.5662848353385925, "rewards/rejected": -1.6475766897201538, "step": 2080 }, { "epoch": 1.6720000000000002, "grad_norm": 5.951044082641602, "learning_rate": 4.931696796339933e-06, "logits/chosen": 0.8365780711174011, "logits/rejected": 0.9172168970108032, "logps/chosen": -222.9845733642578, "logps/rejected": -241.9564666748047, "loss": 0.5248, "rewards/accuracies": 0.75, "rewards/chosen": -0.6349353790283203, "rewards/margins": 0.626502513885498, "rewards/rejected": -1.2614378929138184, "step": 2090 }, { "epoch": 1.6800000000000002, "grad_norm": 5.7516279220581055, "learning_rate": 4.930066566863182e-06, "logits/chosen": 0.8466065526008606, "logits/rejected": 0.8476532101631165, "logps/chosen": -221.70811462402344, "logps/rejected": -249.4915313720703, "loss": 0.5935, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7990935444831848, "rewards/margins": 0.5039268732070923, "rewards/rejected": -1.3030204772949219, "step": 2100 }, { "epoch": 1.688, "grad_norm": 4.736870288848877, "learning_rate": 4.9284173872391925e-06, "logits/chosen": 0.9319866299629211, "logits/rejected": 0.7861051559448242, "logps/chosen": -211.2753143310547, "logps/rejected": -213.59410095214844, "loss": 0.513, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5237230658531189, "rewards/margins": 0.7074744701385498, "rewards/rejected": -1.231197476387024, "step": 2110 }, { "epoch": 1.696, "grad_norm": 6.545098304748535, "learning_rate": 4.9267492703286005e-06, "logits/chosen": 0.8851673007011414, "logits/rejected": 0.9201962351799011, "logps/chosen": -205.5138702392578, "logps/rejected": -210.7584991455078, "loss": 0.6149, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6310408115386963, "rewards/margins": 0.48075610399246216, "rewards/rejected": -1.1117968559265137, "step": 2120 }, { "epoch": 1.704, "grad_norm": 5.556862831115723, "learning_rate": 4.9250622291397144e-06, "logits/chosen": 0.8860400319099426, "logits/rejected": 0.9521721005439758, "logps/chosen": -203.97494506835938, "logps/rejected": -223.2489776611328, "loss": 0.624, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8290230631828308, "rewards/margins": 0.4605173170566559, "rewards/rejected": -1.2895405292510986, "step": 2130 }, { "epoch": 1.712, "grad_norm": 4.479294776916504, "learning_rate": 4.923356276828422e-06, "logits/chosen": 0.9692792892456055, "logits/rejected": 0.9220939874649048, "logps/chosen": -206.8193817138672, "logps/rejected": -212.7848663330078, "loss": 0.5211, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5379689335823059, "rewards/margins": 0.6115099787712097, "rewards/rejected": -1.149478793144226, "step": 2140 }, { "epoch": 1.72, "grad_norm": 4.822434902191162, "learning_rate": 4.921631426698082e-06, "logits/chosen": 0.9812706112861633, "logits/rejected": 0.8581036925315857, "logps/chosen": -204.3318328857422, "logps/rejected": -208.5980224609375, "loss": 0.6165, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.41802969574928284, "rewards/margins": 0.469400554895401, "rewards/rejected": -0.8874303102493286, "step": 2150 }, { "epoch": 1.728, "grad_norm": 7.955755710601807, "learning_rate": 4.919887692199423e-06, "logits/chosen": 1.019281268119812, "logits/rejected": 0.985063374042511, "logps/chosen": -196.29539489746094, "logps/rejected": -185.47842407226562, "loss": 0.5773, "rewards/accuracies": 0.7125000357627869, "rewards/chosen": -0.2744506895542145, "rewards/margins": 0.5127472877502441, "rewards/rejected": -0.787198007106781, "step": 2160 }, { "epoch": 1.736, "grad_norm": 3.87949275970459, "learning_rate": 4.918125086930435e-06, "logits/chosen": 0.8921056985855103, "logits/rejected": 0.8641147613525391, "logps/chosen": -208.4770965576172, "logps/rejected": -232.86192321777344, "loss": 0.5582, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.398175448179245, "rewards/margins": 0.586793839931488, "rewards/rejected": -0.9849693179130554, "step": 2170 }, { "epoch": 1.744, "grad_norm": 5.009764671325684, "learning_rate": 4.91634362463627e-06, "logits/chosen": 0.8331074118614197, "logits/rejected": 0.8069505095481873, "logps/chosen": -194.30528259277344, "logps/rejected": -228.6878204345703, "loss": 0.5577, "rewards/accuracies": 0.75, "rewards/chosen": -0.46142521500587463, "rewards/margins": 0.6860167384147644, "rewards/rejected": -1.1474418640136719, "step": 2180 }, { "epoch": 1.752, "grad_norm": 6.22864294052124, "learning_rate": 4.914543319209126e-06, "logits/chosen": 0.9490287899971008, "logits/rejected": 0.7996692061424255, "logps/chosen": -213.89102172851562, "logps/rejected": -235.63926696777344, "loss": 0.6154, "rewards/accuracies": 0.625, "rewards/chosen": -0.6507563591003418, "rewards/margins": 0.5299718976020813, "rewards/rejected": -1.1807281970977783, "step": 2190 }, { "epoch": 1.76, "grad_norm": 6.673972129821777, "learning_rate": 4.912724184688149e-06, "logits/chosen": 0.7972329258918762, "logits/rejected": 0.7832920551300049, "logps/chosen": -216.7205810546875, "logps/rejected": -232.1858367919922, "loss": 0.6573, "rewards/accuracies": 0.625, "rewards/chosen": -0.7054974436759949, "rewards/margins": 0.4817284643650055, "rewards/rejected": -1.1872259378433228, "step": 2200 }, { "epoch": 1.768, "grad_norm": 4.410974979400635, "learning_rate": 4.910886235259315e-06, "logits/chosen": 0.8468648791313171, "logits/rejected": 0.8238309025764465, "logps/chosen": -206.13621520996094, "logps/rejected": -206.95458984375, "loss": 0.5607, "rewards/accuracies": 0.7125000357627869, "rewards/chosen": -0.4652062952518463, "rewards/margins": 0.5647412538528442, "rewards/rejected": -1.0299476385116577, "step": 2210 }, { "epoch": 1.776, "grad_norm": 5.151928424835205, "learning_rate": 4.909029485255321e-06, "logits/chosen": 0.883913516998291, "logits/rejected": 0.8439237475395203, "logps/chosen": -216.8835906982422, "logps/rejected": -225.7892303466797, "loss": 0.6172, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": -0.736081063747406, "rewards/margins": 0.49011898040771484, "rewards/rejected": -1.2262001037597656, "step": 2220 }, { "epoch": 1.784, "grad_norm": 4.777335166931152, "learning_rate": 4.907153949155479e-06, "logits/chosen": 0.8213723301887512, "logits/rejected": 0.77827388048172, "logps/chosen": -216.30235290527344, "logps/rejected": -234.95713806152344, "loss": 0.5687, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8275079131126404, "rewards/margins": 0.5740241408348083, "rewards/rejected": -1.4015320539474487, "step": 2230 }, { "epoch": 1.792, "grad_norm": 5.356624126434326, "learning_rate": 4.905259641585594e-06, "logits/chosen": 1.0409396886825562, "logits/rejected": 0.9675580263137817, "logps/chosen": -189.8865966796875, "logps/rejected": -210.8790283203125, "loss": 0.5454, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5686191916465759, "rewards/margins": 0.596284806728363, "rewards/rejected": -1.164903998374939, "step": 2240 }, { "epoch": 1.8, "grad_norm": 7.358102798461914, "learning_rate": 4.903346577317859e-06, "logits/chosen": 0.9212884306907654, "logits/rejected": 0.9235758185386658, "logps/chosen": -205.32864379882812, "logps/rejected": -209.7302703857422, "loss": 0.6964, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7906255125999451, "rewards/margins": 0.18452060222625732, "rewards/rejected": -0.9751461148262024, "step": 2250 }, { "epoch": 1.808, "grad_norm": 6.358880043029785, "learning_rate": 4.901414771270732e-06, "logits/chosen": 1.0110472440719604, "logits/rejected": 0.9830945134162903, "logps/chosen": -196.3888397216797, "logps/rejected": -196.31834411621094, "loss": 0.5517, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.44832611083984375, "rewards/margins": 0.5208678841590881, "rewards/rejected": -0.9691939353942871, "step": 2260 }, { "epoch": 1.8159999999999998, "grad_norm": 4.602325439453125, "learning_rate": 4.899464238508826e-06, "logits/chosen": 0.9746546149253845, "logits/rejected": 0.9642091989517212, "logps/chosen": -196.0602569580078, "logps/rejected": -214.0269012451172, "loss": 0.5631, "rewards/accuracies": 0.75, "rewards/chosen": -0.30714574456214905, "rewards/margins": 0.5362817049026489, "rewards/rejected": -0.8434274792671204, "step": 2270 }, { "epoch": 1.8239999999999998, "grad_norm": 4.84414005279541, "learning_rate": 4.8974949942427854e-06, "logits/chosen": 0.9626126289367676, "logits/rejected": 0.9172550439834595, "logps/chosen": -193.07933044433594, "logps/rejected": -198.24754333496094, "loss": 0.625, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4726344645023346, "rewards/margins": 0.4540925920009613, "rewards/rejected": -0.9267271161079407, "step": 2280 }, { "epoch": 1.8319999999999999, "grad_norm": 8.682156562805176, "learning_rate": 4.895507053829174e-06, "logits/chosen": 0.8741292953491211, "logits/rejected": 0.9165781140327454, "logps/chosen": -206.250732421875, "logps/rejected": -249.11558532714844, "loss": 0.5665, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": -0.6650144457817078, "rewards/margins": 0.7068294286727905, "rewards/rejected": -1.3718438148498535, "step": 2290 }, { "epoch": 1.8399999999999999, "grad_norm": 5.111639499664307, "learning_rate": 4.893500432770349e-06, "logits/chosen": 0.9301668405532837, "logits/rejected": 0.9247567057609558, "logps/chosen": -206.8374786376953, "logps/rejected": -210.31265258789062, "loss": 0.5961, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7106897234916687, "rewards/margins": 0.40209847688674927, "rewards/rejected": -1.112788200378418, "step": 2300 }, { "epoch": 1.8479999999999999, "grad_norm": 5.505365371704102, "learning_rate": 4.891475146714348e-06, "logits/chosen": 0.9037653207778931, "logits/rejected": 0.8538966178894043, "logps/chosen": -199.22640991210938, "logps/rejected": -212.10537719726562, "loss": 0.5586, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9016987085342407, "rewards/margins": 0.6126433610916138, "rewards/rejected": -1.514341950416565, "step": 2310 }, { "epoch": 1.8559999999999999, "grad_norm": 4.674638748168945, "learning_rate": 4.889431211454753e-06, "logits/chosen": 0.8724443316459656, "logits/rejected": 0.9939247965812683, "logps/chosen": -213.60525512695312, "logps/rejected": -227.02359008789062, "loss": 0.5838, "rewards/accuracies": 0.7125000357627869, "rewards/chosen": -1.1054712533950806, "rewards/margins": 0.44871193170547485, "rewards/rejected": -1.5541832447052002, "step": 2320 }, { "epoch": 1.8639999999999999, "grad_norm": 7.4808220863342285, "learning_rate": 4.887368642930588e-06, "logits/chosen": 0.9019695520401001, "logits/rejected": 0.8304522633552551, "logps/chosen": -211.55125427246094, "logps/rejected": -225.2754364013672, "loss": 0.6556, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.171556830406189, "rewards/margins": 0.2245822697877884, "rewards/rejected": -1.396139144897461, "step": 2330 }, { "epoch": 1.8719999999999999, "grad_norm": 6.583737373352051, "learning_rate": 4.8852874572261715e-06, "logits/chosen": 0.9607311487197876, "logits/rejected": 0.8902657628059387, "logps/chosen": -194.19366455078125, "logps/rejected": -233.3009796142578, "loss": 0.5089, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -0.9743821024894714, "rewards/margins": 0.8457622528076172, "rewards/rejected": -1.8201442956924438, "step": 2340 }, { "epoch": 1.88, "grad_norm": 8.004322052001953, "learning_rate": 4.88318767057101e-06, "logits/chosen": 0.6777879595756531, "logits/rejected": 0.7692098021507263, "logps/chosen": -226.11428833007812, "logps/rejected": -252.94114685058594, "loss": 0.6428, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.4815189838409424, "rewards/margins": 0.4003751873970032, "rewards/rejected": -1.8818941116333008, "step": 2350 }, { "epoch": 1.888, "grad_norm": 5.834934234619141, "learning_rate": 4.881069299339662e-06, "logits/chosen": 0.9836352467536926, "logits/rejected": 0.925758957862854, "logps/chosen": -196.276611328125, "logps/rejected": -216.8465118408203, "loss": 0.5801, "rewards/accuracies": 0.7125000357627869, "rewards/chosen": -0.6051527261734009, "rewards/margins": 0.5349343419075012, "rewards/rejected": -1.1400870084762573, "step": 2360 }, { "epoch": 1.896, "grad_norm": 3.793811321258545, "learning_rate": 4.878932360051611e-06, "logits/chosen": 0.9329185485839844, "logits/rejected": 0.8791740536689758, "logps/chosen": -198.0451202392578, "logps/rejected": -204.83177185058594, "loss": 0.5089, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -0.7063581943511963, "rewards/margins": 0.798491895198822, "rewards/rejected": -1.5048500299453735, "step": 2370 }, { "epoch": 1.904, "grad_norm": 6.470005512237549, "learning_rate": 4.876776869371139e-06, "logits/chosen": 0.9098548889160156, "logits/rejected": 0.9413707852363586, "logps/chosen": -205.4551239013672, "logps/rejected": -206.733642578125, "loss": 0.543, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8138889670372009, "rewards/margins": 0.5941287279129028, "rewards/rejected": -1.4080175161361694, "step": 2380 }, { "epoch": 1.912, "grad_norm": 7.808496475219727, "learning_rate": 4.874602844107195e-06, "logits/chosen": 1.01150643825531, "logits/rejected": 0.873560905456543, "logps/chosen": -209.4512939453125, "logps/rejected": -219.6645965576172, "loss": 0.649, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0077110528945923, "rewards/margins": 0.44567227363586426, "rewards/rejected": -1.453383445739746, "step": 2390 }, { "epoch": 1.92, "grad_norm": 10.799213409423828, "learning_rate": 4.872410301213265e-06, "logits/chosen": 0.9560382962226868, "logits/rejected": 0.9300372004508972, "logps/chosen": -196.2581329345703, "logps/rejected": -212.28335571289062, "loss": 0.582, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8118780255317688, "rewards/margins": 0.4887621998786926, "rewards/rejected": -1.3006402254104614, "step": 2400 }, { "epoch": 1.928, "grad_norm": 4.009180068969727, "learning_rate": 4.87019925778724e-06, "logits/chosen": 0.7739871144294739, "logits/rejected": 0.7527830004692078, "logps/chosen": -202.1708221435547, "logps/rejected": -212.5369415283203, "loss": 0.5072, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.6656922101974487, "rewards/margins": 0.811048150062561, "rewards/rejected": -1.4767402410507202, "step": 2410 }, { "epoch": 1.936, "grad_norm": 4.5736799240112305, "learning_rate": 4.867969731071279e-06, "logits/chosen": 0.7770849466323853, "logits/rejected": 0.8255655169487, "logps/chosen": -192.43284606933594, "logps/rejected": -222.0802001953125, "loss": 0.5829, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6381308436393738, "rewards/margins": 0.6074991226196289, "rewards/rejected": -1.245630145072937, "step": 2420 }, { "epoch": 1.944, "grad_norm": 7.796564102172852, "learning_rate": 4.86572173845168e-06, "logits/chosen": 0.9963685870170593, "logits/rejected": 0.986523449420929, "logps/chosen": -213.4120635986328, "logps/rejected": -214.7560577392578, "loss": 0.664, "rewards/accuracies": 0.5875000357627869, "rewards/chosen": -0.7294261455535889, "rewards/margins": 0.35328245162963867, "rewards/rejected": -1.0827085971832275, "step": 2430 }, { "epoch": 1.952, "grad_norm": 5.976226329803467, "learning_rate": 4.863455297458741e-06, "logits/chosen": 0.8093425631523132, "logits/rejected": 0.8130960464477539, "logps/chosen": -223.12828063964844, "logps/rejected": -246.886474609375, "loss": 0.6872, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.014013648033142, "rewards/margins": 0.4549499452114105, "rewards/rejected": -1.468963861465454, "step": 2440 }, { "epoch": 1.96, "grad_norm": 5.015947341918945, "learning_rate": 4.861170425766625e-06, "logits/chosen": 0.7283975481987, "logits/rejected": 0.7141422629356384, "logps/chosen": -223.5131378173828, "logps/rejected": -231.6356964111328, "loss": 0.5357, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8296670913696289, "rewards/margins": 0.6724082827568054, "rewards/rejected": -1.5020753145217896, "step": 2450 }, { "epoch": 1.968, "grad_norm": 5.242801666259766, "learning_rate": 4.8588671411932195e-06, "logits/chosen": 0.8053815960884094, "logits/rejected": 0.8061729669570923, "logps/chosen": -214.3092041015625, "logps/rejected": -251.5323486328125, "loss": 0.5139, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6689096689224243, "rewards/margins": 0.6673839688301086, "rewards/rejected": -1.3362935781478882, "step": 2460 }, { "epoch": 1.976, "grad_norm": 5.3611578941345215, "learning_rate": 4.8565454616999995e-06, "logits/chosen": 0.7702573537826538, "logits/rejected": 0.8161222338676453, "logps/chosen": -194.3104248046875, "logps/rejected": -229.4737091064453, "loss": 0.5365, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5571290850639343, "rewards/margins": 0.699532151222229, "rewards/rejected": -1.2566611766815186, "step": 2470 }, { "epoch": 1.984, "grad_norm": 6.883541107177734, "learning_rate": 4.85420540539189e-06, "logits/chosen": 0.8040187954902649, "logits/rejected": 0.8295624852180481, "logps/chosen": -201.51536560058594, "logps/rejected": -207.3661651611328, "loss": 0.7064, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": -0.622946560382843, "rewards/margins": 0.24029460549354553, "rewards/rejected": -0.8632411956787109, "step": 2480 }, { "epoch": 1.992, "grad_norm": 6.731320381164551, "learning_rate": 4.851846990517118e-06, "logits/chosen": 0.8790884017944336, "logits/rejected": 0.8123799562454224, "logps/chosen": -206.55789184570312, "logps/rejected": -234.25, "loss": 0.5478, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8480615019798279, "rewards/margins": 0.6503373980522156, "rewards/rejected": -1.498399019241333, "step": 2490 }, { "epoch": 2.0, "grad_norm": 5.269995212554932, "learning_rate": 4.849470235467079e-06, "logits/chosen": 0.9131700396537781, "logits/rejected": 0.8961302638053894, "logps/chosen": -205.96377563476562, "logps/rejected": -208.3470458984375, "loss": 0.643, "rewards/accuracies": 0.5875000357627869, "rewards/chosen": -0.4859640598297119, "rewards/margins": 0.346769243478775, "rewards/rejected": -0.8327333331108093, "step": 2500 }, { "epoch": 2.008, "grad_norm": 4.879722595214844, "learning_rate": 4.847075158776183e-06, "logits/chosen": 0.8211914300918579, "logits/rejected": 0.8813266754150391, "logps/chosen": -202.9598846435547, "logps/rejected": -227.003173828125, "loss": 0.5717, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9907218813896179, "rewards/margins": 0.5864070057868958, "rewards/rejected": -1.5771290063858032, "step": 2510 }, { "epoch": 2.016, "grad_norm": 4.4429168701171875, "learning_rate": 4.844661779121723e-06, "logits/chosen": 0.8528118133544922, "logits/rejected": 0.8727533221244812, "logps/chosen": -206.079833984375, "logps/rejected": -226.8438262939453, "loss": 0.4943, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6906086206436157, "rewards/margins": 0.6708667874336243, "rewards/rejected": -1.3614753484725952, "step": 2520 }, { "epoch": 2.024, "grad_norm": 5.297170162200928, "learning_rate": 4.842230115323715e-06, "logits/chosen": 0.8990702629089355, "logits/rejected": 0.8813673257827759, "logps/chosen": -210.3358917236328, "logps/rejected": -215.9623565673828, "loss": 0.4864, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.9391742944717407, "rewards/margins": 0.8787464499473572, "rewards/rejected": -1.8179206848144531, "step": 2530 }, { "epoch": 2.032, "grad_norm": 5.085737228393555, "learning_rate": 4.839780186344763e-06, "logits/chosen": 0.7243828177452087, "logits/rejected": 0.5710186958312988, "logps/chosen": -218.251708984375, "logps/rejected": -233.02012634277344, "loss": 0.5227, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8471910357475281, "rewards/margins": 0.7661673426628113, "rewards/rejected": -1.6133583784103394, "step": 2540 }, { "epoch": 2.04, "grad_norm": 6.776291847229004, "learning_rate": 4.837312011289907e-06, "logits/chosen": 0.8092204332351685, "logits/rejected": 0.7919169664382935, "logps/chosen": -233.19627380371094, "logps/rejected": -239.8424835205078, "loss": 0.5437, "rewards/accuracies": 0.75, "rewards/chosen": -1.3999868631362915, "rewards/margins": 0.5697715878486633, "rewards/rejected": -1.96975839138031, "step": 2550 }, { "epoch": 2.048, "grad_norm": 4.587131500244141, "learning_rate": 4.834825609406469e-06, "logits/chosen": 0.8447713851928711, "logits/rejected": 0.8319129347801208, "logps/chosen": -197.89254760742188, "logps/rejected": -224.28733825683594, "loss": 0.4837, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -0.7428638339042664, "rewards/margins": 0.8988615870475769, "rewards/rejected": -1.6417255401611328, "step": 2560 }, { "epoch": 2.056, "grad_norm": 5.485004425048828, "learning_rate": 4.832321000083912e-06, "logits/chosen": 0.6010708212852478, "logits/rejected": 0.6102247834205627, "logps/chosen": -233.9095001220703, "logps/rejected": -258.9781188964844, "loss": 0.5354, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8256379961967468, "rewards/margins": 0.7594703435897827, "rewards/rejected": -1.5851082801818848, "step": 2570 }, { "epoch": 2.064, "grad_norm": 4.678139686584473, "learning_rate": 4.829798202853683e-06, "logits/chosen": 0.7654294371604919, "logits/rejected": 0.827416718006134, "logps/chosen": -204.60549926757812, "logps/rejected": -226.6314239501953, "loss": 0.5546, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.819423496723175, "rewards/margins": 0.5961082577705383, "rewards/rejected": -1.4155317544937134, "step": 2580 }, { "epoch": 2.072, "grad_norm": 7.408326625823975, "learning_rate": 4.82725723738906e-06, "logits/chosen": 0.9313848614692688, "logits/rejected": 0.9375463724136353, "logps/chosen": -222.64830017089844, "logps/rejected": -225.4774169921875, "loss": 0.6477, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.9466853141784668, "rewards/margins": 0.3815837800502777, "rewards/rejected": -1.328269124031067, "step": 2590 }, { "epoch": 2.08, "grad_norm": 7.954169273376465, "learning_rate": 4.824698123505004e-06, "logits/chosen": 0.8060113191604614, "logits/rejected": 0.7566053867340088, "logps/chosen": -192.39781188964844, "logps/rejected": -221.01512145996094, "loss": 0.5003, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4853571057319641, "rewards/margins": 0.7401863932609558, "rewards/rejected": -1.2255436182022095, "step": 2600 }, { "epoch": 2.088, "grad_norm": 6.623762607574463, "learning_rate": 4.822120881157998e-06, "logits/chosen": 0.8647942543029785, "logits/rejected": 0.8719661831855774, "logps/chosen": -221.8902587890625, "logps/rejected": -240.48338317871094, "loss": 0.5385, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.855373203754425, "rewards/margins": 0.6286361813545227, "rewards/rejected": -1.4840092658996582, "step": 2610 }, { "epoch": 2.096, "grad_norm": 5.830476760864258, "learning_rate": 4.8195255304458945e-06, "logits/chosen": 0.8587938547134399, "logits/rejected": 0.7880618572235107, "logps/chosen": -208.57481384277344, "logps/rejected": -225.7516632080078, "loss": 0.5388, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.8550947308540344, "rewards/margins": 0.61111980676651, "rewards/rejected": -1.4662145376205444, "step": 2620 }, { "epoch": 2.104, "grad_norm": 5.968774795532227, "learning_rate": 4.8169120916077626e-06, "logits/chosen": 0.8810015916824341, "logits/rejected": 0.8329893946647644, "logps/chosen": -206.7748260498047, "logps/rejected": -212.00332641601562, "loss": 0.4447, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9596864581108093, "rewards/margins": 0.9688228964805603, "rewards/rejected": -1.9285091161727905, "step": 2630 }, { "epoch": 2.112, "grad_norm": 4.1383161544799805, "learning_rate": 4.81428058502372e-06, "logits/chosen": 0.7850375175476074, "logits/rejected": 0.7830752730369568, "logps/chosen": -215.91201782226562, "logps/rejected": -243.4402313232422, "loss": 0.5962, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5484000444412231, "rewards/margins": 0.7390782237052917, "rewards/rejected": -2.28747820854187, "step": 2640 }, { "epoch": 2.12, "grad_norm": 5.95521879196167, "learning_rate": 4.811631031214787e-06, "logits/chosen": 0.7586268782615662, "logits/rejected": 0.7738537192344666, "logps/chosen": -226.69517517089844, "logps/rejected": -252.53208923339844, "loss": 0.4531, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -1.1299831867218018, "rewards/margins": 1.046120047569275, "rewards/rejected": -2.176103353500366, "step": 2650 }, { "epoch": 2.128, "grad_norm": 5.219895839691162, "learning_rate": 4.808963450842713e-06, "logits/chosen": 0.9560911059379578, "logits/rejected": 0.9116310477256775, "logps/chosen": -205.336181640625, "logps/rejected": -206.9613800048828, "loss": 0.5378, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8686642050743103, "rewards/margins": 0.6341574788093567, "rewards/rejected": -1.5028215646743774, "step": 2660 }, { "epoch": 2.136, "grad_norm": 4.962176322937012, "learning_rate": 4.806277864709828e-06, "logits/chosen": 0.9082285165786743, "logits/rejected": 0.8512415885925293, "logps/chosen": -217.08535766601562, "logps/rejected": -218.56155395507812, "loss": 0.4624, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.5144981741905212, "rewards/margins": 0.8028311133384705, "rewards/rejected": -1.3173291683197021, "step": 2670 }, { "epoch": 2.144, "grad_norm": 6.617359161376953, "learning_rate": 4.803574293758873e-06, "logits/chosen": 0.8195849657058716, "logits/rejected": 0.8258503079414368, "logps/chosen": -227.69168090820312, "logps/rejected": -232.21450805664062, "loss": 0.5883, "rewards/accuracies": 0.75, "rewards/chosen": -1.4341284036636353, "rewards/margins": 0.5511090755462646, "rewards/rejected": -1.9852374792099, "step": 2680 }, { "epoch": 2.152, "grad_norm": 4.731860637664795, "learning_rate": 4.800852759072834e-06, "logits/chosen": 1.0406241416931152, "logits/rejected": 0.9129249453544617, "logps/chosen": -212.03970336914062, "logps/rejected": -217.3359832763672, "loss": 0.5685, "rewards/accuracies": 0.7125000357627869, "rewards/chosen": -1.189060926437378, "rewards/margins": 0.6306447386741638, "rewards/rejected": -1.819705605506897, "step": 2690 }, { "epoch": 2.16, "grad_norm": 6.71656608581543, "learning_rate": 4.798113281874788e-06, "logits/chosen": 0.8752555847167969, "logits/rejected": 0.8828164935112, "logps/chosen": -216.815673828125, "logps/rejected": -230.06640625, "loss": 0.5298, "rewards/accuracies": 0.75, "rewards/chosen": -0.9842235445976257, "rewards/margins": 0.7561108469963074, "rewards/rejected": -1.740334391593933, "step": 2700 }, { "epoch": 2.168, "grad_norm": 5.923402309417725, "learning_rate": 4.795355883527727e-06, "logits/chosen": 1.119627833366394, "logits/rejected": 1.0217770338058472, "logps/chosen": -214.1453094482422, "logps/rejected": -225.8303680419922, "loss": 0.4973, "rewards/accuracies": 0.75, "rewards/chosen": -1.4931985139846802, "rewards/margins": 0.8571271300315857, "rewards/rejected": -2.350325584411621, "step": 2710 }, { "epoch": 2.176, "grad_norm": 5.549647331237793, "learning_rate": 4.792580585534398e-06, "logits/chosen": 0.976405918598175, "logits/rejected": 1.0035079717636108, "logps/chosen": -200.0377960205078, "logps/rejected": -236.29800415039062, "loss": 0.5329, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0612291097640991, "rewards/margins": 0.7105744481086731, "rewards/rejected": -1.7718034982681274, "step": 2720 }, { "epoch": 2.184, "grad_norm": 8.303404808044434, "learning_rate": 4.789787409537131e-06, "logits/chosen": 0.9948248267173767, "logits/rejected": 0.940967857837677, "logps/chosen": -207.9303436279297, "logps/rejected": -244.0626220703125, "loss": 0.5104, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.080901026725769, "rewards/margins": 0.7775439620018005, "rewards/rejected": -1.8584450483322144, "step": 2730 }, { "epoch": 2.192, "grad_norm": 6.788060188293457, "learning_rate": 4.786976377317676e-06, "logits/chosen": 0.9980520606040955, "logits/rejected": 0.9337381720542908, "logps/chosen": -205.30735778808594, "logps/rejected": -212.91213989257812, "loss": 0.5431, "rewards/accuracies": 0.75, "rewards/chosen": -1.287164568901062, "rewards/margins": 0.6435604691505432, "rewards/rejected": -1.93072509765625, "step": 2740 }, { "epoch": 2.2, "grad_norm": 5.9607133865356445, "learning_rate": 4.784147510797024e-06, "logits/chosen": 0.956340491771698, "logits/rejected": 0.8990178108215332, "logps/chosen": -207.7548828125, "logps/rejected": -227.23081970214844, "loss": 0.5219, "rewards/accuracies": 0.7125000357627869, "rewards/chosen": -1.1541296243667603, "rewards/margins": 0.8181205987930298, "rewards/rejected": -1.97225022315979, "step": 2750 }, { "epoch": 2.208, "grad_norm": 4.65596342086792, "learning_rate": 4.7813008320352475e-06, "logits/chosen": 0.886645495891571, "logits/rejected": 0.8505622148513794, "logps/chosen": -201.58055114746094, "logps/rejected": -216.75845336914062, "loss": 0.5183, "rewards/accuracies": 0.7125000357627869, "rewards/chosen": -0.5563634037971497, "rewards/margins": 0.788402259349823, "rewards/rejected": -1.344765543937683, "step": 2760 }, { "epoch": 2.216, "grad_norm": 3.351832628250122, "learning_rate": 4.778436363231317e-06, "logits/chosen": 0.9784888625144958, "logits/rejected": 0.9678120613098145, "logps/chosen": -209.824462890625, "logps/rejected": -236.84239196777344, "loss": 0.5421, "rewards/accuracies": 0.75, "rewards/chosen": -1.1734176874160767, "rewards/margins": 0.7453652024269104, "rewards/rejected": -1.9187828302383423, "step": 2770 }, { "epoch": 2.224, "grad_norm": 5.160139560699463, "learning_rate": 4.775554126722935e-06, "logits/chosen": 1.0381357669830322, "logits/rejected": 0.8997783064842224, "logps/chosen": -215.1231689453125, "logps/rejected": -221.33921813964844, "loss": 0.4999, "rewards/accuracies": 0.75, "rewards/chosen": -0.8012914657592773, "rewards/margins": 0.8245387077331543, "rewards/rejected": -1.625830054283142, "step": 2780 }, { "epoch": 2.232, "grad_norm": 6.553884983062744, "learning_rate": 4.772654144986364e-06, "logits/chosen": 1.0697473287582397, "logits/rejected": 0.9900814890861511, "logps/chosen": -190.27476501464844, "logps/rejected": -185.47320556640625, "loss": 0.5253, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6551761031150818, "rewards/margins": 0.7400442361831665, "rewards/rejected": -1.395220398902893, "step": 2790 }, { "epoch": 2.24, "grad_norm": 7.37640905380249, "learning_rate": 4.7697364406362415e-06, "logits/chosen": 0.9499724507331848, "logits/rejected": 0.9506826400756836, "logps/chosen": -211.59814453125, "logps/rejected": -214.0547637939453, "loss": 0.5319, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9677931070327759, "rewards/margins": 0.7696712613105774, "rewards/rejected": -1.7374645471572876, "step": 2800 }, { "epoch": 2.248, "grad_norm": 7.318824291229248, "learning_rate": 4.766801036425413e-06, "logits/chosen": 0.9329463243484497, "logits/rejected": 0.9652411341667175, "logps/chosen": -198.66159057617188, "logps/rejected": -218.34695434570312, "loss": 0.4791, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.636730432510376, "rewards/margins": 0.9497951865196228, "rewards/rejected": -1.5865256786346436, "step": 2810 }, { "epoch": 2.2560000000000002, "grad_norm": 8.074874877929688, "learning_rate": 4.763847955244749e-06, "logits/chosen": 0.9467241168022156, "logits/rejected": 0.9240646362304688, "logps/chosen": -195.85824584960938, "logps/rejected": -213.454345703125, "loss": 0.4331, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -0.5387411117553711, "rewards/margins": 0.931182861328125, "rewards/rejected": -1.4699242115020752, "step": 2820 }, { "epoch": 2.2640000000000002, "grad_norm": 5.222074508666992, "learning_rate": 4.760877220122972e-06, "logits/chosen": 1.0361279249191284, "logits/rejected": 0.9784205555915833, "logps/chosen": -217.472412109375, "logps/rejected": -220.15403747558594, "loss": 0.4908, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6365905404090881, "rewards/margins": 0.8387727737426758, "rewards/rejected": -1.4753633737564087, "step": 2830 }, { "epoch": 2.2720000000000002, "grad_norm": 12.525833129882812, "learning_rate": 4.757888854226469e-06, "logits/chosen": 0.980974018573761, "logits/rejected": 0.8848344683647156, "logps/chosen": -217.03807067871094, "logps/rejected": -220.2837371826172, "loss": 0.6158, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.099002480506897, "rewards/margins": 0.5908426642417908, "rewards/rejected": -1.689845085144043, "step": 2840 }, { "epoch": 2.2800000000000002, "grad_norm": 7.606281280517578, "learning_rate": 4.75488288085912e-06, "logits/chosen": 0.886518120765686, "logits/rejected": 0.8294790387153625, "logps/chosen": -223.3667449951172, "logps/rejected": -241.37620544433594, "loss": 0.5392, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5162465572357178, "rewards/margins": 0.6705383658409119, "rewards/rejected": -2.1867847442626953, "step": 2850 }, { "epoch": 2.288, "grad_norm": 6.877048015594482, "learning_rate": 4.751859323462106e-06, "logits/chosen": 0.8719585537910461, "logits/rejected": 0.8641806840896606, "logps/chosen": -196.090576171875, "logps/rejected": -222.262451171875, "loss": 0.4964, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1291723251342773, "rewards/margins": 0.9274942278862, "rewards/rejected": -2.056666612625122, "step": 2860 }, { "epoch": 2.296, "grad_norm": 8.89814281463623, "learning_rate": 4.748818205613738e-06, "logits/chosen": 0.9758926630020142, "logits/rejected": 0.8365556597709656, "logps/chosen": -208.0929412841797, "logps/rejected": -219.41197204589844, "loss": 0.5806, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": -0.8481144905090332, "rewards/margins": 0.6624639630317688, "rewards/rejected": -1.5105783939361572, "step": 2870 }, { "epoch": 2.304, "grad_norm": 10.317713737487793, "learning_rate": 4.7457595510292615e-06, "logits/chosen": 0.9229365587234497, "logits/rejected": 0.7857539057731628, "logps/chosen": -219.9251251220703, "logps/rejected": -253.33775329589844, "loss": 0.6269, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2590759992599487, "rewards/margins": 0.696280300617218, "rewards/rejected": -1.9553560018539429, "step": 2880 }, { "epoch": 2.312, "grad_norm": 4.831161022186279, "learning_rate": 4.7426833835606815e-06, "logits/chosen": 0.977482259273529, "logits/rejected": 0.9541126489639282, "logps/chosen": -201.36666870117188, "logps/rejected": -229.928466796875, "loss": 0.5535, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8802324533462524, "rewards/margins": 0.7807623744010925, "rewards/rejected": -1.6609947681427002, "step": 2890 }, { "epoch": 2.32, "grad_norm": 10.042828559875488, "learning_rate": 4.7395897271965676e-06, "logits/chosen": 0.9359084963798523, "logits/rejected": 0.8896707892417908, "logps/chosen": -226.84707641601562, "logps/rejected": -231.924072265625, "loss": 0.5353, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.173598051071167, "rewards/margins": 0.6731023192405701, "rewards/rejected": -1.8467003107070923, "step": 2900 }, { "epoch": 2.328, "grad_norm": 7.612265110015869, "learning_rate": 4.736478606061876e-06, "logits/chosen": 0.9218583106994629, "logits/rejected": 0.8763092160224915, "logps/chosen": -189.9997100830078, "logps/rejected": -222.7259521484375, "loss": 0.4865, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.624975323677063, "rewards/margins": 0.9700756072998047, "rewards/rejected": -1.5950509309768677, "step": 2910 }, { "epoch": 2.336, "grad_norm": 7.228297710418701, "learning_rate": 4.733350044417752e-06, "logits/chosen": 0.8519703149795532, "logits/rejected": 0.9009062051773071, "logps/chosen": -211.00601196289062, "logps/rejected": -227.3970184326172, "loss": 0.5175, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1423438787460327, "rewards/margins": 0.9418218731880188, "rewards/rejected": -2.084165573120117, "step": 2920 }, { "epoch": 2.344, "grad_norm": 7.3916192054748535, "learning_rate": 4.730204066661349e-06, "logits/chosen": 0.8180250525474548, "logits/rejected": 0.8404077887535095, "logps/chosen": -219.1990509033203, "logps/rejected": -247.86865234375, "loss": 0.4079, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -1.1498545408248901, "rewards/margins": 1.2002915143966675, "rewards/rejected": -2.3501460552215576, "step": 2930 }, { "epoch": 2.352, "grad_norm": 7.41680383682251, "learning_rate": 4.727040697325634e-06, "logits/chosen": 0.8754854202270508, "logits/rejected": 0.8143682479858398, "logps/chosen": -216.9535675048828, "logps/rejected": -224.46226501464844, "loss": 0.4467, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9588058590888977, "rewards/margins": 0.9411051869392395, "rewards/rejected": -1.8999111652374268, "step": 2940 }, { "epoch": 2.36, "grad_norm": 9.07734203338623, "learning_rate": 4.723859961079196e-06, "logits/chosen": 0.9282005429267883, "logits/rejected": 1.0409669876098633, "logps/chosen": -200.03123474121094, "logps/rejected": -205.33560180664062, "loss": 0.5902, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9528915286064148, "rewards/margins": 0.5995592474937439, "rewards/rejected": -1.5524507761001587, "step": 2950 }, { "epoch": 2.368, "grad_norm": 7.95607328414917, "learning_rate": 4.720661882726054e-06, "logits/chosen": 0.8046501278877258, "logits/rejected": 0.8820120096206665, "logps/chosen": -198.43272399902344, "logps/rejected": -228.85073852539062, "loss": 0.5915, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2360581159591675, "rewards/margins": 0.611167848110199, "rewards/rejected": -1.8472260236740112, "step": 2960 }, { "epoch": 2.376, "grad_norm": 9.31917667388916, "learning_rate": 4.717446487205466e-06, "logits/chosen": 1.0128583908081055, "logits/rejected": 0.8825269937515259, "logps/chosen": -221.35952758789062, "logps/rejected": -233.3060760498047, "loss": 0.5001, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7609128952026367, "rewards/margins": 0.8846578598022461, "rewards/rejected": -1.6455707550048828, "step": 2970 }, { "epoch": 2.384, "grad_norm": 6.160464763641357, "learning_rate": 4.714213799591733e-06, "logits/chosen": 0.9421942830085754, "logits/rejected": 0.9018292427062988, "logps/chosen": -205.28750610351562, "logps/rejected": -214.02294921875, "loss": 0.5016, "rewards/accuracies": 0.75, "rewards/chosen": -0.7612765431404114, "rewards/margins": 0.7555828094482422, "rewards/rejected": -1.5168594121932983, "step": 2980 }, { "epoch": 2.392, "grad_norm": 5.200317859649658, "learning_rate": 4.710963845094003e-06, "logits/chosen": 0.8887328505516052, "logits/rejected": 0.8375994563102722, "logps/chosen": -196.9802703857422, "logps/rejected": -206.02444458007812, "loss": 0.5104, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.6040834784507751, "rewards/margins": 0.7478589415550232, "rewards/rejected": -1.3519423007965088, "step": 2990 }, { "epoch": 2.4, "grad_norm": 5.419322490692139, "learning_rate": 4.707696649056073e-06, "logits/chosen": 0.8664781451225281, "logits/rejected": 0.8220338821411133, "logps/chosen": -191.3097686767578, "logps/rejected": -210.8022918701172, "loss": 0.4085, "rewards/accuracies": 0.875, "rewards/chosen": -0.4698951244354248, "rewards/margins": 1.0510791540145874, "rewards/rejected": -1.5209741592407227, "step": 3000 }, { "epoch": 2.408, "grad_norm": 7.53326416015625, "learning_rate": 4.704412236956194e-06, "logits/chosen": 0.8981558680534363, "logits/rejected": 0.9666721224784851, "logps/chosen": -210.7949981689453, "logps/rejected": -252.54966735839844, "loss": 0.4686, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.2617671489715576, "rewards/margins": 1.07135808467865, "rewards/rejected": -2.333125352859497, "step": 3010 }, { "epoch": 2.416, "grad_norm": 4.284425258636475, "learning_rate": 4.701110634406871e-06, "logits/chosen": 0.7835731506347656, "logits/rejected": 0.7178624272346497, "logps/chosen": -197.4336395263672, "logps/rejected": -236.43325805664062, "loss": 0.456, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1816260814666748, "rewards/margins": 1.00633704662323, "rewards/rejected": -2.1879632472991943, "step": 3020 }, { "epoch": 2.424, "grad_norm": 8.3016357421875, "learning_rate": 4.6977918671546635e-06, "logits/chosen": 0.9753482937812805, "logits/rejected": 0.8922098278999329, "logps/chosen": -211.79405212402344, "logps/rejected": -218.29739379882812, "loss": 0.5311, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.0505523681640625, "rewards/margins": 0.8706418871879578, "rewards/rejected": -1.9211944341659546, "step": 3030 }, { "epoch": 2.432, "grad_norm": 9.126633644104004, "learning_rate": 4.6944559610799865e-06, "logits/chosen": 1.0277721881866455, "logits/rejected": 1.0039381980895996, "logps/chosen": -183.63748168945312, "logps/rejected": -211.2511749267578, "loss": 0.5654, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6382626891136169, "rewards/margins": 0.8599988222122192, "rewards/rejected": -1.4982614517211914, "step": 3040 }, { "epoch": 2.44, "grad_norm": 7.879682540893555, "learning_rate": 4.691102942196905e-06, "logits/chosen": 0.9624242186546326, "logits/rejected": 1.0566548109054565, "logps/chosen": -197.49008178710938, "logps/rejected": -237.42349243164062, "loss": 0.5254, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.352699875831604, "rewards/margins": 0.8264394998550415, "rewards/rejected": -2.1791393756866455, "step": 3050 }, { "epoch": 2.448, "grad_norm": 8.244893074035645, "learning_rate": 4.687732836652935e-06, "logits/chosen": 1.102777361869812, "logits/rejected": 0.9323251843452454, "logps/chosen": -211.6886749267578, "logps/rejected": -223.81761169433594, "loss": 0.5689, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": -1.4250367879867554, "rewards/margins": 0.8165454864501953, "rewards/rejected": -2.2415823936462402, "step": 3060 }, { "epoch": 2.456, "grad_norm": 10.064545631408691, "learning_rate": 4.684345670728835e-06, "logits/chosen": 0.9085485339164734, "logits/rejected": 0.9594566226005554, "logps/chosen": -216.97305297851562, "logps/rejected": -245.9871368408203, "loss": 0.5879, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5272258520126343, "rewards/margins": 0.6840270161628723, "rewards/rejected": -2.2112529277801514, "step": 3070 }, { "epoch": 2.464, "grad_norm": 5.9822773933410645, "learning_rate": 4.680941470838405e-06, "logits/chosen": 0.9992641806602478, "logits/rejected": 0.9386343955993652, "logps/chosen": -211.3744659423828, "logps/rejected": -215.39097595214844, "loss": 0.5522, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8485382199287415, "rewards/margins": 0.7580586075782776, "rewards/rejected": -1.606596827507019, "step": 3080 }, { "epoch": 2.472, "grad_norm": 9.379515647888184, "learning_rate": 4.67752026352828e-06, "logits/chosen": 0.9449491500854492, "logits/rejected": 0.8973454833030701, "logps/chosen": -229.10166931152344, "logps/rejected": -235.14395141601562, "loss": 0.5425, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0061733722686768, "rewards/margins": 0.7214813828468323, "rewards/rejected": -1.7276546955108643, "step": 3090 }, { "epoch": 2.48, "grad_norm": 7.775414943695068, "learning_rate": 4.674082075477724e-06, "logits/chosen": 0.9458308219909668, "logits/rejected": 0.9328064322471619, "logps/chosen": -194.5631561279297, "logps/rejected": -212.3477325439453, "loss": 0.5018, "rewards/accuracies": 0.75, "rewards/chosen": -0.9718823432922363, "rewards/margins": 0.8911674618721008, "rewards/rejected": -1.8630497455596924, "step": 3100 }, { "epoch": 2.488, "grad_norm": 8.869393348693848, "learning_rate": 4.670626933498415e-06, "logits/chosen": 1.0055527687072754, "logits/rejected": 0.8751175999641418, "logps/chosen": -205.0562744140625, "logps/rejected": -214.41934204101562, "loss": 0.5698, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.790256679058075, "rewards/margins": 0.7495672106742859, "rewards/rejected": -1.5398238897323608, "step": 3110 }, { "epoch": 2.496, "grad_norm": 7.518993854522705, "learning_rate": 4.667154864534245e-06, "logits/chosen": 0.9388038516044617, "logits/rejected": 0.9028251767158508, "logps/chosen": -189.31138610839844, "logps/rejected": -246.3251495361328, "loss": 0.4711, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7512499690055847, "rewards/margins": 0.9976223111152649, "rewards/rejected": -1.7488723993301392, "step": 3120 }, { "epoch": 2.504, "grad_norm": 5.921288967132568, "learning_rate": 4.663665895661107e-06, "logits/chosen": 0.9502741694450378, "logits/rejected": 0.9423877596855164, "logps/chosen": -201.31776428222656, "logps/rejected": -228.7478790283203, "loss": 0.6146, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0320862531661987, "rewards/margins": 0.730940580368042, "rewards/rejected": -1.7630270719528198, "step": 3130 }, { "epoch": 2.512, "grad_norm": 7.64976692199707, "learning_rate": 4.6601600540866794e-06, "logits/chosen": 0.9315633773803711, "logits/rejected": 0.9705595374107361, "logps/chosen": -207.4699249267578, "logps/rejected": -223.7519989013672, "loss": 0.5229, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7369431257247925, "rewards/margins": 0.7962962985038757, "rewards/rejected": -1.5332393646240234, "step": 3140 }, { "epoch": 2.52, "grad_norm": 10.3628511428833, "learning_rate": 4.65663736715022e-06, "logits/chosen": 1.0306199789047241, "logits/rejected": 0.9668065309524536, "logps/chosen": -216.719970703125, "logps/rejected": -218.1455535888672, "loss": 0.5989, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6501615047454834, "rewards/margins": 0.6095995306968689, "rewards/rejected": -1.259761095046997, "step": 3150 }, { "epoch": 2.528, "grad_norm": 8.640954971313477, "learning_rate": 4.653097862322347e-06, "logits/chosen": 0.9273595809936523, "logits/rejected": 0.9403362274169922, "logps/chosen": -213.16714477539062, "logps/rejected": -236.9595489501953, "loss": 0.5293, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7404825687408447, "rewards/margins": 0.8145327568054199, "rewards/rejected": -1.5550154447555542, "step": 3160 }, { "epoch": 2.536, "grad_norm": 7.425879001617432, "learning_rate": 4.6495415672048336e-06, "logits/chosen": 0.8542930483818054, "logits/rejected": 0.880113422870636, "logps/chosen": -213.308837890625, "logps/rejected": -242.27099609375, "loss": 0.4891, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.4495139718055725, "rewards/margins": 0.8645074963569641, "rewards/rejected": -1.3140214681625366, "step": 3170 }, { "epoch": 2.544, "grad_norm": 6.936334609985352, "learning_rate": 4.645968509530381e-06, "logits/chosen": 0.985787034034729, "logits/rejected": 0.970470130443573, "logps/chosen": -205.81802368164062, "logps/rejected": -199.0405731201172, "loss": 0.5015, "rewards/accuracies": 0.75, "rewards/chosen": -0.6486034393310547, "rewards/margins": 0.8106253743171692, "rewards/rejected": -1.4592288732528687, "step": 3180 }, { "epoch": 2.552, "grad_norm": 9.339189529418945, "learning_rate": 4.642378717162411e-06, "logits/chosen": 0.9108757972717285, "logits/rejected": 0.8992031216621399, "logps/chosen": -212.0107879638672, "logps/rejected": -223.214599609375, "loss": 0.5662, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.43249455094337463, "rewards/margins": 0.7586955428123474, "rewards/rejected": -1.1911901235580444, "step": 3190 }, { "epoch": 2.56, "grad_norm": 10.31650161743164, "learning_rate": 4.638772218094847e-06, "logits/chosen": 0.8744648098945618, "logits/rejected": 0.9359402060508728, "logps/chosen": -211.1424560546875, "logps/rejected": -240.16343688964844, "loss": 0.5185, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9992405772209167, "rewards/margins": 0.8215592503547668, "rewards/rejected": -1.820799708366394, "step": 3200 }, { "epoch": 2.568, "grad_norm": 9.806978225708008, "learning_rate": 4.635149040451891e-06, "logits/chosen": 1.0247632265090942, "logits/rejected": 0.971980094909668, "logps/chosen": -204.77330017089844, "logps/rejected": -208.4816131591797, "loss": 0.5257, "rewards/accuracies": 0.75, "rewards/chosen": -0.39921775460243225, "rewards/margins": 0.7709429860115051, "rewards/rejected": -1.1701607704162598, "step": 3210 }, { "epoch": 2.576, "grad_norm": 7.172482490539551, "learning_rate": 4.631509212487812e-06, "logits/chosen": 0.9954729080200195, "logits/rejected": 0.9880996942520142, "logps/chosen": -210.8682098388672, "logps/rejected": -213.47573852539062, "loss": 0.5339, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -0.9985690116882324, "rewards/margins": 0.739185094833374, "rewards/rejected": -1.7377541065216064, "step": 3220 }, { "epoch": 2.584, "grad_norm": 11.274581909179688, "learning_rate": 4.627852762586718e-06, "logits/chosen": 0.9896817207336426, "logits/rejected": 1.021188735961914, "logps/chosen": -189.18344116210938, "logps/rejected": -215.19497680664062, "loss": 0.5474, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4847024381160736, "rewards/margins": 0.7209543585777283, "rewards/rejected": -1.205656886100769, "step": 3230 }, { "epoch": 2.592, "grad_norm": 5.447055816650391, "learning_rate": 4.624179719262342e-06, "logits/chosen": 1.0467568635940552, "logits/rejected": 1.0570539236068726, "logps/chosen": -201.77415466308594, "logps/rejected": -206.077392578125, "loss": 0.4613, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5657867789268494, "rewards/margins": 0.8899927139282227, "rewards/rejected": -1.4557795524597168, "step": 3240 }, { "epoch": 2.6, "grad_norm": 7.587062358856201, "learning_rate": 4.62049011115781e-06, "logits/chosen": 0.9699912071228027, "logits/rejected": 1.017883539199829, "logps/chosen": -205.2608642578125, "logps/rejected": -220.08792114257812, "loss": 0.4848, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.7234296798706055, "rewards/margins": 0.8834452033042908, "rewards/rejected": -1.6068748235702515, "step": 3250 }, { "epoch": 2.608, "grad_norm": 6.428370475769043, "learning_rate": 4.616783967045432e-06, "logits/chosen": 1.0170371532440186, "logits/rejected": 0.9278216361999512, "logps/chosen": -205.9370574951172, "logps/rejected": -226.574462890625, "loss": 0.443, "rewards/accuracies": 0.8125, "rewards/chosen": -0.777590274810791, "rewards/margins": 0.9760915637016296, "rewards/rejected": -1.753682017326355, "step": 3260 }, { "epoch": 2.616, "grad_norm": 9.805668830871582, "learning_rate": 4.6130613158264605e-06, "logits/chosen": 0.8976919054985046, "logits/rejected": 0.9280143976211548, "logps/chosen": -206.73876953125, "logps/rejected": -206.44371032714844, "loss": 0.5154, "rewards/accuracies": 0.75, "rewards/chosen": -0.8372632265090942, "rewards/margins": 0.7562478184700012, "rewards/rejected": -1.5935109853744507, "step": 3270 }, { "epoch": 2.624, "grad_norm": 8.653728485107422, "learning_rate": 4.6093221865308795e-06, "logits/chosen": 0.9815517663955688, "logits/rejected": 0.926008403301239, "logps/chosen": -196.36875915527344, "logps/rejected": -215.28271484375, "loss": 0.5294, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -0.9871358871459961, "rewards/margins": 0.8871564865112305, "rewards/rejected": -1.874292254447937, "step": 3280 }, { "epoch": 2.632, "grad_norm": 4.758111953735352, "learning_rate": 4.605566608317169e-06, "logits/chosen": 0.8787722587585449, "logits/rejected": 0.8375174403190613, "logps/chosen": -190.63404846191406, "logps/rejected": -210.22802734375, "loss": 0.5017, "rewards/accuracies": 0.75, "rewards/chosen": -0.5020423531532288, "rewards/margins": 0.8334178328514099, "rewards/rejected": -1.3354600667953491, "step": 3290 }, { "epoch": 2.64, "grad_norm": 7.596653938293457, "learning_rate": 4.601794610472083e-06, "logits/chosen": 0.9046363830566406, "logits/rejected": 0.8183378577232361, "logps/chosen": -215.7941436767578, "logps/rejected": -241.56838989257812, "loss": 0.4736, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9933833479881287, "rewards/margins": 0.9869853854179382, "rewards/rejected": -1.9803688526153564, "step": 3300 }, { "epoch": 2.648, "grad_norm": 12.559988975524902, "learning_rate": 4.598006222410419e-06, "logits/chosen": 0.9887520670890808, "logits/rejected": 1.0198968648910522, "logps/chosen": -188.9550018310547, "logps/rejected": -219.6782684326172, "loss": 0.4814, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -0.820119321346283, "rewards/margins": 0.9246525168418884, "rewards/rejected": -1.7447718381881714, "step": 3310 }, { "epoch": 2.656, "grad_norm": 5.864845275878906, "learning_rate": 4.594201473674788e-06, "logits/chosen": 0.9294392466545105, "logits/rejected": 0.796425461769104, "logps/chosen": -197.6046905517578, "logps/rejected": -234.70787048339844, "loss": 0.4557, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -1.0228973627090454, "rewards/margins": 0.9283467531204224, "rewards/rejected": -1.9512439966201782, "step": 3320 }, { "epoch": 2.664, "grad_norm": 8.346921920776367, "learning_rate": 4.590380393935383e-06, "logits/chosen": 0.8741597533226013, "logits/rejected": 0.836887001991272, "logps/chosen": -214.54185485839844, "logps/rejected": -223.0530242919922, "loss": 0.5895, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2219938039779663, "rewards/margins": 0.6594648361206055, "rewards/rejected": -1.8814586400985718, "step": 3330 }, { "epoch": 2.672, "grad_norm": 7.899364948272705, "learning_rate": 4.5865430129897536e-06, "logits/chosen": 0.857835590839386, "logits/rejected": 0.8607529997825623, "logps/chosen": -210.548095703125, "logps/rejected": -236.50767517089844, "loss": 0.5514, "rewards/accuracies": 0.75, "rewards/chosen": -1.091759443283081, "rewards/margins": 0.8923540115356445, "rewards/rejected": -1.984113335609436, "step": 3340 }, { "epoch": 2.68, "grad_norm": 6.950599193572998, "learning_rate": 4.5826893607625665e-06, "logits/chosen": 0.8608657717704773, "logits/rejected": 0.9021877646446228, "logps/chosen": -219.5095672607422, "logps/rejected": -248.8926239013672, "loss": 0.4884, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.17706298828125, "rewards/margins": 1.054115653038025, "rewards/rejected": -2.2311787605285645, "step": 3350 }, { "epoch": 2.6879999999999997, "grad_norm": 6.3925065994262695, "learning_rate": 4.578819467305375e-06, "logits/chosen": 0.8635089993476868, "logits/rejected": 0.9291839599609375, "logps/chosen": -208.9047393798828, "logps/rejected": -242.42764282226562, "loss": 0.5177, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8883923888206482, "rewards/margins": 0.8316335678100586, "rewards/rejected": -1.7200260162353516, "step": 3360 }, { "epoch": 2.6959999999999997, "grad_norm": 7.390092372894287, "learning_rate": 4.5749333627963886e-06, "logits/chosen": 1.025161623954773, "logits/rejected": 0.9390385746955872, "logps/chosen": -192.0789031982422, "logps/rejected": -208.17300415039062, "loss": 0.6558, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7664794325828552, "rewards/margins": 0.47816377878189087, "rewards/rejected": -1.244643211364746, "step": 3370 }, { "epoch": 2.7039999999999997, "grad_norm": 6.525977611541748, "learning_rate": 4.571031077540227e-06, "logits/chosen": 0.8534032702445984, "logits/rejected": 0.847669780254364, "logps/chosen": -206.7920684814453, "logps/rejected": -226.60240173339844, "loss": 0.5259, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8559421896934509, "rewards/margins": 0.7720093727111816, "rewards/rejected": -1.6279516220092773, "step": 3380 }, { "epoch": 2.7119999999999997, "grad_norm": 11.586974143981934, "learning_rate": 4.567112641967697e-06, "logits/chosen": 0.849672794342041, "logits/rejected": 0.7306337356567383, "logps/chosen": -230.2295379638672, "logps/rejected": -227.7930908203125, "loss": 0.5625, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9319648146629333, "rewards/margins": 0.6750860214233398, "rewards/rejected": -1.607050895690918, "step": 3390 }, { "epoch": 2.7199999999999998, "grad_norm": 13.69837474822998, "learning_rate": 4.563178086635546e-06, "logits/chosen": 0.9492778182029724, "logits/rejected": 0.9037938117980957, "logps/chosen": -210.29478454589844, "logps/rejected": -216.89903259277344, "loss": 0.5038, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -0.8451706171035767, "rewards/margins": 0.8125400543212891, "rewards/rejected": -1.6577106714248657, "step": 3400 }, { "epoch": 2.7279999999999998, "grad_norm": 4.132405757904053, "learning_rate": 4.559227442226226e-06, "logits/chosen": 0.9789943695068359, "logits/rejected": 0.9167888760566711, "logps/chosen": -191.22715759277344, "logps/rejected": -232.78164672851562, "loss": 0.4735, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -0.7555204629898071, "rewards/margins": 0.9841014742851257, "rewards/rejected": -1.7396221160888672, "step": 3410 }, { "epoch": 2.7359999999999998, "grad_norm": 6.608091831207275, "learning_rate": 4.555260739547657e-06, "logits/chosen": 1.0244665145874023, "logits/rejected": 0.8857207298278809, "logps/chosen": -203.6723175048828, "logps/rejected": -212.9291534423828, "loss": 0.5356, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6175267100334167, "rewards/margins": 0.7223233580589294, "rewards/rejected": -1.3398501873016357, "step": 3420 }, { "epoch": 2.7439999999999998, "grad_norm": 9.938222885131836, "learning_rate": 4.551278009532981e-06, "logits/chosen": 1.0124415159225464, "logits/rejected": 0.9431573152542114, "logps/chosen": -207.00718688964844, "logps/rejected": -225.10806274414062, "loss": 0.5292, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5922741889953613, "rewards/margins": 0.7229223251342773, "rewards/rejected": -1.3151965141296387, "step": 3430 }, { "epoch": 2.752, "grad_norm": 6.640532970428467, "learning_rate": 4.5472792832403295e-06, "logits/chosen": 0.9901018142700195, "logits/rejected": 0.8438584208488464, "logps/chosen": -192.44761657714844, "logps/rejected": -208.34683227539062, "loss": 0.4769, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6061093211174011, "rewards/margins": 0.9215426445007324, "rewards/rejected": -1.5276520252227783, "step": 3440 }, { "epoch": 2.76, "grad_norm": 7.896131992340088, "learning_rate": 4.543264591852572e-06, "logits/chosen": 0.9401613473892212, "logits/rejected": 1.0364055633544922, "logps/chosen": -212.4904022216797, "logps/rejected": -249.1276092529297, "loss": 0.6105, "rewards/accuracies": 0.6500000357627869, "rewards/chosen": -0.9996709823608398, "rewards/margins": 0.6035251021385193, "rewards/rejected": -1.6031960248947144, "step": 3450 }, { "epoch": 2.768, "grad_norm": 14.408479690551758, "learning_rate": 4.539233966677078e-06, "logits/chosen": 0.9975506067276001, "logits/rejected": 1.0087366104125977, "logps/chosen": -220.4411163330078, "logps/rejected": -230.56089782714844, "loss": 0.5005, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.6962399482727051, "rewards/margins": 0.7583475112915039, "rewards/rejected": -1.4545873403549194, "step": 3460 }, { "epoch": 2.776, "grad_norm": 9.091312408447266, "learning_rate": 4.535187439145473e-06, "logits/chosen": 1.004744529724121, "logits/rejected": 0.956881046295166, "logps/chosen": -208.3164520263672, "logps/rejected": -249.3081512451172, "loss": 0.4817, "rewards/accuracies": 0.75, "rewards/chosen": -0.8332870602607727, "rewards/margins": 0.9917869567871094, "rewards/rejected": -1.8250740766525269, "step": 3470 }, { "epoch": 2.784, "grad_norm": 9.282960891723633, "learning_rate": 4.531125040813392e-06, "logits/chosen": 0.9015275835990906, "logits/rejected": 0.8595914244651794, "logps/chosen": -206.90184020996094, "logps/rejected": -241.4942626953125, "loss": 0.4265, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8733939528465271, "rewards/margins": 1.2572206258773804, "rewards/rejected": -2.1306145191192627, "step": 3480 }, { "epoch": 2.792, "grad_norm": 7.599782943725586, "learning_rate": 4.527046803360232e-06, "logits/chosen": 1.0097182989120483, "logits/rejected": 0.9868324398994446, "logps/chosen": -197.34820556640625, "logps/rejected": -220.2039337158203, "loss": 0.4665, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.28118035197257996, "rewards/margins": 0.9197006225585938, "rewards/rejected": -1.200881004333496, "step": 3490 }, { "epoch": 2.8, "grad_norm": 10.817337036132812, "learning_rate": 4.522952758588909e-06, "logits/chosen": 1.014154314994812, "logits/rejected": 1.0231674909591675, "logps/chosen": -199.9979705810547, "logps/rejected": -226.4349822998047, "loss": 0.4632, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3697163462638855, "rewards/margins": 0.8839020133018494, "rewards/rejected": -1.2536184787750244, "step": 3500 }, { "epoch": 2.808, "grad_norm": 10.987738609313965, "learning_rate": 4.518842938425606e-06, "logits/chosen": 1.0434938669204712, "logits/rejected": 0.9931101202964783, "logps/chosen": -193.0313262939453, "logps/rejected": -202.6310272216797, "loss": 0.493, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3265318274497986, "rewards/margins": 0.9020771980285645, "rewards/rejected": -1.2286089658737183, "step": 3510 }, { "epoch": 2.816, "grad_norm": 6.466224193572998, "learning_rate": 4.514717374919525e-06, "logits/chosen": 1.0711123943328857, "logits/rejected": 0.9438812136650085, "logps/chosen": -204.62571716308594, "logps/rejected": -227.7462921142578, "loss": 0.4658, "rewards/accuracies": 0.8125, "rewards/chosen": -0.47518759965896606, "rewards/margins": 1.0582878589630127, "rewards/rejected": -1.5334755182266235, "step": 3520 }, { "epoch": 2.824, "grad_norm": 9.006815910339355, "learning_rate": 4.510576100242642e-06, "logits/chosen": 1.0240957736968994, "logits/rejected": 0.9844247698783875, "logps/chosen": -198.17991638183594, "logps/rejected": -212.7672576904297, "loss": 0.5084, "rewards/accuracies": 0.75, "rewards/chosen": -0.45504727959632874, "rewards/margins": 0.9179355502128601, "rewards/rejected": -1.3729829788208008, "step": 3530 }, { "epoch": 2.832, "grad_norm": 7.647313594818115, "learning_rate": 4.506419146689445e-06, "logits/chosen": 1.0858877897262573, "logits/rejected": 0.9952969551086426, "logps/chosen": -204.3494110107422, "logps/rejected": -224.9398651123047, "loss": 0.4831, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -0.022721266373991966, "rewards/margins": 1.0130819082260132, "rewards/rejected": -1.035803198814392, "step": 3540 }, { "epoch": 2.84, "grad_norm": 10.288926124572754, "learning_rate": 4.502246546676697e-06, "logits/chosen": 1.0694109201431274, "logits/rejected": 1.054747462272644, "logps/chosen": -194.4237518310547, "logps/rejected": -223.91714477539062, "loss": 0.5075, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7783587574958801, "rewards/margins": 0.9766228795051575, "rewards/rejected": -1.7549816370010376, "step": 3550 }, { "epoch": 2.848, "grad_norm": 7.374194145202637, "learning_rate": 4.498058332743168e-06, "logits/chosen": 0.7898425459861755, "logits/rejected": 0.7930797934532166, "logps/chosen": -203.2722625732422, "logps/rejected": -237.77330017089844, "loss": 0.5055, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -0.446158230304718, "rewards/margins": 0.8784043192863464, "rewards/rejected": -1.3245625495910645, "step": 3560 }, { "epoch": 2.856, "grad_norm": 9.789468765258789, "learning_rate": 4.493854537549393e-06, "logits/chosen": 1.0634657144546509, "logits/rejected": 1.1062793731689453, "logps/chosen": -202.4827117919922, "logps/rejected": -214.6761474609375, "loss": 0.49, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -0.2086171656847, "rewards/margins": 0.8339020609855652, "rewards/rejected": -1.042519211769104, "step": 3570 }, { "epoch": 2.864, "grad_norm": 6.90875244140625, "learning_rate": 4.48963519387741e-06, "logits/chosen": 1.0072237253189087, "logits/rejected": 0.9182701110839844, "logps/chosen": -202.2296905517578, "logps/rejected": -228.3877410888672, "loss": 0.5129, "rewards/accuracies": 0.75, "rewards/chosen": -0.5820229649543762, "rewards/margins": 0.7529212832450867, "rewards/rejected": -1.334944248199463, "step": 3580 }, { "epoch": 2.872, "grad_norm": 7.405145645141602, "learning_rate": 4.485400334630511e-06, "logits/chosen": 1.0315786600112915, "logits/rejected": 1.0412896871566772, "logps/chosen": -208.01670837402344, "logps/rejected": -233.53164672851562, "loss": 0.4206, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -0.788216233253479, "rewards/margins": 1.1600385904312134, "rewards/rejected": -1.9482548236846924, "step": 3590 }, { "epoch": 2.88, "grad_norm": 8.326019287109375, "learning_rate": 4.4811499928329775e-06, "logits/chosen": 0.8572309613227844, "logits/rejected": 0.8392788171768188, "logps/chosen": -199.2486572265625, "logps/rejected": -243.4100799560547, "loss": 0.5167, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2861894369125366, "rewards/margins": 0.9100348353385925, "rewards/rejected": -2.1962242126464844, "step": 3600 }, { "epoch": 2.888, "grad_norm": 5.889350414276123, "learning_rate": 4.4768842016298275e-06, "logits/chosen": 1.1370327472686768, "logits/rejected": 0.966783344745636, "logps/chosen": -203.1497039794922, "logps/rejected": -216.1260223388672, "loss": 0.5539, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.37005722522735596, "rewards/margins": 0.8371629118919373, "rewards/rejected": -1.2072200775146484, "step": 3610 }, { "epoch": 2.896, "grad_norm": 5.19056510925293, "learning_rate": 4.472602994286559e-06, "logits/chosen": 1.003334641456604, "logits/rejected": 0.9512959718704224, "logps/chosen": -205.01858520507812, "logps/rejected": -227.35317993164062, "loss": 0.629, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9021452069282532, "rewards/margins": 0.6625142097473145, "rewards/rejected": -1.5646594762802124, "step": 3620 }, { "epoch": 2.904, "grad_norm": 8.15114974975586, "learning_rate": 4.468306404188887e-06, "logits/chosen": 0.9551017880439758, "logits/rejected": 0.9154602289199829, "logps/chosen": -198.5496368408203, "logps/rejected": -231.066650390625, "loss": 0.5031, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8837252855300903, "rewards/margins": 0.8877946734428406, "rewards/rejected": -1.7715200185775757, "step": 3630 }, { "epoch": 2.912, "grad_norm": 6.599959373474121, "learning_rate": 4.463994464842485e-06, "logits/chosen": 1.1515620946884155, "logits/rejected": 1.0820724964141846, "logps/chosen": -196.33172607421875, "logps/rejected": -196.94976806640625, "loss": 0.5572, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.42906999588012695, "rewards/margins": 0.6470549702644348, "rewards/rejected": -1.0761250257492065, "step": 3640 }, { "epoch": 2.92, "grad_norm": 11.319299697875977, "learning_rate": 4.45966720987272e-06, "logits/chosen": 0.982757031917572, "logits/rejected": 0.9923986792564392, "logps/chosen": -221.4713134765625, "logps/rejected": -228.6186065673828, "loss": 0.5141, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6713235974311829, "rewards/margins": 0.912000834941864, "rewards/rejected": -1.5833243131637573, "step": 3650 }, { "epoch": 2.928, "grad_norm": 6.565591335296631, "learning_rate": 4.455324673024396e-06, "logits/chosen": 1.0351612567901611, "logits/rejected": 0.9620069861412048, "logps/chosen": -205.81106567382812, "logps/rejected": -224.9451446533203, "loss": 0.466, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.5482096076011658, "rewards/margins": 0.8861121535301208, "rewards/rejected": -1.434321641921997, "step": 3660 }, { "epoch": 2.936, "grad_norm": 11.906221389770508, "learning_rate": 4.45096688816149e-06, "logits/chosen": 0.9011246562004089, "logits/rejected": 0.9497923254966736, "logps/chosen": -201.8857421875, "logps/rejected": -207.8782501220703, "loss": 0.6158, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.971345067024231, "rewards/margins": 0.47949621081352234, "rewards/rejected": -1.4508411884307861, "step": 3670 }, { "epoch": 2.944, "grad_norm": 4.588380336761475, "learning_rate": 4.4465938892668815e-06, "logits/chosen": 1.0076476335525513, "logits/rejected": 1.000885009765625, "logps/chosen": -192.5887451171875, "logps/rejected": -214.83604431152344, "loss": 0.4861, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -0.916580319404602, "rewards/margins": 0.8697047233581543, "rewards/rejected": -1.7862850427627563, "step": 3680 }, { "epoch": 2.952, "grad_norm": 7.142757415771484, "learning_rate": 4.442205710442095e-06, "logits/chosen": 1.0882282257080078, "logits/rejected": 1.0283339023590088, "logps/chosen": -202.76055908203125, "logps/rejected": -213.7969512939453, "loss": 0.5747, "rewards/accuracies": 0.7125000357627869, "rewards/chosen": -1.2140840291976929, "rewards/margins": 0.6454777121543884, "rewards/rejected": -1.8595619201660156, "step": 3690 }, { "epoch": 2.96, "grad_norm": 7.612401962280273, "learning_rate": 4.43780238590703e-06, "logits/chosen": 1.0234251022338867, "logits/rejected": 1.0618809461593628, "logps/chosen": -208.89512634277344, "logps/rejected": -240.69761657714844, "loss": 0.5233, "rewards/accuracies": 0.75, "rewards/chosen": -1.4141120910644531, "rewards/margins": 1.018404245376587, "rewards/rejected": -2.432516574859619, "step": 3700 }, { "epoch": 2.968, "grad_norm": 7.064319133758545, "learning_rate": 4.433383949999695e-06, "logits/chosen": 0.9859750866889954, "logits/rejected": 0.86090087890625, "logps/chosen": -195.9196319580078, "logps/rejected": -218.8886260986328, "loss": 0.4801, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -1.1885597705841064, "rewards/margins": 0.8579304814338684, "rewards/rejected": -2.04649019241333, "step": 3710 }, { "epoch": 2.976, "grad_norm": 6.925068378448486, "learning_rate": 4.428950437175944e-06, "logits/chosen": 0.9019951224327087, "logits/rejected": 0.8458378911018372, "logps/chosen": -232.447265625, "logps/rejected": -239.73648071289062, "loss": 0.5006, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1259241104125977, "rewards/margins": 0.8236715197563171, "rewards/rejected": -1.9495956897735596, "step": 3720 }, { "epoch": 2.984, "grad_norm": 9.741296768188477, "learning_rate": 4.4245018820091975e-06, "logits/chosen": 0.8749257326126099, "logits/rejected": 0.8770486116409302, "logps/chosen": -232.95010375976562, "logps/rejected": -250.5192413330078, "loss": 0.5261, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2567474842071533, "rewards/margins": 1.0741055011749268, "rewards/rejected": -2.33085298538208, "step": 3730 }, { "epoch": 2.992, "grad_norm": 8.8729887008667, "learning_rate": 4.420038319190184e-06, "logits/chosen": 0.8761041760444641, "logits/rejected": 0.9457497000694275, "logps/chosen": -199.91307067871094, "logps/rejected": -205.45762634277344, "loss": 0.4907, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8159521222114563, "rewards/margins": 0.788399338722229, "rewards/rejected": -1.6043514013290405, "step": 3740 }, { "epoch": 3.0, "grad_norm": 6.5494585037231445, "learning_rate": 4.415559783526661e-06, "logits/chosen": 0.9953758120536804, "logits/rejected": 0.9608160257339478, "logps/chosen": -203.28005981445312, "logps/rejected": -234.4387664794922, "loss": 0.5245, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4376920461654663, "rewards/margins": 0.7289747595787048, "rewards/rejected": -2.1666667461395264, "step": 3750 }, { "epoch": 3.008, "grad_norm": 7.342889785766602, "learning_rate": 4.411066309943151e-06, "logits/chosen": 0.911847710609436, "logits/rejected": 0.8547344207763672, "logps/chosen": -229.86656188964844, "logps/rejected": -246.57359313964844, "loss": 0.39, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -0.783231258392334, "rewards/margins": 1.1452419757843018, "rewards/rejected": -1.9284733533859253, "step": 3760 }, { "epoch": 3.016, "grad_norm": 6.122635364532471, "learning_rate": 4.406557933480665e-06, "logits/chosen": 0.8730871081352234, "logits/rejected": 0.7891945242881775, "logps/chosen": -230.88916015625, "logps/rejected": -229.0500946044922, "loss": 0.4251, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.818146824836731, "rewards/margins": 1.2087650299072266, "rewards/rejected": -2.026911973953247, "step": 3770 }, { "epoch": 3.024, "grad_norm": 7.4118452072143555, "learning_rate": 4.402034689296425e-06, "logits/chosen": 0.9826356768608093, "logits/rejected": 0.8032142519950867, "logps/chosen": -197.12989807128906, "logps/rejected": -208.55838012695312, "loss": 0.3441, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -0.5323260426521301, "rewards/margins": 1.3038690090179443, "rewards/rejected": -1.8361949920654297, "step": 3780 }, { "epoch": 3.032, "grad_norm": 9.49242877960205, "learning_rate": 4.397496612663599e-06, "logits/chosen": 1.0212730169296265, "logits/rejected": 0.8553698658943176, "logps/chosen": -213.5210418701172, "logps/rejected": -232.46238708496094, "loss": 0.427, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1634958982467651, "rewards/margins": 1.099713921546936, "rewards/rejected": -2.263209819793701, "step": 3790 }, { "epoch": 3.04, "grad_norm": 7.7141571044921875, "learning_rate": 4.392943738971021e-06, "logits/chosen": 0.9747546315193176, "logits/rejected": 0.974277675151825, "logps/chosen": -211.902099609375, "logps/rejected": -224.82958984375, "loss": 0.3788, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.9616455435752869, "rewards/margins": 1.1425886154174805, "rewards/rejected": -2.104234457015991, "step": 3800 }, { "epoch": 3.048, "grad_norm": 6.494782447814941, "learning_rate": 4.388376103722914e-06, "logits/chosen": 0.9183564186096191, "logits/rejected": 0.8766688704490662, "logps/chosen": -192.46058654785156, "logps/rejected": -252.86399841308594, "loss": 0.4398, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.0373767614364624, "rewards/margins": 1.24557363986969, "rewards/rejected": -2.2829504013061523, "step": 3810 }, { "epoch": 3.056, "grad_norm": 12.855714797973633, "learning_rate": 4.383793742538615e-06, "logits/chosen": 0.9224297404289246, "logits/rejected": 0.8424360156059265, "logps/chosen": -188.82408142089844, "logps/rejected": -220.83970642089844, "loss": 0.5444, "rewards/accuracies": 0.75, "rewards/chosen": -0.5318169593811035, "rewards/margins": 0.8576032519340515, "rewards/rejected": -1.3894203901290894, "step": 3820 }, { "epoch": 3.064, "grad_norm": 6.39134407043457, "learning_rate": 4.3791966911522985e-06, "logits/chosen": 0.9185449481010437, "logits/rejected": 0.9570236206054688, "logps/chosen": -198.7986602783203, "logps/rejected": -196.16566467285156, "loss": 0.5775, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8139546513557434, "rewards/margins": 0.7184473872184753, "rewards/rejected": -1.5324020385742188, "step": 3830 }, { "epoch": 3.072, "grad_norm": 7.5484185218811035, "learning_rate": 4.374584985412692e-06, "logits/chosen": 1.0025938749313354, "logits/rejected": 0.965260922908783, "logps/chosen": -184.16038513183594, "logps/rejected": -217.509765625, "loss": 0.4271, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -0.8147037625312805, "rewards/margins": 1.0747199058532715, "rewards/rejected": -1.8894237279891968, "step": 3840 }, { "epoch": 3.08, "grad_norm": 10.4950532913208, "learning_rate": 4.369958661282805e-06, "logits/chosen": 0.9348158240318298, "logits/rejected": 0.8544149398803711, "logps/chosen": -199.47950744628906, "logps/rejected": -233.7744598388672, "loss": 0.4727, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -0.8798511624336243, "rewards/margins": 1.0782015323638916, "rewards/rejected": -1.958052635192871, "step": 3850 }, { "epoch": 3.088, "grad_norm": 10.140469551086426, "learning_rate": 4.365317754839643e-06, "logits/chosen": 0.8552842140197754, "logits/rejected": 0.8293384909629822, "logps/chosen": -229.6714324951172, "logps/rejected": -237.9065399169922, "loss": 0.4637, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.158323884010315, "rewards/margins": 1.1132620573043823, "rewards/rejected": -2.2715859413146973, "step": 3860 }, { "epoch": 3.096, "grad_norm": 10.660223960876465, "learning_rate": 4.360662302273926e-06, "logits/chosen": 0.9855637550354004, "logits/rejected": 0.8762430548667908, "logps/chosen": -197.29660034179688, "logps/rejected": -210.25186157226562, "loss": 0.4684, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.9172877669334412, "rewards/margins": 0.9678840637207031, "rewards/rejected": -1.885171890258789, "step": 3870 }, { "epoch": 3.104, "grad_norm": 7.666329383850098, "learning_rate": 4.355992339889806e-06, "logits/chosen": 0.946201741695404, "logits/rejected": 0.8815839886665344, "logps/chosen": -197.9135284423828, "logps/rejected": -215.1569366455078, "loss": 0.4383, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3328570127487183, "rewards/margins": 1.0127038955688477, "rewards/rejected": -2.3455610275268555, "step": 3880 }, { "epoch": 3.112, "grad_norm": 14.594318389892578, "learning_rate": 4.3513079041045925e-06, "logits/chosen": 1.000898003578186, "logits/rejected": 1.0313762426376343, "logps/chosen": -207.04624938964844, "logps/rejected": -221.084228515625, "loss": 0.4328, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.088545560836792, "rewards/margins": 0.945988118648529, "rewards/rejected": -2.0345335006713867, "step": 3890 }, { "epoch": 3.12, "grad_norm": 7.375593185424805, "learning_rate": 4.3466090314484526e-06, "logits/chosen": 0.8702503442764282, "logits/rejected": 0.9601964950561523, "logps/chosen": -216.6647491455078, "logps/rejected": -237.7848663330078, "loss": 0.429, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -1.0842136144638062, "rewards/margins": 1.1683653593063354, "rewards/rejected": -2.2525792121887207, "step": 3900 }, { "epoch": 3.128, "grad_norm": 5.803628921508789, "learning_rate": 4.341895758564141e-06, "logits/chosen": 0.9758888483047485, "logits/rejected": 0.9623239636421204, "logps/chosen": -195.2889404296875, "logps/rejected": -224.08889770507812, "loss": 0.4119, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9340219497680664, "rewards/margins": 1.1046533584594727, "rewards/rejected": -2.038675308227539, "step": 3910 }, { "epoch": 3.136, "grad_norm": 13.526612281799316, "learning_rate": 4.3371681222067065e-06, "logits/chosen": 0.9235677719116211, "logits/rejected": 0.8304759860038757, "logps/chosen": -211.0590362548828, "logps/rejected": -219.9079132080078, "loss": 0.4242, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -0.7414934039115906, "rewards/margins": 1.2431910037994385, "rewards/rejected": -1.9846843481063843, "step": 3920 }, { "epoch": 3.144, "grad_norm": 7.47207498550415, "learning_rate": 4.332426159243206e-06, "logits/chosen": 0.9469828009605408, "logits/rejected": 0.8730935454368591, "logps/chosen": -202.57708740234375, "logps/rejected": -234.28994750976562, "loss": 0.3326, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -0.8033515214920044, "rewards/margins": 1.5122345685958862, "rewards/rejected": -2.3155860900878906, "step": 3930 }, { "epoch": 3.152, "grad_norm": 10.290471076965332, "learning_rate": 4.327669906652421e-06, "logits/chosen": 0.9347248077392578, "logits/rejected": 0.9168369174003601, "logps/chosen": -205.8367156982422, "logps/rejected": -241.17527770996094, "loss": 0.4048, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.078516960144043, "rewards/margins": 1.3800959587097168, "rewards/rejected": -2.4586129188537598, "step": 3940 }, { "epoch": 3.16, "grad_norm": 10.458824157714844, "learning_rate": 4.322899401524563e-06, "logits/chosen": 0.8786640167236328, "logits/rejected": 0.726817786693573, "logps/chosen": -206.63316345214844, "logps/rejected": -241.03616333007812, "loss": 0.417, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.183452844619751, "rewards/margins": 1.3412961959838867, "rewards/rejected": -2.5247490406036377, "step": 3950 }, { "epoch": 3.168, "grad_norm": 11.669900894165039, "learning_rate": 4.318114681060989e-06, "logits/chosen": 0.9407995343208313, "logits/rejected": 0.9570455551147461, "logps/chosen": -201.372802734375, "logps/rejected": -219.628662109375, "loss": 0.4584, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7632571458816528, "rewards/margins": 1.0025835037231445, "rewards/rejected": -1.765840768814087, "step": 3960 }, { "epoch": 3.176, "grad_norm": 12.478621482849121, "learning_rate": 4.313315782573914e-06, "logits/chosen": 0.8294739127159119, "logits/rejected": 0.8217317461967468, "logps/chosen": -222.06484985351562, "logps/rejected": -248.57861328125, "loss": 0.474, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5710614919662476, "rewards/margins": 1.08151113986969, "rewards/rejected": -2.6525726318359375, "step": 3970 }, { "epoch": 3.184, "grad_norm": 17.692319869995117, "learning_rate": 4.308502743486107e-06, "logits/chosen": 0.9584707617759705, "logits/rejected": 0.8594500422477722, "logps/chosen": -223.4303436279297, "logps/rejected": -239.1450653076172, "loss": 0.4356, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8885783553123474, "rewards/margins": 1.3923364877700806, "rewards/rejected": -2.280914783477783, "step": 3980 }, { "epoch": 3.192, "grad_norm": 7.768544673919678, "learning_rate": 4.303675601330618e-06, "logits/chosen": 0.8310710191726685, "logits/rejected": 0.8502975702285767, "logps/chosen": -201.81480407714844, "logps/rejected": -245.815673828125, "loss": 0.3742, "rewards/accuracies": 0.875, "rewards/chosen": -0.9760616421699524, "rewards/margins": 1.6183117628097534, "rewards/rejected": -2.5943734645843506, "step": 3990 }, { "epoch": 3.2, "grad_norm": 11.38988208770752, "learning_rate": 4.298834393750469e-06, "logits/chosen": 0.9716154336929321, "logits/rejected": 0.9836306571960449, "logps/chosen": -210.5645294189453, "logps/rejected": -230.19566345214844, "loss": 0.4073, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -1.0508493185043335, "rewards/margins": 1.2152557373046875, "rewards/rejected": -2.2661049365997314, "step": 4000 }, { "epoch": 3.208, "grad_norm": 8.679203987121582, "learning_rate": 4.2939791584983695e-06, "logits/chosen": 0.9018089175224304, "logits/rejected": 0.8700457811355591, "logps/chosen": -199.3931427001953, "logps/rejected": -236.6988983154297, "loss": 0.3966, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3499828577041626, "rewards/margins": 1.516105055809021, "rewards/rejected": -2.8660879135131836, "step": 4010 }, { "epoch": 3.216, "grad_norm": 6.8652777671813965, "learning_rate": 4.28910993343642e-06, "logits/chosen": 0.9662749171257019, "logits/rejected": 0.9868101477622986, "logps/chosen": -201.87808227539062, "logps/rejected": -243.437744140625, "loss": 0.3566, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -1.0853939056396484, "rewards/margins": 1.5348894596099854, "rewards/rejected": -2.620283365249634, "step": 4020 }, { "epoch": 3.224, "grad_norm": 10.237499237060547, "learning_rate": 4.284226756535814e-06, "logits/chosen": 0.899558961391449, "logits/rejected": 0.8322281241416931, "logps/chosen": -207.8240203857422, "logps/rejected": -255.903564453125, "loss": 0.42, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.0343433618545532, "rewards/margins": 1.4542030096054077, "rewards/rejected": -2.488546371459961, "step": 4030 }, { "epoch": 3.232, "grad_norm": 8.353890419006348, "learning_rate": 4.279329665876548e-06, "logits/chosen": 0.8721134066581726, "logits/rejected": 0.8629803657531738, "logps/chosen": -199.42417907714844, "logps/rejected": -235.1962432861328, "loss": 0.404, "rewards/accuracies": 0.875, "rewards/chosen": -1.0176727771759033, "rewards/margins": 1.266229510307312, "rewards/rejected": -2.283902406692505, "step": 4040 }, { "epoch": 3.24, "grad_norm": 8.403268814086914, "learning_rate": 4.274418699647117e-06, "logits/chosen": 0.9170069694519043, "logits/rejected": 0.836732029914856, "logps/chosen": -224.29885864257812, "logps/rejected": -224.2794189453125, "loss": 0.475, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -1.4468005895614624, "rewards/margins": 1.152286410331726, "rewards/rejected": -2.5990869998931885, "step": 4050 }, { "epoch": 3.248, "grad_norm": 13.75932502746582, "learning_rate": 4.269493896144224e-06, "logits/chosen": 0.809971809387207, "logits/rejected": 0.8430444598197937, "logps/chosen": -188.1841278076172, "logps/rejected": -231.8311767578125, "loss": 0.4046, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8959644436836243, "rewards/margins": 1.3504829406738281, "rewards/rejected": -2.2464473247528076, "step": 4060 }, { "epoch": 3.2560000000000002, "grad_norm": 15.792646408081055, "learning_rate": 4.264555293772475e-06, "logits/chosen": 0.8807941675186157, "logits/rejected": 0.8839223980903625, "logps/chosen": -203.93402099609375, "logps/rejected": -227.60142517089844, "loss": 0.4881, "rewards/accuracies": 0.75, "rewards/chosen": -0.9412260055541992, "rewards/margins": 1.0077519416809082, "rewards/rejected": -1.9489777088165283, "step": 4070 }, { "epoch": 3.2640000000000002, "grad_norm": 13.880685806274414, "learning_rate": 4.2596029310440826e-06, "logits/chosen": 0.8665573000907898, "logits/rejected": 0.9192334413528442, "logps/chosen": -225.70999145507812, "logps/rejected": -229.2024383544922, "loss": 0.4918, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1684324741363525, "rewards/margins": 1.0852488279342651, "rewards/rejected": -2.253680944442749, "step": 4080 }, { "epoch": 3.2720000000000002, "grad_norm": 12.61258602142334, "learning_rate": 4.254636846578567e-06, "logits/chosen": 0.8059272766113281, "logits/rejected": 0.7835027575492859, "logps/chosen": -214.0414581298828, "logps/rejected": -231.42832946777344, "loss": 0.4407, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8260923624038696, "rewards/margins": 1.0982822179794312, "rewards/rejected": -1.9243745803833008, "step": 4090 }, { "epoch": 3.2800000000000002, "grad_norm": 14.153593063354492, "learning_rate": 4.249657079102452e-06, "logits/chosen": 0.9150283932685852, "logits/rejected": 0.7976348996162415, "logps/chosen": -203.6976776123047, "logps/rejected": -214.48025512695312, "loss": 0.4579, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8042591214179993, "rewards/margins": 1.0714467763900757, "rewards/rejected": -1.8757059574127197, "step": 4100 }, { "epoch": 3.288, "grad_norm": 12.573663711547852, "learning_rate": 4.244663667448965e-06, "logits/chosen": 0.9341398477554321, "logits/rejected": 0.8945412039756775, "logps/chosen": -218.30126953125, "logps/rejected": -234.64599609375, "loss": 0.462, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.2913658618927002, "rewards/margins": 1.1530601978302002, "rewards/rejected": -2.4444260597229004, "step": 4110 }, { "epoch": 3.296, "grad_norm": 9.752543449401855, "learning_rate": 4.239656650557733e-06, "logits/chosen": 0.9419944882392883, "logits/rejected": 0.9461938738822937, "logps/chosen": -205.2966766357422, "logps/rejected": -216.1801300048828, "loss": 0.4384, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -1.2133740186691284, "rewards/margins": 1.447343111038208, "rewards/rejected": -2.660717010498047, "step": 4120 }, { "epoch": 3.304, "grad_norm": 19.97236442565918, "learning_rate": 4.234636067474481e-06, "logits/chosen": 0.8762012720108032, "logits/rejected": 0.8730892539024353, "logps/chosen": -209.2789306640625, "logps/rejected": -225.71426391601562, "loss": 0.5437, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9765220880508423, "rewards/margins": 0.9421109557151794, "rewards/rejected": -1.9186328649520874, "step": 4130 }, { "epoch": 3.312, "grad_norm": 7.913999080657959, "learning_rate": 4.229601957350722e-06, "logits/chosen": 0.8649017214775085, "logits/rejected": 0.7728471159934998, "logps/chosen": -236.4957733154297, "logps/rejected": -252.7061767578125, "loss": 0.4567, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.455244779586792, "rewards/margins": 1.1092432737350464, "rewards/rejected": -2.564488172531128, "step": 4140 }, { "epoch": 3.32, "grad_norm": 12.79985523223877, "learning_rate": 4.224554359443459e-06, "logits/chosen": 0.9708512425422668, "logits/rejected": 0.8335834741592407, "logps/chosen": -202.499755859375, "logps/rejected": -231.90538024902344, "loss": 0.4786, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.0350658893585205, "rewards/margins": 1.2157402038574219, "rewards/rejected": -2.2508060932159424, "step": 4150 }, { "epoch": 3.328, "grad_norm": 9.839788436889648, "learning_rate": 4.219493313114875e-06, "logits/chosen": 0.9519194960594177, "logits/rejected": 0.9104518294334412, "logps/chosen": -205.2983856201172, "logps/rejected": -224.93663024902344, "loss": 0.3966, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8869916796684265, "rewards/margins": 1.4354019165039062, "rewards/rejected": -2.3223936557769775, "step": 4160 }, { "epoch": 3.336, "grad_norm": 12.0596284866333, "learning_rate": 4.214418857832025e-06, "logits/chosen": 0.8763806223869324, "logits/rejected": 0.8765867352485657, "logps/chosen": -222.2722625732422, "logps/rejected": -257.39453125, "loss": 0.4153, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -1.6189930438995361, "rewards/margins": 1.1976702213287354, "rewards/rejected": -2.8166635036468506, "step": 4170 }, { "epoch": 3.344, "grad_norm": 8.58606243133545, "learning_rate": 4.209331033166532e-06, "logits/chosen": 0.9239550828933716, "logits/rejected": 0.8988777995109558, "logps/chosen": -198.4449920654297, "logps/rejected": -243.26878356933594, "loss": 0.4353, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5305871963500977, "rewards/margins": 1.0142645835876465, "rewards/rejected": -2.544851779937744, "step": 4180 }, { "epoch": 3.352, "grad_norm": 15.175363540649414, "learning_rate": 4.2042298787942735e-06, "logits/chosen": 0.843410313129425, "logits/rejected": 0.8987080454826355, "logps/chosen": -211.9822235107422, "logps/rejected": -244.9770965576172, "loss": 0.4034, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.0648707151412964, "rewards/margins": 1.016554355621338, "rewards/rejected": -2.081425189971924, "step": 4190 }, { "epoch": 3.36, "grad_norm": 8.164642333984375, "learning_rate": 4.1991154344950755e-06, "logits/chosen": 0.9261566996574402, "logits/rejected": 0.897598385810852, "logps/chosen": -202.40428161621094, "logps/rejected": -241.23191833496094, "loss": 0.3954, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -0.40160098671913147, "rewards/margins": 1.1215957403182983, "rewards/rejected": -1.5231966972351074, "step": 4200 }, { "epoch": 3.368, "grad_norm": 10.852888107299805, "learning_rate": 4.193987740152404e-06, "logits/chosen": 0.955554187297821, "logits/rejected": 0.9004291892051697, "logps/chosen": -184.4699249267578, "logps/rejected": -220.79690551757812, "loss": 0.3409, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -0.6415241956710815, "rewards/margins": 1.315872073173523, "rewards/rejected": -1.957396388053894, "step": 4210 }, { "epoch": 3.376, "grad_norm": 18.212907791137695, "learning_rate": 4.188846835753047e-06, "logits/chosen": 0.8005386590957642, "logits/rejected": 0.8098655939102173, "logps/chosen": -215.5693359375, "logps/rejected": -231.4944610595703, "loss": 0.3963, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.9665080308914185, "rewards/margins": 1.293958306312561, "rewards/rejected": -2.2604663372039795, "step": 4220 }, { "epoch": 3.384, "grad_norm": 10.81216812133789, "learning_rate": 4.183692761386813e-06, "logits/chosen": 0.9972082376480103, "logits/rejected": 0.9668437242507935, "logps/chosen": -207.8662872314453, "logps/rejected": -236.48428344726562, "loss": 0.4591, "rewards/accuracies": 0.75, "rewards/chosen": -0.5786136984825134, "rewards/margins": 1.239148497581482, "rewards/rejected": -1.8177622556686401, "step": 4230 }, { "epoch": 3.392, "grad_norm": 12.289898872375488, "learning_rate": 4.178525557246207e-06, "logits/chosen": 0.8920055627822876, "logits/rejected": 0.8146273493766785, "logps/chosen": -197.36410522460938, "logps/rejected": -226.280029296875, "loss": 0.4908, "rewards/accuracies": 0.75, "rewards/chosen": -1.0728448629379272, "rewards/margins": 1.341575264930725, "rewards/rejected": -2.4144201278686523, "step": 4240 }, { "epoch": 3.4, "grad_norm": 10.308980941772461, "learning_rate": 4.173345263626125e-06, "logits/chosen": 0.9153737425804138, "logits/rejected": 0.9260154962539673, "logps/chosen": -215.0824737548828, "logps/rejected": -243.21629333496094, "loss": 0.479, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.110467791557312, "rewards/margins": 1.1035948991775513, "rewards/rejected": -2.214062452316284, "step": 4250 }, { "epoch": 3.408, "grad_norm": 9.071174621582031, "learning_rate": 4.168151920923536e-06, "logits/chosen": 0.8960357904434204, "logits/rejected": 0.8135896921157837, "logps/chosen": -216.384765625, "logps/rejected": -231.6706085205078, "loss": 0.4235, "rewards/accuracies": 0.8125, "rewards/chosen": -0.950639545917511, "rewards/margins": 1.0974748134613037, "rewards/rejected": -2.048114538192749, "step": 4260 }, { "epoch": 3.416, "grad_norm": 7.223389148712158, "learning_rate": 4.162945569637174e-06, "logits/chosen": 0.7265322804450989, "logits/rejected": 0.7518787384033203, "logps/chosen": -207.7421112060547, "logps/rejected": -234.2729034423828, "loss": 0.4978, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4523930549621582, "rewards/margins": 1.2414056062698364, "rewards/rejected": -2.693798542022705, "step": 4270 }, { "epoch": 3.424, "grad_norm": 12.135030746459961, "learning_rate": 4.157726250367208e-06, "logits/chosen": 0.8626713752746582, "logits/rejected": 0.8069788217544556, "logps/chosen": -228.7198944091797, "logps/rejected": -217.19143676757812, "loss": 0.412, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -1.6234177350997925, "rewards/margins": 1.184737205505371, "rewards/rejected": -2.808155059814453, "step": 4280 }, { "epoch": 3.432, "grad_norm": 10.609066009521484, "learning_rate": 4.152494003814939e-06, "logits/chosen": 0.93280029296875, "logits/rejected": 0.8937816619873047, "logps/chosen": -212.73155212402344, "logps/rejected": -226.97402954101562, "loss": 0.5399, "rewards/accuracies": 0.75, "rewards/chosen": -1.471534013748169, "rewards/margins": 0.9502825736999512, "rewards/rejected": -2.42181658744812, "step": 4290 }, { "epoch": 3.44, "grad_norm": 12.865897178649902, "learning_rate": 4.147248870782477e-06, "logits/chosen": 0.9401634335517883, "logits/rejected": 0.8570839166641235, "logps/chosen": -236.3987274169922, "logps/rejected": -248.67503356933594, "loss": 0.4249, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5956907272338867, "rewards/margins": 1.3058620691299438, "rewards/rejected": -2.901552677154541, "step": 4300 }, { "epoch": 3.448, "grad_norm": 10.754268646240234, "learning_rate": 4.141990892172424e-06, "logits/chosen": 0.9375637173652649, "logits/rejected": 0.8381093144416809, "logps/chosen": -232.57603454589844, "logps/rejected": -257.6446838378906, "loss": 0.4866, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8424854278564453, "rewards/margins": 1.1537357568740845, "rewards/rejected": -2.9962213039398193, "step": 4310 }, { "epoch": 3.456, "grad_norm": 5.4626054763793945, "learning_rate": 4.136720108987552e-06, "logits/chosen": 1.1030884981155396, "logits/rejected": 0.9064838290214539, "logps/chosen": -203.1874542236328, "logps/rejected": -221.27049255371094, "loss": 0.3856, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -1.246989130973816, "rewards/margins": 1.3378962278366089, "rewards/rejected": -2.584885358810425, "step": 4320 }, { "epoch": 3.464, "grad_norm": 12.412184715270996, "learning_rate": 4.131436562330488e-06, "logits/chosen": 0.9628559350967407, "logits/rejected": 0.9677176475524902, "logps/chosen": -211.6782684326172, "logps/rejected": -234.12564086914062, "loss": 0.4181, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5418589115142822, "rewards/margins": 1.1167868375778198, "rewards/rejected": -2.6586456298828125, "step": 4330 }, { "epoch": 3.472, "grad_norm": 15.786921501159668, "learning_rate": 4.126140293403389e-06, "logits/chosen": 0.98765629529953, "logits/rejected": 0.9222535490989685, "logps/chosen": -210.3227081298828, "logps/rejected": -228.1774444580078, "loss": 0.4448, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -1.6453129053115845, "rewards/margins": 1.0853195190429688, "rewards/rejected": -2.7306325435638428, "step": 4340 }, { "epoch": 3.48, "grad_norm": 10.291638374328613, "learning_rate": 4.1208313435076255e-06, "logits/chosen": 0.9463958740234375, "logits/rejected": 0.9481539130210876, "logps/chosen": -212.87197875976562, "logps/rejected": -221.46165466308594, "loss": 0.5025, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4104286432266235, "rewards/margins": 1.028719425201416, "rewards/rejected": -2.43914794921875, "step": 4350 }, { "epoch": 3.488, "grad_norm": 18.95135498046875, "learning_rate": 4.115509754043454e-06, "logits/chosen": 0.9782568216323853, "logits/rejected": 0.9336418509483337, "logps/chosen": -213.4448699951172, "logps/rejected": -235.00161743164062, "loss": 0.4065, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11874504387378693, "rewards/margins": 1.4726828336715698, "rewards/rejected": -1.5914280414581299, "step": 4360 }, { "epoch": 3.496, "grad_norm": 15.36201000213623, "learning_rate": 4.1101755665097e-06, "logits/chosen": 1.0215367078781128, "logits/rejected": 0.9443756937980652, "logps/chosen": -210.01756286621094, "logps/rejected": -214.49449157714844, "loss": 0.4452, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.8719436526298523, "rewards/margins": 1.059888482093811, "rewards/rejected": -1.931832194328308, "step": 4370 }, { "epoch": 3.504, "grad_norm": 14.627403259277344, "learning_rate": 4.104828822503427e-06, "logits/chosen": 1.0366023778915405, "logits/rejected": 0.9075769782066345, "logps/chosen": -215.0442657470703, "logps/rejected": -228.04214477539062, "loss": 0.4347, "rewards/accuracies": 0.8125, "rewards/chosen": -0.925746738910675, "rewards/margins": 1.1247776746749878, "rewards/rejected": -2.0505244731903076, "step": 4380 }, { "epoch": 3.512, "grad_norm": 12.368705749511719, "learning_rate": 4.09946956371962e-06, "logits/chosen": 0.8762325644493103, "logits/rejected": 0.9048509001731873, "logps/chosen": -215.4369659423828, "logps/rejected": -259.8838806152344, "loss": 0.4074, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -1.2534805536270142, "rewards/margins": 1.3236781358718872, "rewards/rejected": -2.5771586894989014, "step": 4390 }, { "epoch": 3.52, "grad_norm": 11.062219619750977, "learning_rate": 4.094097831950855e-06, "logits/chosen": 0.9681398272514343, "logits/rejected": 0.8761087656021118, "logps/chosen": -194.627685546875, "logps/rejected": -231.01333618164062, "loss": 0.4311, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7227877974510193, "rewards/margins": 1.3572759628295898, "rewards/rejected": -2.080063819885254, "step": 4400 }, { "epoch": 3.528, "grad_norm": 13.982263565063477, "learning_rate": 4.0887136690869774e-06, "logits/chosen": 0.9537173509597778, "logits/rejected": 0.9426767230033875, "logps/chosen": -201.9144287109375, "logps/rejected": -229.7262420654297, "loss": 0.3984, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.8084962964057922, "rewards/margins": 1.3087875843048096, "rewards/rejected": -2.117284059524536, "step": 4410 }, { "epoch": 3.536, "grad_norm": 12.993025779724121, "learning_rate": 4.0833171171147675e-06, "logits/chosen": 0.8449978828430176, "logits/rejected": 0.8342208862304688, "logps/chosen": -216.42979431152344, "logps/rejected": -257.1995849609375, "loss": 0.3506, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0627737045288086, "rewards/margins": 1.590333342552185, "rewards/rejected": -2.653106927871704, "step": 4420 }, { "epoch": 3.544, "grad_norm": 16.3622989654541, "learning_rate": 4.077908218117625e-06, "logits/chosen": 1.0056699514389038, "logits/rejected": 0.9356996417045593, "logps/chosen": -223.9595947265625, "logps/rejected": -241.6196746826172, "loss": 0.3655, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.1103066205978394, "rewards/margins": 1.436450481414795, "rewards/rejected": -2.546757459640503, "step": 4430 }, { "epoch": 3.552, "grad_norm": 4.724842548370361, "learning_rate": 4.072487014275228e-06, "logits/chosen": 0.9575614929199219, "logits/rejected": 0.7959676384925842, "logps/chosen": -206.49546813964844, "logps/rejected": -259.52972412109375, "loss": 0.4052, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4658904075622559, "rewards/margins": 1.5798839330673218, "rewards/rejected": -3.045774221420288, "step": 4440 }, { "epoch": 3.56, "grad_norm": 11.298309326171875, "learning_rate": 4.067053547863215e-06, "logits/chosen": 0.9113892912864685, "logits/rejected": 0.837977409362793, "logps/chosen": -218.4698486328125, "logps/rejected": -254.52427673339844, "loss": 0.4225, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5362995862960815, "rewards/margins": 1.178178071975708, "rewards/rejected": -2.7144775390625, "step": 4450 }, { "epoch": 3.568, "grad_norm": 6.332326889038086, "learning_rate": 4.061607861252848e-06, "logits/chosen": 0.870576024055481, "logits/rejected": 0.8328754305839539, "logps/chosen": -217.90184020996094, "logps/rejected": -243.92466735839844, "loss": 0.4866, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8576223850250244, "rewards/margins": 1.264044165611267, "rewards/rejected": -3.121666669845581, "step": 4460 }, { "epoch": 3.576, "grad_norm": 19.035297393798828, "learning_rate": 4.056149996910683e-06, "logits/chosen": 0.8594030737876892, "logits/rejected": 0.7456435561180115, "logps/chosen": -213.825439453125, "logps/rejected": -235.2193145751953, "loss": 0.4798, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.9178558588027954, "rewards/margins": 1.1325775384902954, "rewards/rejected": -3.050433397293091, "step": 4470 }, { "epoch": 3.584, "grad_norm": 7.088034152984619, "learning_rate": 4.050679997398247e-06, "logits/chosen": 0.7786403894424438, "logits/rejected": 0.806538999080658, "logps/chosen": -202.5958709716797, "logps/rejected": -247.24156188964844, "loss": 0.4065, "rewards/accuracies": 0.8125, "rewards/chosen": -1.560505747795105, "rewards/margins": 1.615809440612793, "rewards/rejected": -3.1763153076171875, "step": 4480 }, { "epoch": 3.592, "grad_norm": 7.6185078620910645, "learning_rate": 4.045197905371691e-06, "logits/chosen": 0.8927198648452759, "logits/rejected": 0.8568658232688904, "logps/chosen": -200.9806671142578, "logps/rejected": -230.39976501464844, "loss": 0.4441, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6182069778442383, "rewards/margins": 1.4453877210617065, "rewards/rejected": -3.0635945796966553, "step": 4490 }, { "epoch": 3.6, "grad_norm": 16.420970916748047, "learning_rate": 4.039703763581472e-06, "logits/chosen": 0.8568277359008789, "logits/rejected": 0.7848814725875854, "logps/chosen": -235.65513610839844, "logps/rejected": -240.30740356445312, "loss": 0.463, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -2.157593250274658, "rewards/margins": 1.1094763278961182, "rewards/rejected": -3.2670693397521973, "step": 4500 }, { "epoch": 3.608, "grad_norm": 9.267051696777344, "learning_rate": 4.03419761487201e-06, "logits/chosen": 1.004172921180725, "logits/rejected": 0.8293590545654297, "logps/chosen": -220.4943389892578, "logps/rejected": -238.46202087402344, "loss": 0.4328, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6261404752731323, "rewards/margins": 1.2516275644302368, "rewards/rejected": -2.877768039703369, "step": 4510 }, { "epoch": 3.616, "grad_norm": 8.903371810913086, "learning_rate": 4.0286795021813595e-06, "logits/chosen": 0.7373332977294922, "logits/rejected": 0.7830902338027954, "logps/chosen": -206.95864868164062, "logps/rejected": -238.60684204101562, "loss": 0.4642, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.362304449081421, "rewards/margins": 1.3245913982391357, "rewards/rejected": -2.6868958473205566, "step": 4520 }, { "epoch": 3.624, "grad_norm": 9.160982131958008, "learning_rate": 4.023149468540871e-06, "logits/chosen": 0.8418847918510437, "logits/rejected": 0.8112041354179382, "logps/chosen": -207.1737060546875, "logps/rejected": -238.65518188476562, "loss": 0.5013, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8721437454223633, "rewards/margins": 1.152505874633789, "rewards/rejected": -3.0246498584747314, "step": 4530 }, { "epoch": 3.632, "grad_norm": 8.608299255371094, "learning_rate": 4.0176075570748596e-06, "logits/chosen": 0.8552350997924805, "logits/rejected": 0.8494642376899719, "logps/chosen": -219.12474060058594, "logps/rejected": -222.3189239501953, "loss": 0.4387, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -1.2666515111923218, "rewards/margins": 1.1172685623168945, "rewards/rejected": -2.3839199542999268, "step": 4540 }, { "epoch": 3.64, "grad_norm": 11.497233390808105, "learning_rate": 4.012053811000262e-06, "logits/chosen": 0.919759213924408, "logits/rejected": 0.9661205410957336, "logps/chosen": -220.15342712402344, "logps/rejected": -245.29685974121094, "loss": 0.3889, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -1.2769368886947632, "rewards/margins": 1.3770891427993774, "rewards/rejected": -2.6540262699127197, "step": 4550 }, { "epoch": 3.648, "grad_norm": 12.795794486999512, "learning_rate": 4.006488273626307e-06, "logits/chosen": 0.8910170793533325, "logits/rejected": 0.8077157139778137, "logps/chosen": -208.62158203125, "logps/rejected": -234.9666290283203, "loss": 0.424, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1060861349105835, "rewards/margins": 1.2906891107559204, "rewards/rejected": -2.396775007247925, "step": 4560 }, { "epoch": 3.656, "grad_norm": 19.536590576171875, "learning_rate": 4.000910988354172e-06, "logits/chosen": 0.9590219855308533, "logits/rejected": 0.9733330011367798, "logps/chosen": -199.6140899658203, "logps/rejected": -206.4640655517578, "loss": 0.4468, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -0.5966355204582214, "rewards/margins": 1.0923033952713013, "rewards/rejected": -1.6889389753341675, "step": 4570 }, { "epoch": 3.664, "grad_norm": 7.018489837646484, "learning_rate": 3.995321998676648e-06, "logits/chosen": 0.9202627539634705, "logits/rejected": 0.8594157099723816, "logps/chosen": -203.9952850341797, "logps/rejected": -229.83840942382812, "loss": 0.4649, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.984645664691925, "rewards/margins": 1.0351135730743408, "rewards/rejected": -2.0197594165802, "step": 4580 }, { "epoch": 3.672, "grad_norm": 10.823237419128418, "learning_rate": 3.989721348177801e-06, "logits/chosen": 0.9117226004600525, "logits/rejected": 0.8806440234184265, "logps/chosen": -222.4832763671875, "logps/rejected": -247.8332061767578, "loss": 0.3441, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -1.3591053485870361, "rewards/margins": 1.4962059259414673, "rewards/rejected": -2.855311155319214, "step": 4590 }, { "epoch": 3.68, "grad_norm": 14.88288402557373, "learning_rate": 3.984109080532627e-06, "logits/chosen": 0.9188446998596191, "logits/rejected": 0.8930045366287231, "logps/chosen": -190.447509765625, "logps/rejected": -212.60386657714844, "loss": 0.4098, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -1.0938667058944702, "rewards/margins": 1.1657882928848267, "rewards/rejected": -2.259654998779297, "step": 4600 }, { "epoch": 3.6879999999999997, "grad_norm": 15.50723934173584, "learning_rate": 3.978485239506717e-06, "logits/chosen": 0.8951994180679321, "logits/rejected": 0.9232072830200195, "logps/chosen": -222.2184295654297, "logps/rejected": -248.290771484375, "loss": 0.4084, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0719726085662842, "rewards/margins": 1.3165521621704102, "rewards/rejected": -2.3885247707366943, "step": 4610 }, { "epoch": 3.6959999999999997, "grad_norm": 11.021881103515625, "learning_rate": 3.972849868955913e-06, "logits/chosen": 0.9283590316772461, "logits/rejected": 0.8612028360366821, "logps/chosen": -208.7410888671875, "logps/rejected": -236.5598907470703, "loss": 0.2934, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.6821409463882446, "rewards/margins": 1.696975588798523, "rewards/rejected": -2.3791165351867676, "step": 4620 }, { "epoch": 3.7039999999999997, "grad_norm": 7.1401777267456055, "learning_rate": 3.967203012825965e-06, "logits/chosen": 0.8836082816123962, "logits/rejected": 0.8645914196968079, "logps/chosen": -213.68142700195312, "logps/rejected": -251.0132293701172, "loss": 0.4014, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4805501699447632, "rewards/margins": 1.6087449789047241, "rewards/rejected": -3.0892951488494873, "step": 4630 }, { "epoch": 3.7119999999999997, "grad_norm": 15.971772193908691, "learning_rate": 3.961544715152195e-06, "logits/chosen": 0.9219527244567871, "logits/rejected": 0.8754696249961853, "logps/chosen": -201.8592987060547, "logps/rejected": -226.51504516601562, "loss": 0.4237, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1449387073516846, "rewards/margins": 1.4969040155410767, "rewards/rejected": -2.6418426036834717, "step": 4640 }, { "epoch": 3.7199999999999998, "grad_norm": 3.4029507637023926, "learning_rate": 3.955875020059141e-06, "logits/chosen": 0.9539863467216492, "logits/rejected": 0.9481542706489563, "logps/chosen": -190.33999633789062, "logps/rejected": -230.81553649902344, "loss": 0.4776, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.0588791370391846, "rewards/margins": 1.255807876586914, "rewards/rejected": -2.3146870136260986, "step": 4650 }, { "epoch": 3.7279999999999998, "grad_norm": 9.8549165725708, "learning_rate": 3.950193971760227e-06, "logits/chosen": 0.9642109274864197, "logits/rejected": 0.9821377992630005, "logps/chosen": -210.73709106445312, "logps/rejected": -215.7576904296875, "loss": 0.4352, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1254199743270874, "rewards/margins": 1.1262983083724976, "rewards/rejected": -2.251718282699585, "step": 4660 }, { "epoch": 3.7359999999999998, "grad_norm": 13.556317329406738, "learning_rate": 3.944501614557408e-06, "logits/chosen": 1.0108891725540161, "logits/rejected": 0.903409481048584, "logps/chosen": -189.86575317382812, "logps/rejected": -216.7268524169922, "loss": 0.3932, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1368598937988281, "rewards/margins": 1.453155517578125, "rewards/rejected": -2.590015172958374, "step": 4670 }, { "epoch": 3.7439999999999998, "grad_norm": 12.027454376220703, "learning_rate": 3.938797992840828e-06, "logits/chosen": 0.9133999943733215, "logits/rejected": 0.8714101910591125, "logps/chosen": -202.46926879882812, "logps/rejected": -219.1940460205078, "loss": 0.504, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.0332410335540771, "rewards/margins": 1.1814830303192139, "rewards/rejected": -2.214724063873291, "step": 4680 }, { "epoch": 3.752, "grad_norm": 7.450799465179443, "learning_rate": 3.933083151088476e-06, "logits/chosen": 0.9655442237854004, "logits/rejected": 0.9951929450035095, "logps/chosen": -216.5625762939453, "logps/rejected": -226.92529296875, "loss": 0.4698, "rewards/accuracies": 0.75, "rewards/chosen": -1.4813133478164673, "rewards/margins": 1.0279206037521362, "rewards/rejected": -2.5092339515686035, "step": 4690 }, { "epoch": 3.76, "grad_norm": 10.197044372558594, "learning_rate": 3.927357133865836e-06, "logits/chosen": 0.8014942407608032, "logits/rejected": 0.7166596055030823, "logps/chosen": -206.07383728027344, "logps/rejected": -233.2744598388672, "loss": 0.4263, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1761497259140015, "rewards/margins": 1.4023005962371826, "rewards/rejected": -2.5784504413604736, "step": 4700 }, { "epoch": 3.768, "grad_norm": 12.484579086303711, "learning_rate": 3.92161998582554e-06, "logits/chosen": 0.9144058227539062, "logits/rejected": 0.8918857574462891, "logps/chosen": -210.94981384277344, "logps/rejected": -232.71836853027344, "loss": 0.4134, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.373250961303711, "rewards/margins": 1.1428226232528687, "rewards/rejected": -2.516073703765869, "step": 4710 }, { "epoch": 3.776, "grad_norm": 10.925464630126953, "learning_rate": 3.9158717517070216e-06, "logits/chosen": 0.9928507804870605, "logits/rejected": 0.9784805178642273, "logps/chosen": -184.71435546875, "logps/rejected": -216.792724609375, "loss": 0.3318, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -1.1705821752548218, "rewards/margins": 1.615303874015808, "rewards/rejected": -2.78588604927063, "step": 4720 }, { "epoch": 3.784, "grad_norm": 12.400118827819824, "learning_rate": 3.9101124763361645e-06, "logits/chosen": 0.9748711585998535, "logits/rejected": 0.9293093681335449, "logps/chosen": -217.0461883544922, "logps/rejected": -227.57286071777344, "loss": 0.4629, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5834509134292603, "rewards/margins": 1.2847626209259033, "rewards/rejected": -2.868213415145874, "step": 4730 }, { "epoch": 3.792, "grad_norm": 11.517487525939941, "learning_rate": 3.904342204624955e-06, "logits/chosen": 0.9611164331436157, "logits/rejected": 0.9418201446533203, "logps/chosen": -217.9922637939453, "logps/rejected": -231.400634765625, "loss": 0.4182, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.1144764423370361, "rewards/margins": 1.261598825454712, "rewards/rejected": -2.376075029373169, "step": 4740 }, { "epoch": 3.8, "grad_norm": 18.023473739624023, "learning_rate": 3.8985609815711315e-06, "logits/chosen": 1.0194095373153687, "logits/rejected": 0.9130865931510925, "logps/chosen": -217.2231903076172, "logps/rejected": -231.2641143798828, "loss": 0.4554, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4849942922592163, "rewards/margins": 1.1392329931259155, "rewards/rejected": -2.624227285385132, "step": 4750 }, { "epoch": 3.808, "grad_norm": 17.991451263427734, "learning_rate": 3.892768852257831e-06, "logits/chosen": 0.9631202816963196, "logits/rejected": 0.8651983141899109, "logps/chosen": -215.14443969726562, "logps/rejected": -251.19363403320312, "loss": 0.492, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.984946608543396, "rewards/margins": 1.1132451295852661, "rewards/rejected": -3.098191499710083, "step": 4760 }, { "epoch": 3.816, "grad_norm": 10.07773208618164, "learning_rate": 3.886965861853243e-06, "logits/chosen": 0.9928179979324341, "logits/rejected": 0.9808389544487, "logps/chosen": -209.1436309814453, "logps/rejected": -228.26318359375, "loss": 0.3544, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.264801263809204, "rewards/margins": 1.3731207847595215, "rewards/rejected": -2.6379220485687256, "step": 4770 }, { "epoch": 3.824, "grad_norm": 21.519670486450195, "learning_rate": 3.881152055610253e-06, "logits/chosen": 0.9541469812393188, "logits/rejected": 0.8962286114692688, "logps/chosen": -216.61123657226562, "logps/rejected": -220.80076599121094, "loss": 0.5935, "rewards/accuracies": 0.7125000357627869, "rewards/chosen": -1.6538161039352417, "rewards/margins": 0.6947936415672302, "rewards/rejected": -2.348609685897827, "step": 4780 }, { "epoch": 3.832, "grad_norm": 12.259000778198242, "learning_rate": 3.875327478866089e-06, "logits/chosen": 0.8635460734367371, "logits/rejected": 0.8118668794631958, "logps/chosen": -233.24827575683594, "logps/rejected": -245.0417022705078, "loss": 0.4181, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2450588941574097, "rewards/margins": 1.115803599357605, "rewards/rejected": -2.3608624935150146, "step": 4790 }, { "epoch": 3.84, "grad_norm": 14.562280654907227, "learning_rate": 3.869492177041971e-06, "logits/chosen": 0.9073505401611328, "logits/rejected": 0.872600257396698, "logps/chosen": -231.16311645507812, "logps/rejected": -224.2497100830078, "loss": 0.5217, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.0939778089523315, "rewards/margins": 0.7748345732688904, "rewards/rejected": -1.8688122034072876, "step": 4800 }, { "epoch": 3.848, "grad_norm": 14.074769973754883, "learning_rate": 3.863646195642754e-06, "logits/chosen": 0.9319748282432556, "logits/rejected": 0.8828820586204529, "logps/chosen": -201.0325164794922, "logps/rejected": -209.9548797607422, "loss": 0.4524, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -0.7232056856155396, "rewards/margins": 1.1827844381332397, "rewards/rejected": -1.9059902429580688, "step": 4810 }, { "epoch": 3.856, "grad_norm": 13.172402381896973, "learning_rate": 3.857789580256576e-06, "logits/chosen": 0.8821210861206055, "logits/rejected": 0.8552846312522888, "logps/chosen": -204.34593200683594, "logps/rejected": -239.4946746826172, "loss": 0.5189, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5751923322677612, "rewards/margins": 0.9332385063171387, "rewards/rejected": -2.5084309577941895, "step": 4820 }, { "epoch": 3.864, "grad_norm": 12.942428588867188, "learning_rate": 3.8519223765544985e-06, "logits/chosen": 0.9515409469604492, "logits/rejected": 0.8108540773391724, "logps/chosen": -196.3207550048828, "logps/rejected": -233.15110778808594, "loss": 0.3708, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.2026864290237427, "rewards/margins": 1.3849681615829468, "rewards/rejected": -2.5876548290252686, "step": 4830 }, { "epoch": 3.872, "grad_norm": 16.509870529174805, "learning_rate": 3.8460446302901575e-06, "logits/chosen": 0.9374640583992004, "logits/rejected": 0.8481482863426208, "logps/chosen": -200.82627868652344, "logps/rejected": -220.869873046875, "loss": 0.4426, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4088255167007446, "rewards/margins": 1.1208895444869995, "rewards/rejected": -2.529715061187744, "step": 4840 }, { "epoch": 3.88, "grad_norm": 6.809768199920654, "learning_rate": 3.840156387299397e-06, "logits/chosen": 0.8348701596260071, "logits/rejected": 0.8732596635818481, "logps/chosen": -216.68138122558594, "logps/rejected": -247.988037109375, "loss": 0.4629, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.482513189315796, "rewards/margins": 1.2046746015548706, "rewards/rejected": -2.687187910079956, "step": 4850 }, { "epoch": 3.888, "grad_norm": 10.955533981323242, "learning_rate": 3.8342576934999184e-06, "logits/chosen": 0.8893832564353943, "logits/rejected": 0.8247093558311462, "logps/chosen": -204.13548278808594, "logps/rejected": -241.85671997070312, "loss": 0.4514, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -1.217315435409546, "rewards/margins": 1.1432523727416992, "rewards/rejected": -2.360567808151245, "step": 4860 }, { "epoch": 3.896, "grad_norm": 11.238479614257812, "learning_rate": 3.828348594890923e-06, "logits/chosen": 0.8213216662406921, "logits/rejected": 0.8109332919120789, "logps/chosen": -209.7275848388672, "logps/rejected": -236.1708526611328, "loss": 0.4234, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.2727683782577515, "rewards/margins": 1.410487174987793, "rewards/rejected": -2.683255910873413, "step": 4870 }, { "epoch": 3.904, "grad_norm": 12.693486213684082, "learning_rate": 3.822429137552747e-06, "logits/chosen": 0.9051896929740906, "logits/rejected": 0.8351686596870422, "logps/chosen": -212.6349639892578, "logps/rejected": -205.12376403808594, "loss": 0.5046, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.834870457649231, "rewards/margins": 0.9832606315612793, "rewards/rejected": -1.8181308507919312, "step": 4880 }, { "epoch": 3.912, "grad_norm": 6.716438293457031, "learning_rate": 3.816499367646508e-06, "logits/chosen": 1.0042909383773804, "logits/rejected": 0.9129732251167297, "logps/chosen": -212.87109375, "logps/rejected": -230.78567504882812, "loss": 0.535, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2843660116195679, "rewards/margins": 1.0389277935028076, "rewards/rejected": -2.323293924331665, "step": 4890 }, { "epoch": 3.92, "grad_norm": 10.39694595336914, "learning_rate": 3.8105593314137434e-06, "logits/chosen": 0.9733989834785461, "logits/rejected": 0.9892801642417908, "logps/chosen": -209.4358367919922, "logps/rejected": -221.290771484375, "loss": 0.3994, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -1.1634297370910645, "rewards/margins": 1.246660590171814, "rewards/rejected": -2.410090208053589, "step": 4900 }, { "epoch": 3.928, "grad_norm": 16.674728393554688, "learning_rate": 3.804609075176049e-06, "logits/chosen": 0.7904636859893799, "logits/rejected": 0.7977269291877747, "logps/chosen": -202.6865692138672, "logps/rejected": -237.93357849121094, "loss": 0.4082, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5116066932678223, "rewards/margins": 1.3030771017074585, "rewards/rejected": -2.8146839141845703, "step": 4910 }, { "epoch": 3.936, "grad_norm": 11.239212036132812, "learning_rate": 3.7986486453347183e-06, "logits/chosen": 0.9851727485656738, "logits/rejected": 0.8940200805664062, "logps/chosen": -206.59951782226562, "logps/rejected": -215.1295623779297, "loss": 0.4914, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3703104257583618, "rewards/margins": 0.929050624370575, "rewards/rejected": -2.299360990524292, "step": 4920 }, { "epoch": 3.944, "grad_norm": 14.794110298156738, "learning_rate": 3.7926780883703794e-06, "logits/chosen": 0.9150724411010742, "logits/rejected": 0.8797799944877625, "logps/chosen": -209.21240234375, "logps/rejected": -228.32688903808594, "loss": 0.4379, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.8193817138671875, "rewards/margins": 1.1322726011276245, "rewards/rejected": -1.9516544342041016, "step": 4930 }, { "epoch": 3.952, "grad_norm": 9.821282386779785, "learning_rate": 3.7866974508426355e-06, "logits/chosen": 0.8630434274673462, "logits/rejected": 0.9651856422424316, "logps/chosen": -226.81321716308594, "logps/rejected": -220.50515747070312, "loss": 0.402, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -0.8245296478271484, "rewards/margins": 1.2236753702163696, "rewards/rejected": -2.0482048988342285, "step": 4940 }, { "epoch": 3.96, "grad_norm": 11.387616157531738, "learning_rate": 3.7807067793897006e-06, "logits/chosen": 0.8870008587837219, "logits/rejected": 0.8385466933250427, "logps/chosen": -230.2690887451172, "logps/rejected": -249.96641540527344, "loss": 0.5202, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.0625405311584473, "rewards/margins": 0.9815382361412048, "rewards/rejected": -2.044078826904297, "step": 4950 }, { "epoch": 3.968, "grad_norm": 10.633583068847656, "learning_rate": 3.7747061207280322e-06, "logits/chosen": 0.867588222026825, "logits/rejected": 0.78270423412323, "logps/chosen": -230.1920928955078, "logps/rejected": -261.4850158691406, "loss": 0.4374, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1695181131362915, "rewards/margins": 1.3066359758377075, "rewards/rejected": -2.476154088973999, "step": 4960 }, { "epoch": 3.976, "grad_norm": 11.73788070678711, "learning_rate": 3.7686955216519733e-06, "logits/chosen": 0.8684186339378357, "logits/rejected": 0.9026303291320801, "logps/chosen": -202.72027587890625, "logps/rejected": -224.8450164794922, "loss": 0.3963, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -1.1008380651474, "rewards/margins": 1.233585238456726, "rewards/rejected": -2.334423065185547, "step": 4970 }, { "epoch": 3.984, "grad_norm": 12.70168685913086, "learning_rate": 3.7626750290333824e-06, "logits/chosen": 0.9102399945259094, "logits/rejected": 0.789924681186676, "logps/chosen": -217.3763885498047, "logps/rejected": -246.22720336914062, "loss": 0.431, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -1.2897388935089111, "rewards/margins": 1.2651302814483643, "rewards/rejected": -2.5548691749572754, "step": 4980 }, { "epoch": 3.992, "grad_norm": 10.833782196044922, "learning_rate": 3.7566446898212704e-06, "logits/chosen": 0.8599758148193359, "logits/rejected": 0.8142706751823425, "logps/chosen": -215.34092712402344, "logps/rejected": -204.10824584960938, "loss": 0.4499, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1187313795089722, "rewards/margins": 1.1984655857086182, "rewards/rejected": -2.317196846008301, "step": 4990 }, { "epoch": 4.0, "grad_norm": 13.4774169921875, "learning_rate": 3.7506045510414337e-06, "logits/chosen": 1.0011329650878906, "logits/rejected": 0.9479683041572571, "logps/chosen": -199.9625701904297, "logps/rejected": -239.7847137451172, "loss": 0.3563, "rewards/accuracies": 0.875, "rewards/chosen": -1.0051459074020386, "rewards/margins": 1.3487907648086548, "rewards/rejected": -2.3539369106292725, "step": 5000 }, { "epoch": 4.008, "grad_norm": 8.692243576049805, "learning_rate": 3.7445546597960882e-06, "logits/chosen": 1.0498260259628296, "logits/rejected": 1.0109456777572632, "logps/chosen": -207.2587432861328, "logps/rejected": -230.1753692626953, "loss": 0.3541, "rewards/accuracies": 0.875, "rewards/chosen": -0.9930923581123352, "rewards/margins": 1.3766816854476929, "rewards/rejected": -2.3697738647460938, "step": 5010 }, { "epoch": 4.016, "grad_norm": 14.994426727294922, "learning_rate": 3.7384950632635e-06, "logits/chosen": 1.0372326374053955, "logits/rejected": 0.9997326135635376, "logps/chosen": -206.49755859375, "logps/rejected": -229.08706665039062, "loss": 0.4148, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0184519290924072, "rewards/margins": 1.1713188886642456, "rewards/rejected": -2.1897706985473633, "step": 5020 }, { "epoch": 4.024, "grad_norm": 12.80656909942627, "learning_rate": 3.732425808697622e-06, "logits/chosen": 1.04145085811615, "logits/rejected": 0.8659443855285645, "logps/chosen": -207.9678192138672, "logps/rejected": -230.5495147705078, "loss": 0.3428, "rewards/accuracies": 0.875, "rewards/chosen": -0.694399356842041, "rewards/margins": 1.4684067964553833, "rewards/rejected": -2.162806272506714, "step": 5030 }, { "epoch": 4.032, "grad_norm": 7.541386604309082, "learning_rate": 3.726346943427719e-06, "logits/chosen": 0.9582153558731079, "logits/rejected": 0.867928683757782, "logps/chosen": -208.20193481445312, "logps/rejected": -234.29641723632812, "loss": 0.3957, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2386597394943237, "rewards/margins": 1.2774535417556763, "rewards/rejected": -2.516113042831421, "step": 5040 }, { "epoch": 4.04, "grad_norm": 12.571610450744629, "learning_rate": 3.720258514858004e-06, "logits/chosen": 1.0318713188171387, "logits/rejected": 0.8940740823745728, "logps/chosen": -195.3436279296875, "logps/rejected": -235.5468292236328, "loss": 0.3239, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.7823795676231384, "rewards/margins": 1.5767382383346558, "rewards/rejected": -2.3591177463531494, "step": 5050 }, { "epoch": 4.048, "grad_norm": 14.773040771484375, "learning_rate": 3.714160570467266e-06, "logits/chosen": 0.9699028134346008, "logits/rejected": 0.9142343401908875, "logps/chosen": -218.6599578857422, "logps/rejected": -241.7670440673828, "loss": 0.3705, "rewards/accuracies": 0.8125, "rewards/chosen": -1.231390357017517, "rewards/margins": 1.3999671936035156, "rewards/rejected": -2.6313576698303223, "step": 5060 }, { "epoch": 4.056, "grad_norm": 8.311098098754883, "learning_rate": 3.7080531578085e-06, "logits/chosen": 0.880449116230011, "logits/rejected": 0.8501182794570923, "logps/chosen": -207.9825897216797, "logps/rejected": -245.84878540039062, "loss": 0.3206, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.1131296157836914, "rewards/margins": 1.682613730430603, "rewards/rejected": -2.795743465423584, "step": 5070 }, { "epoch": 4.064, "grad_norm": 13.896095275878906, "learning_rate": 3.701936324508537e-06, "logits/chosen": 1.1272010803222656, "logits/rejected": 0.9481062293052673, "logps/chosen": -211.05064392089844, "logps/rejected": -214.7148895263672, "loss": 0.3481, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.8524333834648132, "rewards/margins": 1.6547110080718994, "rewards/rejected": -2.5071444511413574, "step": 5080 }, { "epoch": 4.072, "grad_norm": 11.393054008483887, "learning_rate": 3.6958101182676725e-06, "logits/chosen": 0.9813786745071411, "logits/rejected": 0.962406575679779, "logps/chosen": -205.95632934570312, "logps/rejected": -231.1307373046875, "loss": 0.3552, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.020094633102417, "rewards/margins": 1.620416283607483, "rewards/rejected": -2.6405110359191895, "step": 5090 }, { "epoch": 4.08, "grad_norm": 16.073274612426758, "learning_rate": 3.6896745868592924e-06, "logits/chosen": 0.980717658996582, "logits/rejected": 0.9125275611877441, "logps/chosen": -199.70022583007812, "logps/rejected": -237.94412231445312, "loss": 0.3716, "rewards/accuracies": 0.875, "rewards/chosen": -1.2604044675827026, "rewards/margins": 1.5706150531768799, "rewards/rejected": -2.831019639968872, "step": 5100 }, { "epoch": 4.088, "grad_norm": 15.651266098022461, "learning_rate": 3.683529778129504e-06, "logits/chosen": 1.0326038599014282, "logits/rejected": 0.9012963175773621, "logps/chosen": -218.704833984375, "logps/rejected": -238.8866424560547, "loss": 0.3246, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -1.407049536705017, "rewards/margins": 1.6295727491378784, "rewards/rejected": -3.0366222858428955, "step": 5110 }, { "epoch": 4.096, "grad_norm": 9.559797286987305, "learning_rate": 3.677375739996759e-06, "logits/chosen": 0.9783565402030945, "logits/rejected": 0.9558612704277039, "logps/chosen": -231.9866485595703, "logps/rejected": -237.6753387451172, "loss": 0.3427, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.2938991785049438, "rewards/margins": 1.4221700429916382, "rewards/rejected": -2.716069221496582, "step": 5120 }, { "epoch": 4.104, "grad_norm": 20.652666091918945, "learning_rate": 3.6712125204514836e-06, "logits/chosen": 1.0956971645355225, "logits/rejected": 1.1476175785064697, "logps/chosen": -198.1278533935547, "logps/rejected": -215.04112243652344, "loss": 0.3917, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.28689682483673096, "rewards/margins": 1.569340467453003, "rewards/rejected": -1.8562372922897339, "step": 5130 }, { "epoch": 4.112, "grad_norm": 20.57211685180664, "learning_rate": 3.6650401675557025e-06, "logits/chosen": 1.0987768173217773, "logits/rejected": 0.8933493494987488, "logps/chosen": -203.9111785888672, "logps/rejected": -229.03076171875, "loss": 0.3811, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.243438720703125, "rewards/margins": 1.4107178449630737, "rewards/rejected": -2.654156446456909, "step": 5140 }, { "epoch": 4.12, "grad_norm": 5.7946271896362305, "learning_rate": 3.658858729442662e-06, "logits/chosen": 0.9748809933662415, "logits/rejected": 0.8560425043106079, "logps/chosen": -199.7461395263672, "logps/rejected": -234.7223663330078, "loss": 0.3288, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.376451015472412, "rewards/margins": 1.6137140989303589, "rewards/rejected": -2.9901649951934814, "step": 5150 }, { "epoch": 4.128, "grad_norm": 6.650130271911621, "learning_rate": 3.65266825431646e-06, "logits/chosen": 0.9371269345283508, "logits/rejected": 1.002604603767395, "logps/chosen": -216.43150329589844, "logps/rejected": -230.34336853027344, "loss": 0.3251, "rewards/accuracies": 0.875, "rewards/chosen": -1.1472088098526, "rewards/margins": 1.6432514190673828, "rewards/rejected": -2.7904601097106934, "step": 5160 }, { "epoch": 4.136, "grad_norm": 24.08129119873047, "learning_rate": 3.646468790451663e-06, "logits/chosen": 0.8831321597099304, "logits/rejected": 0.776918888092041, "logps/chosen": -213.083740234375, "logps/rejected": -230.79202270507812, "loss": 0.4005, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.3220943212509155, "rewards/margins": 1.3914340734481812, "rewards/rejected": -2.7135283946990967, "step": 5170 }, { "epoch": 4.144, "grad_norm": 7.815832138061523, "learning_rate": 3.6402603861929374e-06, "logits/chosen": 1.0589165687561035, "logits/rejected": 0.9391776919364929, "logps/chosen": -221.99449157714844, "logps/rejected": -242.86216735839844, "loss": 0.2609, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.1631277799606323, "rewards/margins": 1.766058325767517, "rewards/rejected": -2.9291858673095703, "step": 5180 }, { "epoch": 4.152, "grad_norm": 13.720664978027344, "learning_rate": 3.6340430899546656e-06, "logits/chosen": 0.9697431921958923, "logits/rejected": 0.9235960841178894, "logps/chosen": -223.24876403808594, "logps/rejected": -240.31167602539062, "loss": 0.4369, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2503145933151245, "rewards/margins": 1.2561858892440796, "rewards/rejected": -2.506500482559204, "step": 5190 }, { "epoch": 4.16, "grad_norm": 16.428617477416992, "learning_rate": 3.6278169502205734e-06, "logits/chosen": 1.0490871667861938, "logits/rejected": 0.9422445297241211, "logps/chosen": -211.8967742919922, "logps/rejected": -263.45782470703125, "loss": 0.3235, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.041167140007019, "rewards/margins": 1.970610499382019, "rewards/rejected": -3.011777639389038, "step": 5200 }, { "epoch": 4.168, "grad_norm": 28.398910522460938, "learning_rate": 3.6215820155433486e-06, "logits/chosen": 0.9427713751792908, "logits/rejected": 0.9313327670097351, "logps/chosen": -243.6337890625, "logps/rejected": -252.7070770263672, "loss": 0.3734, "rewards/accuracies": 0.875, "rewards/chosen": -0.9518672227859497, "rewards/margins": 1.5367608070373535, "rewards/rejected": -2.4886281490325928, "step": 5210 }, { "epoch": 4.176, "grad_norm": 14.895367622375488, "learning_rate": 3.615338334544265e-06, "logits/chosen": 0.9828926920890808, "logits/rejected": 0.9496780633926392, "logps/chosen": -203.6242218017578, "logps/rejected": -224.4546661376953, "loss": 0.3542, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.7610888481140137, "rewards/margins": 1.4806041717529297, "rewards/rejected": -2.2416930198669434, "step": 5220 }, { "epoch": 4.184, "grad_norm": 19.1573543548584, "learning_rate": 3.6090859559128e-06, "logits/chosen": 0.9076266288757324, "logits/rejected": 0.9045122265815735, "logps/chosen": -212.6093292236328, "logps/rejected": -223.61474609375, "loss": 0.3464, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -0.48652344942092896, "rewards/margins": 1.6024211645126343, "rewards/rejected": -2.088944435119629, "step": 5230 }, { "epoch": 4.192, "grad_norm": 7.1050310134887695, "learning_rate": 3.6028249284062593e-06, "logits/chosen": 0.9402229189872742, "logits/rejected": 0.9130407571792603, "logps/chosen": -215.83328247070312, "logps/rejected": -237.27294921875, "loss": 0.3177, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.442810297012329, "rewards/margins": 1.6023463010787964, "rewards/rejected": -3.045156478881836, "step": 5240 }, { "epoch": 4.2, "grad_norm": 13.212199211120605, "learning_rate": 3.5965553008493924e-06, "logits/chosen": 0.9931579828262329, "logits/rejected": 0.8333091139793396, "logps/chosen": -216.7156219482422, "logps/rejected": -227.3023681640625, "loss": 0.4671, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2548701763153076, "rewards/margins": 1.3508440256118774, "rewards/rejected": -2.6057140827178955, "step": 5250 }, { "epoch": 4.208, "grad_norm": 15.673922538757324, "learning_rate": 3.590277122134015e-06, "logits/chosen": 0.925972580909729, "logits/rejected": 0.9485855102539062, "logps/chosen": -203.7695770263672, "logps/rejected": -233.1253204345703, "loss": 0.2825, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.1053215265274048, "rewards/margins": 1.8811248540878296, "rewards/rejected": -2.9864463806152344, "step": 5260 }, { "epoch": 4.216, "grad_norm": 18.827274322509766, "learning_rate": 3.5839904412186254e-06, "logits/chosen": 0.959101676940918, "logits/rejected": 1.012556552886963, "logps/chosen": -203.8087615966797, "logps/rejected": -251.2672119140625, "loss": 0.3625, "rewards/accuracies": 0.875, "rewards/chosen": -0.9612110257148743, "rewards/margins": 1.7676079273223877, "rewards/rejected": -2.7288191318511963, "step": 5270 }, { "epoch": 4.224, "grad_norm": 10.582806587219238, "learning_rate": 3.577695307128024e-06, "logits/chosen": 0.8980388641357422, "logits/rejected": 0.7367168664932251, "logps/chosen": -219.5625, "logps/rejected": -244.0664520263672, "loss": 0.2701, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.9930900931358337, "rewards/margins": 2.069688558578491, "rewards/rejected": -3.0627784729003906, "step": 5280 }, { "epoch": 4.232, "grad_norm": 17.180286407470703, "learning_rate": 3.571391768952932e-06, "logits/chosen": 0.8705675005912781, "logits/rejected": 0.8819023966789246, "logps/chosen": -189.57054138183594, "logps/rejected": -228.0927734375, "loss": 0.3455, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -0.9885832667350769, "rewards/margins": 1.6450799703598022, "rewards/rejected": -2.6336631774902344, "step": 5290 }, { "epoch": 4.24, "grad_norm": 13.662627220153809, "learning_rate": 3.5650798758496053e-06, "logits/chosen": 0.880375325679779, "logits/rejected": 0.8753491640090942, "logps/chosen": -213.9600067138672, "logps/rejected": -244.53857421875, "loss": 0.3318, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.6102064847946167, "rewards/margins": 1.6889526844024658, "rewards/rejected": -3.299158811569214, "step": 5300 }, { "epoch": 4.248, "grad_norm": 19.561792373657227, "learning_rate": 3.558759677039455e-06, "logits/chosen": 0.8705887794494629, "logits/rejected": 0.8816524744033813, "logps/chosen": -213.3289031982422, "logps/rejected": -243.65737915039062, "loss": 0.3461, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.5360292196273804, "rewards/margins": 1.6422207355499268, "rewards/rejected": -3.1782500743865967, "step": 5310 }, { "epoch": 4.256, "grad_norm": 16.225622177124023, "learning_rate": 3.552431221808661e-06, "logits/chosen": 0.988519012928009, "logits/rejected": 0.824196457862854, "logps/chosen": -195.06642150878906, "logps/rejected": -229.2406768798828, "loss": 0.3154, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.1573065519332886, "rewards/margins": 1.9272657632827759, "rewards/rejected": -3.0845723152160645, "step": 5320 }, { "epoch": 4.264, "grad_norm": 11.897854804992676, "learning_rate": 3.5460945595077874e-06, "logits/chosen": 0.9042286276817322, "logits/rejected": 0.8848081827163696, "logps/chosen": -203.17344665527344, "logps/rejected": -256.4823303222656, "loss": 0.3054, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.392894983291626, "rewards/margins": 1.7944774627685547, "rewards/rejected": -3.1873722076416016, "step": 5330 }, { "epoch": 4.272, "grad_norm": 13.320012092590332, "learning_rate": 3.539749739551401e-06, "logits/chosen": 0.9267520904541016, "logits/rejected": 0.9451652765274048, "logps/chosen": -191.97021484375, "logps/rejected": -214.5209503173828, "loss": 0.2868, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.6399614810943604, "rewards/margins": 1.7933428287506104, "rewards/rejected": -2.4333043098449707, "step": 5340 }, { "epoch": 4.28, "grad_norm": 10.956954002380371, "learning_rate": 3.533396811417682e-06, "logits/chosen": 0.9412531852722168, "logits/rejected": 0.9464859366416931, "logps/chosen": -192.187255859375, "logps/rejected": -215.4243621826172, "loss": 0.3242, "rewards/accuracies": 0.875, "rewards/chosen": -0.8624544143676758, "rewards/margins": 1.6475841999053955, "rewards/rejected": -2.5100386142730713, "step": 5350 }, { "epoch": 4.288, "grad_norm": 18.71451187133789, "learning_rate": 3.527035824648039e-06, "logits/chosen": 0.8915054202079773, "logits/rejected": 0.7794849276542664, "logps/chosen": -192.20079040527344, "logps/rejected": -232.98291015625, "loss": 0.3807, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0030672550201416, "rewards/margins": 1.811819076538086, "rewards/rejected": -2.8148863315582275, "step": 5360 }, { "epoch": 4.296, "grad_norm": 12.68542194366455, "learning_rate": 3.520666828846726e-06, "logits/chosen": 0.7684556841850281, "logits/rejected": 0.7673687934875488, "logps/chosen": -211.497314453125, "logps/rejected": -240.1795654296875, "loss": 0.3209, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4810975790023804, "rewards/margins": 1.700387954711914, "rewards/rejected": -3.181485652923584, "step": 5370 }, { "epoch": 4.304, "grad_norm": 6.8565144538879395, "learning_rate": 3.5142898736804516e-06, "logits/chosen": 0.9116408228874207, "logits/rejected": 0.796613335609436, "logps/chosen": -211.6884765625, "logps/rejected": -243.06777954101562, "loss": 0.3276, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -1.3549237251281738, "rewards/margins": 1.6243526935577393, "rewards/rejected": -2.979276418685913, "step": 5380 }, { "epoch": 4.312, "grad_norm": 9.291360855102539, "learning_rate": 3.5079050088779927e-06, "logits/chosen": 0.8643763661384583, "logits/rejected": 0.8873327374458313, "logps/chosen": -180.9557342529297, "logps/rejected": -216.88780212402344, "loss": 0.3089, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.6550284624099731, "rewards/margins": 1.8085215091705322, "rewards/rejected": -2.463550090789795, "step": 5390 }, { "epoch": 4.32, "grad_norm": 10.52054500579834, "learning_rate": 3.501512284229807e-06, "logits/chosen": 0.8750426173210144, "logits/rejected": 0.7605211734771729, "logps/chosen": -212.87950134277344, "logps/rejected": -263.2989196777344, "loss": 0.2426, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.4775450229644775, "rewards/margins": 2.0750772953033447, "rewards/rejected": -3.552621841430664, "step": 5400 }, { "epoch": 4.328, "grad_norm": 16.574983596801758, "learning_rate": 3.4951117495876473e-06, "logits/chosen": 0.9197494387626648, "logits/rejected": 0.8915640115737915, "logps/chosen": -215.3748016357422, "logps/rejected": -229.8962860107422, "loss": 0.3404, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7154279947280884, "rewards/margins": 1.5610477924346924, "rewards/rejected": -3.276475667953491, "step": 5410 }, { "epoch": 4.336, "grad_norm": 15.055716514587402, "learning_rate": 3.4887034548641673e-06, "logits/chosen": 0.9449960589408875, "logits/rejected": 0.9040482640266418, "logps/chosen": -217.7838897705078, "logps/rejected": -268.0353088378906, "loss": 0.3165, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7793148756027222, "rewards/margins": 1.969291090965271, "rewards/rejected": -3.748605728149414, "step": 5420 }, { "epoch": 4.344, "grad_norm": 7.452909469604492, "learning_rate": 3.482287450032536e-06, "logits/chosen": 0.8897709250450134, "logits/rejected": 0.8994399905204773, "logps/chosen": -212.1781768798828, "logps/rejected": -253.2353973388672, "loss": 0.3063, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -1.805542230606079, "rewards/margins": 1.7644466161727905, "rewards/rejected": -3.569988965988159, "step": 5430 }, { "epoch": 4.352, "grad_norm": 19.223543167114258, "learning_rate": 3.47586378512605e-06, "logits/chosen": 1.0354408025741577, "logits/rejected": 0.9919587969779968, "logps/chosen": -216.69546508789062, "logps/rejected": -250.68069458007812, "loss": 0.3631, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -1.8229974508285522, "rewards/margins": 1.5450613498687744, "rewards/rejected": -3.368058919906616, "step": 5440 }, { "epoch": 4.36, "grad_norm": 13.662184715270996, "learning_rate": 3.4694325102377356e-06, "logits/chosen": 0.972303569316864, "logits/rejected": 0.8980138897895813, "logps/chosen": -206.581787109375, "logps/rejected": -222.6265106201172, "loss": 0.4966, "rewards/accuracies": 0.75, "rewards/chosen": -1.6232937574386597, "rewards/margins": 1.1546478271484375, "rewards/rejected": -2.7779414653778076, "step": 5450 }, { "epoch": 4.368, "grad_norm": 6.83968448638916, "learning_rate": 3.462993675519968e-06, "logits/chosen": 0.8973142504692078, "logits/rejected": 1.0178273916244507, "logps/chosen": -226.5254364013672, "logps/rejected": -241.5300750732422, "loss": 0.3852, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.1246910095214844, "rewards/margins": 1.604926347732544, "rewards/rejected": -3.7296173572540283, "step": 5460 }, { "epoch": 4.376, "grad_norm": 13.287477493286133, "learning_rate": 3.4565473311840735e-06, "logits/chosen": 1.0390936136245728, "logits/rejected": 0.9516915678977966, "logps/chosen": -214.5906982421875, "logps/rejected": -220.36778259277344, "loss": 0.2884, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.473663330078125, "rewards/margins": 1.6746243238449097, "rewards/rejected": -3.148287534713745, "step": 5470 }, { "epoch": 4.384, "grad_norm": 8.79075813293457, "learning_rate": 3.4500935274999414e-06, "logits/chosen": 0.8991445899009705, "logits/rejected": 0.7987154126167297, "logps/chosen": -221.56704711914062, "logps/rejected": -226.0802764892578, "loss": 0.3229, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3047336339950562, "rewards/margins": 1.384783148765564, "rewards/rejected": -2.68951678276062, "step": 5480 }, { "epoch": 4.392, "grad_norm": 18.390661239624023, "learning_rate": 3.443632314795627e-06, "logits/chosen": 1.0096405744552612, "logits/rejected": 0.9392326474189758, "logps/chosen": -209.8367156982422, "logps/rejected": -223.6701202392578, "loss": 0.3204, "rewards/accuracies": 0.875, "rewards/chosen": -1.4388588666915894, "rewards/margins": 1.779291033744812, "rewards/rejected": -3.2181496620178223, "step": 5490 }, { "epoch": 4.4, "grad_norm": 14.75433349609375, "learning_rate": 3.4371637434569664e-06, "logits/chosen": 0.9798237085342407, "logits/rejected": 0.8885357975959778, "logps/chosen": -215.8732147216797, "logps/rejected": -250.0683135986328, "loss": 0.2836, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.3270418643951416, "rewards/margins": 1.8547767400741577, "rewards/rejected": -3.181818723678589, "step": 5500 }, { "epoch": 4.408, "grad_norm": 7.223197937011719, "learning_rate": 3.430687863927178e-06, "logits/chosen": 0.9878398776054382, "logits/rejected": 0.907991886138916, "logps/chosen": -220.220947265625, "logps/rejected": -232.602294921875, "loss": 0.3854, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4777723550796509, "rewards/margins": 1.486155390739441, "rewards/rejected": -2.963927745819092, "step": 5510 }, { "epoch": 4.416, "grad_norm": 18.754655838012695, "learning_rate": 3.4242047267064714e-06, "logits/chosen": 1.0766704082489014, "logits/rejected": 1.0072535276412964, "logps/chosen": -218.5634307861328, "logps/rejected": -240.5037841796875, "loss": 0.3645, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6403402090072632, "rewards/margins": 1.6216999292373657, "rewards/rejected": -3.26203989982605, "step": 5520 }, { "epoch": 4.424, "grad_norm": 16.398393630981445, "learning_rate": 3.4177143823516523e-06, "logits/chosen": 0.9290235638618469, "logits/rejected": 0.8646947145462036, "logps/chosen": -221.277099609375, "logps/rejected": -243.0144805908203, "loss": 0.3467, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -1.3182934522628784, "rewards/margins": 1.7082767486572266, "rewards/rejected": -3.0265700817108154, "step": 5530 }, { "epoch": 4.432, "grad_norm": 10.032676696777344, "learning_rate": 3.4112168814757307e-06, "logits/chosen": 0.9694162607192993, "logits/rejected": 0.9398545622825623, "logps/chosen": -205.362060546875, "logps/rejected": -224.15017700195312, "loss": 0.2903, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.2131686210632324, "rewards/margins": 1.873663306236267, "rewards/rejected": -3.086832046508789, "step": 5540 }, { "epoch": 4.44, "grad_norm": 21.09775733947754, "learning_rate": 3.4047122747475227e-06, "logits/chosen": 0.9186191558837891, "logits/rejected": 0.763767421245575, "logps/chosen": -226.12075805664062, "logps/rejected": -264.490966796875, "loss": 0.3921, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.8982375860214233, "rewards/margins": 1.5824882984161377, "rewards/rejected": -3.4807260036468506, "step": 5550 }, { "epoch": 4.448, "grad_norm": 25.094377517700195, "learning_rate": 3.3982006128912587e-06, "logits/chosen": 0.8753210306167603, "logits/rejected": 0.8916776776313782, "logps/chosen": -205.23233032226562, "logps/rejected": -227.34988403320312, "loss": 0.433, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5061285495758057, "rewards/margins": 1.4716495275497437, "rewards/rejected": -2.9777779579162598, "step": 5560 }, { "epoch": 4.456, "grad_norm": 19.3261661529541, "learning_rate": 3.391681946686186e-06, "logits/chosen": 0.8373166918754578, "logits/rejected": 0.819493293762207, "logps/chosen": -207.85977172851562, "logps/rejected": -245.46836853027344, "loss": 0.3845, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -1.96077561378479, "rewards/margins": 1.5056307315826416, "rewards/rejected": -3.4664063453674316, "step": 5570 }, { "epoch": 4.464, "grad_norm": 17.59246253967285, "learning_rate": 3.385156326966173e-06, "logits/chosen": 0.9824124574661255, "logits/rejected": 0.9056622385978699, "logps/chosen": -212.71084594726562, "logps/rejected": -223.310791015625, "loss": 0.3712, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4070976972579956, "rewards/margins": 1.7284908294677734, "rewards/rejected": -3.1355881690979004, "step": 5580 }, { "epoch": 4.4719999999999995, "grad_norm": 10.986981391906738, "learning_rate": 3.3786238046193125e-06, "logits/chosen": 0.9522453546524048, "logits/rejected": 0.8866806030273438, "logps/chosen": -206.303955078125, "logps/rejected": -239.8437042236328, "loss": 0.4034, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8517950773239136, "rewards/margins": 1.48299241065979, "rewards/rejected": -3.3347878456115723, "step": 5590 }, { "epoch": 4.48, "grad_norm": 10.576725006103516, "learning_rate": 3.372084430587528e-06, "logits/chosen": 0.8120132684707642, "logits/rejected": 0.8881155252456665, "logps/chosen": -224.1233367919922, "logps/rejected": -252.2123260498047, "loss": 0.3466, "rewards/accuracies": 0.875, "rewards/chosen": -1.8974634408950806, "rewards/margins": 1.5087515115737915, "rewards/rejected": -3.406214952468872, "step": 5600 }, { "epoch": 4.4879999999999995, "grad_norm": 7.61484432220459, "learning_rate": 3.365538255866169e-06, "logits/chosen": 0.9328517913818359, "logits/rejected": 0.832955539226532, "logps/chosen": -224.76219177246094, "logps/rejected": -263.3474426269531, "loss": 0.3206, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -1.6335548162460327, "rewards/margins": 2.0621376037597656, "rewards/rejected": -3.695692539215088, "step": 5610 }, { "epoch": 4.496, "grad_norm": 7.165886878967285, "learning_rate": 3.3589853315036227e-06, "logits/chosen": 0.9069485068321228, "logits/rejected": 0.8654775619506836, "logps/chosen": -231.8223876953125, "logps/rejected": -247.6749267578125, "loss": 0.3769, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6844438314437866, "rewards/margins": 1.5118721723556519, "rewards/rejected": -3.1963162422180176, "step": 5620 }, { "epoch": 4.504, "grad_norm": 12.06303596496582, "learning_rate": 3.3524257086009105e-06, "logits/chosen": 0.8854274153709412, "logits/rejected": 0.9237664341926575, "logps/chosen": -219.48692321777344, "logps/rejected": -252.7774200439453, "loss": 0.2831, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -1.6034126281738281, "rewards/margins": 1.9384158849716187, "rewards/rejected": -3.541828155517578, "step": 5630 }, { "epoch": 4.5120000000000005, "grad_norm": 10.347089767456055, "learning_rate": 3.3458594383112868e-06, "logits/chosen": 0.9743103981018066, "logits/rejected": 0.820307731628418, "logps/chosen": -192.3817138671875, "logps/rejected": -223.5673065185547, "loss": 0.3259, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.055615782737732, "rewards/margins": 1.9324188232421875, "rewards/rejected": -2.98803448677063, "step": 5640 }, { "epoch": 4.52, "grad_norm": 21.758241653442383, "learning_rate": 3.339286571839848e-06, "logits/chosen": 0.8464789390563965, "logits/rejected": 0.8443576693534851, "logps/chosen": -203.8209991455078, "logps/rejected": -246.0260772705078, "loss": 0.3542, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.057892084121704, "rewards/margins": 1.6661615371704102, "rewards/rejected": -2.7240536212921143, "step": 5650 }, { "epoch": 4.5280000000000005, "grad_norm": 15.415779113769531, "learning_rate": 3.332707160443128e-06, "logits/chosen": 0.9635658264160156, "logits/rejected": 0.8357729911804199, "logps/chosen": -222.7996826171875, "logps/rejected": -249.92446899414062, "loss": 0.2987, "rewards/accuracies": 0.875, "rewards/chosen": -1.3085647821426392, "rewards/margins": 1.8709415197372437, "rewards/rejected": -3.179506301879883, "step": 5660 }, { "epoch": 4.536, "grad_norm": 22.9267578125, "learning_rate": 3.3261212554286977e-06, "logits/chosen": 0.8950627446174622, "logits/rejected": 0.8704226613044739, "logps/chosen": -221.8727569580078, "logps/rejected": -254.3755645751953, "loss": 0.4041, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.043503761291504, "rewards/margins": 1.4210357666015625, "rewards/rejected": -3.4645392894744873, "step": 5670 }, { "epoch": 4.5440000000000005, "grad_norm": 31.6077880859375, "learning_rate": 3.319528908154766e-06, "logits/chosen": 0.8877006769180298, "logits/rejected": 0.8138026595115662, "logps/chosen": -252.089111328125, "logps/rejected": -247.1710968017578, "loss": 0.427, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.8856754302978516, "rewards/margins": 1.4032905101776123, "rewards/rejected": -3.288965940475464, "step": 5680 }, { "epoch": 4.552, "grad_norm": 19.946836471557617, "learning_rate": 3.3129301700297834e-06, "logits/chosen": 1.060017704963684, "logits/rejected": 1.0112991333007812, "logps/chosen": -202.8851776123047, "logps/rejected": -234.48912048339844, "loss": 0.3442, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.6744213104248047, "rewards/margins": 1.476469874382019, "rewards/rejected": -3.1508913040161133, "step": 5690 }, { "epoch": 4.5600000000000005, "grad_norm": 9.337148666381836, "learning_rate": 3.306325092512034e-06, "logits/chosen": 0.9761505126953125, "logits/rejected": 0.9596638083457947, "logps/chosen": -208.33743286132812, "logps/rejected": -234.8901824951172, "loss": 0.3815, "rewards/accuracies": 0.875, "rewards/chosen": -1.9948211908340454, "rewards/margins": 1.56894850730896, "rewards/rejected": -3.563769578933716, "step": 5700 }, { "epoch": 4.568, "grad_norm": 13.035239219665527, "learning_rate": 3.2997137271092396e-06, "logits/chosen": 0.963702380657196, "logits/rejected": 0.872109055519104, "logps/chosen": -234.18614196777344, "logps/rejected": -254.7106475830078, "loss": 0.3564, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.163374900817871, "rewards/margins": 1.6218290328979492, "rewards/rejected": -3.7852039337158203, "step": 5710 }, { "epoch": 4.576, "grad_norm": 27.116188049316406, "learning_rate": 3.293096125378156e-06, "logits/chosen": 0.92717045545578, "logits/rejected": 0.908390462398529, "logps/chosen": -210.4567413330078, "logps/rejected": -253.227294921875, "loss": 0.3333, "rewards/accuracies": 0.875, "rewards/chosen": -1.5652707815170288, "rewards/margins": 1.8512943983078003, "rewards/rejected": -3.41656494140625, "step": 5720 }, { "epoch": 4.584, "grad_norm": 19.045427322387695, "learning_rate": 3.2864723389241697e-06, "logits/chosen": 0.9135765433311462, "logits/rejected": 0.859271228313446, "logps/chosen": -222.8280487060547, "logps/rejected": -242.25540161132812, "loss": 0.3374, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.3668630123138428, "rewards/margins": 1.7538166046142578, "rewards/rejected": -3.1206796169281006, "step": 5730 }, { "epoch": 4.592, "grad_norm": 20.58856773376465, "learning_rate": 3.279842419400899e-06, "logits/chosen": 0.8949125409126282, "logits/rejected": 0.9091690182685852, "logps/chosen": -224.7333984375, "logps/rejected": -265.9559631347656, "loss": 0.4016, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5353691577911377, "rewards/margins": 1.6605640649795532, "rewards/rejected": -3.1959331035614014, "step": 5740 }, { "epoch": 4.6, "grad_norm": 15.529658317565918, "learning_rate": 3.2732064185097885e-06, "logits/chosen": 1.0308302640914917, "logits/rejected": 0.9736013412475586, "logps/chosen": -207.3667449951172, "logps/rejected": -235.87290954589844, "loss": 0.3166, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -1.106737732887268, "rewards/margins": 1.6635363101959229, "rewards/rejected": -2.7702741622924805, "step": 5750 }, { "epoch": 4.608, "grad_norm": 13.790717124938965, "learning_rate": 3.2665643879997054e-06, "logits/chosen": 1.0450878143310547, "logits/rejected": 1.0168274641036987, "logps/chosen": -201.68594360351562, "logps/rejected": -224.7779541015625, "loss": 0.4748, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.329674243927002, "rewards/margins": 1.2952015399932861, "rewards/rejected": -2.624875783920288, "step": 5760 }, { "epoch": 4.616, "grad_norm": 19.334339141845703, "learning_rate": 3.259916379666538e-06, "logits/chosen": 0.9460241198539734, "logits/rejected": 0.902341365814209, "logps/chosen": -206.3498077392578, "logps/rejected": -234.02978515625, "loss": 0.3343, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -0.8321787714958191, "rewards/margins": 1.6867774724960327, "rewards/rejected": -2.518956422805786, "step": 5770 }, { "epoch": 4.624, "grad_norm": 11.740504264831543, "learning_rate": 3.2532624453527904e-06, "logits/chosen": 0.765145480632782, "logits/rejected": 0.8052225112915039, "logps/chosen": -215.4130401611328, "logps/rejected": -230.92808532714844, "loss": 0.3438, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -0.9054020047187805, "rewards/margins": 1.7861785888671875, "rewards/rejected": -2.6915805339813232, "step": 5780 }, { "epoch": 4.632, "grad_norm": 23.415552139282227, "learning_rate": 3.2466026369471804e-06, "logits/chosen": 0.8790031671524048, "logits/rejected": 0.8607792854309082, "logps/chosen": -205.34423828125, "logps/rejected": -228.1710968017578, "loss": 0.4049, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.2571852207183838, "rewards/margins": 1.416163682937622, "rewards/rejected": -2.6733486652374268, "step": 5790 }, { "epoch": 4.64, "grad_norm": 17.48906707763672, "learning_rate": 3.2399370063842297e-06, "logits/chosen": 0.865296483039856, "logits/rejected": 0.8317980170249939, "logps/chosen": -233.6914520263672, "logps/rejected": -257.34161376953125, "loss": 0.4092, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9758745431900024, "rewards/margins": 1.5804134607315063, "rewards/rejected": -3.556288242340088, "step": 5800 }, { "epoch": 4.648, "grad_norm": 15.079739570617676, "learning_rate": 3.2332656056438663e-06, "logits/chosen": 0.9580942392349243, "logits/rejected": 0.8425354957580566, "logps/chosen": -206.0723419189453, "logps/rejected": -235.4949493408203, "loss": 0.4092, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.753932237625122, "rewards/margins": 1.4880059957504272, "rewards/rejected": -3.241938352584839, "step": 5810 }, { "epoch": 4.656, "grad_norm": 27.08865737915039, "learning_rate": 3.226588486751012e-06, "logits/chosen": 1.0157322883605957, "logits/rejected": 0.9247297644615173, "logps/chosen": -198.93539428710938, "logps/rejected": -235.2766876220703, "loss": 0.37, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -1.8956657648086548, "rewards/margins": 1.5729421377182007, "rewards/rejected": -3.4686081409454346, "step": 5820 }, { "epoch": 4.664, "grad_norm": 13.346570014953613, "learning_rate": 3.2199057017751822e-06, "logits/chosen": 1.0085042715072632, "logits/rejected": 0.9250786900520325, "logps/chosen": -216.81689453125, "logps/rejected": -223.42991638183594, "loss": 0.3252, "rewards/accuracies": 0.875, "rewards/chosen": -1.240125060081482, "rewards/margins": 1.7455374002456665, "rewards/rejected": -2.9856624603271484, "step": 5830 }, { "epoch": 4.672, "grad_norm": 13.748871803283691, "learning_rate": 3.2132173028300756e-06, "logits/chosen": 1.0077823400497437, "logits/rejected": 0.9494333267211914, "logps/chosen": -220.22952270507812, "logps/rejected": -239.5417938232422, "loss": 0.4106, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -1.271531343460083, "rewards/margins": 1.3013349771499634, "rewards/rejected": -2.572866201400757, "step": 5840 }, { "epoch": 4.68, "grad_norm": 11.705482482910156, "learning_rate": 3.2065233420731717e-06, "logits/chosen": 0.9431636929512024, "logits/rejected": 0.945948600769043, "logps/chosen": -211.1238555908203, "logps/rejected": -250.669189453125, "loss": 0.3788, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.2726339101791382, "rewards/margins": 1.5259406566619873, "rewards/rejected": -2.798574447631836, "step": 5850 }, { "epoch": 4.688, "grad_norm": 19.19478416442871, "learning_rate": 3.1998238717053202e-06, "logits/chosen": 1.0462414026260376, "logits/rejected": 0.9334519505500793, "logps/chosen": -204.07395935058594, "logps/rejected": -229.44419860839844, "loss": 0.3411, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.3902432918548584, "rewards/margins": 1.6846592426300049, "rewards/rejected": -3.0749025344848633, "step": 5860 }, { "epoch": 4.696, "grad_norm": 16.826845169067383, "learning_rate": 3.1931189439703383e-06, "logits/chosen": 0.9771106839179993, "logits/rejected": 0.8640087246894836, "logps/chosen": -211.6616668701172, "logps/rejected": -238.89736938476562, "loss": 0.3763, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -1.612330436706543, "rewards/margins": 1.4654382467269897, "rewards/rejected": -3.0777688026428223, "step": 5870 }, { "epoch": 4.704, "grad_norm": 18.69582176208496, "learning_rate": 3.186408611154597e-06, "logits/chosen": 0.8807933926582336, "logits/rejected": 0.8436470031738281, "logps/chosen": -199.52122497558594, "logps/rejected": -235.47007751464844, "loss": 0.3434, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.1381405591964722, "rewards/margins": 1.7940791845321655, "rewards/rejected": -2.9322197437286377, "step": 5880 }, { "epoch": 4.712, "grad_norm": 13.82207202911377, "learning_rate": 3.1796929255866223e-06, "logits/chosen": 0.908224880695343, "logits/rejected": 0.8031226992607117, "logps/chosen": -226.13247680664062, "logps/rejected": -268.6899108886719, "loss": 0.2751, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.7007683515548706, "rewards/margins": 2.1727986335754395, "rewards/rejected": -3.8735668659210205, "step": 5890 }, { "epoch": 4.72, "grad_norm": 17.216737747192383, "learning_rate": 3.1729719396366765e-06, "logits/chosen": 0.8856326341629028, "logits/rejected": 0.8279878497123718, "logps/chosen": -211.522216796875, "logps/rejected": -239.03599548339844, "loss": 0.3723, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6209534406661987, "rewards/margins": 1.6722420454025269, "rewards/rejected": -3.2931954860687256, "step": 5900 }, { "epoch": 4.728, "grad_norm": 11.951440811157227, "learning_rate": 3.1662457057163603e-06, "logits/chosen": 0.9104948043823242, "logits/rejected": 0.80853271484375, "logps/chosen": -216.1103057861328, "logps/rejected": -228.3678741455078, "loss": 0.3353, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -1.3812806606292725, "rewards/margins": 1.7015780210494995, "rewards/rejected": -3.0828585624694824, "step": 5910 }, { "epoch": 4.736, "grad_norm": 18.6155948638916, "learning_rate": 3.1595142762781966e-06, "logits/chosen": 1.020864486694336, "logits/rejected": 0.9599675536155701, "logps/chosen": -213.35317993164062, "logps/rejected": -227.1990203857422, "loss": 0.3485, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3319700956344604, "rewards/margins": 1.7788978815078735, "rewards/rejected": -3.110867738723755, "step": 5920 }, { "epoch": 4.744, "grad_norm": 15.233264923095703, "learning_rate": 3.1527777038152237e-06, "logits/chosen": 0.99336177110672, "logits/rejected": 0.9045869708061218, "logps/chosen": -225.9204559326172, "logps/rejected": -246.9542999267578, "loss": 0.2477, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -1.912535548210144, "rewards/margins": 2.0346450805664062, "rewards/rejected": -3.947180986404419, "step": 5930 }, { "epoch": 4.752, "grad_norm": 8.648079872131348, "learning_rate": 3.1460360408605866e-06, "logits/chosen": 0.778138279914856, "logits/rejected": 0.8619017004966736, "logps/chosen": -206.7869873046875, "logps/rejected": -250.70361328125, "loss": 0.2783, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -1.4929323196411133, "rewards/margins": 1.780416488647461, "rewards/rejected": -3.2733490467071533, "step": 5940 }, { "epoch": 4.76, "grad_norm": 13.389790534973145, "learning_rate": 3.1392893399871294e-06, "logits/chosen": 0.8986393213272095, "logits/rejected": 0.8389616012573242, "logps/chosen": -233.67100524902344, "logps/rejected": -222.7420196533203, "loss": 0.427, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -1.2816507816314697, "rewards/margins": 1.3874958753585815, "rewards/rejected": -2.6691465377807617, "step": 5950 }, { "epoch": 4.768, "grad_norm": 13.679242134094238, "learning_rate": 3.132537653806978e-06, "logits/chosen": 0.9529990553855896, "logits/rejected": 0.9898123741149902, "logps/chosen": -206.0071258544922, "logps/rejected": -235.917236328125, "loss": 0.489, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -1.56574547290802, "rewards/margins": 1.2233296632766724, "rewards/rejected": -2.7890751361846924, "step": 5960 }, { "epoch": 4.776, "grad_norm": 25.058631896972656, "learning_rate": 3.1257810349711388e-06, "logits/chosen": 1.1274060010910034, "logits/rejected": 0.9430057406425476, "logps/chosen": -229.041259765625, "logps/rejected": -240.43399047851562, "loss": 0.3475, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -1.2477760314941406, "rewards/margins": 1.6116218566894531, "rewards/rejected": -2.8593976497650146, "step": 5970 }, { "epoch": 4.784, "grad_norm": 26.500873565673828, "learning_rate": 3.1190195361690833e-06, "logits/chosen": 1.0664939880371094, "logits/rejected": 0.9565810561180115, "logps/chosen": -206.330078125, "logps/rejected": -240.9081573486328, "loss": 0.3886, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.549399971961975, "rewards/margins": 1.5753132104873657, "rewards/rejected": -3.1247129440307617, "step": 5980 }, { "epoch": 4.792, "grad_norm": 8.312376976013184, "learning_rate": 3.1122532101283366e-06, "logits/chosen": 0.9260608553886414, "logits/rejected": 0.8606094717979431, "logps/chosen": -221.0975341796875, "logps/rejected": -267.6355285644531, "loss": 0.3165, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.3144680261611938, "rewards/margins": 1.7448300123214722, "rewards/rejected": -3.059298038482666, "step": 5990 }, { "epoch": 4.8, "grad_norm": 9.584151268005371, "learning_rate": 3.1054821096140675e-06, "logits/chosen": 1.0778309106826782, "logits/rejected": 0.8670400977134705, "logps/chosen": -201.90811157226562, "logps/rejected": -216.34109497070312, "loss": 0.2893, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.7194448709487915, "rewards/margins": 1.7727628946304321, "rewards/rejected": -2.4922077655792236, "step": 6000 }, { "epoch": 4.808, "grad_norm": 10.907683372497559, "learning_rate": 3.0987062874286805e-06, "logits/chosen": 0.895865261554718, "logits/rejected": 0.9385225176811218, "logps/chosen": -206.30361938476562, "logps/rejected": -236.09593200683594, "loss": 0.2717, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7000702023506165, "rewards/margins": 2.0624783039093018, "rewards/rejected": -2.7625484466552734, "step": 6010 }, { "epoch": 4.816, "grad_norm": 20.199901580810547, "learning_rate": 3.0919257964113962e-06, "logits/chosen": 0.9721388220787048, "logits/rejected": 0.9084480404853821, "logps/chosen": -205.85423278808594, "logps/rejected": -226.93325805664062, "loss": 0.4159, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0990513563156128, "rewards/margins": 1.5490108728408813, "rewards/rejected": -2.648062229156494, "step": 6020 }, { "epoch": 4.824, "grad_norm": 13.252838134765625, "learning_rate": 3.085140689437846e-06, "logits/chosen": 1.1252657175064087, "logits/rejected": 1.0671288967132568, "logps/chosen": -192.517333984375, "logps/rejected": -223.1140594482422, "loss": 0.285, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.5657386779785156, "rewards/margins": 2.0705223083496094, "rewards/rejected": -2.636261224746704, "step": 6030 }, { "epoch": 4.832, "grad_norm": 16.6810359954834, "learning_rate": 3.0783510194196577e-06, "logits/chosen": 1.0133423805236816, "logits/rejected": 0.9432527422904968, "logps/chosen": -212.550537109375, "logps/rejected": -248.9680633544922, "loss": 0.3551, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -1.3131626844406128, "rewards/margins": 1.684877634048462, "rewards/rejected": -2.998040199279785, "step": 6040 }, { "epoch": 4.84, "grad_norm": 12.429515838623047, "learning_rate": 3.0715568393040405e-06, "logits/chosen": 0.997822105884552, "logits/rejected": 0.9970841407775879, "logps/chosen": -186.12039184570312, "logps/rejected": -226.0194854736328, "loss": 0.2044, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5419926047325134, "rewards/margins": 2.209963321685791, "rewards/rejected": -2.75195574760437, "step": 6050 }, { "epoch": 4.848, "grad_norm": 10.648442268371582, "learning_rate": 3.0647582020733773e-06, "logits/chosen": 0.8079277873039246, "logits/rejected": 0.8897015452384949, "logps/chosen": -228.3728485107422, "logps/rejected": -263.2770690917969, "loss": 0.2747, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -1.4080426692962646, "rewards/margins": 2.117032051086426, "rewards/rejected": -3.5250747203826904, "step": 6060 }, { "epoch": 4.856, "grad_norm": 34.41978073120117, "learning_rate": 3.0579551607448064e-06, "logits/chosen": 0.9691902995109558, "logits/rejected": 0.8717222213745117, "logps/chosen": -229.96450805664062, "logps/rejected": -267.5755920410156, "loss": 0.3608, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.6460784673690796, "rewards/margins": 1.8111892938613892, "rewards/rejected": -3.4572677612304688, "step": 6070 }, { "epoch": 4.864, "grad_norm": 10.431986808776855, "learning_rate": 3.051147768369811e-06, "logits/chosen": 0.8802177309989929, "logits/rejected": 0.8142924308776855, "logps/chosen": -208.6748046875, "logps/rejected": -240.06729125976562, "loss": 0.2808, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.5522571802139282, "rewards/margins": 2.0125110149383545, "rewards/rejected": -3.5647683143615723, "step": 6080 }, { "epoch": 4.872, "grad_norm": 15.201266288757324, "learning_rate": 3.0443360780338034e-06, "logits/chosen": 0.8416620492935181, "logits/rejected": 0.8180571794509888, "logps/chosen": -210.61656188964844, "logps/rejected": -238.749267578125, "loss": 0.4195, "rewards/accuracies": 0.7750000357627869, "rewards/chosen": -1.1236116886138916, "rewards/margins": 1.5696989297866821, "rewards/rejected": -2.693310499191284, "step": 6090 }, { "epoch": 4.88, "grad_norm": 13.592103004455566, "learning_rate": 3.0375201428557135e-06, "logits/chosen": 0.9775605201721191, "logits/rejected": 0.9477388262748718, "logps/chosen": -211.87046813964844, "logps/rejected": -223.9567413330078, "loss": 0.3774, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.3752092123031616, "rewards/margins": 1.6053650379180908, "rewards/rejected": -2.980574369430542, "step": 6100 }, { "epoch": 4.888, "grad_norm": 18.705928802490234, "learning_rate": 3.0307000159875733e-06, "logits/chosen": 1.0056161880493164, "logits/rejected": 1.0067530870437622, "logps/chosen": -217.2322235107422, "logps/rejected": -247.19590759277344, "loss": 0.3999, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5061392784118652, "rewards/margins": 1.4345601797103882, "rewards/rejected": -2.940699338912964, "step": 6110 }, { "epoch": 4.896, "grad_norm": 25.736547470092773, "learning_rate": 3.0238757506141013e-06, "logits/chosen": 0.9335775375366211, "logits/rejected": 0.8118970990180969, "logps/chosen": -203.6862030029297, "logps/rejected": -248.3121337890625, "loss": 0.3255, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.9543191194534302, "rewards/margins": 2.1874382495880127, "rewards/rejected": -3.1417574882507324, "step": 6120 }, { "epoch": 4.904, "grad_norm": 36.35403823852539, "learning_rate": 3.0170473999522914e-06, "logits/chosen": 0.7437816262245178, "logits/rejected": 0.7528760433197021, "logps/chosen": -203.16519165039062, "logps/rejected": -233.81167602539062, "loss": 0.3915, "rewards/accuracies": 0.875, "rewards/chosen": -1.3550660610198975, "rewards/margins": 1.738660454750061, "rewards/rejected": -3.093726396560669, "step": 6130 }, { "epoch": 4.912, "grad_norm": 21.394716262817383, "learning_rate": 3.010215017250993e-06, "logits/chosen": 0.8748787045478821, "logits/rejected": 0.8038078546524048, "logps/chosen": -190.148681640625, "logps/rejected": -229.1069793701172, "loss": 0.3333, "rewards/accuracies": 0.8375000357627869, "rewards/chosen": -1.0847818851470947, "rewards/margins": 1.952123999595642, "rewards/rejected": -3.0369060039520264, "step": 6140 }, { "epoch": 4.92, "grad_norm": 10.547325134277344, "learning_rate": 3.0033786557904982e-06, "logits/chosen": 0.9673800468444824, "logits/rejected": 0.8164333701133728, "logps/chosen": -215.92520141601562, "logps/rejected": -247.7021942138672, "loss": 0.385, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.3397104740142822, "rewards/margins": 1.9476795196533203, "rewards/rejected": -3.2873897552490234, "step": 6150 }, { "epoch": 4.928, "grad_norm": 8.210875511169434, "learning_rate": 2.996538368882127e-06, "logits/chosen": 1.024520754814148, "logits/rejected": 0.8471878170967102, "logps/chosen": -226.50840759277344, "logps/rejected": -246.0098114013672, "loss": 0.2765, "rewards/accuracies": 0.875, "rewards/chosen": -0.8774771690368652, "rewards/margins": 1.9889500141143799, "rewards/rejected": -2.866427183151245, "step": 6160 }, { "epoch": 4.936, "grad_norm": 11.18367862701416, "learning_rate": 2.9896942098678124e-06, "logits/chosen": 0.9735992550849915, "logits/rejected": 0.981691300868988, "logps/chosen": -192.818359375, "logps/rejected": -243.27273559570312, "loss": 0.3407, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.7151866555213928, "rewards/margins": 1.898476481437683, "rewards/rejected": -2.6136631965637207, "step": 6170 }, { "epoch": 4.944, "grad_norm": 22.406173706054688, "learning_rate": 2.982846232119679e-06, "logits/chosen": 1.0046285390853882, "logits/rejected": 0.9979419708251953, "logps/chosen": -216.0959014892578, "logps/rejected": -244.8262176513672, "loss": 0.324, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.145372986793518, "rewards/margins": 1.6784679889678955, "rewards/rejected": -2.823841094970703, "step": 6180 }, { "epoch": 4.952, "grad_norm": 28.244096755981445, "learning_rate": 2.975994489039634e-06, "logits/chosen": 0.9947078824043274, "logits/rejected": 0.91530841588974, "logps/chosen": -208.3612823486328, "logps/rejected": -234.4157257080078, "loss": 0.3708, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6478198766708374, "rewards/margins": 1.7686901092529297, "rewards/rejected": -3.4165101051330566, "step": 6190 }, { "epoch": 4.96, "grad_norm": 18.052692413330078, "learning_rate": 2.9691390340589467e-06, "logits/chosen": 1.0676629543304443, "logits/rejected": 0.9271091818809509, "logps/chosen": -199.14598083496094, "logps/rejected": -222.794921875, "loss": 0.2456, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8989327549934387, "rewards/margins": 2.075803518295288, "rewards/rejected": -2.974736213684082, "step": 6200 }, { "epoch": 4.968, "grad_norm": 10.866455078125, "learning_rate": 2.9622799206378306e-06, "logits/chosen": 1.0321582555770874, "logits/rejected": 0.9904826283454895, "logps/chosen": -201.66383361816406, "logps/rejected": -241.1051788330078, "loss": 0.3023, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.0962049961090088, "rewards/margins": 1.7505791187286377, "rewards/rejected": -2.8467838764190674, "step": 6210 }, { "epoch": 4.976, "grad_norm": 14.606306076049805, "learning_rate": 2.955417202265032e-06, "logits/chosen": 1.0466164350509644, "logits/rejected": 0.9499098062515259, "logps/chosen": -206.0836639404297, "logps/rejected": -229.9392852783203, "loss": 0.3209, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.0380445718765259, "rewards/margins": 1.6832345724105835, "rewards/rejected": -2.7212791442871094, "step": 6220 }, { "epoch": 4.984, "grad_norm": 27.189558029174805, "learning_rate": 2.948550932457407e-06, "logits/chosen": 0.9810224771499634, "logits/rejected": 0.8776025176048279, "logps/chosen": -220.196044921875, "logps/rejected": -241.2982940673828, "loss": 0.344, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.4501808881759644, "rewards/margins": 1.7209481000900269, "rewards/rejected": -3.171128988265991, "step": 6230 }, { "epoch": 4.992, "grad_norm": 13.859283447265625, "learning_rate": 2.9416811647595052e-06, "logits/chosen": 1.027130365371704, "logits/rejected": 0.9805015921592712, "logps/chosen": -207.3785858154297, "logps/rejected": -245.68896484375, "loss": 0.2176, "rewards/accuracies": 0.9625000357627869, "rewards/chosen": -1.786203384399414, "rewards/margins": 2.202639579772949, "rewards/rejected": -3.9888432025909424, "step": 6240 }, { "epoch": 5.0, "grad_norm": 27.184709548950195, "learning_rate": 2.9348079527431565e-06, "logits/chosen": 0.9774474501609802, "logits/rejected": 0.830977737903595, "logps/chosen": -207.9765625, "logps/rejected": -246.947021484375, "loss": 0.2871, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.9358631372451782, "rewards/margins": 2.2705037593841553, "rewards/rejected": -4.206367492675781, "step": 6250 }, { "epoch": 5.008, "grad_norm": 9.864638328552246, "learning_rate": 2.927931350007048e-06, "logits/chosen": 0.9456281661987305, "logits/rejected": 0.8694238662719727, "logps/chosen": -221.7121124267578, "logps/rejected": -256.9190979003906, "loss": 0.2611, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.0195796489715576, "rewards/margins": 2.1393039226531982, "rewards/rejected": -4.158883571624756, "step": 6260 }, { "epoch": 5.016, "grad_norm": 8.061872482299805, "learning_rate": 2.9210514101763116e-06, "logits/chosen": 0.9081168174743652, "logits/rejected": 0.8007159233093262, "logps/chosen": -208.5377197265625, "logps/rejected": -260.0176696777344, "loss": 0.2097, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.110689878463745, "rewards/margins": 2.2806320190429688, "rewards/rejected": -4.391321659088135, "step": 6270 }, { "epoch": 5.024, "grad_norm": 13.147801399230957, "learning_rate": 2.9141681869020973e-06, "logits/chosen": 0.8848690390586853, "logits/rejected": 0.9038190245628357, "logps/chosen": -234.75083923339844, "logps/rejected": -270.18408203125, "loss": 0.2374, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.0579333305358887, "rewards/margins": 2.2107951641082764, "rewards/rejected": -4.268728733062744, "step": 6280 }, { "epoch": 5.032, "grad_norm": 22.0900936126709, "learning_rate": 2.907281733861164e-06, "logits/chosen": 0.8925933241844177, "logits/rejected": 0.8973062634468079, "logps/chosen": -206.32763671875, "logps/rejected": -228.326904296875, "loss": 0.3404, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4018090963363647, "rewards/margins": 1.735507607460022, "rewards/rejected": -3.1373164653778076, "step": 6290 }, { "epoch": 5.04, "grad_norm": 8.617568016052246, "learning_rate": 2.900392104755455e-06, "logits/chosen": 0.8251405954360962, "logits/rejected": 0.7466452121734619, "logps/chosen": -231.0389862060547, "logps/rejected": -279.979248046875, "loss": 0.261, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.3339290618896484, "rewards/margins": 2.2454123497009277, "rewards/rejected": -4.579341411590576, "step": 6300 }, { "epoch": 5.048, "grad_norm": 10.349010467529297, "learning_rate": 2.8934993533116827e-06, "logits/chosen": 0.8101547360420227, "logits/rejected": 0.8393245935440063, "logps/chosen": -218.5235137939453, "logps/rejected": -276.7334899902344, "loss": 0.2239, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8703062534332275, "rewards/margins": 2.4195773601531982, "rewards/rejected": -4.289883613586426, "step": 6310 }, { "epoch": 5.056, "grad_norm": 6.812648296356201, "learning_rate": 2.8866035332809083e-06, "logits/chosen": 0.9793133735656738, "logits/rejected": 0.9104774594306946, "logps/chosen": -209.8477783203125, "logps/rejected": -235.04031372070312, "loss": 0.2581, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -1.3933300971984863, "rewards/margins": 2.483262777328491, "rewards/rejected": -3.8765931129455566, "step": 6320 }, { "epoch": 5.064, "grad_norm": 13.762728691101074, "learning_rate": 2.879704698438121e-06, "logits/chosen": 0.9087037444114685, "logits/rejected": 0.890673816204071, "logps/chosen": -208.9591064453125, "logps/rejected": -263.70452880859375, "loss": 0.2147, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.8164809942245483, "rewards/margins": 2.4334583282470703, "rewards/rejected": -4.24993896484375, "step": 6330 }, { "epoch": 5.072, "grad_norm": 27.724454879760742, "learning_rate": 2.8728029025818206e-06, "logits/chosen": 0.9317230582237244, "logits/rejected": 0.7435727119445801, "logps/chosen": -212.3452911376953, "logps/rejected": -262.7144470214844, "loss": 0.2844, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7303894758224487, "rewards/margins": 2.263822317123413, "rewards/rejected": -3.9942119121551514, "step": 6340 }, { "epoch": 5.08, "grad_norm": 23.111251831054688, "learning_rate": 2.865898199533597e-06, "logits/chosen": 0.9201246500015259, "logits/rejected": 0.8896446228027344, "logps/chosen": -209.3739013671875, "logps/rejected": -240.9625244140625, "loss": 0.2962, "rewards/accuracies": 0.875, "rewards/chosen": -1.9742488861083984, "rewards/margins": 1.8627338409423828, "rewards/rejected": -3.8369832038879395, "step": 6350 }, { "epoch": 5.088, "grad_norm": 24.608367919921875, "learning_rate": 2.8589906431377133e-06, "logits/chosen": 1.0965173244476318, "logits/rejected": 0.9428312182426453, "logps/chosen": -223.93748474121094, "logps/rejected": -240.9108428955078, "loss": 0.2826, "rewards/accuracies": 0.875, "rewards/chosen": -1.867902398109436, "rewards/margins": 2.019596815109253, "rewards/rejected": -3.8874988555908203, "step": 6360 }, { "epoch": 5.096, "grad_norm": 14.740994453430176, "learning_rate": 2.8520802872606803e-06, "logits/chosen": 0.9774947166442871, "logits/rejected": 0.9799107909202576, "logps/chosen": -204.7219696044922, "logps/rejected": -231.45262145996094, "loss": 0.3067, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -1.5889536142349243, "rewards/margins": 2.0691580772399902, "rewards/rejected": -3.658111572265625, "step": 6370 }, { "epoch": 5.104, "grad_norm": 9.468008995056152, "learning_rate": 2.8451671857908414e-06, "logits/chosen": 0.9921843409538269, "logits/rejected": 0.9002591371536255, "logps/chosen": -198.7250213623047, "logps/rejected": -225.64834594726562, "loss": 0.2472, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.3202179670333862, "rewards/margins": 2.2721245288848877, "rewards/rejected": -3.5923423767089844, "step": 6380 }, { "epoch": 5.112, "grad_norm": 14.54666805267334, "learning_rate": 2.8382513926379508e-06, "logits/chosen": 0.9203723073005676, "logits/rejected": 0.8950970768928528, "logps/chosen": -196.84657287597656, "logps/rejected": -230.43809509277344, "loss": 0.2501, "rewards/accuracies": 0.9625000357627869, "rewards/chosen": -1.5130068063735962, "rewards/margins": 2.144577741622925, "rewards/rejected": -3.6575844287872314, "step": 6390 }, { "epoch": 5.12, "grad_norm": 27.838401794433594, "learning_rate": 2.831332961732754e-06, "logits/chosen": 1.0219676494598389, "logits/rejected": 0.9032443165779114, "logps/chosen": -212.4551544189453, "logps/rejected": -241.2015838623047, "loss": 0.2542, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.7502228021621704, "rewards/margins": 2.144639015197754, "rewards/rejected": -3.894861936569214, "step": 6400 }, { "epoch": 5.128, "grad_norm": 23.901247024536133, "learning_rate": 2.8244119470265628e-06, "logits/chosen": 0.9221467971801758, "logits/rejected": 0.844575822353363, "logps/chosen": -216.648681640625, "logps/rejected": -226.3651123046875, "loss": 0.3867, "rewards/accuracies": 0.875, "rewards/chosen": -1.8910839557647705, "rewards/margins": 2.042074203491211, "rewards/rejected": -3.9331586360931396, "step": 6410 }, { "epoch": 5.136, "grad_norm": 28.365636825561523, "learning_rate": 2.817488402490841e-06, "logits/chosen": 0.9146007895469666, "logits/rejected": 0.9036192297935486, "logps/chosen": -217.9712371826172, "logps/rejected": -246.8628387451172, "loss": 0.2528, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.148552417755127, "rewards/margins": 2.226809024810791, "rewards/rejected": -4.375361919403076, "step": 6420 }, { "epoch": 5.144, "grad_norm": 11.069653511047363, "learning_rate": 2.8105623821167804e-06, "logits/chosen": 0.9730092883110046, "logits/rejected": 0.8130871057510376, "logps/chosen": -223.188720703125, "logps/rejected": -259.5501403808594, "loss": 0.2751, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.180257558822632, "rewards/margins": 2.317899465560913, "rewards/rejected": -4.498157024383545, "step": 6430 }, { "epoch": 5.152, "grad_norm": 4.842264652252197, "learning_rate": 2.8036339399148783e-06, "logits/chosen": 0.8939194083213806, "logits/rejected": 0.8858678936958313, "logps/chosen": -210.72207641601562, "logps/rejected": -263.359375, "loss": 0.2706, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -1.6614614725112915, "rewards/margins": 2.150581121444702, "rewards/rejected": -3.812042713165283, "step": 6440 }, { "epoch": 5.16, "grad_norm": 15.502528190612793, "learning_rate": 2.796703129914519e-06, "logits/chosen": 0.9737855792045593, "logits/rejected": 0.8153099417686462, "logps/chosen": -200.01934814453125, "logps/rejected": -228.4278106689453, "loss": 0.2503, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.3171756267547607, "rewards/margins": 2.079169988632202, "rewards/rejected": -3.396345615386963, "step": 6450 }, { "epoch": 5.168, "grad_norm": 13.45697021484375, "learning_rate": 2.7897700061635517e-06, "logits/chosen": 0.9658388495445251, "logits/rejected": 0.8410293459892273, "logps/chosen": -197.5312957763672, "logps/rejected": -234.99929809570312, "loss": 0.2299, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5396157503128052, "rewards/margins": 2.2552897930145264, "rewards/rejected": -3.794905424118042, "step": 6460 }, { "epoch": 5.176, "grad_norm": 21.632190704345703, "learning_rate": 2.7828346227278676e-06, "logits/chosen": 0.8654441833496094, "logits/rejected": 0.8274869322776794, "logps/chosen": -220.19541931152344, "logps/rejected": -281.8578796386719, "loss": 0.3081, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.2533280849456787, "rewards/margins": 2.436194658279419, "rewards/rejected": -4.689522743225098, "step": 6470 }, { "epoch": 5.184, "grad_norm": 23.60213851928711, "learning_rate": 2.7758970336909795e-06, "logits/chosen": 0.9322023391723633, "logits/rejected": 0.8540251851081848, "logps/chosen": -226.8634796142578, "logps/rejected": -249.7890625, "loss": 0.3004, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.6639741659164429, "rewards/margins": 2.1177260875701904, "rewards/rejected": -3.781700611114502, "step": 6480 }, { "epoch": 5.192, "grad_norm": 8.637389183044434, "learning_rate": 2.768957293153602e-06, "logits/chosen": 1.001732587814331, "logits/rejected": 1.0790340900421143, "logps/chosen": -212.6641845703125, "logps/rejected": -248.3900604248047, "loss": 0.2027, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.9972444772720337, "rewards/margins": 2.5673165321350098, "rewards/rejected": -4.564560890197754, "step": 6490 }, { "epoch": 5.2, "grad_norm": 17.306949615478516, "learning_rate": 2.7620154552332236e-06, "logits/chosen": 1.0369716882705688, "logits/rejected": 0.8845782279968262, "logps/chosen": -210.2711944580078, "logps/rejected": -248.135009765625, "loss": 0.2221, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.953507423400879, "rewards/margins": 2.378849506378174, "rewards/rejected": -4.332356929779053, "step": 6500 }, { "epoch": 5.208, "grad_norm": 20.987321853637695, "learning_rate": 2.755071574063692e-06, "logits/chosen": 0.9889179468154907, "logits/rejected": 0.9481157660484314, "logps/chosen": -209.94509887695312, "logps/rejected": -241.16763305664062, "loss": 0.192, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.9956191778182983, "rewards/margins": 2.235707998275757, "rewards/rejected": -4.231327056884766, "step": 6510 }, { "epoch": 5.216, "grad_norm": 22.150367736816406, "learning_rate": 2.7481257037947873e-06, "logits/chosen": 1.0156019926071167, "logits/rejected": 0.8769356608390808, "logps/chosen": -219.45596313476562, "logps/rejected": -244.5421905517578, "loss": 0.2559, "rewards/accuracies": 0.875, "rewards/chosen": -1.335376262664795, "rewards/margins": 2.1717984676361084, "rewards/rejected": -3.5071747303009033, "step": 6520 }, { "epoch": 5.224, "grad_norm": 24.29526138305664, "learning_rate": 2.741177898591801e-06, "logits/chosen": 1.0322328805923462, "logits/rejected": 0.9097422957420349, "logps/chosen": -215.4120635986328, "logps/rejected": -245.8870849609375, "loss": 0.2705, "rewards/accuracies": 0.875, "rewards/chosen": -2.2170920372009277, "rewards/margins": 2.0834217071533203, "rewards/rejected": -4.300513744354248, "step": 6530 }, { "epoch": 5.232, "grad_norm": 11.549744606018066, "learning_rate": 2.7342282126351145e-06, "logits/chosen": 1.0487924814224243, "logits/rejected": 0.9894416928291321, "logps/chosen": -229.5109405517578, "logps/rejected": -263.1656494140625, "loss": 0.2757, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.495918035507202, "rewards/margins": 2.1933634281158447, "rewards/rejected": -4.689281463623047, "step": 6540 }, { "epoch": 5.24, "grad_norm": 11.867500305175781, "learning_rate": 2.727276700119774e-06, "logits/chosen": 1.0762747526168823, "logits/rejected": 0.9830428957939148, "logps/chosen": -216.1927490234375, "logps/rejected": -268.2547302246094, "loss": 0.3535, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -2.7991743087768555, "rewards/margins": 2.384692430496216, "rewards/rejected": -5.183866500854492, "step": 6550 }, { "epoch": 5.248, "grad_norm": 12.038691520690918, "learning_rate": 2.720323415255071e-06, "logits/chosen": 1.0917648077011108, "logits/rejected": 1.0066547393798828, "logps/chosen": -216.46788024902344, "logps/rejected": -242.0109405517578, "loss": 0.2506, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -2.0974338054656982, "rewards/margins": 2.069669723510742, "rewards/rejected": -4.1671037673950195, "step": 6560 }, { "epoch": 5.256, "grad_norm": 16.80321502685547, "learning_rate": 2.713368412264118e-06, "logits/chosen": 0.9983375668525696, "logits/rejected": 0.8874215483665466, "logps/chosen": -205.94091796875, "logps/rejected": -261.5921936035156, "loss": 0.2188, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.691676139831543, "rewards/margins": 2.6781671047210693, "rewards/rejected": -4.369843482971191, "step": 6570 }, { "epoch": 5.264, "grad_norm": 45.896324157714844, "learning_rate": 2.7064117453834245e-06, "logits/chosen": 1.0483051538467407, "logits/rejected": 0.9006147384643555, "logps/chosen": -196.8784637451172, "logps/rejected": -240.83731079101562, "loss": 0.2838, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.3554176092147827, "rewards/margins": 2.4722535610198975, "rewards/rejected": -3.8276710510253906, "step": 6580 }, { "epoch": 5.272, "grad_norm": 52.870967864990234, "learning_rate": 2.699453468862477e-06, "logits/chosen": 0.8586319088935852, "logits/rejected": 0.8957145810127258, "logps/chosen": -202.1999053955078, "logps/rejected": -247.98497009277344, "loss": 0.3508, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.5604242086410522, "rewards/margins": 2.2796757221221924, "rewards/rejected": -3.840100049972534, "step": 6590 }, { "epoch": 5.28, "grad_norm": 15.097871780395508, "learning_rate": 2.6924936369633126e-06, "logits/chosen": 0.9903114438056946, "logits/rejected": 0.8972232937812805, "logps/chosen": -220.2637176513672, "logps/rejected": -246.101318359375, "loss": 0.2577, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -1.400626540184021, "rewards/margins": 2.2727909088134766, "rewards/rejected": -3.673417806625366, "step": 6600 }, { "epoch": 5.288, "grad_norm": 30.0377140045166, "learning_rate": 2.6855323039601e-06, "logits/chosen": 0.9595162272453308, "logits/rejected": 0.9324502944946289, "logps/chosen": -217.57749938964844, "logps/rejected": -249.253662109375, "loss": 0.2962, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.08235502243042, "rewards/margins": 2.116223096847534, "rewards/rejected": -4.198577880859375, "step": 6610 }, { "epoch": 5.296, "grad_norm": 35.225372314453125, "learning_rate": 2.678569524138711e-06, "logits/chosen": 1.0795904397964478, "logits/rejected": 0.9872816205024719, "logps/chosen": -217.6615753173828, "logps/rejected": -247.40611267089844, "loss": 0.2785, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.6414073705673218, "rewards/margins": 2.288874387741089, "rewards/rejected": -3.9302818775177, "step": 6620 }, { "epoch": 5.304, "grad_norm": 19.5015926361084, "learning_rate": 2.671605351796302e-06, "logits/chosen": 1.0191433429718018, "logits/rejected": 1.0426959991455078, "logps/chosen": -215.02542114257812, "logps/rejected": -250.69419860839844, "loss": 0.2692, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.4523265361785889, "rewards/margins": 2.202432632446289, "rewards/rejected": -3.654759168624878, "step": 6630 }, { "epoch": 5.312, "grad_norm": 13.902155876159668, "learning_rate": 2.664639841240888e-06, "logits/chosen": 1.0611361265182495, "logits/rejected": 1.0298746824264526, "logps/chosen": -210.43655395507812, "logps/rejected": -242.15591430664062, "loss": 0.2278, "rewards/accuracies": 0.9625000357627869, "rewards/chosen": -1.0366039276123047, "rewards/margins": 2.3139262199401855, "rewards/rejected": -3.3505303859710693, "step": 6640 }, { "epoch": 5.32, "grad_norm": 19.404386520385742, "learning_rate": 2.6576730467909202e-06, "logits/chosen": 0.9389854669570923, "logits/rejected": 0.8830668330192566, "logps/chosen": -211.813720703125, "logps/rejected": -255.224365234375, "loss": 0.2227, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.772042155265808, "rewards/margins": 2.3123362064361572, "rewards/rejected": -4.084378719329834, "step": 6650 }, { "epoch": 5.328, "grad_norm": 19.177539825439453, "learning_rate": 2.6507050227748595e-06, "logits/chosen": 0.9689971804618835, "logits/rejected": 0.8664621710777283, "logps/chosen": -218.7672882080078, "logps/rejected": -242.08872985839844, "loss": 0.2906, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.9733574390411377, "rewards/margins": 2.0987534523010254, "rewards/rejected": -4.072110652923584, "step": 6660 }, { "epoch": 5.336, "grad_norm": 19.155794143676758, "learning_rate": 2.6437358235307574e-06, "logits/chosen": 0.9650457501411438, "logits/rejected": 0.9216805696487427, "logps/chosen": -223.25161743164062, "logps/rejected": -267.2557373046875, "loss": 0.2284, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.293266773223877, "rewards/margins": 2.462886095046997, "rewards/rejected": -4.756152629852295, "step": 6670 }, { "epoch": 5.344, "grad_norm": 39.01654815673828, "learning_rate": 2.6367655034058302e-06, "logits/chosen": 0.9830673336982727, "logits/rejected": 0.9171612858772278, "logps/chosen": -213.6517791748047, "logps/rejected": -247.17857360839844, "loss": 0.1726, "rewards/accuracies": 0.9625000357627869, "rewards/chosen": -1.5426089763641357, "rewards/margins": 2.6942059993743896, "rewards/rejected": -4.236814975738525, "step": 6680 }, { "epoch": 5.352, "grad_norm": 18.91624641418457, "learning_rate": 2.629794116756035e-06, "logits/chosen": 1.1095460653305054, "logits/rejected": 1.073819875717163, "logps/chosen": -196.78050231933594, "logps/rejected": -221.8975067138672, "loss": 0.2172, "rewards/accuracies": 0.9375, "rewards/chosen": -1.370259165763855, "rewards/margins": 2.258117198944092, "rewards/rejected": -3.628376007080078, "step": 6690 }, { "epoch": 5.36, "grad_norm": 10.549464225769043, "learning_rate": 2.6228217179456433e-06, "logits/chosen": 0.9206059575080872, "logits/rejected": 0.8385736346244812, "logps/chosen": -229.06982421875, "logps/rejected": -266.5553894042969, "loss": 0.2318, "rewards/accuracies": 0.9375, "rewards/chosen": -2.381563663482666, "rewards/margins": 2.3572521209716797, "rewards/rejected": -4.7388153076171875, "step": 6700 }, { "epoch": 5.368, "grad_norm": 18.678146362304688, "learning_rate": 2.6158483613468227e-06, "logits/chosen": 0.8933264017105103, "logits/rejected": 0.9182813763618469, "logps/chosen": -246.1497039794922, "logps/rejected": -265.3963317871094, "loss": 0.2916, "rewards/accuracies": 0.875, "rewards/chosen": -2.833988666534424, "rewards/margins": 2.2055633068084717, "rewards/rejected": -5.039552211761475, "step": 6710 }, { "epoch": 5.376, "grad_norm": 15.417664527893066, "learning_rate": 2.60887410133921e-06, "logits/chosen": 1.0603265762329102, "logits/rejected": 0.9646215438842773, "logps/chosen": -213.02725219726562, "logps/rejected": -248.04296875, "loss": 0.1764, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.2449893951416016, "rewards/margins": 2.7219924926757812, "rewards/rejected": -4.966982364654541, "step": 6720 }, { "epoch": 5.384, "grad_norm": 16.097373962402344, "learning_rate": 2.6018989923094827e-06, "logits/chosen": 1.0397377014160156, "logits/rejected": 1.0830873250961304, "logps/chosen": -216.1459197998047, "logps/rejected": -231.951904296875, "loss": 0.2511, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.4918582439422607, "rewards/margins": 2.454404354095459, "rewards/rejected": -4.946262359619141, "step": 6730 }, { "epoch": 5.392, "grad_norm": 15.579238891601562, "learning_rate": 2.594923088650946e-06, "logits/chosen": 1.000435709953308, "logits/rejected": 1.0281956195831299, "logps/chosen": -225.9973907470703, "logps/rejected": -261.2825012207031, "loss": 0.2612, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.4454987049102783, "rewards/margins": 2.452462911605835, "rewards/rejected": -3.897961378097534, "step": 6740 }, { "epoch": 5.4, "grad_norm": 23.80678367614746, "learning_rate": 2.5879464447630947e-06, "logits/chosen": 0.9257308840751648, "logits/rejected": 0.9335107207298279, "logps/chosen": -205.0893096923828, "logps/rejected": -268.3067932128906, "loss": 0.2616, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -1.4560121297836304, "rewards/margins": 2.8784759044647217, "rewards/rejected": -4.3344879150390625, "step": 6750 }, { "epoch": 5.408, "grad_norm": 14.877538681030273, "learning_rate": 2.5809691150512013e-06, "logits/chosen": 1.0292404890060425, "logits/rejected": 0.8708595633506775, "logps/chosen": -230.5414581298828, "logps/rejected": -267.4679870605469, "loss": 0.2617, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.2800981998443604, "rewards/margins": 2.2236454486846924, "rewards/rejected": -4.503743648529053, "step": 6760 }, { "epoch": 5.416, "grad_norm": 30.09552764892578, "learning_rate": 2.573991153925883e-06, "logits/chosen": 0.9744608998298645, "logits/rejected": 0.9204347729682922, "logps/chosen": -230.64749145507812, "logps/rejected": -238.489990234375, "loss": 0.2917, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.851356863975525, "rewards/margins": 2.2804553508758545, "rewards/rejected": -4.131812572479248, "step": 6770 }, { "epoch": 5.424, "grad_norm": 9.124749183654785, "learning_rate": 2.5670126158026843e-06, "logits/chosen": 0.9641228914260864, "logits/rejected": 0.9122328758239746, "logps/chosen": -209.06993103027344, "logps/rejected": -239.84507751464844, "loss": 0.1824, "rewards/accuracies": 0.9625000357627869, "rewards/chosen": -1.1467351913452148, "rewards/margins": 2.435342788696289, "rewards/rejected": -3.582077741622925, "step": 6780 }, { "epoch": 5.432, "grad_norm": 13.851131439208984, "learning_rate": 2.5600335551016447e-06, "logits/chosen": 1.2253344058990479, "logits/rejected": 1.1556156873703003, "logps/chosen": -200.3877410888672, "logps/rejected": -242.9773712158203, "loss": 0.2018, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.4336289167404175, "rewards/margins": 2.616694450378418, "rewards/rejected": -4.050323486328125, "step": 6790 }, { "epoch": 5.44, "grad_norm": 15.476871490478516, "learning_rate": 2.553054026246884e-06, "logits/chosen": 1.0492868423461914, "logits/rejected": 0.955453097820282, "logps/chosen": -229.8250732421875, "logps/rejected": -254.77793884277344, "loss": 0.2157, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.6499685049057007, "rewards/margins": 2.502528429031372, "rewards/rejected": -4.152497291564941, "step": 6800 }, { "epoch": 5.448, "grad_norm": 17.754188537597656, "learning_rate": 2.546074083666169e-06, "logits/chosen": 1.1146057844161987, "logits/rejected": 1.0327948331832886, "logps/chosen": -196.68882751464844, "logps/rejected": -219.73692321777344, "loss": 0.2566, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.2834064960479736, "rewards/margins": 2.399442434310913, "rewards/rejected": -3.6828484535217285, "step": 6810 }, { "epoch": 5.456, "grad_norm": 14.777560234069824, "learning_rate": 2.539093781790494e-06, "logits/chosen": 1.1234928369522095, "logits/rejected": 0.8710097670555115, "logps/chosen": -229.3218231201172, "logps/rejected": -263.53204345703125, "loss": 0.3007, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -2.53562593460083, "rewards/margins": 2.2426531314849854, "rewards/rejected": -4.778278827667236, "step": 6820 }, { "epoch": 5.464, "grad_norm": 11.484394073486328, "learning_rate": 2.5321131750536548e-06, "logits/chosen": 1.0820449590682983, "logits/rejected": 1.0439467430114746, "logps/chosen": -210.1967315673828, "logps/rejected": -252.2407684326172, "loss": 0.2437, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.7417097091674805, "rewards/margins": 2.493793487548828, "rewards/rejected": -4.235503673553467, "step": 6830 }, { "epoch": 5.4719999999999995, "grad_norm": 7.133698463439941, "learning_rate": 2.525132317891827e-06, "logits/chosen": 0.9405183792114258, "logits/rejected": 0.7907823920249939, "logps/chosen": -223.71621704101562, "logps/rejected": -250.6181182861328, "loss": 0.309, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -2.273303747177124, "rewards/margins": 2.0772006511688232, "rewards/rejected": -4.350503444671631, "step": 6840 }, { "epoch": 5.48, "grad_norm": 36.052452087402344, "learning_rate": 2.518151264743135e-06, "logits/chosen": 1.0576660633087158, "logits/rejected": 0.9024376273155212, "logps/chosen": -219.67166137695312, "logps/rejected": -256.7233581542969, "loss": 0.2635, "rewards/accuracies": 0.875, "rewards/chosen": -2.318809986114502, "rewards/margins": 2.248960494995117, "rewards/rejected": -4.567770481109619, "step": 6850 }, { "epoch": 5.4879999999999995, "grad_norm": 22.618186950683594, "learning_rate": 2.5111700700472346e-06, "logits/chosen": 1.0195873975753784, "logits/rejected": 0.735422670841217, "logps/chosen": -220.07212829589844, "logps/rejected": -274.3692932128906, "loss": 0.3001, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -2.25004243850708, "rewards/margins": 2.4187867641448975, "rewards/rejected": -4.668828964233398, "step": 6860 }, { "epoch": 5.496, "grad_norm": 14.920022964477539, "learning_rate": 2.5041887882448845e-06, "logits/chosen": 0.9542160034179688, "logits/rejected": 0.9047889709472656, "logps/chosen": -217.98045349121094, "logps/rejected": -259.5525817871094, "loss": 0.2509, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.3205878734588623, "rewards/margins": 2.485825777053833, "rewards/rejected": -4.806413173675537, "step": 6870 }, { "epoch": 5.504, "grad_norm": 27.746845245361328, "learning_rate": 2.4972074737775215e-06, "logits/chosen": 0.8094510436058044, "logits/rejected": 0.820503830909729, "logps/chosen": -235.88784790039062, "logps/rejected": -269.9020690917969, "loss": 0.2144, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.0258710384368896, "rewards/margins": 2.428048610687256, "rewards/rejected": -5.453919410705566, "step": 6880 }, { "epoch": 5.5120000000000005, "grad_norm": 38.07632064819336, "learning_rate": 2.490226181086838e-06, "logits/chosen": 1.0112783908843994, "logits/rejected": 0.9033260345458984, "logps/chosen": -205.05882263183594, "logps/rejected": -243.261962890625, "loss": 0.2399, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.0925614833831787, "rewards/margins": 2.7473719120025635, "rewards/rejected": -4.839933395385742, "step": 6890 }, { "epoch": 5.52, "grad_norm": 35.062713623046875, "learning_rate": 2.4832449646143605e-06, "logits/chosen": 1.0022151470184326, "logits/rejected": 0.9039661288261414, "logps/chosen": -211.7960968017578, "logps/rejected": -265.2066345214844, "loss": 0.3377, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.1607472896575928, "rewards/margins": 2.2616782188415527, "rewards/rejected": -4.422425746917725, "step": 6900 }, { "epoch": 5.5280000000000005, "grad_norm": 21.59208869934082, "learning_rate": 2.4762638788010123e-06, "logits/chosen": 0.8795046210289001, "logits/rejected": 0.8476438522338867, "logps/chosen": -229.28501892089844, "logps/rejected": -292.383544921875, "loss": 0.283, "rewards/accuracies": 0.9000000357627869, "rewards/chosen": -2.4365899562835693, "rewards/margins": 2.430114984512329, "rewards/rejected": -4.86670446395874, "step": 6910 }, { "epoch": 5.536, "grad_norm": 33.235862731933594, "learning_rate": 2.4692829780867066e-06, "logits/chosen": 0.9138635993003845, "logits/rejected": 0.8161695599555969, "logps/chosen": -233.41311645507812, "logps/rejected": -261.68975830078125, "loss": 0.2933, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.5699150562286377, "rewards/margins": 2.5199315547943115, "rewards/rejected": -4.089846611022949, "step": 6920 }, { "epoch": 5.5440000000000005, "grad_norm": 28.37091064453125, "learning_rate": 2.4623023169099074e-06, "logits/chosen": 1.0121078491210938, "logits/rejected": 0.8701547980308533, "logps/chosen": -226.9220733642578, "logps/rejected": -260.5491638183594, "loss": 0.2636, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.4063045978546143, "rewards/margins": 2.338066577911377, "rewards/rejected": -4.744370937347412, "step": 6930 }, { "epoch": 5.552, "grad_norm": 17.56963348388672, "learning_rate": 2.4553219497072144e-06, "logits/chosen": 0.9864139556884766, "logits/rejected": 0.9286383986473083, "logps/chosen": -224.4748992919922, "logps/rejected": -247.63027954101562, "loss": 0.2675, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.1812076568603516, "rewards/margins": 2.474334716796875, "rewards/rejected": -4.655541896820068, "step": 6940 }, { "epoch": 5.5600000000000005, "grad_norm": 37.597293853759766, "learning_rate": 2.4483419309129315e-06, "logits/chosen": 0.8857519030570984, "logits/rejected": 0.9429466128349304, "logps/chosen": -202.9930877685547, "logps/rejected": -244.86172485351562, "loss": 0.2741, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.7118009328842163, "rewards/margins": 2.3302266597747803, "rewards/rejected": -4.042027473449707, "step": 6950 }, { "epoch": 5.568, "grad_norm": 22.82415008544922, "learning_rate": 2.441362314958649e-06, "logits/chosen": 1.015781283378601, "logits/rejected": 0.8939239382743835, "logps/chosen": -197.0980987548828, "logps/rejected": -247.9208526611328, "loss": 0.2189, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.4986218214035034, "rewards/margins": 2.6701772212982178, "rewards/rejected": -4.168798923492432, "step": 6960 }, { "epoch": 5.576, "grad_norm": 30.6041259765625, "learning_rate": 2.4343831562728135e-06, "logits/chosen": 0.8862255215644836, "logits/rejected": 0.8734444975852966, "logps/chosen": -238.130859375, "logps/rejected": -281.5000915527344, "loss": 0.3029, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.5411267280578613, "rewards/margins": 2.136220693588257, "rewards/rejected": -4.677347660064697, "step": 6970 }, { "epoch": 5.584, "grad_norm": 14.699413299560547, "learning_rate": 2.4274045092803056e-06, "logits/chosen": 0.9806970953941345, "logits/rejected": 0.8635124564170837, "logps/chosen": -230.31900024414062, "logps/rejected": -276.25421142578125, "loss": 0.241, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.257481098175049, "rewards/margins": 2.4393088817596436, "rewards/rejected": -4.696789741516113, "step": 6980 }, { "epoch": 5.592, "grad_norm": 26.546295166015625, "learning_rate": 2.4204264284020182e-06, "logits/chosen": 1.0598602294921875, "logits/rejected": 1.0161948204040527, "logps/chosen": -208.1858367919922, "logps/rejected": -228.87876892089844, "loss": 0.3018, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.463453769683838, "rewards/margins": 2.1223933696746826, "rewards/rejected": -3.5858471393585205, "step": 6990 }, { "epoch": 5.6, "grad_norm": 13.010027885437012, "learning_rate": 2.4134489680544263e-06, "logits/chosen": 1.0219472646713257, "logits/rejected": 0.916050910949707, "logps/chosen": -237.35302734375, "logps/rejected": -259.0421447753906, "loss": 0.1984, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8679497241973877, "rewards/margins": 2.485231399536133, "rewards/rejected": -4.3531813621521, "step": 7000 } ], "logging_steps": 10, "max_steps": 12500, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0023474241339392e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }