{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 9917, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010084457330139921, "grad_norm": 132.19058227539062, "learning_rate": 9.990924674800847e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -492.67266845703125, "logps/rejected": -294.3489990234375, "loss": 0.6837, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.009280338883399963, "rewards/margins": 0.019613923504948616, "rewards/rejected": -0.02889426052570343, "step": 10 }, { "epoch": 0.0020168914660279843, "grad_norm": 102.65989685058594, "learning_rate": 9.98084098013512e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -476.215576171875, "logps/rejected": -280.8232421875, "loss": 0.6683, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.01887153834104538, "rewards/margins": 0.05232178419828415, "rewards/rejected": -0.07119332253932953, "step": 20 }, { "epoch": 0.0030253371990419764, "grad_norm": 100.29199981689453, "learning_rate": 9.970757285469395e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -487.90478515625, "logps/rejected": -333.49322509765625, "loss": 0.6742, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.013336201198399067, "rewards/margins": 0.04365706443786621, "rewards/rejected": -0.0569932647049427, "step": 30 }, { "epoch": 0.0040337829320559685, "grad_norm": 173.7672882080078, "learning_rate": 9.96067359080367e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -590.6216430664062, "logps/rejected": -276.6372985839844, "loss": 0.648, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.04521356523036957, "rewards/margins": 0.10435555875301361, "rewards/rejected": -0.05914200097322464, "step": 40 }, { "epoch": 0.005042228665069961, "grad_norm": 152.8135223388672, "learning_rate": 9.950589896137944e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -502.71099853515625, "logps/rejected": -356.7897033691406, "loss": 0.6523, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.06095316261053085, "rewards/margins": 0.09699732065200806, "rewards/rejected": -0.0360441617667675, "step": 50 }, { "epoch": 0.006050674398083953, "grad_norm": 105.90693664550781, "learning_rate": 9.94050620147222e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -433.0853576660156, "logps/rejected": -336.85894775390625, "loss": 0.6325, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.03647039458155632, "rewards/margins": 0.14110925793647766, "rewards/rejected": -0.10463883727788925, "step": 60 }, { "epoch": 0.007059120131097945, "grad_norm": 68.47858428955078, "learning_rate": 9.930422506806494e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -506.19097900390625, "logps/rejected": -288.6146240234375, "loss": 0.6358, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.014058482833206654, "rewards/margins": 0.13066796958446503, "rewards/rejected": -0.11660949140787125, "step": 70 }, { "epoch": 0.008067565864111937, "grad_norm": 167.64889526367188, "learning_rate": 9.920338812140767e-07, "logits/chosen": 1.9296503067016602, "logits/rejected": NaN, "logps/chosen": -613.1732788085938, "logps/rejected": -356.1299743652344, "loss": 0.6787, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.046195171773433685, "rewards/margins": 0.04616032540798187, "rewards/rejected": 3.483779801172204e-05, "step": 80 }, { "epoch": 0.00907601159712593, "grad_norm": 149.88845825195312, "learning_rate": 9.910255117475043e-07, "logits/chosen": 1.878204107284546, "logits/rejected": NaN, "logps/chosen": -468.49993896484375, "logps/rejected": -285.2349548339844, "loss": 0.6397, "rewards/accuracies": 0.625, "rewards/chosen": 0.006062865257263184, "rewards/margins": 0.1256987601518631, "rewards/rejected": -0.11963589489459991, "step": 90 }, { "epoch": 0.010084457330139922, "grad_norm": 68.9906234741211, "learning_rate": 9.900171422809318e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -528.1124267578125, "logps/rejected": -312.1116943359375, "loss": 0.6145, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.08531082421541214, "rewards/margins": 0.19223423302173615, "rewards/rejected": -0.10692340135574341, "step": 100 }, { "epoch": 0.011092903063153914, "grad_norm": 109.79571533203125, "learning_rate": 9.890087728143591e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -480.45184326171875, "logps/rejected": -320.73455810546875, "loss": 0.5615, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16111794114112854, "rewards/margins": 0.3798338770866394, "rewards/rejected": -0.21871598064899445, "step": 110 }, { "epoch": 0.012101348796167906, "grad_norm": 114.03849792480469, "learning_rate": 9.880004033477866e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -572.1328735351562, "logps/rejected": -343.0621337890625, "loss": 0.6152, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.06697340309619904, "rewards/margins": 0.21065068244934082, "rewards/rejected": -0.14367727935314178, "step": 120 }, { "epoch": 0.013109794529181899, "grad_norm": 102.92571258544922, "learning_rate": 9.86992033881214e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -473.3602600097656, "logps/rejected": -319.1706237792969, "loss": 0.6047, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.14200575649738312, "rewards/margins": 0.26635247468948364, "rewards/rejected": -0.12434671074151993, "step": 130 }, { "epoch": 0.01411824026219589, "grad_norm": 131.4315948486328, "learning_rate": 9.859836644146415e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -423.8439025878906, "logps/rejected": -314.05206298828125, "loss": 0.6262, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.0013574243057519197, "rewards/margins": 0.1781669408082962, "rewards/rejected": -0.17952436208724976, "step": 140 }, { "epoch": 0.015126685995209882, "grad_norm": 74.6403579711914, "learning_rate": 9.84975294948069e-07, "logits/chosen": 2.2399797439575195, "logits/rejected": NaN, "logps/chosen": -480.0853576660156, "logps/rejected": -420.03192138671875, "loss": 0.6327, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.027425020933151245, "rewards/margins": 0.1939731389284134, "rewards/rejected": -0.22139815986156464, "step": 150 }, { "epoch": 0.016135131728223874, "grad_norm": 172.31642150878906, "learning_rate": 9.839669254814963e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -553.5308837890625, "logps/rejected": -286.5758972167969, "loss": 0.6204, "rewards/accuracies": 0.625, "rewards/chosen": 0.05566934868693352, "rewards/margins": 0.20663981139659882, "rewards/rejected": -0.1509704887866974, "step": 160 }, { "epoch": 0.017143577461237867, "grad_norm": 169.35107421875, "learning_rate": 9.829585560149238e-07, "logits/chosen": 2.111576557159424, "logits/rejected": NaN, "logps/chosen": -543.0858154296875, "logps/rejected": -303.3373107910156, "loss": 0.6169, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.10131724178791046, "rewards/margins": 0.2826957106590271, "rewards/rejected": -0.18137845396995544, "step": 170 }, { "epoch": 0.01815202319425186, "grad_norm": 86.03092956542969, "learning_rate": 9.819501865483514e-07, "logits/chosen": NaN, "logits/rejected": 2.3274455070495605, "logps/chosen": -452.4576721191406, "logps/rejected": -386.15362548828125, "loss": 0.6007, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.05320992320775986, "rewards/margins": 0.2805226743221283, "rewards/rejected": -0.22731277346611023, "step": 180 }, { "epoch": 0.01916046892726585, "grad_norm": 122.47123718261719, "learning_rate": 9.809418170817787e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -565.9439086914062, "logps/rejected": -450.9951171875, "loss": 0.6561, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.07688179612159729, "rewards/margins": 0.15626820921897888, "rewards/rejected": -0.0793863981962204, "step": 190 }, { "epoch": 0.020168914660279844, "grad_norm": 80.74224853515625, "learning_rate": 9.799334476152062e-07, "logits/chosen": 2.0503249168395996, "logits/rejected": NaN, "logps/chosen": -436.8985900878906, "logps/rejected": -300.7847900390625, "loss": 0.6161, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.05784404277801514, "rewards/margins": 0.24146540462970734, "rewards/rejected": -0.2993094325065613, "step": 200 }, { "epoch": 0.021177360393293834, "grad_norm": 113.6532211303711, "learning_rate": 9.789250781486335e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -591.5537109375, "logps/rejected": -355.46820068359375, "loss": 0.6012, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.08494012802839279, "rewards/margins": 0.2851090729236603, "rewards/rejected": -0.2001689225435257, "step": 210 }, { "epoch": 0.022185806126307828, "grad_norm": 104.55280303955078, "learning_rate": 9.77916708682061e-07, "logits/chosen": 1.9334770441055298, "logits/rejected": NaN, "logps/chosen": -530.5650024414062, "logps/rejected": -297.15191650390625, "loss": 0.5753, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.01055198349058628, "rewards/margins": 0.3281201422214508, "rewards/rejected": -0.33867210149765015, "step": 220 }, { "epoch": 0.02319425185932182, "grad_norm": 127.9859619140625, "learning_rate": 9.769083392154886e-07, "logits/chosen": 1.9690792560577393, "logits/rejected": NaN, "logps/chosen": -604.9849243164062, "logps/rejected": -254.45431518554688, "loss": 0.5549, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12644194066524506, "rewards/margins": 0.40287214517593384, "rewards/rejected": -0.2764301896095276, "step": 230 }, { "epoch": 0.02420269759233581, "grad_norm": 132.99148559570312, "learning_rate": 9.758999697489159e-07, "logits/chosen": 2.1367974281311035, "logits/rejected": NaN, "logps/chosen": -561.9125366210938, "logps/rejected": -286.87554931640625, "loss": 0.5576, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.0065349191427230835, "rewards/margins": 0.3529384732246399, "rewards/rejected": -0.3594733774662018, "step": 240 }, { "epoch": 0.025211143325349804, "grad_norm": 114.58164978027344, "learning_rate": 9.748916002823434e-07, "logits/chosen": 1.5212979316711426, "logits/rejected": NaN, "logps/chosen": -515.1577758789062, "logps/rejected": -400.8404846191406, "loss": 0.5944, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.04878995940089226, "rewards/margins": 0.32272887229919434, "rewards/rejected": -0.2739388942718506, "step": 250 }, { "epoch": 0.026219589058363798, "grad_norm": 161.09117126464844, "learning_rate": 9.73883230815771e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -544.3243408203125, "logps/rejected": -333.8516845703125, "loss": 0.5816, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.032847706228494644, "rewards/margins": 0.3203594386577606, "rewards/rejected": -0.35320717096328735, "step": 260 }, { "epoch": 0.027228034791377788, "grad_norm": 100.0063705444336, "learning_rate": 9.728748613491982e-07, "logits/chosen": 1.8294861316680908, "logits/rejected": NaN, "logps/chosen": -498.52093505859375, "logps/rejected": -331.4982604980469, "loss": 0.5936, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.016296368092298508, "rewards/margins": 0.3353232741355896, "rewards/rejected": -0.3516196310520172, "step": 270 }, { "epoch": 0.02823648052439178, "grad_norm": 204.71115112304688, "learning_rate": 9.718664918826258e-07, "logits/chosen": 2.1012396812438965, "logits/rejected": NaN, "logps/chosen": -657.1436157226562, "logps/rejected": -359.98681640625, "loss": 0.6345, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.024639006704092026, "rewards/margins": 0.24086341261863708, "rewards/rejected": -0.21622440218925476, "step": 280 }, { "epoch": 0.029244926257405775, "grad_norm": 110.68402099609375, "learning_rate": 9.708581224160533e-07, "logits/chosen": 1.856713056564331, "logits/rejected": NaN, "logps/chosen": -573.726806640625, "logps/rejected": -322.92401123046875, "loss": 0.5535, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.12816312909126282, "rewards/margins": 0.4340866208076477, "rewards/rejected": -0.3059235215187073, "step": 290 }, { "epoch": 0.030253371990419765, "grad_norm": 247.96798706054688, "learning_rate": 9.698497529494806e-07, "logits/chosen": 1.886361837387085, "logits/rejected": NaN, "logps/chosen": -640.3163452148438, "logps/rejected": -413.12158203125, "loss": 0.5733, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.22312109172344208, "rewards/margins": 0.4198804795742035, "rewards/rejected": -0.1967594176530838, "step": 300 }, { "epoch": 0.031261817723433755, "grad_norm": 105.93134307861328, "learning_rate": 9.688413834829081e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -509.42431640625, "logps/rejected": -315.7112731933594, "loss": 0.5573, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.009422451257705688, "rewards/margins": 0.37460872530937195, "rewards/rejected": -0.3840312063694, "step": 310 }, { "epoch": 0.03227026345644775, "grad_norm": 133.40406799316406, "learning_rate": 9.678330140163356e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -595.7225341796875, "logps/rejected": -426.496337890625, "loss": 0.6038, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.02317388914525509, "rewards/margins": 0.27441391348838806, "rewards/rejected": -0.2512400150299072, "step": 320 }, { "epoch": 0.03327870918946174, "grad_norm": 124.77814483642578, "learning_rate": 9.66824644549763e-07, "logits/chosen": 2.0852890014648438, "logits/rejected": NaN, "logps/chosen": -573.8028564453125, "logps/rejected": -294.39630126953125, "loss": 0.6177, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.06134957820177078, "rewards/margins": 0.2674252390861511, "rewards/rejected": -0.3287748396396637, "step": 330 }, { "epoch": 0.034287154922475735, "grad_norm": 100.37052154541016, "learning_rate": 9.658162750831905e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -511.34600830078125, "logps/rejected": -296.46063232421875, "loss": 0.5481, "rewards/accuracies": 0.75, "rewards/chosen": 0.004605159163475037, "rewards/margins": 0.440579891204834, "rewards/rejected": -0.43597474694252014, "step": 340 }, { "epoch": 0.03529560065548973, "grad_norm": 80.08270263671875, "learning_rate": 9.648079056166178e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -534.5794677734375, "logps/rejected": -388.6957702636719, "loss": 0.5721, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.20011213421821594, "rewards/margins": 0.4216812551021576, "rewards/rejected": -0.6217933893203735, "step": 350 }, { "epoch": 0.03630404638850372, "grad_norm": 61.59571075439453, "learning_rate": 9.637995361500453e-07, "logits/chosen": 2.0812835693359375, "logits/rejected": NaN, "logps/chosen": -492.084716796875, "logps/rejected": -280.2168273925781, "loss": 0.5337, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.09482525289058685, "rewards/margins": 0.5071539878845215, "rewards/rejected": -0.6019792556762695, "step": 360 }, { "epoch": 0.03731249212151771, "grad_norm": 143.09054565429688, "learning_rate": 9.627911666834728e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -553.0162353515625, "logps/rejected": -368.04254150390625, "loss": 0.5392, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.09597174823284149, "rewards/margins": 0.5070396661758423, "rewards/rejected": -0.6030114889144897, "step": 370 }, { "epoch": 0.0383209378545317, "grad_norm": 122.94889068603516, "learning_rate": 9.617827972169002e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -512.2037963867188, "logps/rejected": -432.593994140625, "loss": 0.5747, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.04019744321703911, "rewards/margins": 0.5182380676269531, "rewards/rejected": -0.5584355592727661, "step": 380 }, { "epoch": 0.039329383587545695, "grad_norm": 101.95709991455078, "learning_rate": 9.607744277503277e-07, "logits/chosen": 1.9268792867660522, "logits/rejected": NaN, "logps/chosen": -467.927490234375, "logps/rejected": -349.65911865234375, "loss": 0.6421, "rewards/accuracies": 0.625, "rewards/chosen": -0.2218526303768158, "rewards/margins": 0.22338160872459412, "rewards/rejected": -0.4452342092990875, "step": 390 }, { "epoch": 0.04033782932055969, "grad_norm": 199.4053497314453, "learning_rate": 9.597660582837552e-07, "logits/chosen": 1.822810411453247, "logits/rejected": NaN, "logps/chosen": -512.802734375, "logps/rejected": -266.24249267578125, "loss": 0.5315, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.11396177113056183, "rewards/margins": 0.5939675569534302, "rewards/rejected": -0.7079293131828308, "step": 400 }, { "epoch": 0.04134627505357368, "grad_norm": 129.87461853027344, "learning_rate": 9.587576888171825e-07, "logits/chosen": 2.0481486320495605, "logits/rejected": NaN, "logps/chosen": -527.7064208984375, "logps/rejected": -341.63140869140625, "loss": 0.636, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.15256816148757935, "rewards/margins": 0.33961623907089233, "rewards/rejected": -0.49218446016311646, "step": 410 }, { "epoch": 0.04235472078658767, "grad_norm": 92.84192657470703, "learning_rate": 9.5774931935061e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -510.0606384277344, "logps/rejected": -359.5212097167969, "loss": 0.5413, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.09091368317604065, "rewards/margins": 0.4722279906272888, "rewards/rejected": -0.5631416440010071, "step": 420 }, { "epoch": 0.04336316651960166, "grad_norm": 107.12171173095703, "learning_rate": 9.567409498840374e-07, "logits/chosen": 2.0071218013763428, "logits/rejected": NaN, "logps/chosen": -683.85986328125, "logps/rejected": -324.39849853515625, "loss": 0.48, "rewards/accuracies": 0.75, "rewards/chosen": 0.10597691684961319, "rewards/margins": 0.6120631694793701, "rewards/rejected": -0.5060862302780151, "step": 430 }, { "epoch": 0.044371612252615655, "grad_norm": 94.43312072753906, "learning_rate": 9.557325804174649e-07, "logits/chosen": 2.1299831867218018, "logits/rejected": NaN, "logps/chosen": -548.5216064453125, "logps/rejected": -346.8918762207031, "loss": 0.5662, "rewards/accuracies": 0.625, "rewards/chosen": -0.17419520020484924, "rewards/margins": 0.38008391857147217, "rewards/rejected": -0.5542792081832886, "step": 440 }, { "epoch": 0.04538005798562965, "grad_norm": 86.58575439453125, "learning_rate": 9.547242109508924e-07, "logits/chosen": 1.9150701761245728, "logits/rejected": 2.3229928016662598, "logps/chosen": -500.03009033203125, "logps/rejected": -436.33245849609375, "loss": 0.579, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.05216984823346138, "rewards/margins": 0.4504990577697754, "rewards/rejected": -0.5026689171791077, "step": 450 }, { "epoch": 0.04638850371864364, "grad_norm": 128.5560302734375, "learning_rate": 9.537158414843197e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -457.93914794921875, "logps/rejected": -300.36773681640625, "loss": 0.4914, "rewards/accuracies": 0.75, "rewards/chosen": 0.01884140446782112, "rewards/margins": 0.6529094576835632, "rewards/rejected": -0.6340681314468384, "step": 460 }, { "epoch": 0.047396949451657636, "grad_norm": 70.592041015625, "learning_rate": 9.527074720177472e-07, "logits/chosen": 1.8603322505950928, "logits/rejected": NaN, "logps/chosen": -454.64007568359375, "logps/rejected": -346.5075988769531, "loss": 0.5521, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.17917276918888092, "rewards/margins": 0.5277234315872192, "rewards/rejected": -0.706896185874939, "step": 470 }, { "epoch": 0.04840539518467162, "grad_norm": 112.92415618896484, "learning_rate": 9.516991025511747e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -601.5279541015625, "logps/rejected": -425.1250915527344, "loss": 0.6125, "rewards/accuracies": 0.625, "rewards/chosen": -0.2863697111606598, "rewards/margins": 0.33085790276527405, "rewards/rejected": -0.6172276139259338, "step": 480 }, { "epoch": 0.049413840917685616, "grad_norm": 121.9244613647461, "learning_rate": 9.506907330846021e-07, "logits/chosen": 1.8158600330352783, "logits/rejected": NaN, "logps/chosen": -499.67987060546875, "logps/rejected": -316.8470153808594, "loss": 0.5098, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.07600502669811249, "rewards/margins": 0.6378663182258606, "rewards/rejected": -0.7138713598251343, "step": 490 }, { "epoch": 0.05042228665069961, "grad_norm": 75.59954071044922, "learning_rate": 9.496823636180296e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -541.521728515625, "logps/rejected": -298.0113830566406, "loss": 0.5428, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.08543100208044052, "rewards/margins": 0.5161314606666565, "rewards/rejected": -0.6015625, "step": 500 }, { "epoch": 0.0514307323837136, "grad_norm": 95.78594970703125, "learning_rate": 9.486739941514571e-07, "logits/chosen": 1.8227039575576782, "logits/rejected": NaN, "logps/chosen": -544.6151733398438, "logps/rejected": -346.550537109375, "loss": 0.5135, "rewards/accuracies": 0.625, "rewards/chosen": 0.03601216524839401, "rewards/margins": 0.628010630607605, "rewards/rejected": -0.5919984579086304, "step": 510 }, { "epoch": 0.052439178116727596, "grad_norm": 111.62764739990234, "learning_rate": 9.476656246848846e-07, "logits/chosen": 1.8794965744018555, "logits/rejected": NaN, "logps/chosen": -551.0281372070312, "logps/rejected": -293.67437744140625, "loss": 0.5549, "rewards/accuracies": 0.75, "rewards/chosen": -0.2179102897644043, "rewards/margins": 0.5619044303894043, "rewards/rejected": -0.7798146605491638, "step": 520 }, { "epoch": 0.05344762384974158, "grad_norm": 117.84715270996094, "learning_rate": 9.46657255218312e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -533.288818359375, "logps/rejected": -425.22314453125, "loss": 0.4781, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1283532828092575, "rewards/margins": 0.7077903747558594, "rewards/rejected": -0.5794371366500854, "step": 530 }, { "epoch": 0.054456069582755576, "grad_norm": 109.84874725341797, "learning_rate": 9.456488857517394e-07, "logits/chosen": 1.9084606170654297, "logits/rejected": NaN, "logps/chosen": -554.3775634765625, "logps/rejected": -410.01214599609375, "loss": 0.5268, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.31660059094429016, "rewards/margins": 0.5456563830375671, "rewards/rejected": -0.8622570037841797, "step": 540 }, { "epoch": 0.05546451531576957, "grad_norm": 72.8508071899414, "learning_rate": 9.446405162851668e-07, "logits/chosen": NaN, "logits/rejected": 1.977013349533081, "logps/chosen": -468.7640686035156, "logps/rejected": -352.95733642578125, "loss": 0.4614, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1695244312286377, "rewards/margins": 0.6477757692337036, "rewards/rejected": -0.8173002004623413, "step": 550 }, { "epoch": 0.05647296104878356, "grad_norm": 130.54861450195312, "learning_rate": 9.436321468185942e-07, "logits/chosen": 1.9513938426971436, "logits/rejected": NaN, "logps/chosen": -579.4774780273438, "logps/rejected": -294.833251953125, "loss": 0.6001, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3388453423976898, "rewards/margins": 0.38485080003738403, "rewards/rejected": -0.7236960530281067, "step": 560 }, { "epoch": 0.057481406781797556, "grad_norm": 15.156588554382324, "learning_rate": 9.426237773520216e-07, "logits/chosen": 1.7548303604125977, "logits/rejected": NaN, "logps/chosen": -538.7776489257812, "logps/rejected": -325.391845703125, "loss": 0.5287, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4004303514957428, "rewards/margins": 0.6444536447525024, "rewards/rejected": -1.0448840856552124, "step": 570 }, { "epoch": 0.05848985251481155, "grad_norm": 230.54933166503906, "learning_rate": 9.416154078854492e-07, "logits/chosen": 1.9458354711532593, "logits/rejected": NaN, "logps/chosen": -525.8310546875, "logps/rejected": -305.7309265136719, "loss": 0.6273, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3219083845615387, "rewards/margins": 0.5313286781311035, "rewards/rejected": -0.8532371520996094, "step": 580 }, { "epoch": 0.059498298247825536, "grad_norm": 93.90612030029297, "learning_rate": 9.406070384188767e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -607.6595458984375, "logps/rejected": -306.5436096191406, "loss": 0.475, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.012901430949568748, "rewards/margins": 0.7367895245552063, "rewards/rejected": -0.723888099193573, "step": 590 }, { "epoch": 0.06050674398083953, "grad_norm": 185.37440490722656, "learning_rate": 9.395986689523041e-07, "logits/chosen": 1.9418013095855713, "logits/rejected": NaN, "logps/chosen": -539.4380493164062, "logps/rejected": -357.07818603515625, "loss": 0.5171, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03013516589999199, "rewards/margins": 0.6042314767837524, "rewards/rejected": -0.6343667507171631, "step": 600 }, { "epoch": 0.06151518971385352, "grad_norm": 114.2945556640625, "learning_rate": 9.385902994857315e-07, "logits/chosen": 1.937760353088379, "logits/rejected": NaN, "logps/chosen": -587.9893188476562, "logps/rejected": -386.4167175292969, "loss": 0.5368, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2823416292667389, "rewards/margins": 0.5707222819328308, "rewards/rejected": -0.8530638813972473, "step": 610 }, { "epoch": 0.06252363544686751, "grad_norm": 137.36239624023438, "learning_rate": 9.37581930019159e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -602.6934814453125, "logps/rejected": -402.4562072753906, "loss": 0.5356, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.09537116438150406, "rewards/margins": 0.5797526240348816, "rewards/rejected": -0.6751238107681274, "step": 620 }, { "epoch": 0.0635320811798815, "grad_norm": 65.52979278564453, "learning_rate": 9.365735605525864e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -653.9659423828125, "logps/rejected": -335.0870056152344, "loss": 0.5855, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.061857931315898895, "rewards/margins": 0.5392388701438904, "rewards/rejected": -0.6010968685150146, "step": 630 }, { "epoch": 0.0645405269128955, "grad_norm": 142.6638641357422, "learning_rate": 9.355651910860138e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -591.3888549804688, "logps/rejected": -439.3702697753906, "loss": 0.4919, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.02998219057917595, "rewards/margins": 0.762653112411499, "rewards/rejected": -0.7326709628105164, "step": 640 }, { "epoch": 0.06554897264590949, "grad_norm": 75.45792388916016, "learning_rate": 9.345568216194413e-07, "logits/chosen": 1.626983642578125, "logits/rejected": NaN, "logps/chosen": -450.16448974609375, "logps/rejected": -271.1984558105469, "loss": 0.5697, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.43172687292099, "rewards/margins": 0.47020435333251953, "rewards/rejected": -0.9019311666488647, "step": 650 }, { "epoch": 0.06655741837892348, "grad_norm": 74.01763916015625, "learning_rate": 9.335484521528688e-07, "logits/chosen": 1.7974681854248047, "logits/rejected": NaN, "logps/chosen": -550.3064575195312, "logps/rejected": -312.94415283203125, "loss": 0.509, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.18871021270751953, "rewards/margins": 0.6226880550384521, "rewards/rejected": -0.8113983273506165, "step": 660 }, { "epoch": 0.06756586411193748, "grad_norm": 111.54241180419922, "learning_rate": 9.325400826862963e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -471.6922912597656, "logps/rejected": -358.4078063964844, "loss": 0.4926, "rewards/accuracies": 0.625, "rewards/chosen": -0.48809224367141724, "rewards/margins": 0.7166021466255188, "rewards/rejected": -1.2046945095062256, "step": 670 }, { "epoch": 0.06857430984495147, "grad_norm": 92.76861572265625, "learning_rate": 9.315317132197237e-07, "logits/chosen": 1.8572044372558594, "logits/rejected": NaN, "logps/chosen": -407.95001220703125, "logps/rejected": -263.310546875, "loss": 0.5391, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.49873942136764526, "rewards/margins": 0.6231524348258972, "rewards/rejected": -1.1218918561935425, "step": 680 }, { "epoch": 0.06958275557796546, "grad_norm": 76.82247924804688, "learning_rate": 9.305233437531511e-07, "logits/chosen": 2.073349714279175, "logits/rejected": NaN, "logps/chosen": -511.4974060058594, "logps/rejected": -275.00433349609375, "loss": 0.4284, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.07165037840604782, "rewards/margins": 0.9511950612068176, "rewards/rejected": -1.0228455066680908, "step": 690 }, { "epoch": 0.07059120131097946, "grad_norm": 103.17076873779297, "learning_rate": 9.295149742865785e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -481.35479736328125, "logps/rejected": -349.23431396484375, "loss": 0.6078, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.40006646513938904, "rewards/margins": 0.4396069645881653, "rewards/rejected": -0.8396733999252319, "step": 700 }, { "epoch": 0.07159964704399345, "grad_norm": 124.38003540039062, "learning_rate": 9.285066048200059e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -511.7784729003906, "logps/rejected": -328.0311279296875, "loss": 0.4173, "rewards/accuracies": 0.75, "rewards/chosen": -0.19846411049365997, "rewards/margins": 0.9141234159469604, "rewards/rejected": -1.1125876903533936, "step": 710 }, { "epoch": 0.07260809277700744, "grad_norm": 39.739803314208984, "learning_rate": 9.274982353534335e-07, "logits/chosen": 1.814206838607788, "logits/rejected": NaN, "logps/chosen": -582.1797485351562, "logps/rejected": -365.3920593261719, "loss": 0.4223, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.03215334936976433, "rewards/margins": 1.0288679599761963, "rewards/rejected": -1.0610214471817017, "step": 720 }, { "epoch": 0.07361653851002142, "grad_norm": 133.85928344726562, "learning_rate": 9.26489865886861e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -506.61163330078125, "logps/rejected": -510.47381591796875, "loss": 0.4716, "rewards/accuracies": 0.75, "rewards/chosen": -0.13832217454910278, "rewards/margins": 0.8145145177841187, "rewards/rejected": -0.952836811542511, "step": 730 }, { "epoch": 0.07462498424303542, "grad_norm": 45.454437255859375, "learning_rate": 9.254814964202884e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -492.2759704589844, "logps/rejected": -446.49566650390625, "loss": 0.6706, "rewards/accuracies": 0.5, "rewards/chosen": -0.5443629622459412, "rewards/margins": 0.22774279117584229, "rewards/rejected": -0.7721058130264282, "step": 740 }, { "epoch": 0.07563342997604941, "grad_norm": 151.42294311523438, "learning_rate": 9.244731269537158e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -529.4232177734375, "logps/rejected": -320.9268493652344, "loss": 0.5031, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5412870049476624, "rewards/margins": 0.7154534459114075, "rewards/rejected": -1.2567403316497803, "step": 750 }, { "epoch": 0.0766418757090634, "grad_norm": 170.70602416992188, "learning_rate": 9.234647574871432e-07, "logits/chosen": 2.0390732288360596, "logits/rejected": NaN, "logps/chosen": -637.401611328125, "logps/rejected": -574.60107421875, "loss": 0.5175, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.46415281295776367, "rewards/margins": 0.7902419567108154, "rewards/rejected": -1.2543946504592896, "step": 760 }, { "epoch": 0.0776503214420774, "grad_norm": 84.51779174804688, "learning_rate": 9.224563880205707e-07, "logits/chosen": 1.956151008605957, "logits/rejected": NaN, "logps/chosen": -451.34637451171875, "logps/rejected": -339.843505859375, "loss": 0.6168, "rewards/accuracies": 0.625, "rewards/chosen": -0.24336782097816467, "rewards/margins": 0.5934616923332214, "rewards/rejected": -0.8368295431137085, "step": 770 }, { "epoch": 0.07865876717509139, "grad_norm": 28.000028610229492, "learning_rate": 9.214480185539981e-07, "logits/chosen": 1.9785239696502686, "logits/rejected": NaN, "logps/chosen": -647.0656127929688, "logps/rejected": -295.07635498046875, "loss": 0.4545, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3486732840538025, "rewards/margins": 0.7985097169876099, "rewards/rejected": -1.1471829414367676, "step": 780 }, { "epoch": 0.07966721290810538, "grad_norm": 61.14370346069336, "learning_rate": 9.204396490874255e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -561.9942016601562, "logps/rejected": -334.60491943359375, "loss": 0.6372, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3937831521034241, "rewards/margins": 0.5262726545333862, "rewards/rejected": -0.9200557470321655, "step": 790 }, { "epoch": 0.08067565864111938, "grad_norm": 43.0927734375, "learning_rate": 9.19431279620853e-07, "logits/chosen": 2.011370897293091, "logits/rejected": NaN, "logps/chosen": -475.2210998535156, "logps/rejected": -363.206298828125, "loss": 0.4714, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.270903080701828, "rewards/margins": 0.8540326356887817, "rewards/rejected": -1.1249356269836426, "step": 800 }, { "epoch": 0.08168410437413337, "grad_norm": 123.90274047851562, "learning_rate": 9.184229101542805e-07, "logits/chosen": 1.643784761428833, "logits/rejected": NaN, "logps/chosen": -521.4137573242188, "logps/rejected": -242.83999633789062, "loss": 0.5439, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.3780437707901001, "rewards/margins": 0.7346284985542297, "rewards/rejected": -1.112672209739685, "step": 810 }, { "epoch": 0.08269255010714736, "grad_norm": 138.54908752441406, "learning_rate": 9.17414540687708e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -522.8388671875, "logps/rejected": -484.576904296875, "loss": 0.4817, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2338569164276123, "rewards/margins": 1.0389982461929321, "rewards/rejected": -1.2728550434112549, "step": 820 }, { "epoch": 0.08370099584016136, "grad_norm": 94.35789489746094, "learning_rate": 9.164061712211354e-07, "logits/chosen": NaN, "logits/rejected": 1.8600553274154663, "logps/chosen": -482.998291015625, "logps/rejected": -391.72991943359375, "loss": 0.5513, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5363544225692749, "rewards/margins": 0.7590147256851196, "rewards/rejected": -1.295369267463684, "step": 830 }, { "epoch": 0.08470944157317534, "grad_norm": 115.73876953125, "learning_rate": 9.153978017545628e-07, "logits/chosen": 1.7251341342926025, "logits/rejected": NaN, "logps/chosen": -429.52984619140625, "logps/rejected": -402.33709716796875, "loss": 0.4676, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5198580622673035, "rewards/margins": 0.8966590762138367, "rewards/rejected": -1.4165171384811401, "step": 840 }, { "epoch": 0.08571788730618933, "grad_norm": 30.919540405273438, "learning_rate": 9.143894322879902e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -458.90704345703125, "logps/rejected": -371.2810974121094, "loss": 0.4407, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6335151791572571, "rewards/margins": 1.14771568775177, "rewards/rejected": -1.7812306880950928, "step": 850 }, { "epoch": 0.08672633303920332, "grad_norm": 159.8536834716797, "learning_rate": 9.133810628214176e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -475.75994873046875, "logps/rejected": -447.79241943359375, "loss": 0.6144, "rewards/accuracies": 0.625, "rewards/chosen": -0.6293665766716003, "rewards/margins": 0.5129130482673645, "rewards/rejected": -1.1422797441482544, "step": 860 }, { "epoch": 0.08773477877221732, "grad_norm": 52.004268646240234, "learning_rate": 9.123726933548452e-07, "logits/chosen": 1.8083961009979248, "logits/rejected": NaN, "logps/chosen": -529.2391357421875, "logps/rejected": -341.1845397949219, "loss": 0.5126, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.337271511554718, "rewards/margins": 0.8955130577087402, "rewards/rejected": -1.232784628868103, "step": 870 }, { "epoch": 0.08874322450523131, "grad_norm": 165.34744262695312, "learning_rate": 9.113643238882727e-07, "logits/chosen": 1.728273630142212, "logits/rejected": NaN, "logps/chosen": -484.275634765625, "logps/rejected": -329.22320556640625, "loss": 0.5649, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.49623480439186096, "rewards/margins": 0.746003270149231, "rewards/rejected": -1.2422380447387695, "step": 880 }, { "epoch": 0.0897516702382453, "grad_norm": 111.0256576538086, "learning_rate": 9.103559544217001e-07, "logits/chosen": 1.9173816442489624, "logits/rejected": NaN, "logps/chosen": -567.4414672851562, "logps/rejected": -391.5813293457031, "loss": 0.4646, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8079174757003784, "rewards/margins": 0.9471007585525513, "rewards/rejected": -1.7550182342529297, "step": 890 }, { "epoch": 0.0907601159712593, "grad_norm": 107.92668914794922, "learning_rate": 9.093475849551275e-07, "logits/chosen": 1.980146050453186, "logits/rejected": NaN, "logps/chosen": -609.1776733398438, "logps/rejected": -368.528076171875, "loss": 0.4857, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6815590858459473, "rewards/margins": 0.9259883761405945, "rewards/rejected": -1.607547402381897, "step": 900 }, { "epoch": 0.09176856170427329, "grad_norm": 126.16165924072266, "learning_rate": 9.08339215488555e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -598.533447265625, "logps/rejected": -395.2157287597656, "loss": 0.5842, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.6668006777763367, "rewards/margins": 0.7047586441040039, "rewards/rejected": -1.3715592622756958, "step": 910 }, { "epoch": 0.09277700743728728, "grad_norm": 181.2750701904297, "learning_rate": 9.073308460219824e-07, "logits/chosen": NaN, "logits/rejected": 2.111541748046875, "logps/chosen": -477.2706604003906, "logps/rejected": -363.63116455078125, "loss": 0.4675, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8285900354385376, "rewards/margins": 0.7651680707931519, "rewards/rejected": -1.5937581062316895, "step": 920 }, { "epoch": 0.09378545317030128, "grad_norm": 60.291683197021484, "learning_rate": 9.063224765554098e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -464.531982421875, "logps/rejected": -297.23333740234375, "loss": 0.4947, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5618988871574402, "rewards/margins": 0.7371665835380554, "rewards/rejected": -1.2990654706954956, "step": 930 }, { "epoch": 0.09479389890331527, "grad_norm": 129.25820922851562, "learning_rate": 9.053141070888373e-07, "logits/chosen": 1.9993900060653687, "logits/rejected": NaN, "logps/chosen": -572.3640747070312, "logps/rejected": -319.18890380859375, "loss": 0.399, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6743332147598267, "rewards/margins": 1.1952693462371826, "rewards/rejected": -1.8696025609970093, "step": 940 }, { "epoch": 0.09580234463632925, "grad_norm": 72.09867858886719, "learning_rate": 9.043057376222648e-07, "logits/chosen": 1.6107006072998047, "logits/rejected": NaN, "logps/chosen": -362.5164489746094, "logps/rejected": -250.4098663330078, "loss": 0.5545, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.793789267539978, "rewards/margins": 0.9667034149169922, "rewards/rejected": -1.7604926824569702, "step": 950 }, { "epoch": 0.09681079036934324, "grad_norm": 30.185775756835938, "learning_rate": 9.032973681556923e-07, "logits/chosen": 1.8983585834503174, "logits/rejected": NaN, "logps/chosen": -608.4962158203125, "logps/rejected": -347.6793212890625, "loss": 0.5348, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6380371451377869, "rewards/margins": 0.7635072469711304, "rewards/rejected": -1.4015443325042725, "step": 960 }, { "epoch": 0.09781923610235724, "grad_norm": 92.49270629882812, "learning_rate": 9.022889986891197e-07, "logits/chosen": 1.7562427520751953, "logits/rejected": NaN, "logps/chosen": -648.3843994140625, "logps/rejected": -345.5763854980469, "loss": 0.3815, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.44010406732559204, "rewards/margins": 1.0602848529815674, "rewards/rejected": -1.5003888607025146, "step": 970 }, { "epoch": 0.09882768183537123, "grad_norm": 58.44127655029297, "learning_rate": 9.012806292225471e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -589.8999633789062, "logps/rejected": -482.61981201171875, "loss": 0.4161, "rewards/accuracies": 0.75, "rewards/chosen": -0.5031472444534302, "rewards/margins": 1.1362789869308472, "rewards/rejected": -1.6394259929656982, "step": 980 }, { "epoch": 0.09983612756838522, "grad_norm": 266.0754089355469, "learning_rate": 9.002722597559745e-07, "logits/chosen": 1.6011817455291748, "logits/rejected": NaN, "logps/chosen": -562.0955200195312, "logps/rejected": -292.0634765625, "loss": 0.5104, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6527531743049622, "rewards/margins": 1.0262954235076904, "rewards/rejected": -1.6790485382080078, "step": 990 }, { "epoch": 0.10084457330139922, "grad_norm": 46.380821228027344, "learning_rate": 8.992638902894019e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -510.47900390625, "logps/rejected": -528.35400390625, "loss": 0.5088, "rewards/accuracies": 0.625, "rewards/chosen": -0.5720651745796204, "rewards/margins": 0.8732172846794128, "rewards/rejected": -1.4452825784683228, "step": 1000 }, { "epoch": 0.10185301903441321, "grad_norm": 92.10799407958984, "learning_rate": 8.982555208228295e-07, "logits/chosen": 1.5874038934707642, "logits/rejected": NaN, "logps/chosen": -634.5480346679688, "logps/rejected": -309.37371826171875, "loss": 0.4471, "rewards/accuracies": 0.75, "rewards/chosen": -0.4858071208000183, "rewards/margins": 1.248624563217163, "rewards/rejected": -1.7344316244125366, "step": 1010 }, { "epoch": 0.1028614647674272, "grad_norm": 104.59929656982422, "learning_rate": 8.972471513562569e-07, "logits/chosen": 1.963379144668579, "logits/rejected": NaN, "logps/chosen": -657.004150390625, "logps/rejected": -399.32086181640625, "loss": 0.3325, "rewards/accuracies": 0.875, "rewards/chosen": -0.48403415083885193, "rewards/margins": 1.5635567903518677, "rewards/rejected": -2.047590732574463, "step": 1020 }, { "epoch": 0.1038699105004412, "grad_norm": 70.91700744628906, "learning_rate": 8.962387818896844e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -596.5647583007812, "logps/rejected": -394.855712890625, "loss": 0.4225, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3794094920158386, "rewards/margins": 1.4248696565628052, "rewards/rejected": -1.804279088973999, "step": 1030 }, { "epoch": 0.10487835623345519, "grad_norm": 134.66143798828125, "learning_rate": 8.952304124231118e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -588.7990112304688, "logps/rejected": -486.5838928222656, "loss": 0.4226, "rewards/accuracies": 0.75, "rewards/chosen": -0.07996378093957901, "rewards/margins": 1.1551998853683472, "rewards/rejected": -1.2351638078689575, "step": 1040 }, { "epoch": 0.10588680196646919, "grad_norm": 176.32968139648438, "learning_rate": 8.942220429565392e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -610.7411499023438, "logps/rejected": -305.39764404296875, "loss": 0.475, "rewards/accuracies": 0.75, "rewards/chosen": -0.4148687422275543, "rewards/margins": 0.9141465425491333, "rewards/rejected": -1.3290152549743652, "step": 1050 }, { "epoch": 0.10689524769948316, "grad_norm": 103.89250946044922, "learning_rate": 8.932136734899667e-07, "logits/chosen": 1.9781631231307983, "logits/rejected": NaN, "logps/chosen": -659.0015258789062, "logps/rejected": -408.5461730957031, "loss": 0.4523, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.43457841873168945, "rewards/margins": 1.1715776920318604, "rewards/rejected": -1.6061561107635498, "step": 1060 }, { "epoch": 0.10790369343249716, "grad_norm": 143.62181091308594, "learning_rate": 8.922053040233941e-07, "logits/chosen": 1.7835540771484375, "logits/rejected": NaN, "logps/chosen": -438.0276794433594, "logps/rejected": -281.61944580078125, "loss": 0.3744, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5907410383224487, "rewards/margins": 1.2513178586959839, "rewards/rejected": -1.842058777809143, "step": 1070 }, { "epoch": 0.10891213916551115, "grad_norm": 81.28884887695312, "learning_rate": 8.911969345568215e-07, "logits/chosen": 1.8497769832611084, "logits/rejected": NaN, "logps/chosen": -432.05780029296875, "logps/rejected": -335.74822998046875, "loss": 0.5264, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7096863985061646, "rewards/margins": 0.7406740188598633, "rewards/rejected": -1.4503604173660278, "step": 1080 }, { "epoch": 0.10992058489852514, "grad_norm": 102.22379302978516, "learning_rate": 8.90188565090249e-07, "logits/chosen": 1.5802478790283203, "logits/rejected": NaN, "logps/chosen": -594.8567504882812, "logps/rejected": -340.15277099609375, "loss": 0.482, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.40911203622817993, "rewards/margins": 0.945254921913147, "rewards/rejected": -1.3543668985366821, "step": 1090 }, { "epoch": 0.11092903063153914, "grad_norm": 81.94027709960938, "learning_rate": 8.891801956236765e-07, "logits/chosen": 1.5334967374801636, "logits/rejected": NaN, "logps/chosen": -559.8138427734375, "logps/rejected": -361.30731201171875, "loss": 0.5212, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4757334589958191, "rewards/margins": 0.9353474378585815, "rewards/rejected": -1.4110808372497559, "step": 1100 }, { "epoch": 0.11193747636455313, "grad_norm": 81.71729278564453, "learning_rate": 8.88171826157104e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -495.265625, "logps/rejected": -387.0204772949219, "loss": 0.4757, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3578835725784302, "rewards/margins": 0.7826317548751831, "rewards/rejected": -1.1405153274536133, "step": 1110 }, { "epoch": 0.11294592209756713, "grad_norm": 25.821718215942383, "learning_rate": 8.871634566905314e-07, "logits/chosen": 1.8982083797454834, "logits/rejected": NaN, "logps/chosen": -673.8179931640625, "logps/rejected": -411.1075134277344, "loss": 0.4358, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.24797765910625458, "rewards/margins": 1.108145833015442, "rewards/rejected": -1.356123447418213, "step": 1120 }, { "epoch": 0.11395436783058112, "grad_norm": 37.83675003051758, "learning_rate": 8.861550872239588e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -557.3895874023438, "logps/rejected": -465.0948181152344, "loss": 0.5058, "rewards/accuracies": 0.625, "rewards/chosen": -0.3055989146232605, "rewards/margins": 1.0799120664596558, "rewards/rejected": -1.385510802268982, "step": 1130 }, { "epoch": 0.11496281356359511, "grad_norm": 201.12744140625, "learning_rate": 8.851467177573862e-07, "logits/chosen": 1.9406763315200806, "logits/rejected": NaN, "logps/chosen": -568.0208740234375, "logps/rejected": -354.0198974609375, "loss": 0.5228, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.27715232968330383, "rewards/margins": 0.9150485992431641, "rewards/rejected": -1.1922008991241455, "step": 1140 }, { "epoch": 0.1159712592966091, "grad_norm": 112.36207580566406, "learning_rate": 8.841383482908136e-07, "logits/chosen": 1.8901097774505615, "logits/rejected": NaN, "logps/chosen": -667.754638671875, "logps/rejected": -308.4432373046875, "loss": 0.3687, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.2211560308933258, "rewards/margins": 1.5038596391677856, "rewards/rejected": -1.725015640258789, "step": 1150 }, { "epoch": 0.1169797050296231, "grad_norm": 99.22782135009766, "learning_rate": 8.831299788242412e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -620.3355712890625, "logps/rejected": -373.51318359375, "loss": 0.4794, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.35899361968040466, "rewards/margins": 1.1670465469360352, "rewards/rejected": -1.5260401964187622, "step": 1160 }, { "epoch": 0.11798815076263709, "grad_norm": 129.2876434326172, "learning_rate": 8.821216093576687e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -528.0130615234375, "logps/rejected": -437.317138671875, "loss": 0.4608, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3683696687221527, "rewards/margins": 1.0572372674942017, "rewards/rejected": -1.4256069660186768, "step": 1170 }, { "epoch": 0.11899659649565107, "grad_norm": 55.95785140991211, "learning_rate": 8.811132398910961e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -481.54901123046875, "logps/rejected": -353.1736755371094, "loss": 0.5603, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5939914584159851, "rewards/margins": 0.8456441164016724, "rewards/rejected": -1.4396355152130127, "step": 1180 }, { "epoch": 0.12000504222866507, "grad_norm": 109.4458236694336, "learning_rate": 8.801048704245235e-07, "logits/chosen": 1.8625978231430054, "logits/rejected": NaN, "logps/chosen": -542.1990356445312, "logps/rejected": -316.5778503417969, "loss": 0.5351, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5819467306137085, "rewards/margins": 0.8697717785835266, "rewards/rejected": -1.4517185688018799, "step": 1190 }, { "epoch": 0.12101348796167906, "grad_norm": 88.90333557128906, "learning_rate": 8.790965009579509e-07, "logits/chosen": 1.6433948278427124, "logits/rejected": NaN, "logps/chosen": -489.9921875, "logps/rejected": -364.8006896972656, "loss": 0.6317, "rewards/accuracies": 0.625, "rewards/chosen": -0.8479019403457642, "rewards/margins": 0.7288907766342163, "rewards/rejected": -1.576792597770691, "step": 1200 }, { "epoch": 0.12202193369469305, "grad_norm": 165.8902130126953, "learning_rate": 8.780881314913784e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -537.5215454101562, "logps/rejected": -560.543212890625, "loss": 0.5513, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.45771923661231995, "rewards/margins": 0.8316957354545593, "rewards/rejected": -1.2894150018692017, "step": 1210 }, { "epoch": 0.12303037942770705, "grad_norm": 74.10693359375, "learning_rate": 8.770797620248058e-07, "logits/chosen": 1.8629356622695923, "logits/rejected": NaN, "logps/chosen": -714.4608154296875, "logps/rejected": -274.86175537109375, "loss": 0.3443, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.0025085925590246916, "rewards/margins": 1.2633421421051025, "rewards/rejected": -1.260833501815796, "step": 1220 }, { "epoch": 0.12403882516072104, "grad_norm": 106.7501449584961, "learning_rate": 8.760713925582333e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -533.4304809570312, "logps/rejected": -387.86871337890625, "loss": 0.478, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5326951146125793, "rewards/margins": 0.9261938333511353, "rewards/rejected": -1.4588887691497803, "step": 1230 }, { "epoch": 0.12504727089373502, "grad_norm": 201.52769470214844, "learning_rate": 8.750630230916607e-07, "logits/chosen": 1.8488346338272095, "logits/rejected": NaN, "logps/chosen": -531.90234375, "logps/rejected": -375.6727600097656, "loss": 0.4506, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3801704943180084, "rewards/margins": 1.1626498699188232, "rewards/rejected": -1.5428204536437988, "step": 1240 }, { "epoch": 0.12605571662674903, "grad_norm": 42.00877380371094, "learning_rate": 8.740546536250883e-07, "logits/chosen": 1.5766220092773438, "logits/rejected": NaN, "logps/chosen": -545.892578125, "logps/rejected": -320.42132568359375, "loss": 0.4468, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.33942911028862, "rewards/margins": 1.1831496953964233, "rewards/rejected": -1.5225788354873657, "step": 1250 }, { "epoch": 0.127064162359763, "grad_norm": 81.27189636230469, "learning_rate": 8.730462841585157e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -431.46429443359375, "logps/rejected": -441.205810546875, "loss": 0.7078, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2760913372039795, "rewards/margins": 0.48608022928237915, "rewards/rejected": -1.7621715068817139, "step": 1260 }, { "epoch": 0.128072608092777, "grad_norm": 66.05879974365234, "learning_rate": 8.720379146919431e-07, "logits/chosen": 1.9858624935150146, "logits/rejected": NaN, "logps/chosen": -579.40771484375, "logps/rejected": -351.76153564453125, "loss": 0.4012, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.37347105145454407, "rewards/margins": 1.6193336248397827, "rewards/rejected": -1.992804765701294, "step": 1270 }, { "epoch": 0.129081053825791, "grad_norm": 78.32707977294922, "learning_rate": 8.710295452253705e-07, "logits/chosen": 1.943500280380249, "logits/rejected": NaN, "logps/chosen": -480.93145751953125, "logps/rejected": -260.7458801269531, "loss": 0.5159, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.9555455446243286, "rewards/margins": 0.8572637438774109, "rewards/rejected": -1.8128093481063843, "step": 1280 }, { "epoch": 0.130089499558805, "grad_norm": 39.30900192260742, "learning_rate": 8.700211757587979e-07, "logits/chosen": 1.7545284032821655, "logits/rejected": NaN, "logps/chosen": -613.25, "logps/rejected": -245.91983032226562, "loss": 0.4482, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3659313917160034, "rewards/margins": 1.2435805797576904, "rewards/rejected": -1.6095119714736938, "step": 1290 }, { "epoch": 0.13109794529181898, "grad_norm": 76.54405212402344, "learning_rate": 8.690128062922253e-07, "logits/chosen": 1.7318273782730103, "logits/rejected": NaN, "logps/chosen": -584.7431030273438, "logps/rejected": -367.18505859375, "loss": 0.4692, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6070036292076111, "rewards/margins": 1.0485517978668213, "rewards/rejected": -1.6555554866790771, "step": 1300 }, { "epoch": 0.132106391024833, "grad_norm": 199.39630126953125, "learning_rate": 8.680044368256529e-07, "logits/chosen": 1.6785860061645508, "logits/rejected": NaN, "logps/chosen": -546.9725341796875, "logps/rejected": -298.91375732421875, "loss": 0.4905, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6675860285758972, "rewards/margins": 0.9407965540885925, "rewards/rejected": -1.6083825826644897, "step": 1310 }, { "epoch": 0.13311483675784697, "grad_norm": 89.01335144042969, "learning_rate": 8.669960673590804e-07, "logits/chosen": 1.8249495029449463, "logits/rejected": NaN, "logps/chosen": -640.6610107421875, "logps/rejected": -290.7347412109375, "loss": 0.4387, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5270609855651855, "rewards/margins": 1.1261849403381348, "rewards/rejected": -1.6532456874847412, "step": 1320 }, { "epoch": 0.13412328249086097, "grad_norm": 58.411170959472656, "learning_rate": 8.659876978925078e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -486.3694763183594, "logps/rejected": -363.1666564941406, "loss": 0.3547, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.36276665329933167, "rewards/margins": 1.2171679735183716, "rewards/rejected": -1.5799347162246704, "step": 1330 }, { "epoch": 0.13513172822387495, "grad_norm": 151.51341247558594, "learning_rate": 8.649793284259352e-07, "logits/chosen": 1.8102407455444336, "logits/rejected": NaN, "logps/chosen": -548.1889038085938, "logps/rejected": -378.2693176269531, "loss": 0.5385, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5679073333740234, "rewards/margins": 0.8544720411300659, "rewards/rejected": -1.4223792552947998, "step": 1340 }, { "epoch": 0.13614017395688893, "grad_norm": 249.55361938476562, "learning_rate": 8.639709589593627e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -482.70721435546875, "logps/rejected": -530.0214233398438, "loss": 0.506, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.12833204865455627, "rewards/margins": 0.9130407571792603, "rewards/rejected": -1.0413727760314941, "step": 1350 }, { "epoch": 0.13714861968990294, "grad_norm": 114.95024108886719, "learning_rate": 8.629625894927901e-07, "logits/chosen": 1.7555482387542725, "logits/rejected": 1.80849289894104, "logps/chosen": -511.44830322265625, "logps/rejected": -348.4994812011719, "loss": 0.4477, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5222009420394897, "rewards/margins": 1.1996185779571533, "rewards/rejected": -1.721819519996643, "step": 1360 }, { "epoch": 0.13815706542291692, "grad_norm": 59.44681167602539, "learning_rate": 8.619542200262175e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -530.890625, "logps/rejected": -346.5772705078125, "loss": 0.4611, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4434468150138855, "rewards/margins": 0.8285759687423706, "rewards/rejected": -1.2720229625701904, "step": 1370 }, { "epoch": 0.13916551115593093, "grad_norm": 131.08067321777344, "learning_rate": 8.60945850559645e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -656.5137329101562, "logps/rejected": -348.9764709472656, "loss": 0.5123, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6014164686203003, "rewards/margins": 1.288649082183838, "rewards/rejected": -1.8900655508041382, "step": 1380 }, { "epoch": 0.1401739568889449, "grad_norm": 86.28978729248047, "learning_rate": 8.599374810930725e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -569.0958862304688, "logps/rejected": -353.709228515625, "loss": 0.3273, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.5361255407333374, "rewards/margins": 1.714626669883728, "rewards/rejected": -2.2507522106170654, "step": 1390 }, { "epoch": 0.1411824026219589, "grad_norm": 13.89314079284668, "learning_rate": 8.589291116265e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -573.155517578125, "logps/rejected": -289.60595703125, "loss": 0.4303, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.2833048701286316, "rewards/margins": 1.4503955841064453, "rewards/rejected": -1.7337003946304321, "step": 1400 }, { "epoch": 0.1421908483549729, "grad_norm": 88.95187377929688, "learning_rate": 8.579207421599274e-07, "logits/chosen": 1.9648587703704834, "logits/rejected": NaN, "logps/chosen": -626.3477783203125, "logps/rejected": -441.12237548828125, "loss": 0.4148, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.34361034631729126, "rewards/margins": 1.165665864944458, "rewards/rejected": -1.5092761516571045, "step": 1410 }, { "epoch": 0.1431992940879869, "grad_norm": 30.296091079711914, "learning_rate": 8.569123726933548e-07, "logits/chosen": 1.9573132991790771, "logits/rejected": NaN, "logps/chosen": -511.6756896972656, "logps/rejected": -311.41571044921875, "loss": 0.2947, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.3877282738685608, "rewards/margins": 1.9452402591705322, "rewards/rejected": -2.3329684734344482, "step": 1420 }, { "epoch": 0.14420773982100088, "grad_norm": 45.109092712402344, "learning_rate": 8.559040032267822e-07, "logits/chosen": 1.712241530418396, "logits/rejected": NaN, "logps/chosen": -585.326171875, "logps/rejected": -292.2222900390625, "loss": 0.3052, "rewards/accuracies": 0.875, "rewards/chosen": -0.40458256006240845, "rewards/margins": 1.5798141956329346, "rewards/rejected": -1.9843966960906982, "step": 1430 }, { "epoch": 0.1452161855540149, "grad_norm": 88.45269012451172, "learning_rate": 8.548956337602096e-07, "logits/chosen": 1.8128883838653564, "logits/rejected": NaN, "logps/chosen": -583.7403564453125, "logps/rejected": -293.77728271484375, "loss": 0.4221, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5213754773139954, "rewards/margins": 1.4944360256195068, "rewards/rejected": -2.0158114433288574, "step": 1440 }, { "epoch": 0.14622463128702887, "grad_norm": 100.88516998291016, "learning_rate": 8.538872642936372e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -587.214599609375, "logps/rejected": -395.2303771972656, "loss": 0.5098, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.71574866771698, "rewards/margins": 1.0546101331710815, "rewards/rejected": -1.7703590393066406, "step": 1450 }, { "epoch": 0.14723307702004285, "grad_norm": 94.43812561035156, "learning_rate": 8.528788948270646e-07, "logits/chosen": 1.8621253967285156, "logits/rejected": NaN, "logps/chosen": -529.9680786132812, "logps/rejected": -297.25189208984375, "loss": 0.5038, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.934593677520752, "rewards/margins": 1.126517653465271, "rewards/rejected": -2.0611112117767334, "step": 1460 }, { "epoch": 0.14824152275305685, "grad_norm": 35.163055419921875, "learning_rate": 8.518705253604921e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -470.20465087890625, "logps/rejected": -372.54327392578125, "loss": 0.3861, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.36359265446662903, "rewards/margins": 1.7590663433074951, "rewards/rejected": -2.122659206390381, "step": 1470 }, { "epoch": 0.14924996848607083, "grad_norm": 87.27070617675781, "learning_rate": 8.508621558939195e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -471.7169494628906, "logps/rejected": -460.80633544921875, "loss": 0.5701, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.046809196472168, "rewards/margins": 1.1679044961929321, "rewards/rejected": -2.2147135734558105, "step": 1480 }, { "epoch": 0.15025841421908484, "grad_norm": 71.5147705078125, "learning_rate": 8.498537864273469e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -519.0262451171875, "logps/rejected": -392.48150634765625, "loss": 0.41, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0342988967895508, "rewards/margins": 1.152693748474121, "rewards/rejected": -2.186992883682251, "step": 1490 }, { "epoch": 0.15126685995209882, "grad_norm": 105.94913482666016, "learning_rate": 8.488454169607744e-07, "logits/chosen": 1.7435325384140015, "logits/rejected": NaN, "logps/chosen": -420.8480529785156, "logps/rejected": -224.700927734375, "loss": 0.4674, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1692605018615723, "rewards/margins": 1.2622137069702148, "rewards/rejected": -2.431474447250366, "step": 1500 }, { "epoch": 0.15227530568511283, "grad_norm": 87.79023742675781, "learning_rate": 8.478370474942018e-07, "logits/chosen": 1.4190092086791992, "logits/rejected": NaN, "logps/chosen": -552.2002563476562, "logps/rejected": -288.13201904296875, "loss": 0.4598, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9159367680549622, "rewards/margins": 1.236507773399353, "rewards/rejected": -2.152444362640381, "step": 1510 }, { "epoch": 0.1532837514181268, "grad_norm": 50.55667495727539, "learning_rate": 8.468286780276293e-07, "logits/chosen": 1.6109117269515991, "logits/rejected": NaN, "logps/chosen": -585.0997314453125, "logps/rejected": -427.2538146972656, "loss": 0.4325, "rewards/accuracies": 0.75, "rewards/chosen": -0.7913145422935486, "rewards/margins": 1.5010710954666138, "rewards/rejected": -2.2923855781555176, "step": 1520 }, { "epoch": 0.15429219715114081, "grad_norm": 63.762184143066406, "learning_rate": 8.458203085610567e-07, "logits/chosen": 1.904990553855896, "logits/rejected": NaN, "logps/chosen": -499.215576171875, "logps/rejected": -391.56329345703125, "loss": 0.4448, "rewards/accuracies": 0.75, "rewards/chosen": -1.0365979671478271, "rewards/margins": 1.1849958896636963, "rewards/rejected": -2.2215938568115234, "step": 1530 }, { "epoch": 0.1553006428841548, "grad_norm": 65.4466552734375, "learning_rate": 8.448119390944842e-07, "logits/chosen": 1.9056084156036377, "logits/rejected": NaN, "logps/chosen": -565.1134033203125, "logps/rejected": -314.1097717285156, "loss": 0.3883, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8398957252502441, "rewards/margins": 1.3923864364624023, "rewards/rejected": -2.2322821617126465, "step": 1540 }, { "epoch": 0.1563090886171688, "grad_norm": 50.940208435058594, "learning_rate": 8.438035696279117e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -475.05767822265625, "logps/rejected": -396.0668029785156, "loss": 0.5513, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.193230390548706, "rewards/margins": 1.1830987930297852, "rewards/rejected": -2.376329183578491, "step": 1550 }, { "epoch": 0.15731753435018278, "grad_norm": 28.385560989379883, "learning_rate": 8.427952001613391e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -492.05010986328125, "logps/rejected": -446.9898376464844, "loss": 0.4974, "rewards/accuracies": 0.625, "rewards/chosen": -1.1237256526947021, "rewards/margins": 1.145969033241272, "rewards/rejected": -2.2696948051452637, "step": 1560 }, { "epoch": 0.15832598008319676, "grad_norm": 80.76138305664062, "learning_rate": 8.417868306947665e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -471.1390075683594, "logps/rejected": -494.5360412597656, "loss": 0.4446, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2216306924819946, "rewards/margins": 1.0701006650924683, "rewards/rejected": -2.291731357574463, "step": 1570 }, { "epoch": 0.15933442581621077, "grad_norm": 74.08111572265625, "learning_rate": 8.407784612281939e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -487.59490966796875, "logps/rejected": -418.0879821777344, "loss": 0.5122, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2439204454421997, "rewards/margins": 1.2361605167388916, "rewards/rejected": -2.480081081390381, "step": 1580 }, { "epoch": 0.16034287154922475, "grad_norm": 37.55466079711914, "learning_rate": 8.397700917616213e-07, "logits/chosen": 1.6143312454223633, "logits/rejected": NaN, "logps/chosen": -551.8643188476562, "logps/rejected": -357.4889221191406, "loss": 0.3811, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0015610456466675, "rewards/margins": 1.657225251197815, "rewards/rejected": -2.6587862968444824, "step": 1590 }, { "epoch": 0.16135131728223875, "grad_norm": 156.14662170410156, "learning_rate": 8.387617222950489e-07, "logits/chosen": 1.4694721698760986, "logits/rejected": NaN, "logps/chosen": -645.520263671875, "logps/rejected": -335.8185729980469, "loss": 0.4537, "rewards/accuracies": 0.75, "rewards/chosen": -1.1308567523956299, "rewards/margins": 1.328317642211914, "rewards/rejected": -2.459174394607544, "step": 1600 }, { "epoch": 0.16235976301525273, "grad_norm": 154.37950134277344, "learning_rate": 8.377533528284764e-07, "logits/chosen": 1.789122223854065, "logits/rejected": NaN, "logps/chosen": -588.742431640625, "logps/rejected": -290.56512451171875, "loss": 0.4833, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8018652200698853, "rewards/margins": 1.8585789203643799, "rewards/rejected": -2.6604437828063965, "step": 1610 }, { "epoch": 0.16336820874826674, "grad_norm": 91.900146484375, "learning_rate": 8.367449833619038e-07, "logits/chosen": 1.7136714458465576, "logits/rejected": NaN, "logps/chosen": -574.3074951171875, "logps/rejected": -328.9071044921875, "loss": 0.4359, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3669966459274292, "rewards/margins": 1.4050681591033936, "rewards/rejected": -2.7720649242401123, "step": 1620 }, { "epoch": 0.16437665448128072, "grad_norm": 109.7617416381836, "learning_rate": 8.357366138953312e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -599.9327392578125, "logps/rejected": -416.9254455566406, "loss": 0.4662, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1706746816635132, "rewards/margins": 1.649876356124878, "rewards/rejected": -2.8205509185791016, "step": 1630 }, { "epoch": 0.16538510021429473, "grad_norm": 55.03628921508789, "learning_rate": 8.347282444287586e-07, "logits/chosen": 1.788271188735962, "logits/rejected": NaN, "logps/chosen": -597.9560546875, "logps/rejected": -380.1611328125, "loss": 0.5172, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8079506754875183, "rewards/margins": 1.2244170904159546, "rewards/rejected": -2.032367467880249, "step": 1640 }, { "epoch": 0.1663935459473087, "grad_norm": 74.35514068603516, "learning_rate": 8.337198749621861e-07, "logits/chosen": 1.7706184387207031, "logits/rejected": NaN, "logps/chosen": -625.16357421875, "logps/rejected": -428.3868103027344, "loss": 0.5226, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8178674578666687, "rewards/margins": 1.2670639753341675, "rewards/rejected": -2.0849313735961914, "step": 1650 }, { "epoch": 0.16740199168032271, "grad_norm": 67.06239318847656, "learning_rate": 8.327115054956135e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -575.0252685546875, "logps/rejected": -364.90643310546875, "loss": 0.3781, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6374253034591675, "rewards/margins": 1.8019918203353882, "rewards/rejected": -2.4394171237945557, "step": 1660 }, { "epoch": 0.1684104374133367, "grad_norm": 144.6909942626953, "learning_rate": 8.31703136029041e-07, "logits/chosen": 1.7298057079315186, "logits/rejected": 1.75667405128479, "logps/chosen": -547.5020141601562, "logps/rejected": -412.74420166015625, "loss": 0.5474, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.0984045267105103, "rewards/margins": 1.25821852684021, "rewards/rejected": -2.3566231727600098, "step": 1670 }, { "epoch": 0.16941888314635067, "grad_norm": 57.246742248535156, "learning_rate": 8.306947665624684e-07, "logits/chosen": 1.7590707540512085, "logits/rejected": NaN, "logps/chosen": -555.2813720703125, "logps/rejected": -405.0220947265625, "loss": 0.3105, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.61640465259552, "rewards/margins": 1.6220016479492188, "rewards/rejected": -2.238406181335449, "step": 1680 }, { "epoch": 0.17042732887936468, "grad_norm": 105.54542541503906, "learning_rate": 8.29686397095896e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -549.3219604492188, "logps/rejected": -340.60797119140625, "loss": 0.4586, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1657884120941162, "rewards/margins": 1.6510579586029053, "rewards/rejected": -2.8168463706970215, "step": 1690 }, { "epoch": 0.17143577461237866, "grad_norm": 125.93199920654297, "learning_rate": 8.286780276293234e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -590.2093505859375, "logps/rejected": -370.90045166015625, "loss": 0.3613, "rewards/accuracies": 0.75, "rewards/chosen": -1.1933696269989014, "rewards/margins": 1.6587345600128174, "rewards/rejected": -2.8521041870117188, "step": 1700 }, { "epoch": 0.17244422034539267, "grad_norm": 124.79438781738281, "learning_rate": 8.276696581627508e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -509.90338134765625, "logps/rejected": -310.9976501464844, "loss": 0.4815, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0932681560516357, "rewards/margins": 1.2486772537231445, "rewards/rejected": -2.3419454097747803, "step": 1710 }, { "epoch": 0.17345266607840665, "grad_norm": 50.837554931640625, "learning_rate": 8.266612886961782e-07, "logits/chosen": 1.7238906621932983, "logits/rejected": 1.8655446767807007, "logps/chosen": -465.48504638671875, "logps/rejected": -388.8497619628906, "loss": 0.5216, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2274278402328491, "rewards/margins": 1.307990550994873, "rewards/rejected": -2.535418748855591, "step": 1720 }, { "epoch": 0.17446111181142065, "grad_norm": 123.06155395507812, "learning_rate": 8.256529192296056e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -591.5633544921875, "logps/rejected": -397.83544921875, "loss": 0.4688, "rewards/accuracies": 0.75, "rewards/chosen": -0.959812343120575, "rewards/margins": 2.004218578338623, "rewards/rejected": -2.964031219482422, "step": 1730 }, { "epoch": 0.17546955754443463, "grad_norm": 189.8430633544922, "learning_rate": 8.246445497630332e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -467.323974609375, "logps/rejected": -430.00286865234375, "loss": 0.7068, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.295811414718628, "rewards/margins": 0.6055411100387573, "rewards/rejected": -1.9013526439666748, "step": 1740 }, { "epoch": 0.17647800327744864, "grad_norm": 27.961915969848633, "learning_rate": 8.236361802964606e-07, "logits/chosen": 1.626708745956421, "logits/rejected": NaN, "logps/chosen": -527.3577880859375, "logps/rejected": -371.4434509277344, "loss": 0.4112, "rewards/accuracies": 0.75, "rewards/chosen": -0.5709960460662842, "rewards/margins": 1.5636094808578491, "rewards/rejected": -2.134605646133423, "step": 1750 }, { "epoch": 0.17748644901046262, "grad_norm": 98.64136505126953, "learning_rate": 8.226278108298881e-07, "logits/chosen": 1.6097615957260132, "logits/rejected": NaN, "logps/chosen": -484.410888671875, "logps/rejected": -297.02581787109375, "loss": 0.3471, "rewards/accuracies": 0.75, "rewards/chosen": -0.902869701385498, "rewards/margins": 1.5133922100067139, "rewards/rejected": -2.416262149810791, "step": 1760 }, { "epoch": 0.17849489474347663, "grad_norm": 111.43708038330078, "learning_rate": 8.216194413633155e-07, "logits/chosen": 1.708630919456482, "logits/rejected": NaN, "logps/chosen": -507.36260986328125, "logps/rejected": -353.63970947265625, "loss": 0.3914, "rewards/accuracies": 0.75, "rewards/chosen": -0.5541878938674927, "rewards/margins": 1.920843482017517, "rewards/rejected": -2.4750313758850098, "step": 1770 }, { "epoch": 0.1795033404764906, "grad_norm": 54.48271942138672, "learning_rate": 8.206110718967429e-07, "logits/chosen": 1.4232394695281982, "logits/rejected": NaN, "logps/chosen": -494.6046447753906, "logps/rejected": -335.6606140136719, "loss": 0.3915, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.8979905843734741, "rewards/margins": 1.6160156726837158, "rewards/rejected": -2.5140061378479004, "step": 1780 }, { "epoch": 0.1805117862095046, "grad_norm": 209.8251495361328, "learning_rate": 8.196027024301704e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -567.2044677734375, "logps/rejected": -427.18231201171875, "loss": 0.5475, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9597899317741394, "rewards/margins": 1.1182506084442139, "rewards/rejected": -2.078040599822998, "step": 1790 }, { "epoch": 0.1815202319425186, "grad_norm": 69.91799926757812, "learning_rate": 8.185943329635978e-07, "logits/chosen": NaN, "logits/rejected": 1.565045714378357, "logps/chosen": -397.81121826171875, "logps/rejected": -401.9210205078125, "loss": 0.5171, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0350182056427002, "rewards/margins": 1.2388756275177002, "rewards/rejected": -2.2738938331604004, "step": 1800 }, { "epoch": 0.18252867767553257, "grad_norm": 186.0284423828125, "learning_rate": 8.175859634970252e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -438.23370361328125, "logps/rejected": -373.0047912597656, "loss": 0.466, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.869978129863739, "rewards/margins": 1.4184216260910034, "rewards/rejected": -2.2883996963500977, "step": 1810 }, { "epoch": 0.18353712340854658, "grad_norm": 38.56039047241211, "learning_rate": 8.165775940304527e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -557.6175537109375, "logps/rejected": -465.5536193847656, "loss": 0.3976, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7710731029510498, "rewards/margins": 1.9189125299453735, "rewards/rejected": -2.689985752105713, "step": 1820 }, { "epoch": 0.18454556914156056, "grad_norm": 252.23475646972656, "learning_rate": 8.155692245638802e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -480.16259765625, "logps/rejected": -378.3122253417969, "loss": 0.4133, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9501357078552246, "rewards/margins": 1.7546335458755493, "rewards/rejected": -2.7047691345214844, "step": 1830 }, { "epoch": 0.18555401487457457, "grad_norm": 60.908267974853516, "learning_rate": 8.145608550973077e-07, "logits/chosen": 1.5061416625976562, "logits/rejected": NaN, "logps/chosen": -778.4315185546875, "logps/rejected": -374.4270935058594, "loss": 0.3702, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.096657395362854, "rewards/margins": 1.6222938299179077, "rewards/rejected": -1.7189514636993408, "step": 1840 }, { "epoch": 0.18656246060758855, "grad_norm": 218.84530639648438, "learning_rate": 8.135524856307351e-07, "logits/chosen": 1.8033186197280884, "logits/rejected": NaN, "logps/chosen": -576.3825073242188, "logps/rejected": -403.41717529296875, "loss": 0.5645, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8355556726455688, "rewards/margins": 1.462955355644226, "rewards/rejected": -2.298511028289795, "step": 1850 }, { "epoch": 0.18757090634060256, "grad_norm": 89.60678100585938, "learning_rate": 8.125441161641625e-07, "logits/chosen": 1.5946104526519775, "logits/rejected": NaN, "logps/chosen": -583.9129638671875, "logps/rejected": -377.573486328125, "loss": 0.383, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.728221595287323, "rewards/margins": 1.717071294784546, "rewards/rejected": -2.4452929496765137, "step": 1860 }, { "epoch": 0.18857935207361654, "grad_norm": 136.88418579101562, "learning_rate": 8.115357466975899e-07, "logits/chosen": 1.6940301656723022, "logits/rejected": NaN, "logps/chosen": -634.5401000976562, "logps/rejected": -286.6044921875, "loss": 0.6739, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8777866363525391, "rewards/margins": 0.6369501352310181, "rewards/rejected": -1.5147368907928467, "step": 1870 }, { "epoch": 0.18958779780663054, "grad_norm": 159.0320281982422, "learning_rate": 8.105273772310173e-07, "logits/chosen": 1.5649113655090332, "logits/rejected": NaN, "logps/chosen": -515.5249633789062, "logps/rejected": -269.4950866699219, "loss": 0.3977, "rewards/accuracies": 0.75, "rewards/chosen": -0.7100602388381958, "rewards/margins": 1.4324700832366943, "rewards/rejected": -2.1425302028656006, "step": 1880 }, { "epoch": 0.19059624353964452, "grad_norm": 63.055755615234375, "learning_rate": 8.095190077644449e-07, "logits/chosen": 1.7817327976226807, "logits/rejected": NaN, "logps/chosen": -630.0707397460938, "logps/rejected": -350.133056640625, "loss": 0.4292, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7094494700431824, "rewards/margins": 1.2989227771759033, "rewards/rejected": -2.0083725452423096, "step": 1890 }, { "epoch": 0.1916046892726585, "grad_norm": 54.897499084472656, "learning_rate": 8.085106382978723e-07, "logits/chosen": 1.7635011672973633, "logits/rejected": NaN, "logps/chosen": -524.6170654296875, "logps/rejected": -420.756591796875, "loss": 0.585, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9524825215339661, "rewards/margins": 1.565629243850708, "rewards/rejected": -2.5181117057800293, "step": 1900 }, { "epoch": 0.1926131350056725, "grad_norm": 71.32386779785156, "learning_rate": 8.075022688312998e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -564.5474853515625, "logps/rejected": -274.607177734375, "loss": 0.4104, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.549919843673706, "rewards/margins": 1.4801199436187744, "rewards/rejected": -2.0300395488739014, "step": 1910 }, { "epoch": 0.1936215807386865, "grad_norm": 175.72093200683594, "learning_rate": 8.064938993647272e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -591.4932861328125, "logps/rejected": -322.52484130859375, "loss": 0.4179, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8264533877372742, "rewards/margins": 1.5101540088653564, "rewards/rejected": -2.3366074562072754, "step": 1920 }, { "epoch": 0.1946300264717005, "grad_norm": 25.58595848083496, "learning_rate": 8.054855298981546e-07, "logits/chosen": 1.828822135925293, "logits/rejected": NaN, "logps/chosen": -562.4783325195312, "logps/rejected": -333.0569763183594, "loss": 0.468, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8724918365478516, "rewards/margins": 1.4793860912322998, "rewards/rejected": -2.3518776893615723, "step": 1930 }, { "epoch": 0.19563847220471448, "grad_norm": 218.72621154785156, "learning_rate": 8.044771604315821e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -490.88031005859375, "logps/rejected": -342.4007568359375, "loss": 0.5238, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.051348090171814, "rewards/margins": 1.2238709926605225, "rewards/rejected": -2.275219202041626, "step": 1940 }, { "epoch": 0.19664691793772848, "grad_norm": 92.49273681640625, "learning_rate": 8.034687909650095e-07, "logits/chosen": 2.0275068283081055, "logits/rejected": NaN, "logps/chosen": -587.2249145507812, "logps/rejected": -296.5369873046875, "loss": 0.5486, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4399775266647339, "rewards/margins": 0.858544647693634, "rewards/rejected": -2.2985222339630127, "step": 1950 }, { "epoch": 0.19765536367074246, "grad_norm": 239.87744140625, "learning_rate": 8.02460421498437e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -430.6083068847656, "logps/rejected": -410.880615234375, "loss": 0.5887, "rewards/accuracies": 0.625, "rewards/chosen": -0.868424117565155, "rewards/margins": 1.4788777828216553, "rewards/rejected": -2.347301959991455, "step": 1960 }, { "epoch": 0.19866380940375647, "grad_norm": 182.8596954345703, "learning_rate": 8.014520520318644e-07, "logits/chosen": 1.8574390411376953, "logits/rejected": NaN, "logps/chosen": -625.5231323242188, "logps/rejected": -315.65411376953125, "loss": 0.6331, "rewards/accuracies": 0.625, "rewards/chosen": -0.6613037586212158, "rewards/margins": 1.3761616945266724, "rewards/rejected": -2.0374653339385986, "step": 1970 }, { "epoch": 0.19967225513677045, "grad_norm": 41.07072067260742, "learning_rate": 8.00443682565292e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -524.3724365234375, "logps/rejected": -358.7198181152344, "loss": 0.5173, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.25017470121383667, "rewards/margins": 1.193847417831421, "rewards/rejected": -1.444022297859192, "step": 1980 }, { "epoch": 0.20068070086978446, "grad_norm": 137.80401611328125, "learning_rate": 7.994353130987194e-07, "logits/chosen": 1.7679952383041382, "logits/rejected": NaN, "logps/chosen": -592.4173583984375, "logps/rejected": -436.49029541015625, "loss": 0.4512, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4717985689640045, "rewards/margins": 1.0863823890686035, "rewards/rejected": -1.5581810474395752, "step": 1990 }, { "epoch": 0.20168914660279844, "grad_norm": 72.2203598022461, "learning_rate": 7.984269436321468e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -485.794189453125, "logps/rejected": -369.32574462890625, "loss": 0.624, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6184436678886414, "rewards/margins": 1.139943242073059, "rewards/rejected": -1.7583868503570557, "step": 2000 }, { "epoch": 0.20269759233581242, "grad_norm": 5.841154098510742, "learning_rate": 7.974185741655742e-07, "logits/chosen": 1.6365827322006226, "logits/rejected": NaN, "logps/chosen": -510.53643798828125, "logps/rejected": -379.68048095703125, "loss": 0.492, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5257982015609741, "rewards/margins": 1.4405415058135986, "rewards/rejected": -1.9663397073745728, "step": 2010 }, { "epoch": 0.20370603806882642, "grad_norm": 165.34373474121094, "learning_rate": 7.964102046990016e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -565.5963134765625, "logps/rejected": -422.0406188964844, "loss": 0.4689, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6381744146347046, "rewards/margins": 1.2962640523910522, "rewards/rejected": -1.9344384670257568, "step": 2020 }, { "epoch": 0.2047144838018404, "grad_norm": 319.8710021972656, "learning_rate": 7.95401835232429e-07, "logits/chosen": 1.898681640625, "logits/rejected": NaN, "logps/chosen": -473.8670959472656, "logps/rejected": -314.64068603515625, "loss": 0.7522, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.9282987713813782, "rewards/margins": 0.6207016706466675, "rewards/rejected": -1.5490005016326904, "step": 2030 }, { "epoch": 0.2057229295348544, "grad_norm": 162.75157165527344, "learning_rate": 7.943934657658566e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -491.5118713378906, "logps/rejected": -327.0946350097656, "loss": 0.4332, "rewards/accuracies": 0.625, "rewards/chosen": -0.8294345140457153, "rewards/margins": 1.3381013870239258, "rewards/rejected": -2.1675362586975098, "step": 2040 }, { "epoch": 0.2067313752678684, "grad_norm": 60.25227737426758, "learning_rate": 7.933850962992841e-07, "logits/chosen": 1.5691028833389282, "logits/rejected": NaN, "logps/chosen": -528.0642700195312, "logps/rejected": -370.58135986328125, "loss": 0.3893, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6963014602661133, "rewards/margins": 1.2929874658584595, "rewards/rejected": -1.9892886877059937, "step": 2050 }, { "epoch": 0.2077398210008824, "grad_norm": 99.35675811767578, "learning_rate": 7.923767268327115e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -533.9144287109375, "logps/rejected": -421.8441467285156, "loss": 0.464, "rewards/accuracies": 0.75, "rewards/chosen": -0.4543306827545166, "rewards/margins": 1.381632924079895, "rewards/rejected": -1.8359636068344116, "step": 2060 }, { "epoch": 0.20874826673389638, "grad_norm": 73.4193115234375, "learning_rate": 7.913683573661389e-07, "logits/chosen": 1.6391799449920654, "logits/rejected": NaN, "logps/chosen": -585.6121215820312, "logps/rejected": -306.4964904785156, "loss": 0.3206, "rewards/accuracies": 0.75, "rewards/chosen": -0.5080381631851196, "rewards/margins": 1.8413426876068115, "rewards/rejected": -2.3493807315826416, "step": 2070 }, { "epoch": 0.20975671246691038, "grad_norm": 141.93502807617188, "learning_rate": 7.903599878995664e-07, "logits/chosen": 1.6648600101470947, "logits/rejected": NaN, "logps/chosen": -549.5322265625, "logps/rejected": -330.93072509765625, "loss": 0.4027, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5007593631744385, "rewards/margins": 1.6533199548721313, "rewards/rejected": -2.1540791988372803, "step": 2080 }, { "epoch": 0.21076515819992436, "grad_norm": 179.8011016845703, "learning_rate": 7.893516184329938e-07, "logits/chosen": NaN, "logits/rejected": 1.9364051818847656, "logps/chosen": -547.8359375, "logps/rejected": -422.76806640625, "loss": 0.3867, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7950091361999512, "rewards/margins": 1.7989399433135986, "rewards/rejected": -2.59394907951355, "step": 2090 }, { "epoch": 0.21177360393293837, "grad_norm": 100.9329605102539, "learning_rate": 7.883432489664212e-07, "logits/chosen": 1.6080834865570068, "logits/rejected": NaN, "logps/chosen": -614.3760986328125, "logps/rejected": -357.6084899902344, "loss": 0.3884, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.40214720368385315, "rewards/margins": 1.9379644393920898, "rewards/rejected": -2.34011173248291, "step": 2100 }, { "epoch": 0.21278204966595235, "grad_norm": 152.07398986816406, "learning_rate": 7.873348794998487e-07, "logits/chosen": 1.7137635946273804, "logits/rejected": NaN, "logps/chosen": -645.8475341796875, "logps/rejected": -258.60736083984375, "loss": 0.4211, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7974323034286499, "rewards/margins": 1.308822512626648, "rewards/rejected": -2.106255054473877, "step": 2110 }, { "epoch": 0.21379049539896633, "grad_norm": 119.90113067626953, "learning_rate": 7.863265100332761e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -556.4190063476562, "logps/rejected": -380.1941833496094, "loss": 0.508, "rewards/accuracies": 0.75, "rewards/chosen": -0.5271318554878235, "rewards/margins": 1.4741802215576172, "rewards/rejected": -2.001311779022217, "step": 2120 }, { "epoch": 0.21479894113198034, "grad_norm": 41.1197395324707, "learning_rate": 7.853181405667037e-07, "logits/chosen": 1.6385018825531006, "logits/rejected": NaN, "logps/chosen": -506.974609375, "logps/rejected": -330.17364501953125, "loss": 0.4733, "rewards/accuracies": 0.75, "rewards/chosen": -0.8722357749938965, "rewards/margins": 1.4834556579589844, "rewards/rejected": -2.355691432952881, "step": 2130 }, { "epoch": 0.21580738686499432, "grad_norm": 65.2364501953125, "learning_rate": 7.843097711001311e-07, "logits/chosen": 1.5398333072662354, "logits/rejected": NaN, "logps/chosen": -457.68231201171875, "logps/rejected": -345.53106689453125, "loss": 0.6156, "rewards/accuracies": 0.625, "rewards/chosen": -2.1511008739471436, "rewards/margins": 0.7870520353317261, "rewards/rejected": -2.938153028488159, "step": 2140 }, { "epoch": 0.21681583259800832, "grad_norm": 11.16569709777832, "learning_rate": 7.833014016335585e-07, "logits/chosen": NaN, "logits/rejected": 1.970071792602539, "logps/chosen": -494.96405029296875, "logps/rejected": -440.373779296875, "loss": 0.3905, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0274643898010254, "rewards/margins": 1.5672826766967773, "rewards/rejected": -2.594747304916382, "step": 2150 }, { "epoch": 0.2178242783310223, "grad_norm": 69.79710388183594, "learning_rate": 7.822930321669859e-07, "logits/chosen": 1.5311042070388794, "logits/rejected": NaN, "logps/chosen": -582.8330688476562, "logps/rejected": -364.5525817871094, "loss": 0.428, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9418557286262512, "rewards/margins": 1.6594867706298828, "rewards/rejected": -2.6013426780700684, "step": 2160 }, { "epoch": 0.2188327240640363, "grad_norm": 21.943584442138672, "learning_rate": 7.812846627004133e-07, "logits/chosen": 1.7194087505340576, "logits/rejected": NaN, "logps/chosen": -590.8585205078125, "logps/rejected": -415.4112243652344, "loss": 0.6782, "rewards/accuracies": 0.625, "rewards/chosen": -1.4141565561294556, "rewards/margins": 1.0369315147399902, "rewards/rejected": -2.4510879516601562, "step": 2170 }, { "epoch": 0.2198411697970503, "grad_norm": 100.2394790649414, "learning_rate": 7.802762932338409e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -528.9229736328125, "logps/rejected": -364.94915771484375, "loss": 0.5698, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4496088027954102, "rewards/margins": 1.0846641063690186, "rewards/rejected": -2.5342729091644287, "step": 2180 }, { "epoch": 0.2208496155300643, "grad_norm": 22.402328491210938, "learning_rate": 7.792679237672683e-07, "logits/chosen": 1.615094542503357, "logits/rejected": NaN, "logps/chosen": -558.7181396484375, "logps/rejected": -248.57162475585938, "loss": 0.4021, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.253159761428833, "rewards/margins": 1.4209684133529663, "rewards/rejected": -2.674128293991089, "step": 2190 }, { "epoch": 0.22185806126307828, "grad_norm": 268.4200439453125, "learning_rate": 7.782595543006958e-07, "logits/chosen": 1.5938994884490967, "logits/rejected": NaN, "logps/chosen": -512.0958251953125, "logps/rejected": -325.0887756347656, "loss": 0.4671, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4256281852722168, "rewards/margins": 1.4403674602508545, "rewards/rejected": -2.8659956455230713, "step": 2200 }, { "epoch": 0.22286650699609228, "grad_norm": 51.57876205444336, "learning_rate": 7.772511848341232e-07, "logits/chosen": 1.607224464416504, "logits/rejected": NaN, "logps/chosen": -691.4561767578125, "logps/rejected": -385.08233642578125, "loss": 0.3778, "rewards/accuracies": 0.75, "rewards/chosen": -0.7714093327522278, "rewards/margins": 1.8286396265029907, "rewards/rejected": -2.6000492572784424, "step": 2210 }, { "epoch": 0.22387495272910626, "grad_norm": 132.4216766357422, "learning_rate": 7.762428153675506e-07, "logits/chosen": 1.835770845413208, "logits/rejected": NaN, "logps/chosen": -548.4856567382812, "logps/rejected": -395.3909606933594, "loss": 0.5372, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7267765998840332, "rewards/margins": 1.421576976776123, "rewards/rejected": -2.1483535766601562, "step": 2220 }, { "epoch": 0.22488339846212027, "grad_norm": 203.02244567871094, "learning_rate": 7.752344459009781e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -577.2278442382812, "logps/rejected": -471.95538330078125, "loss": 0.4966, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9058005213737488, "rewards/margins": 1.3107765913009644, "rewards/rejected": -2.2165770530700684, "step": 2230 }, { "epoch": 0.22589184419513425, "grad_norm": 157.4184112548828, "learning_rate": 7.742260764344055e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -510.08978271484375, "logps/rejected": -228.8129425048828, "loss": 0.3912, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3755647540092468, "rewards/margins": 1.6218907833099365, "rewards/rejected": -1.9974555969238281, "step": 2240 }, { "epoch": 0.22690028992814823, "grad_norm": 155.34776306152344, "learning_rate": 7.73217706967833e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -628.7304077148438, "logps/rejected": -325.3753356933594, "loss": 0.5463, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.4714861512184143, "rewards/margins": 1.2627382278442383, "rewards/rejected": -1.734224557876587, "step": 2250 }, { "epoch": 0.22790873566116224, "grad_norm": 115.18247985839844, "learning_rate": 7.722093375012604e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -496.79119873046875, "logps/rejected": -264.83123779296875, "loss": 0.4095, "rewards/accuracies": 0.75, "rewards/chosen": -0.8858256340026855, "rewards/margins": 1.1401169300079346, "rewards/rejected": -2.025942325592041, "step": 2260 }, { "epoch": 0.22891718139417622, "grad_norm": 150.7806396484375, "learning_rate": 7.712009680346879e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -565.3043212890625, "logps/rejected": -361.48388671875, "loss": 0.4807, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4845438599586487, "rewards/margins": 1.5212546586990356, "rewards/rejected": -2.005798578262329, "step": 2270 }, { "epoch": 0.22992562712719022, "grad_norm": 142.82101440429688, "learning_rate": 7.701925985681154e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -449.11590576171875, "logps/rejected": -443.8045959472656, "loss": 0.4546, "rewards/accuracies": 0.75, "rewards/chosen": -1.0356857776641846, "rewards/margins": 1.6219100952148438, "rewards/rejected": -2.6575958728790283, "step": 2280 }, { "epoch": 0.2309340728602042, "grad_norm": 193.78204345703125, "learning_rate": 7.691842291015428e-07, "logits/chosen": 1.6071662902832031, "logits/rejected": NaN, "logps/chosen": -614.4429931640625, "logps/rejected": -328.77301025390625, "loss": 0.5486, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6141219735145569, "rewards/margins": 1.1560838222503662, "rewards/rejected": -1.7702058553695679, "step": 2290 }, { "epoch": 0.2319425185932182, "grad_norm": 57.76058578491211, "learning_rate": 7.681758596349702e-07, "logits/chosen": 1.8610271215438843, "logits/rejected": NaN, "logps/chosen": -514.3006591796875, "logps/rejected": -391.39984130859375, "loss": 0.5466, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9285093545913696, "rewards/margins": 1.804166555404663, "rewards/rejected": -2.7326760292053223, "step": 2300 }, { "epoch": 0.2329509643262322, "grad_norm": 141.94435119628906, "learning_rate": 7.671674901683976e-07, "logits/chosen": 1.6550953388214111, "logits/rejected": 1.7953588962554932, "logps/chosen": -463.9625549316406, "logps/rejected": -328.06781005859375, "loss": 0.3912, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2522350549697876, "rewards/margins": 1.600600004196167, "rewards/rejected": -2.852835178375244, "step": 2310 }, { "epoch": 0.2339594100592462, "grad_norm": 119.73535919189453, "learning_rate": 7.66159120701825e-07, "logits/chosen": 1.3166067600250244, "logits/rejected": NaN, "logps/chosen": -454.41961669921875, "logps/rejected": -293.42803955078125, "loss": 0.2801, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9876929521560669, "rewards/margins": 2.037445068359375, "rewards/rejected": -3.0251381397247314, "step": 2320 }, { "epoch": 0.23496785579226018, "grad_norm": 144.8364715576172, "learning_rate": 7.651507512352526e-07, "logits/chosen": 1.7978036403656006, "logits/rejected": NaN, "logps/chosen": -545.7093505859375, "logps/rejected": -338.61968994140625, "loss": 0.6847, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.2666791677474976, "rewards/margins": 0.9218288660049438, "rewards/rejected": -2.1885077953338623, "step": 2330 }, { "epoch": 0.23597630152527418, "grad_norm": 145.67098999023438, "learning_rate": 7.6414238176868e-07, "logits/chosen": 1.45780348777771, "logits/rejected": NaN, "logps/chosen": -582.42578125, "logps/rejected": -364.7970886230469, "loss": 0.4243, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6121991872787476, "rewards/margins": 1.8533077239990234, "rewards/rejected": -2.4655070304870605, "step": 2340 }, { "epoch": 0.23698474725828816, "grad_norm": 3.9400382041931152, "learning_rate": 7.631340123021075e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -579.4414672851562, "logps/rejected": -423.7308044433594, "loss": 0.295, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.7453855872154236, "rewards/margins": 2.047069787979126, "rewards/rejected": -2.7924551963806152, "step": 2350 }, { "epoch": 0.23799319299130214, "grad_norm": 104.11801147460938, "learning_rate": 7.621256428355349e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -597.3833618164062, "logps/rejected": -546.0389404296875, "loss": 0.461, "rewards/accuracies": 0.75, "rewards/chosen": -0.7621411085128784, "rewards/margins": 1.5675022602081299, "rewards/rejected": -2.3296432495117188, "step": 2360 }, { "epoch": 0.23900163872431615, "grad_norm": 54.56405258178711, "learning_rate": 7.611172733689623e-07, "logits/chosen": 1.5941044092178345, "logits/rejected": NaN, "logps/chosen": -639.1541748046875, "logps/rejected": -365.701904296875, "loss": 0.441, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3692710399627686, "rewards/margins": 1.8323863744735718, "rewards/rejected": -3.20165753364563, "step": 2370 }, { "epoch": 0.24001008445733013, "grad_norm": 95.0191879272461, "learning_rate": 7.601089039023898e-07, "logits/chosen": 1.7837574481964111, "logits/rejected": NaN, "logps/chosen": -533.266357421875, "logps/rejected": -286.6042785644531, "loss": 0.523, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2971585988998413, "rewards/margins": 1.8601329326629639, "rewards/rejected": -3.1572916507720947, "step": 2380 }, { "epoch": 0.24101853019034414, "grad_norm": 259.6350402832031, "learning_rate": 7.591005344358172e-07, "logits/chosen": 1.4583925008773804, "logits/rejected": NaN, "logps/chosen": -453.532470703125, "logps/rejected": -326.9818420410156, "loss": 0.4254, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2248423099517822, "rewards/margins": 1.9169079065322876, "rewards/rejected": -3.1417503356933594, "step": 2390 }, { "epoch": 0.24202697592335812, "grad_norm": 125.44010162353516, "learning_rate": 7.580921649692447e-07, "logits/chosen": 1.3911540508270264, "logits/rejected": NaN, "logps/chosen": -552.0001220703125, "logps/rejected": -367.97003173828125, "loss": 0.6951, "rewards/accuracies": 0.625, "rewards/chosen": -1.268123984336853, "rewards/margins": 1.6227996349334717, "rewards/rejected": -2.8909237384796143, "step": 2400 }, { "epoch": 0.24303542165637212, "grad_norm": 176.10487365722656, "learning_rate": 7.570837955026721e-07, "logits/chosen": 1.5478878021240234, "logits/rejected": NaN, "logps/chosen": -484.7079162597656, "logps/rejected": -324.21392822265625, "loss": 0.4108, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2678320407867432, "rewards/margins": 1.6671159267425537, "rewards/rejected": -2.934947967529297, "step": 2410 }, { "epoch": 0.2440438673893861, "grad_norm": 34.205787658691406, "learning_rate": 7.560754260360997e-07, "logits/chosen": 1.780164122581482, "logits/rejected": NaN, "logps/chosen": -485.09228515625, "logps/rejected": -378.19622802734375, "loss": 0.4856, "rewards/accuracies": 0.75, "rewards/chosen": -1.2483608722686768, "rewards/margins": 1.8256866931915283, "rewards/rejected": -3.074047565460205, "step": 2420 }, { "epoch": 0.2450523131224001, "grad_norm": 72.36730194091797, "learning_rate": 7.550670565695271e-07, "logits/chosen": 1.6213512420654297, "logits/rejected": NaN, "logps/chosen": -570.633544921875, "logps/rejected": -339.080078125, "loss": 0.9345, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5420787334442139, "rewards/margins": 0.7372657656669617, "rewards/rejected": -2.2793445587158203, "step": 2430 }, { "epoch": 0.2460607588554141, "grad_norm": 9.715612411499023, "learning_rate": 7.540586871029545e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -508.021240234375, "logps/rejected": -371.1828918457031, "loss": 0.3644, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8981480598449707, "rewards/margins": 2.2146592140197754, "rewards/rejected": -3.112807273864746, "step": 2440 }, { "epoch": 0.2470692045884281, "grad_norm": 170.18267822265625, "learning_rate": 7.530503176363819e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -565.1450805664062, "logps/rejected": -326.12896728515625, "loss": 0.7056, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4102280139923096, "rewards/margins": 1.167496919631958, "rewards/rejected": -2.5777249336242676, "step": 2450 }, { "epoch": 0.24807765032144208, "grad_norm": 410.0872802734375, "learning_rate": 7.520419481698093e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -552.6588134765625, "logps/rejected": -388.3376159667969, "loss": 0.5244, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9543924331665039, "rewards/margins": 1.2503159046173096, "rewards/rejected": -2.2047083377838135, "step": 2460 }, { "epoch": 0.24908609605445606, "grad_norm": 27.46773338317871, "learning_rate": 7.510335787032369e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -482.1988220214844, "logps/rejected": -484.34954833984375, "loss": 0.6443, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7282560467720032, "rewards/margins": 1.219954490661621, "rewards/rejected": -1.94821035861969, "step": 2470 }, { "epoch": 0.25009454178747004, "grad_norm": 164.6046600341797, "learning_rate": 7.500252092366643e-07, "logits/chosen": NaN, "logits/rejected": 1.9425990581512451, "logps/chosen": -452.48675537109375, "logps/rejected": -342.753173828125, "loss": 0.3961, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0358684062957764, "rewards/margins": 1.83217453956604, "rewards/rejected": -2.8680427074432373, "step": 2480 }, { "epoch": 0.25110298752048404, "grad_norm": 170.34048461914062, "learning_rate": 7.490168397700918e-07, "logits/chosen": 1.8961429595947266, "logits/rejected": NaN, "logps/chosen": -613.548095703125, "logps/rejected": -429.647705078125, "loss": 0.3279, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.5258141160011292, "rewards/margins": 2.4435999393463135, "rewards/rejected": -2.9694137573242188, "step": 2490 }, { "epoch": 0.25211143325349805, "grad_norm": 110.2259292602539, "learning_rate": 7.480084703035192e-07, "logits/chosen": 1.6553165912628174, "logits/rejected": NaN, "logps/chosen": -458.67889404296875, "logps/rejected": -389.0628356933594, "loss": 0.4214, "rewards/accuracies": 0.75, "rewards/chosen": -0.6387160420417786, "rewards/margins": 1.819504976272583, "rewards/rejected": -2.458220958709717, "step": 2500 }, { "epoch": 0.25311987898651206, "grad_norm": 191.50579833984375, "learning_rate": 7.470001008369466e-07, "logits/chosen": 1.6784164905548096, "logits/rejected": NaN, "logps/chosen": -618.94775390625, "logps/rejected": -370.7044677734375, "loss": 0.4613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0917798280715942, "rewards/margins": 1.6074861288070679, "rewards/rejected": -2.699265956878662, "step": 2510 }, { "epoch": 0.254128324719526, "grad_norm": 112.59011840820312, "learning_rate": 7.459917313703741e-07, "logits/chosen": 1.812159776687622, "logits/rejected": 1.86269211769104, "logps/chosen": -398.9075927734375, "logps/rejected": -365.17730712890625, "loss": 0.5341, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0873963832855225, "rewards/margins": 1.434707760810852, "rewards/rejected": -2.522104263305664, "step": 2520 }, { "epoch": 0.25513677045254, "grad_norm": 77.90760040283203, "learning_rate": 7.449833619038015e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -564.9093017578125, "logps/rejected": -426.596923828125, "loss": 0.5627, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1494181156158447, "rewards/margins": 1.671457052230835, "rewards/rejected": -2.8208751678466797, "step": 2530 }, { "epoch": 0.256145216185554, "grad_norm": 14.752852439880371, "learning_rate": 7.439749924372289e-07, "logits/chosen": 1.7935501337051392, "logits/rejected": NaN, "logps/chosen": -535.8850708007812, "logps/rejected": -345.2295227050781, "loss": 0.5979, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1352589130401611, "rewards/margins": 1.645999550819397, "rewards/rejected": -2.7812583446502686, "step": 2540 }, { "epoch": 0.25715366191856803, "grad_norm": 262.66485595703125, "learning_rate": 7.429666229706564e-07, "logits/chosen": 1.489556074142456, "logits/rejected": NaN, "logps/chosen": -499.6598205566406, "logps/rejected": -317.48406982421875, "loss": 0.4756, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8088070750236511, "rewards/margins": 1.837807059288025, "rewards/rejected": -2.6466140747070312, "step": 2550 }, { "epoch": 0.258162107651582, "grad_norm": 132.93092346191406, "learning_rate": 7.419582535040838e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -448.02587890625, "logps/rejected": -399.6867980957031, "loss": 0.6271, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.112184762954712, "rewards/margins": 1.5161631107330322, "rewards/rejected": -2.628347873687744, "step": 2560 }, { "epoch": 0.259170553384596, "grad_norm": 30.839763641357422, "learning_rate": 7.409498840375114e-07, "logits/chosen": 1.9038612842559814, "logits/rejected": NaN, "logps/chosen": -565.9909057617188, "logps/rejected": -266.75555419921875, "loss": 0.4566, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2564610242843628, "rewards/margins": 1.710425615310669, "rewards/rejected": -2.9668869972229004, "step": 2570 }, { "epoch": 0.26017899911761, "grad_norm": 193.99737548828125, "learning_rate": 7.399415145709388e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -521.8609619140625, "logps/rejected": -466.43450927734375, "loss": 0.5291, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5588845014572144, "rewards/margins": 1.523636817932129, "rewards/rejected": -2.0825209617614746, "step": 2580 }, { "epoch": 0.26118744485062395, "grad_norm": 84.6798324584961, "learning_rate": 7.389331451043662e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -528.6369018554688, "logps/rejected": -318.6743469238281, "loss": 0.3806, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.27497538924217224, "rewards/margins": 1.9807727336883545, "rewards/rejected": -2.2557480335235596, "step": 2590 }, { "epoch": 0.26219589058363796, "grad_norm": 64.8193359375, "learning_rate": 7.379247756377936e-07, "logits/chosen": 1.7432880401611328, "logits/rejected": NaN, "logps/chosen": -547.4859619140625, "logps/rejected": -383.86480712890625, "loss": 0.6778, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6999083161354065, "rewards/margins": 1.165101170539856, "rewards/rejected": -1.8650095462799072, "step": 2600 }, { "epoch": 0.26320433631665197, "grad_norm": 219.97610473632812, "learning_rate": 7.36916406171221e-07, "logits/chosen": 1.649110198020935, "logits/rejected": NaN, "logps/chosen": -504.590087890625, "logps/rejected": -266.2635192871094, "loss": 0.4835, "rewards/accuracies": 0.625, "rewards/chosen": -1.0549371242523193, "rewards/margins": 1.5251573324203491, "rewards/rejected": -2.580094337463379, "step": 2610 }, { "epoch": 0.264212782049666, "grad_norm": 166.34194946289062, "learning_rate": 7.359080367046486e-07, "logits/chosen": 1.6603381633758545, "logits/rejected": NaN, "logps/chosen": -515.037109375, "logps/rejected": -424.4806213378906, "loss": 0.5691, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7246321439743042, "rewards/margins": 1.4995510578155518, "rewards/rejected": -2.2241833209991455, "step": 2620 }, { "epoch": 0.2652212277826799, "grad_norm": 151.38735961914062, "learning_rate": 7.34899667238076e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -527.3253173828125, "logps/rejected": -406.7724304199219, "loss": 0.3621, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.2661469280719757, "rewards/margins": 2.2737197875976562, "rewards/rejected": -2.5398666858673096, "step": 2630 }, { "epoch": 0.26622967351569393, "grad_norm": 95.57625579833984, "learning_rate": 7.338912977715035e-07, "logits/chosen": 1.7687381505966187, "logits/rejected": NaN, "logps/chosen": -553.6170654296875, "logps/rejected": -341.1683654785156, "loss": 0.3678, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6317867040634155, "rewards/margins": 1.9889780282974243, "rewards/rejected": -2.6207644939422607, "step": 2640 }, { "epoch": 0.26723811924870794, "grad_norm": 49.43730926513672, "learning_rate": 7.328829283049309e-07, "logits/chosen": 1.9935611486434937, "logits/rejected": NaN, "logps/chosen": -595.8130493164062, "logps/rejected": -418.86297607421875, "loss": 0.5284, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9030882716178894, "rewards/margins": 1.2624473571777344, "rewards/rejected": -2.1655354499816895, "step": 2650 }, { "epoch": 0.26824656498172195, "grad_norm": 78.09941864013672, "learning_rate": 7.318745588383583e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -559.8095092773438, "logps/rejected": -346.9538879394531, "loss": 0.4911, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7961171269416809, "rewards/margins": 1.6261396408081055, "rewards/rejected": -2.4222567081451416, "step": 2660 }, { "epoch": 0.2692550107147359, "grad_norm": 71.80756378173828, "learning_rate": 7.308661893717858e-07, "logits/chosen": 1.8312416076660156, "logits/rejected": NaN, "logps/chosen": -519.1195068359375, "logps/rejected": -326.7002868652344, "loss": 0.3789, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6874632239341736, "rewards/margins": 1.797258734703064, "rewards/rejected": -2.4847218990325928, "step": 2670 }, { "epoch": 0.2702634564477499, "grad_norm": 7.3449320793151855, "learning_rate": 7.298578199052132e-07, "logits/chosen": 1.428767442703247, "logits/rejected": NaN, "logps/chosen": -498.0230407714844, "logps/rejected": -424.8733825683594, "loss": 0.4881, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9475738406181335, "rewards/margins": 1.9582140445709229, "rewards/rejected": -2.905787944793701, "step": 2680 }, { "epoch": 0.2712719021807639, "grad_norm": 196.5313262939453, "learning_rate": 7.288494504386407e-07, "logits/chosen": 1.5515304803848267, "logits/rejected": NaN, "logps/chosen": -542.78271484375, "logps/rejected": -364.09967041015625, "loss": 0.6012, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4099458456039429, "rewards/margins": 1.5388596057891846, "rewards/rejected": -2.948805570602417, "step": 2690 }, { "epoch": 0.27228034791377786, "grad_norm": 111.59062194824219, "learning_rate": 7.278410809720681e-07, "logits/chosen": 1.919128179550171, "logits/rejected": NaN, "logps/chosen": -576.684814453125, "logps/rejected": -367.8165588378906, "loss": 0.2965, "rewards/accuracies": 0.75, "rewards/chosen": -1.3175632953643799, "rewards/margins": 2.1886279582977295, "rewards/rejected": -3.5061912536621094, "step": 2700 }, { "epoch": 0.27328879364679187, "grad_norm": 19.128108978271484, "learning_rate": 7.268327115054956e-07, "logits/chosen": 1.7583763599395752, "logits/rejected": NaN, "logps/chosen": -405.2438049316406, "logps/rejected": -371.8138122558594, "loss": 0.6748, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1540334224700928, "rewards/margins": 1.45846426486969, "rewards/rejected": -3.6124978065490723, "step": 2710 }, { "epoch": 0.2742972393798059, "grad_norm": 104.56190490722656, "learning_rate": 7.258243420389231e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -457.45013427734375, "logps/rejected": -394.625732421875, "loss": 0.6604, "rewards/accuracies": 0.625, "rewards/chosen": -1.412913203239441, "rewards/margins": 0.9596582651138306, "rewards/rejected": -2.3725714683532715, "step": 2720 }, { "epoch": 0.2753056851128199, "grad_norm": 107.92532348632812, "learning_rate": 7.248159725723505e-07, "logits/chosen": 1.7524607181549072, "logits/rejected": NaN, "logps/chosen": -558.4683837890625, "logps/rejected": -308.86968994140625, "loss": 0.5601, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.3433908224105835, "rewards/margins": 1.3257077932357788, "rewards/rejected": -2.669098377227783, "step": 2730 }, { "epoch": 0.27631413084583384, "grad_norm": 220.8485870361328, "learning_rate": 7.238076031057779e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -424.4120178222656, "logps/rejected": -337.14129638671875, "loss": 0.6756, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4606966972351074, "rewards/margins": 1.3420073986053467, "rewards/rejected": -2.802704095840454, "step": 2740 }, { "epoch": 0.27732257657884785, "grad_norm": 290.27105712890625, "learning_rate": 7.227992336392053e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -649.4845581054688, "logps/rejected": -354.0214538574219, "loss": 0.5818, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1733417510986328, "rewards/margins": 1.4240167140960693, "rewards/rejected": -2.5973587036132812, "step": 2750 }, { "epoch": 0.27833102231186185, "grad_norm": 50.211387634277344, "learning_rate": 7.217908641726327e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -586.4290161132812, "logps/rejected": -382.0456848144531, "loss": 0.5136, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3010119199752808, "rewards/margins": 1.818412184715271, "rewards/rejected": -3.119424343109131, "step": 2760 }, { "epoch": 0.27933946804487586, "grad_norm": 48.15397262573242, "learning_rate": 7.207824947060603e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -596.1814575195312, "logps/rejected": -330.3123474121094, "loss": 0.3694, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6379673480987549, "rewards/margins": 1.9321517944335938, "rewards/rejected": -2.5701191425323486, "step": 2770 }, { "epoch": 0.2803479137778898, "grad_norm": 167.91026306152344, "learning_rate": 7.197741252394877e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -596.9024658203125, "logps/rejected": -353.5751953125, "loss": 0.3321, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.5070847868919373, "rewards/margins": 2.30220890045166, "rewards/rejected": -2.8092939853668213, "step": 2780 }, { "epoch": 0.2813563595109038, "grad_norm": 101.16056060791016, "learning_rate": 7.187657557729152e-07, "logits/chosen": 1.8074672222137451, "logits/rejected": NaN, "logps/chosen": -397.4500732421875, "logps/rejected": -291.8507385253906, "loss": 0.4836, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.652381181716919, "rewards/margins": 1.1885030269622803, "rewards/rejected": -2.84088397026062, "step": 2790 }, { "epoch": 0.2823648052439178, "grad_norm": 147.54644775390625, "learning_rate": 7.177573863063426e-07, "logits/chosen": 1.4282464981079102, "logits/rejected": NaN, "logps/chosen": -593.8984985351562, "logps/rejected": -467.9214782714844, "loss": 0.437, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1432538032531738, "rewards/margins": 1.560249924659729, "rewards/rejected": -2.703503370285034, "step": 2800 }, { "epoch": 0.2833732509769318, "grad_norm": 18.158245086669922, "learning_rate": 7.1674901683977e-07, "logits/chosen": 1.9593874216079712, "logits/rejected": NaN, "logps/chosen": -649.2216186523438, "logps/rejected": -350.6255187988281, "loss": 0.3399, "rewards/accuracies": 0.75, "rewards/chosen": -0.9087544679641724, "rewards/margins": 1.981388807296753, "rewards/rejected": -2.890143394470215, "step": 2810 }, { "epoch": 0.2843816967099458, "grad_norm": 150.5507354736328, "learning_rate": 7.157406473731975e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -480.30731201171875, "logps/rejected": -354.1000671386719, "loss": 0.5959, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.23287034034729, "rewards/margins": 0.9777077436447144, "rewards/rejected": -2.210578203201294, "step": 2820 }, { "epoch": 0.2853901424429598, "grad_norm": 117.60199737548828, "learning_rate": 7.147322779066249e-07, "logits/chosen": 1.5351979732513428, "logits/rejected": NaN, "logps/chosen": -545.8428955078125, "logps/rejected": -318.22369384765625, "loss": 0.345, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0353703498840332, "rewards/margins": 2.0432958602905273, "rewards/rejected": -3.0786659717559814, "step": 2830 }, { "epoch": 0.2863985881759738, "grad_norm": 231.4066925048828, "learning_rate": 7.137239084400524e-07, "logits/chosen": 1.5760313272476196, "logits/rejected": NaN, "logps/chosen": -537.1239013671875, "logps/rejected": -283.151611328125, "loss": 0.4032, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.046946406364441, "rewards/margins": 1.8658668994903564, "rewards/rejected": -2.912813425064087, "step": 2840 }, { "epoch": 0.28740703390898775, "grad_norm": 0.8046279549598694, "learning_rate": 7.127155389734798e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -604.4855346679688, "logps/rejected": -372.2664489746094, "loss": 0.3452, "rewards/accuracies": 0.75, "rewards/chosen": -0.8428137898445129, "rewards/margins": 2.361956834793091, "rewards/rejected": -3.204770565032959, "step": 2850 }, { "epoch": 0.28841547964200176, "grad_norm": 157.5847930908203, "learning_rate": 7.117071695069074e-07, "logits/chosen": 1.8133395910263062, "logits/rejected": NaN, "logps/chosen": -604.6226806640625, "logps/rejected": -451.9273986816406, "loss": 0.544, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.9455092549324036, "rewards/margins": 1.0561918020248413, "rewards/rejected": -2.0017008781433105, "step": 2860 }, { "epoch": 0.28942392537501577, "grad_norm": 160.799072265625, "learning_rate": 7.106988000403348e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -462.1366271972656, "logps/rejected": -423.457763671875, "loss": 0.3282, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6760119795799255, "rewards/margins": 2.140988349914551, "rewards/rejected": -2.817000150680542, "step": 2870 }, { "epoch": 0.2904323711080298, "grad_norm": 30.33340072631836, "learning_rate": 7.096904305737622e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -637.7091674804688, "logps/rejected": -336.8854675292969, "loss": 0.4333, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.18470549583435059, "rewards/margins": 1.725708246231079, "rewards/rejected": -1.9104137420654297, "step": 2880 }, { "epoch": 0.2914408168410437, "grad_norm": 37.54497146606445, "learning_rate": 7.086820611071896e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -597.8836669921875, "logps/rejected": -491.54766845703125, "loss": 0.5262, "rewards/accuracies": 0.75, "rewards/chosen": -0.5527259707450867, "rewards/margins": 1.8195838928222656, "rewards/rejected": -2.372310161590576, "step": 2890 }, { "epoch": 0.29244926257405773, "grad_norm": 88.00849914550781, "learning_rate": 7.07673691640617e-07, "logits/chosen": 1.6458985805511475, "logits/rejected": NaN, "logps/chosen": -631.8323974609375, "logps/rejected": -316.8956604003906, "loss": 0.3083, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.3832867741584778, "rewards/margins": 2.41545033454895, "rewards/rejected": -2.798737049102783, "step": 2900 }, { "epoch": 0.29345770830707174, "grad_norm": 1.1130412817001343, "learning_rate": 7.066653221740446e-07, "logits/chosen": 1.8335262537002563, "logits/rejected": NaN, "logps/chosen": -570.2293090820312, "logps/rejected": -297.86773681640625, "loss": 0.5372, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7527568340301514, "rewards/margins": 2.062717914581299, "rewards/rejected": -2.81547474861145, "step": 2910 }, { "epoch": 0.2944661540400857, "grad_norm": 91.3023681640625, "learning_rate": 7.05656952707472e-07, "logits/chosen": 1.4999090433120728, "logits/rejected": NaN, "logps/chosen": -520.4133911132812, "logps/rejected": -302.7580871582031, "loss": 0.4624, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7168707847595215, "rewards/margins": 2.3405098915100098, "rewards/rejected": -3.057380199432373, "step": 2920 }, { "epoch": 0.2954745997730997, "grad_norm": 333.427734375, "learning_rate": 7.046485832408995e-07, "logits/chosen": 1.699607491493225, "logits/rejected": NaN, "logps/chosen": -534.2154541015625, "logps/rejected": -324.97113037109375, "loss": 0.5243, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7252119779586792, "rewards/margins": 2.0566823482513428, "rewards/rejected": -2.7818942070007324, "step": 2930 }, { "epoch": 0.2964830455061137, "grad_norm": 16.186016082763672, "learning_rate": 7.036402137743269e-07, "logits/chosen": 1.7305200099945068, "logits/rejected": NaN, "logps/chosen": -541.2529296875, "logps/rejected": -433.34844970703125, "loss": 0.4676, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6817449927330017, "rewards/margins": 2.394242525100708, "rewards/rejected": -3.0759875774383545, "step": 2940 }, { "epoch": 0.2974914912391277, "grad_norm": 221.4260711669922, "learning_rate": 7.026318443077543e-07, "logits/chosen": 1.6112010478973389, "logits/rejected": NaN, "logps/chosen": -431.74261474609375, "logps/rejected": -345.25738525390625, "loss": 0.638, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6387237310409546, "rewards/margins": 1.6807358264923096, "rewards/rejected": -3.3194594383239746, "step": 2950 }, { "epoch": 0.29849993697214167, "grad_norm": 100.08946228027344, "learning_rate": 7.016234748411818e-07, "logits/chosen": 1.8291572332382202, "logits/rejected": NaN, "logps/chosen": -614.3261108398438, "logps/rejected": -325.1398620605469, "loss": 0.4748, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3891388177871704, "rewards/margins": 1.9991086721420288, "rewards/rejected": -3.38824725151062, "step": 2960 }, { "epoch": 0.2995083827051557, "grad_norm": 206.70718383789062, "learning_rate": 7.006151053746092e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -471.18597412109375, "logps/rejected": -362.40020751953125, "loss": 0.6637, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5323325395584106, "rewards/margins": 1.8564808368682861, "rewards/rejected": -3.388813018798828, "step": 2970 }, { "epoch": 0.3005168284381697, "grad_norm": 102.30314636230469, "learning_rate": 6.996067359080367e-07, "logits/chosen": 1.5273233652114868, "logits/rejected": NaN, "logps/chosen": -502.9305725097656, "logps/rejected": -380.766357421875, "loss": 0.3993, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9100942611694336, "rewards/margins": 2.078005790710449, "rewards/rejected": -2.9880998134613037, "step": 2980 }, { "epoch": 0.3015252741711837, "grad_norm": 96.65987396240234, "learning_rate": 6.985983664414641e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -575.8670043945312, "logps/rejected": -364.82958984375, "loss": 0.5548, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8050055503845215, "rewards/margins": 1.8426088094711304, "rewards/rejected": -2.6476142406463623, "step": 2990 }, { "epoch": 0.30253371990419764, "grad_norm": 138.4419403076172, "learning_rate": 6.975899969748915e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -567.2286376953125, "logps/rejected": -329.41497802734375, "loss": 0.4635, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5740769505500793, "rewards/margins": 1.6786396503448486, "rewards/rejected": -2.252716541290283, "step": 3000 }, { "epoch": 0.30354216563721165, "grad_norm": 11.622712135314941, "learning_rate": 6.965816275083191e-07, "logits/chosen": 1.456970453262329, "logits/rejected": NaN, "logps/chosen": -470.63671875, "logps/rejected": -337.16162109375, "loss": 0.213, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8300203084945679, "rewards/margins": 3.5500056743621826, "rewards/rejected": -4.380026340484619, "step": 3010 }, { "epoch": 0.30455061137022565, "grad_norm": 103.4345474243164, "learning_rate": 6.955732580417465e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -637.4652099609375, "logps/rejected": -331.386962890625, "loss": 0.4933, "rewards/accuracies": 0.75, "rewards/chosen": -0.543739914894104, "rewards/margins": 1.6545531749725342, "rewards/rejected": -2.1982932090759277, "step": 3020 }, { "epoch": 0.3055590571032396, "grad_norm": 87.68858337402344, "learning_rate": 6.945648885751739e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -571.2727661132812, "logps/rejected": -387.9153137207031, "loss": 0.6312, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3515675067901611, "rewards/margins": 1.5233261585235596, "rewards/rejected": -2.8748936653137207, "step": 3030 }, { "epoch": 0.3065675028362536, "grad_norm": 143.0201416015625, "learning_rate": 6.935565191086013e-07, "logits/chosen": 1.6674740314483643, "logits/rejected": NaN, "logps/chosen": -589.87646484375, "logps/rejected": -389.56890869140625, "loss": 0.5319, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.204554557800293, "rewards/margins": 1.8004175424575806, "rewards/rejected": -3.004971981048584, "step": 3040 }, { "epoch": 0.3075759485692676, "grad_norm": 46.02983856201172, "learning_rate": 6.925481496420287e-07, "logits/chosen": 1.4701087474822998, "logits/rejected": NaN, "logps/chosen": -626.5850219726562, "logps/rejected": -449.83441162109375, "loss": 0.3477, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9685260653495789, "rewards/margins": 2.0168399810791016, "rewards/rejected": -2.985366106033325, "step": 3050 }, { "epoch": 0.30858439430228163, "grad_norm": 13.106485366821289, "learning_rate": 6.915397801754563e-07, "logits/chosen": 1.6949326992034912, "logits/rejected": NaN, "logps/chosen": -609.6736450195312, "logps/rejected": -310.835205078125, "loss": 0.3395, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6886916756629944, "rewards/margins": 2.092254638671875, "rewards/rejected": -2.7809462547302246, "step": 3060 }, { "epoch": 0.3095928400352956, "grad_norm": 39.94696044921875, "learning_rate": 6.905314107088837e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -512.5126953125, "logps/rejected": -498.76690673828125, "loss": 0.5175, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6929945349693298, "rewards/margins": 1.9392430782318115, "rewards/rejected": -2.6322379112243652, "step": 3070 }, { "epoch": 0.3106012857683096, "grad_norm": 11.955205917358398, "learning_rate": 6.895230412423112e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -488.4690856933594, "logps/rejected": -423.74029541015625, "loss": 0.4618, "rewards/accuracies": 0.75, "rewards/chosen": -0.5291376709938049, "rewards/margins": 2.6229262351989746, "rewards/rejected": -3.1520638465881348, "step": 3080 }, { "epoch": 0.3116097315013236, "grad_norm": 53.01707458496094, "learning_rate": 6.885146717757386e-07, "logits/chosen": 1.6666314601898193, "logits/rejected": NaN, "logps/chosen": -554.1953735351562, "logps/rejected": -301.50775146484375, "loss": 0.5078, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0475319623947144, "rewards/margins": 1.749617338180542, "rewards/rejected": -2.797149419784546, "step": 3090 }, { "epoch": 0.3126181772343376, "grad_norm": 20.084775924682617, "learning_rate": 6.87506302309166e-07, "logits/chosen": 2.0380749702453613, "logits/rejected": NaN, "logps/chosen": -677.9608764648438, "logps/rejected": -403.63751220703125, "loss": 0.2602, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.5022567510604858, "rewards/margins": 2.87391996383667, "rewards/rejected": -3.376176357269287, "step": 3100 }, { "epoch": 0.31362662296735155, "grad_norm": 15.88304615020752, "learning_rate": 6.864979328425935e-07, "logits/chosen": 1.6318700313568115, "logits/rejected": NaN, "logps/chosen": -533.0526123046875, "logps/rejected": -382.232666015625, "loss": 0.5735, "rewards/accuracies": 0.75, "rewards/chosen": -1.0380899906158447, "rewards/margins": 1.820133924484253, "rewards/rejected": -2.8582239151000977, "step": 3110 }, { "epoch": 0.31463506870036556, "grad_norm": 259.76202392578125, "learning_rate": 6.854895633760209e-07, "logits/chosen": 1.4746416807174683, "logits/rejected": NaN, "logps/chosen": -535.2973022460938, "logps/rejected": -304.1683654785156, "loss": 0.4859, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1660970449447632, "rewards/margins": 1.7927000522613525, "rewards/rejected": -2.958796977996826, "step": 3120 }, { "epoch": 0.31564351443337957, "grad_norm": 62.52632141113281, "learning_rate": 6.844811939094484e-07, "logits/chosen": 1.739116907119751, "logits/rejected": NaN, "logps/chosen": -543.9408569335938, "logps/rejected": -264.98675537109375, "loss": 0.3583, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7202632427215576, "rewards/margins": 1.6879167556762695, "rewards/rejected": -3.408179759979248, "step": 3130 }, { "epoch": 0.3166519601663935, "grad_norm": 153.1205596923828, "learning_rate": 6.834728244428758e-07, "logits/chosen": 1.429940938949585, "logits/rejected": NaN, "logps/chosen": -556.2670288085938, "logps/rejected": -413.5735778808594, "loss": 0.3939, "rewards/accuracies": 0.75, "rewards/chosen": -1.1553301811218262, "rewards/margins": 1.9559180736541748, "rewards/rejected": -3.111248016357422, "step": 3140 }, { "epoch": 0.3176604058994075, "grad_norm": 115.14366912841797, "learning_rate": 6.824644549763034e-07, "logits/chosen": 1.811414122581482, "logits/rejected": NaN, "logps/chosen": -545.9974975585938, "logps/rejected": -313.4206237792969, "loss": 0.7008, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0637511014938354, "rewards/margins": 1.41013503074646, "rewards/rejected": -2.473886251449585, "step": 3150 }, { "epoch": 0.31866885163242153, "grad_norm": 209.55320739746094, "learning_rate": 6.814560855097308e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -535.7219848632812, "logps/rejected": -427.1177673339844, "loss": 0.6282, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2075788974761963, "rewards/margins": 1.044459581375122, "rewards/rejected": -2.2520384788513184, "step": 3160 }, { "epoch": 0.31967729736543554, "grad_norm": 48.72821807861328, "learning_rate": 6.804477160431582e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -527.2448120117188, "logps/rejected": -373.5837097167969, "loss": 0.6625, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7110077738761902, "rewards/margins": 1.4290794134140015, "rewards/rejected": -2.140087127685547, "step": 3170 }, { "epoch": 0.3206857430984495, "grad_norm": 23.718095779418945, "learning_rate": 6.794393465765856e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -583.585205078125, "logps/rejected": -344.48468017578125, "loss": 0.6109, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.2377924621105194, "rewards/margins": 1.5846208333969116, "rewards/rejected": -1.8224132061004639, "step": 3180 }, { "epoch": 0.3216941888314635, "grad_norm": 98.00347900390625, "learning_rate": 6.78430977110013e-07, "logits/chosen": 1.9692833423614502, "logits/rejected": NaN, "logps/chosen": -615.8604125976562, "logps/rejected": -282.8575744628906, "loss": 0.3489, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.30436617136001587, "rewards/margins": 1.8011289834976196, "rewards/rejected": -2.105494976043701, "step": 3190 }, { "epoch": 0.3227026345644775, "grad_norm": 119.05905151367188, "learning_rate": 6.774226076434406e-07, "logits/chosen": 1.9608232975006104, "logits/rejected": NaN, "logps/chosen": -516.8528442382812, "logps/rejected": -337.8375549316406, "loss": 0.387, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2089797556400299, "rewards/margins": 1.772960901260376, "rewards/rejected": -1.9819406270980835, "step": 3200 }, { "epoch": 0.3237110802974915, "grad_norm": 92.98951721191406, "learning_rate": 6.76414238176868e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -610.6563720703125, "logps/rejected": -373.1419372558594, "loss": 0.5976, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6966030597686768, "rewards/margins": 1.4508625268936157, "rewards/rejected": -2.147465467453003, "step": 3210 }, { "epoch": 0.32471952603050547, "grad_norm": 211.26808166503906, "learning_rate": 6.754058687102954e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -714.2242431640625, "logps/rejected": -446.2029724121094, "loss": 0.6896, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3603549599647522, "rewards/margins": 1.5012739896774292, "rewards/rejected": -1.8616288900375366, "step": 3220 }, { "epoch": 0.3257279717635195, "grad_norm": 179.9309844970703, "learning_rate": 6.743974992437229e-07, "logits/chosen": 1.992175817489624, "logits/rejected": NaN, "logps/chosen": -571.25, "logps/rejected": -355.92974853515625, "loss": 0.4915, "rewards/accuracies": 0.75, "rewards/chosen": -0.7340831160545349, "rewards/margins": 1.4801275730133057, "rewards/rejected": -2.2142107486724854, "step": 3230 }, { "epoch": 0.3267364174965335, "grad_norm": 56.3846321105957, "learning_rate": 6.733891297771503e-07, "logits/chosen": 1.7289577722549438, "logits/rejected": NaN, "logps/chosen": -533.2789306640625, "logps/rejected": -306.2665100097656, "loss": 0.3966, "rewards/accuracies": 0.75, "rewards/chosen": -0.6874846816062927, "rewards/margins": 1.9216346740722656, "rewards/rejected": -2.609119415283203, "step": 3240 }, { "epoch": 0.32774486322954743, "grad_norm": 78.0403823852539, "learning_rate": 6.723807603105778e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -468.2471618652344, "logps/rejected": -487.8875427246094, "loss": 0.46, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9059586524963379, "rewards/margins": 1.8746938705444336, "rewards/rejected": -2.7806525230407715, "step": 3250 }, { "epoch": 0.32875330896256144, "grad_norm": 132.9267120361328, "learning_rate": 6.713723908440052e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -457.82977294921875, "logps/rejected": -435.8319396972656, "loss": 0.675, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.287078619003296, "rewards/margins": 1.3155312538146973, "rewards/rejected": -2.602609634399414, "step": 3260 }, { "epoch": 0.32976175469557545, "grad_norm": 58.47158432006836, "learning_rate": 6.703640213774326e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -503.92584228515625, "logps/rejected": -316.51202392578125, "loss": 0.3399, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0532965660095215, "rewards/margins": 1.9416682720184326, "rewards/rejected": -2.994965076446533, "step": 3270 }, { "epoch": 0.33077020042858946, "grad_norm": 113.16185760498047, "learning_rate": 6.693556519108601e-07, "logits/chosen": 1.8823665380477905, "logits/rejected": NaN, "logps/chosen": -547.771240234375, "logps/rejected": -359.2042236328125, "loss": 0.4623, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.024465799331665, "rewards/margins": 1.5478906631469727, "rewards/rejected": -2.5723562240600586, "step": 3280 }, { "epoch": 0.3317786461616034, "grad_norm": 128.8007354736328, "learning_rate": 6.683472824442875e-07, "logits/chosen": 1.7757781744003296, "logits/rejected": NaN, "logps/chosen": -541.6405639648438, "logps/rejected": -302.08294677734375, "loss": 0.4952, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2592871189117432, "rewards/margins": 1.649530053138733, "rewards/rejected": -2.9088170528411865, "step": 3290 }, { "epoch": 0.3327870918946174, "grad_norm": 95.00900268554688, "learning_rate": 6.673389129777151e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -560.3937377929688, "logps/rejected": -272.780029296875, "loss": 0.5264, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8466355800628662, "rewards/margins": 1.6330163478851318, "rewards/rejected": -3.4796524047851562, "step": 3300 }, { "epoch": 0.3337955376276314, "grad_norm": 22.115671157836914, "learning_rate": 6.663305435111425e-07, "logits/chosen": NaN, "logits/rejected": 1.6932318210601807, "logps/chosen": -426.9944763183594, "logps/rejected": -445.82574462890625, "loss": 0.6171, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6484653949737549, "rewards/margins": 1.9021375179290771, "rewards/rejected": -3.550602674484253, "step": 3310 }, { "epoch": 0.33480398336064543, "grad_norm": 25.41225814819336, "learning_rate": 6.653221740445699e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -506.10955810546875, "logps/rejected": -394.4707946777344, "loss": 0.4851, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7970380783081055, "rewards/margins": 1.8561632633209229, "rewards/rejected": -3.653200626373291, "step": 3320 }, { "epoch": 0.3358124290936594, "grad_norm": 54.79415512084961, "learning_rate": 6.643138045779973e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -564.1859130859375, "logps/rejected": -426.16876220703125, "loss": 0.699, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.915785551071167, "rewards/margins": 1.4818310737609863, "rewards/rejected": -3.3976166248321533, "step": 3330 }, { "epoch": 0.3368208748266734, "grad_norm": 104.04317474365234, "learning_rate": 6.633054351114247e-07, "logits/chosen": 1.6655126810073853, "logits/rejected": NaN, "logps/chosen": -535.38232421875, "logps/rejected": -390.3100280761719, "loss": 0.4248, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2973716259002686, "rewards/margins": 1.6717876195907593, "rewards/rejected": -2.9691593647003174, "step": 3340 }, { "epoch": 0.3378293205596874, "grad_norm": 184.26821899414062, "learning_rate": 6.622970656448523e-07, "logits/chosen": 1.7646974325180054, "logits/rejected": NaN, "logps/chosen": -586.7598266601562, "logps/rejected": -317.30035400390625, "loss": 0.3944, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.584472417831421, "rewards/margins": 1.8952229022979736, "rewards/rejected": -3.4796950817108154, "step": 3350 }, { "epoch": 0.33883776629270135, "grad_norm": 256.36468505859375, "learning_rate": 6.612886961782797e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -609.0642700195312, "logps/rejected": -505.37176513671875, "loss": 0.7191, "rewards/accuracies": 0.625, "rewards/chosen": -0.8094870448112488, "rewards/margins": 1.3442363739013672, "rewards/rejected": -2.1537234783172607, "step": 3360 }, { "epoch": 0.33984621202571536, "grad_norm": 66.59915161132812, "learning_rate": 6.602803267117072e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -549.6300048828125, "logps/rejected": -453.1890563964844, "loss": 0.3389, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6638983488082886, "rewards/margins": 1.9837675094604492, "rewards/rejected": -2.6476657390594482, "step": 3370 }, { "epoch": 0.34085465775872936, "grad_norm": 211.04039001464844, "learning_rate": 6.592719572451346e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -443.5753479003906, "logps/rejected": -397.462890625, "loss": 0.5295, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8299154043197632, "rewards/margins": 1.6043050289154053, "rewards/rejected": -2.434220552444458, "step": 3380 }, { "epoch": 0.34186310349174337, "grad_norm": 86.42325592041016, "learning_rate": 6.58263587778562e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -512.0333251953125, "logps/rejected": -345.85791015625, "loss": 0.4246, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2326792478561401, "rewards/margins": 1.766666054725647, "rewards/rejected": -2.999345064163208, "step": 3390 }, { "epoch": 0.3428715492247573, "grad_norm": 156.46241760253906, "learning_rate": 6.572552183119895e-07, "logits/chosen": 1.7353382110595703, "logits/rejected": NaN, "logps/chosen": -599.4701538085938, "logps/rejected": -350.35345458984375, "loss": 0.383, "rewards/accuracies": 0.75, "rewards/chosen": -0.8733657598495483, "rewards/margins": 1.8349072933197021, "rewards/rejected": -2.708272933959961, "step": 3400 }, { "epoch": 0.34387999495777133, "grad_norm": 68.168701171875, "learning_rate": 6.562468488454169e-07, "logits/chosen": 1.4037375450134277, "logits/rejected": NaN, "logps/chosen": -614.1039428710938, "logps/rejected": -310.23724365234375, "loss": 0.3558, "rewards/accuracies": 0.75, "rewards/chosen": -0.7078416347503662, "rewards/margins": 1.899627923965454, "rewards/rejected": -2.6074695587158203, "step": 3410 }, { "epoch": 0.34488844069078534, "grad_norm": 31.773550033569336, "learning_rate": 6.552384793788444e-07, "logits/chosen": 1.6205997467041016, "logits/rejected": NaN, "logps/chosen": -590.6087646484375, "logps/rejected": -335.9784240722656, "loss": 0.5045, "rewards/accuracies": 0.75, "rewards/chosen": -1.1371558904647827, "rewards/margins": 1.6307098865509033, "rewards/rejected": -2.7678658962249756, "step": 3420 }, { "epoch": 0.34589688642379934, "grad_norm": 157.49000549316406, "learning_rate": 6.542301099122718e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -623.8073120117188, "logps/rejected": -306.65093994140625, "loss": 0.6304, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.0144927501678467, "rewards/margins": 1.3332573175430298, "rewards/rejected": -2.347750186920166, "step": 3430 }, { "epoch": 0.3469053321568133, "grad_norm": 47.948753356933594, "learning_rate": 6.532217404456992e-07, "logits/chosen": 1.7017829418182373, "logits/rejected": NaN, "logps/chosen": -538.6787109375, "logps/rejected": -364.5447998046875, "loss": 0.5687, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1407740116119385, "rewards/margins": 1.61943781375885, "rewards/rejected": -2.76021146774292, "step": 3440 }, { "epoch": 0.3479137778898273, "grad_norm": 54.674896240234375, "learning_rate": 6.522133709791268e-07, "logits/chosen": 1.8517110347747803, "logits/rejected": NaN, "logps/chosen": -521.1676635742188, "logps/rejected": -397.44183349609375, "loss": 0.6627, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1548333168029785, "rewards/margins": 1.3723925352096558, "rewards/rejected": -2.527226209640503, "step": 3450 }, { "epoch": 0.3489222236228413, "grad_norm": 233.0802764892578, "learning_rate": 6.512050015125542e-07, "logits/chosen": 1.5123026371002197, "logits/rejected": NaN, "logps/chosen": -580.5225830078125, "logps/rejected": -283.5918273925781, "loss": 0.321, "rewards/accuracies": 0.875, "rewards/chosen": -0.7566783428192139, "rewards/margins": 2.020134925842285, "rewards/rejected": -2.776813268661499, "step": 3460 }, { "epoch": 0.34993066935585526, "grad_norm": 184.62254333496094, "learning_rate": 6.501966320459816e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -518.4100341796875, "logps/rejected": -519.8482666015625, "loss": 0.5147, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4963587820529938, "rewards/margins": 1.613224744796753, "rewards/rejected": -2.109583854675293, "step": 3470 }, { "epoch": 0.35093911508886927, "grad_norm": 116.9724349975586, "learning_rate": 6.49188262579409e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -519.6322631835938, "logps/rejected": -342.76129150390625, "loss": 0.2801, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.48519110679626465, "rewards/margins": 2.198723793029785, "rewards/rejected": -2.68391489982605, "step": 3480 }, { "epoch": 0.3519475608218833, "grad_norm": 245.77484130859375, "learning_rate": 6.481798931128365e-07, "logits/chosen": 1.788572907447815, "logits/rejected": NaN, "logps/chosen": -633.8101806640625, "logps/rejected": -293.0058898925781, "loss": 0.4649, "rewards/accuracies": 0.75, "rewards/chosen": -0.862210750579834, "rewards/margins": 1.8306881189346313, "rewards/rejected": -2.6928985118865967, "step": 3490 }, { "epoch": 0.3529560065548973, "grad_norm": 0.18536727130413055, "learning_rate": 6.47171523646264e-07, "logits/chosen": 1.7085316181182861, "logits/rejected": NaN, "logps/chosen": -537.4378662109375, "logps/rejected": -349.874267578125, "loss": 0.3479, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1920970678329468, "rewards/margins": 1.8605502843856812, "rewards/rejected": -3.052647352218628, "step": 3500 }, { "epoch": 0.35396445228791124, "grad_norm": 59.838539123535156, "learning_rate": 6.461631541796914e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -563.9473876953125, "logps/rejected": -345.33502197265625, "loss": 0.3161, "rewards/accuracies": 0.875, "rewards/chosen": -0.34694600105285645, "rewards/margins": 2.3024580478668213, "rewards/rejected": -2.6494038105010986, "step": 3510 }, { "epoch": 0.35497289802092524, "grad_norm": 156.8006591796875, "learning_rate": 6.451547847131189e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -506.9423828125, "logps/rejected": -402.640380859375, "loss": 0.5582, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9207806587219238, "rewards/margins": 1.412514328956604, "rewards/rejected": -2.3332948684692383, "step": 3520 }, { "epoch": 0.35598134375393925, "grad_norm": 66.73566436767578, "learning_rate": 6.441464152465463e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -589.6758422851562, "logps/rejected": -390.7086486816406, "loss": 0.3494, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9867887496948242, "rewards/margins": 1.7956784963607788, "rewards/rejected": -2.7824673652648926, "step": 3530 }, { "epoch": 0.35698978948695326, "grad_norm": 44.40412902832031, "learning_rate": 6.431380457799737e-07, "logits/chosen": 1.561842679977417, "logits/rejected": NaN, "logps/chosen": -593.35302734375, "logps/rejected": -273.7843933105469, "loss": 0.4139, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0215078592300415, "rewards/margins": 1.814978837966919, "rewards/rejected": -2.83648681640625, "step": 3540 }, { "epoch": 0.3579982352199672, "grad_norm": 173.443115234375, "learning_rate": 6.421296763134012e-07, "logits/chosen": 1.7606483697891235, "logits/rejected": NaN, "logps/chosen": -552.3187255859375, "logps/rejected": -304.2648620605469, "loss": 0.4985, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9469742774963379, "rewards/margins": 2.329571485519409, "rewards/rejected": -3.276545763015747, "step": 3550 }, { "epoch": 0.3590066809529812, "grad_norm": 210.75685119628906, "learning_rate": 6.411213068468286e-07, "logits/chosen": 1.6186679601669312, "logits/rejected": NaN, "logps/chosen": -557.2901611328125, "logps/rejected": -408.95574951171875, "loss": 0.3899, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0525527000427246, "rewards/margins": 2.5162429809570312, "rewards/rejected": -3.568796157836914, "step": 3560 }, { "epoch": 0.3600151266859952, "grad_norm": 6.039386749267578, "learning_rate": 6.401129373802561e-07, "logits/chosen": 1.9413810968399048, "logits/rejected": NaN, "logps/chosen": -621.9586791992188, "logps/rejected": -440.716796875, "loss": 0.4374, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.304912805557251, "rewards/margins": 2.2410645484924316, "rewards/rejected": -3.545977830886841, "step": 3570 }, { "epoch": 0.3610235724190092, "grad_norm": 173.27493286132812, "learning_rate": 6.391045679136835e-07, "logits/chosen": 1.3442518711090088, "logits/rejected": NaN, "logps/chosen": -608.280517578125, "logps/rejected": -430.4335021972656, "loss": 0.5575, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.083694577217102, "rewards/margins": 1.3799611330032349, "rewards/rejected": -2.463655471801758, "step": 3580 }, { "epoch": 0.3620320181520232, "grad_norm": 136.84335327148438, "learning_rate": 6.380961984471111e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -552.6600341796875, "logps/rejected": -304.96337890625, "loss": 0.4682, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0055354833602905, "rewards/margins": 2.1677277088165283, "rewards/rejected": -3.1732630729675293, "step": 3590 }, { "epoch": 0.3630404638850372, "grad_norm": 87.62886810302734, "learning_rate": 6.370878289805385e-07, "logits/chosen": 1.7259114980697632, "logits/rejected": NaN, "logps/chosen": -542.234619140625, "logps/rejected": -381.1069641113281, "loss": 0.3897, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0034946203231812, "rewards/margins": 2.1566720008850098, "rewards/rejected": -3.1601665019989014, "step": 3600 }, { "epoch": 0.3640489096180512, "grad_norm": 29.02617835998535, "learning_rate": 6.360794595139659e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -498.0909729003906, "logps/rejected": -507.94403076171875, "loss": 0.4597, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2267563343048096, "rewards/margins": 2.190605640411377, "rewards/rejected": -3.4173622131347656, "step": 3610 }, { "epoch": 0.36505735535106515, "grad_norm": 105.07489776611328, "learning_rate": 6.350710900473933e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -459.66705322265625, "logps/rejected": -348.5559387207031, "loss": 0.5095, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9223142862319946, "rewards/margins": 1.9945614337921143, "rewards/rejected": -2.9168753623962402, "step": 3620 }, { "epoch": 0.36606580108407916, "grad_norm": 54.909236907958984, "learning_rate": 6.340627205808207e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -617.5711669921875, "logps/rejected": -351.4244689941406, "loss": 0.3117, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.43292102217674255, "rewards/margins": 2.174187660217285, "rewards/rejected": -2.6071085929870605, "step": 3630 }, { "epoch": 0.36707424681709316, "grad_norm": 169.08396911621094, "learning_rate": 6.330543511142483e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -636.6119995117188, "logps/rejected": -322.4748840332031, "loss": 0.4671, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5589243769645691, "rewards/margins": 1.8250911235809326, "rewards/rejected": -2.3840155601501465, "step": 3640 }, { "epoch": 0.36808269255010717, "grad_norm": 88.18544006347656, "learning_rate": 6.320459816476757e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -481.9173889160156, "logps/rejected": -441.3299255371094, "loss": 0.4006, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.3754888772964478, "rewards/margins": 2.193976879119873, "rewards/rejected": -3.5694661140441895, "step": 3650 }, { "epoch": 0.3690911382831211, "grad_norm": 184.9099884033203, "learning_rate": 6.310376121811031e-07, "logits/chosen": 1.4736741781234741, "logits/rejected": NaN, "logps/chosen": -492.8067321777344, "logps/rejected": -300.5682373046875, "loss": 0.4648, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7085061073303223, "rewards/margins": 1.820259690284729, "rewards/rejected": -2.528765916824341, "step": 3660 }, { "epoch": 0.37009958401613513, "grad_norm": 184.79299926757812, "learning_rate": 6.300292427145306e-07, "logits/chosen": 1.4617186784744263, "logits/rejected": NaN, "logps/chosen": -539.4271240234375, "logps/rejected": -248.5065155029297, "loss": 0.4607, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7515517473220825, "rewards/margins": 1.3062169551849365, "rewards/rejected": -2.0577690601348877, "step": 3670 }, { "epoch": 0.37110802974914914, "grad_norm": 190.2381134033203, "learning_rate": 6.29020873247958e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -520.4519653320312, "logps/rejected": -510.92877197265625, "loss": 0.4793, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9442607164382935, "rewards/margins": 1.9152694940567017, "rewards/rejected": -2.859530210494995, "step": 3680 }, { "epoch": 0.3721164754821631, "grad_norm": 95.02617645263672, "learning_rate": 6.280125037813855e-07, "logits/chosen": 1.6672626733779907, "logits/rejected": NaN, "logps/chosen": -596.3797607421875, "logps/rejected": -254.05947875976562, "loss": 0.541, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0093411207199097, "rewards/margins": 1.3963435888290405, "rewards/rejected": -2.40568470954895, "step": 3690 }, { "epoch": 0.3731249212151771, "grad_norm": 80.91893005371094, "learning_rate": 6.270041343148129e-07, "logits/chosen": 1.4562652111053467, "logits/rejected": NaN, "logps/chosen": -605.4266357421875, "logps/rejected": -311.78936767578125, "loss": 0.4472, "rewards/accuracies": 0.75, "rewards/chosen": -0.5828850865364075, "rewards/margins": 2.359252691268921, "rewards/rejected": -2.9421377182006836, "step": 3700 }, { "epoch": 0.3741333669481911, "grad_norm": 11.313995361328125, "learning_rate": 6.259957648482404e-07, "logits/chosen": NaN, "logits/rejected": 1.9917638301849365, "logps/chosen": -444.868408203125, "logps/rejected": -456.8526306152344, "loss": 0.5388, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6727864742279053, "rewards/margins": 1.730553388595581, "rewards/rejected": -3.4033401012420654, "step": 3710 }, { "epoch": 0.3751418126812051, "grad_norm": 141.97230529785156, "learning_rate": 6.249873953816678e-07, "logits/chosen": 1.6879287958145142, "logits/rejected": NaN, "logps/chosen": -512.0956420898438, "logps/rejected": -322.77117919921875, "loss": 0.2441, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.7001233100891113, "rewards/margins": 2.778992176055908, "rewards/rejected": -3.4791152477264404, "step": 3720 }, { "epoch": 0.37615025841421906, "grad_norm": 70.40026092529297, "learning_rate": 6.239790259150952e-07, "logits/chosen": 1.647822618484497, "logits/rejected": NaN, "logps/chosen": -650.4662475585938, "logps/rejected": -442.63397216796875, "loss": 0.3189, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5178560018539429, "rewards/margins": 2.2361598014831543, "rewards/rejected": -2.7540159225463867, "step": 3730 }, { "epoch": 0.37715870414723307, "grad_norm": 252.23057556152344, "learning_rate": 6.229706564485228e-07, "logits/chosen": 1.70364248752594, "logits/rejected": NaN, "logps/chosen": -638.6624145507812, "logps/rejected": -453.1658630371094, "loss": 0.4157, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8009678721427917, "rewards/margins": 1.7270381450653076, "rewards/rejected": -2.5280065536499023, "step": 3740 }, { "epoch": 0.3781671498802471, "grad_norm": 26.923688888549805, "learning_rate": 6.219622869819502e-07, "logits/chosen": 1.7544126510620117, "logits/rejected": NaN, "logps/chosen": -568.3533935546875, "logps/rejected": -328.2738342285156, "loss": 0.3445, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5588886737823486, "rewards/margins": 2.17769193649292, "rewards/rejected": -2.7365806102752686, "step": 3750 }, { "epoch": 0.3791755956132611, "grad_norm": 207.55506896972656, "learning_rate": 6.209539175153776e-07, "logits/chosen": 1.6519191265106201, "logits/rejected": 1.641214370727539, "logps/chosen": -433.1195373535156, "logps/rejected": -383.18719482421875, "loss": 0.5736, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3768250942230225, "rewards/margins": 1.9759976863861084, "rewards/rejected": -3.352822780609131, "step": 3760 }, { "epoch": 0.38018404134627504, "grad_norm": 47.745182037353516, "learning_rate": 6.19945548048805e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -497.775390625, "logps/rejected": -310.59637451171875, "loss": 0.6278, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7799538373947144, "rewards/margins": 1.4685113430023193, "rewards/rejected": -2.248465061187744, "step": 3770 }, { "epoch": 0.38119248707928904, "grad_norm": 75.36809539794922, "learning_rate": 6.189371785822324e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -479.631103515625, "logps/rejected": -383.5855712890625, "loss": 0.5239, "rewards/accuracies": 0.625, "rewards/chosen": -1.6786247491836548, "rewards/margins": 1.7305988073349, "rewards/rejected": -3.4092235565185547, "step": 3780 }, { "epoch": 0.38220093281230305, "grad_norm": 11.71839714050293, "learning_rate": 6.1792880911566e-07, "logits/chosen": 1.7146724462509155, "logits/rejected": NaN, "logps/chosen": -590.0692138671875, "logps/rejected": -397.2268981933594, "loss": 0.5144, "rewards/accuracies": 0.625, "rewards/chosen": -1.3737505674362183, "rewards/margins": 1.4234817028045654, "rewards/rejected": -2.7972323894500732, "step": 3790 }, { "epoch": 0.383209378545317, "grad_norm": 11.696369171142578, "learning_rate": 6.169204396490874e-07, "logits/chosen": 1.4996706247329712, "logits/rejected": NaN, "logps/chosen": -588.7529296875, "logps/rejected": -386.5650939941406, "loss": 0.3802, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.4920174479484558, "rewards/margins": 2.3272805213928223, "rewards/rejected": -2.819298028945923, "step": 3800 }, { "epoch": 0.384217824278331, "grad_norm": 152.67344665527344, "learning_rate": 6.159120701825149e-07, "logits/chosen": 1.540018916130066, "logits/rejected": NaN, "logps/chosen": -475.35455322265625, "logps/rejected": -332.8369140625, "loss": 0.335, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.33022141456604, "rewards/margins": 2.2018988132476807, "rewards/rejected": -3.5321204662323, "step": 3810 }, { "epoch": 0.385226270011345, "grad_norm": 40.480194091796875, "learning_rate": 6.149037007159423e-07, "logits/chosen": 1.652557373046875, "logits/rejected": NaN, "logps/chosen": -508.38519287109375, "logps/rejected": -291.1585388183594, "loss": 0.369, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4485963582992554, "rewards/margins": 2.249690532684326, "rewards/rejected": -3.698286533355713, "step": 3820 }, { "epoch": 0.386234715744359, "grad_norm": 74.98438262939453, "learning_rate": 6.138953312493697e-07, "logits/chosen": 1.767273187637329, "logits/rejected": NaN, "logps/chosen": -512.1429443359375, "logps/rejected": -317.9253234863281, "loss": 0.4756, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.5613130331039429, "rewards/margins": 1.5987508296966553, "rewards/rejected": -3.160064220428467, "step": 3830 }, { "epoch": 0.387243161477373, "grad_norm": 61.28828430175781, "learning_rate": 6.128869617827972e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -560.9553833007812, "logps/rejected": -422.44854736328125, "loss": 0.4276, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9525057077407837, "rewards/margins": 2.34698486328125, "rewards/rejected": -3.299490451812744, "step": 3840 }, { "epoch": 0.388251607210387, "grad_norm": 22.515825271606445, "learning_rate": 6.118785923162246e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -583.5960693359375, "logps/rejected": -390.02496337890625, "loss": 0.3605, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.573672890663147, "rewards/margins": 1.520553708076477, "rewards/rejected": -2.094226598739624, "step": 3850 }, { "epoch": 0.389260052943401, "grad_norm": 108.26931762695312, "learning_rate": 6.108702228496521e-07, "logits/chosen": 1.6153924465179443, "logits/rejected": NaN, "logps/chosen": -589.0557861328125, "logps/rejected": -340.9940490722656, "loss": 0.349, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.3369407057762146, "rewards/margins": 2.3051018714904785, "rewards/rejected": -2.642042875289917, "step": 3860 }, { "epoch": 0.390268498676415, "grad_norm": 203.40350341796875, "learning_rate": 6.098618533830795e-07, "logits/chosen": 1.534265398979187, "logits/rejected": NaN, "logps/chosen": -578.19921875, "logps/rejected": -338.0447692871094, "loss": 0.4962, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.2643982470035553, "rewards/margins": 2.1318533420562744, "rewards/rejected": -2.396251678466797, "step": 3870 }, { "epoch": 0.39127694440942895, "grad_norm": 115.27473449707031, "learning_rate": 6.088534839165069e-07, "logits/chosen": 1.4981003999710083, "logits/rejected": NaN, "logps/chosen": -549.837158203125, "logps/rejected": -232.5082244873047, "loss": 0.4236, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6413615345954895, "rewards/margins": 1.9097785949707031, "rewards/rejected": -2.551140069961548, "step": 3880 }, { "epoch": 0.39228539014244296, "grad_norm": 43.50807571411133, "learning_rate": 6.078451144499345e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -483.69268798828125, "logps/rejected": -413.7159118652344, "loss": 0.4144, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2843953371047974, "rewards/margins": 2.2373530864715576, "rewards/rejected": -3.5217483043670654, "step": 3890 }, { "epoch": 0.39329383587545697, "grad_norm": 5.693430423736572, "learning_rate": 6.068367449833619e-07, "logits/chosen": 1.5520070791244507, "logits/rejected": NaN, "logps/chosen": -510.68914794921875, "logps/rejected": -352.09234619140625, "loss": 0.5292, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7404828071594238, "rewards/margins": 1.4856044054031372, "rewards/rejected": -2.2260870933532715, "step": 3900 }, { "epoch": 0.3943022816084709, "grad_norm": 8.056243896484375, "learning_rate": 6.058283755167893e-07, "logits/chosen": 1.620603322982788, "logits/rejected": NaN, "logps/chosen": -576.4569091796875, "logps/rejected": -443.3910217285156, "loss": 0.5399, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7398021221160889, "rewards/margins": 1.9854276180267334, "rewards/rejected": -3.7252297401428223, "step": 3910 }, { "epoch": 0.3953107273414849, "grad_norm": 210.5326385498047, "learning_rate": 6.048200060502167e-07, "logits/chosen": 1.6805047988891602, "logits/rejected": NaN, "logps/chosen": -574.6917724609375, "logps/rejected": -284.5795593261719, "loss": 0.7825, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.058550477027893, "rewards/margins": 1.1762487888336182, "rewards/rejected": -2.2347991466522217, "step": 3920 }, { "epoch": 0.39631917307449893, "grad_norm": 57.80302047729492, "learning_rate": 6.038116365836443e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -652.9698486328125, "logps/rejected": -471.9789123535156, "loss": 0.5631, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6150510907173157, "rewards/margins": 1.3613585233688354, "rewards/rejected": -1.9764095544815063, "step": 3930 }, { "epoch": 0.39732761880751294, "grad_norm": 202.5684356689453, "learning_rate": 6.028032671170717e-07, "logits/chosen": 1.7421996593475342, "logits/rejected": NaN, "logps/chosen": -555.5596923828125, "logps/rejected": -396.2425842285156, "loss": 0.3199, "rewards/accuracies": 0.75, "rewards/chosen": -0.6756970286369324, "rewards/margins": 1.9577713012695312, "rewards/rejected": -2.6334681510925293, "step": 3940 }, { "epoch": 0.3983360645405269, "grad_norm": 84.43403625488281, "learning_rate": 6.017948976504991e-07, "logits/chosen": 1.6816151142120361, "logits/rejected": NaN, "logps/chosen": -492.9303283691406, "logps/rejected": -375.31732177734375, "loss": 0.5445, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3933931589126587, "rewards/margins": 1.9401251077651978, "rewards/rejected": -3.3335182666778564, "step": 3950 }, { "epoch": 0.3993445102735409, "grad_norm": 243.8534698486328, "learning_rate": 6.007865281839266e-07, "logits/chosen": 1.3755054473876953, "logits/rejected": NaN, "logps/chosen": -501.4220275878906, "logps/rejected": -406.6123962402344, "loss": 0.4273, "rewards/accuracies": 0.75, "rewards/chosen": -1.8837515115737915, "rewards/margins": 2.1410152912139893, "rewards/rejected": -4.02476692199707, "step": 3960 }, { "epoch": 0.4003529560065549, "grad_norm": 64.44049072265625, "learning_rate": 5.99778158717354e-07, "logits/chosen": 1.6389961242675781, "logits/rejected": NaN, "logps/chosen": -490.0501403808594, "logps/rejected": -304.538818359375, "loss": 0.412, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5860422849655151, "rewards/margins": 1.8993914127349854, "rewards/rejected": -3.485433578491211, "step": 3970 }, { "epoch": 0.4013614017395689, "grad_norm": 132.62554931640625, "learning_rate": 5.987697892507815e-07, "logits/chosen": 1.489654779434204, "logits/rejected": NaN, "logps/chosen": -526.4230346679688, "logps/rejected": -350.1199035644531, "loss": 0.4536, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9600766897201538, "rewards/margins": 2.2826759815216064, "rewards/rejected": -3.2427525520324707, "step": 3980 }, { "epoch": 0.40236984747258286, "grad_norm": 35.800777435302734, "learning_rate": 5.977614197842089e-07, "logits/chosen": 1.4253251552581787, "logits/rejected": NaN, "logps/chosen": -561.9373779296875, "logps/rejected": -482.2264099121094, "loss": 0.4697, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0585181713104248, "rewards/margins": 1.8819860219955444, "rewards/rejected": -2.9405040740966797, "step": 3990 }, { "epoch": 0.40337829320559687, "grad_norm": 69.98756408691406, "learning_rate": 5.967530503176363e-07, "logits/chosen": 1.6769574880599976, "logits/rejected": NaN, "logps/chosen": -671.810791015625, "logps/rejected": -292.33062744140625, "loss": 0.3705, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5923532247543335, "rewards/margins": 2.526444911956787, "rewards/rejected": -3.118798017501831, "step": 4000 }, { "epoch": 0.4043867389386109, "grad_norm": 122.55999755859375, "learning_rate": 5.957446808510638e-07, "logits/chosen": 1.6422111988067627, "logits/rejected": NaN, "logps/chosen": -504.37237548828125, "logps/rejected": -419.7996520996094, "loss": 0.4858, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7482072114944458, "rewards/margins": 1.8248579502105713, "rewards/rejected": -2.5730652809143066, "step": 4010 }, { "epoch": 0.40539518467162483, "grad_norm": 52.42949676513672, "learning_rate": 5.947363113844912e-07, "logits/chosen": 1.2471247911453247, "logits/rejected": NaN, "logps/chosen": -539.8175048828125, "logps/rejected": -330.17974853515625, "loss": 0.4477, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9326076507568359, "rewards/margins": 2.520432233810425, "rewards/rejected": -3.4530398845672607, "step": 4020 }, { "epoch": 0.40640363040463884, "grad_norm": 225.33372497558594, "learning_rate": 5.937279419179188e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -448.8936462402344, "logps/rejected": -362.58270263671875, "loss": 0.5389, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6148602962493896, "rewards/margins": 1.6259329319000244, "rewards/rejected": -3.240793228149414, "step": 4030 }, { "epoch": 0.40741207613765285, "grad_norm": 114.03205108642578, "learning_rate": 5.927195724513462e-07, "logits/chosen": 1.597710371017456, "logits/rejected": NaN, "logps/chosen": -716.5460205078125, "logps/rejected": -375.8825378417969, "loss": 0.4478, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4965025186538696, "rewards/margins": 1.8384227752685547, "rewards/rejected": -3.3349251747131348, "step": 4040 }, { "epoch": 0.40842052187066685, "grad_norm": 189.4359588623047, "learning_rate": 5.917112029847736e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -523.6817626953125, "logps/rejected": -386.5060119628906, "loss": 0.716, "rewards/accuracies": 0.625, "rewards/chosen": -1.8564609289169312, "rewards/margins": 1.3360648155212402, "rewards/rejected": -3.192525625228882, "step": 4050 }, { "epoch": 0.4094289676036808, "grad_norm": 117.99310302734375, "learning_rate": 5.90702833518201e-07, "logits/chosen": 1.429871678352356, "logits/rejected": NaN, "logps/chosen": -466.87872314453125, "logps/rejected": -428.1151428222656, "loss": 0.392, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8973134756088257, "rewards/margins": 2.0176098346710205, "rewards/rejected": -3.9149234294891357, "step": 4060 }, { "epoch": 0.4104374133366948, "grad_norm": 46.79047775268555, "learning_rate": 5.896944640516284e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -556.6370849609375, "logps/rejected": -361.1519775390625, "loss": 0.4208, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1517561674118042, "rewards/margins": 1.6862589120864868, "rewards/rejected": -2.838015079498291, "step": 4070 }, { "epoch": 0.4114458590697088, "grad_norm": 139.36883544921875, "learning_rate": 5.88686094585056e-07, "logits/chosen": 1.3954551219940186, "logits/rejected": NaN, "logps/chosen": -558.5635986328125, "logps/rejected": -390.52313232421875, "loss": 0.4372, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1181975603103638, "rewards/margins": 1.965226173400879, "rewards/rejected": -3.083423614501953, "step": 4080 }, { "epoch": 0.4124543048027228, "grad_norm": 37.95014190673828, "learning_rate": 5.876777251184834e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -481.5006408691406, "logps/rejected": -411.3868103027344, "loss": 0.5753, "rewards/accuracies": 0.625, "rewards/chosen": -1.1917355060577393, "rewards/margins": 1.7602119445800781, "rewards/rejected": -2.9519474506378174, "step": 4090 }, { "epoch": 0.4134627505357368, "grad_norm": 97.14158630371094, "learning_rate": 5.866693556519108e-07, "logits/chosen": 1.5778579711914062, "logits/rejected": 1.3754836320877075, "logps/chosen": -379.03753662109375, "logps/rejected": -314.56549072265625, "loss": 0.7268, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.501950740814209, "rewards/margins": 1.1077344417572021, "rewards/rejected": -3.609684705734253, "step": 4100 }, { "epoch": 0.4144711962687508, "grad_norm": 181.57656860351562, "learning_rate": 5.856609861853383e-07, "logits/chosen": 1.263698935508728, "logits/rejected": 1.3991825580596924, "logps/chosen": -522.3001098632812, "logps/rejected": -405.28668212890625, "loss": 0.485, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.451515793800354, "rewards/margins": 1.8299888372421265, "rewards/rejected": -3.2815048694610596, "step": 4110 }, { "epoch": 0.4154796420017648, "grad_norm": 122.43319702148438, "learning_rate": 5.846526167187657e-07, "logits/chosen": 1.6333427429199219, "logits/rejected": NaN, "logps/chosen": -617.8841552734375, "logps/rejected": -398.8130187988281, "loss": 0.4585, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7418090105056763, "rewards/margins": 1.9400484561920166, "rewards/rejected": -2.6818573474884033, "step": 4120 }, { "epoch": 0.41648808773477874, "grad_norm": 100.00398254394531, "learning_rate": 5.836442472521932e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -583.6390380859375, "logps/rejected": -407.5420837402344, "loss": 0.4805, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7779208421707153, "rewards/margins": 1.7694175243377686, "rewards/rejected": -2.5473382472991943, "step": 4130 }, { "epoch": 0.41749653346779275, "grad_norm": 137.33670043945312, "learning_rate": 5.826358777856206e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -478.35711669921875, "logps/rejected": -430.98388671875, "loss": 0.566, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4705655574798584, "rewards/margins": 1.5935211181640625, "rewards/rejected": -3.064086437225342, "step": 4140 }, { "epoch": 0.41850497920080676, "grad_norm": 109.06259155273438, "learning_rate": 5.816275083190481e-07, "logits/chosen": 1.5967411994934082, "logits/rejected": NaN, "logps/chosen": -656.5986328125, "logps/rejected": -407.042724609375, "loss": 0.4611, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1769273281097412, "rewards/margins": 2.3525924682617188, "rewards/rejected": -3.529520034790039, "step": 4150 }, { "epoch": 0.41951342493382077, "grad_norm": 144.8237762451172, "learning_rate": 5.806191388524755e-07, "logits/chosen": 1.5965038537979126, "logits/rejected": NaN, "logps/chosen": -555.91015625, "logps/rejected": -402.7421875, "loss": 0.4875, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5031068325042725, "rewards/margins": 1.750393271446228, "rewards/rejected": -3.253500461578369, "step": 4160 }, { "epoch": 0.4205218706668347, "grad_norm": 152.27493286132812, "learning_rate": 5.796107693859029e-07, "logits/chosen": 1.3254958391189575, "logits/rejected": NaN, "logps/chosen": -426.2080993652344, "logps/rejected": -352.7328796386719, "loss": 0.4514, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4947086572647095, "rewards/margins": 1.615459680557251, "rewards/rejected": -3.110168695449829, "step": 4170 }, { "epoch": 0.4215303163998487, "grad_norm": 43.365413665771484, "learning_rate": 5.786023999193305e-07, "logits/chosen": 1.5577284097671509, "logits/rejected": NaN, "logps/chosen": -571.78173828125, "logps/rejected": -293.7032165527344, "loss": 0.4249, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6563616991043091, "rewards/margins": 1.8456289768218994, "rewards/rejected": -2.501990556716919, "step": 4180 }, { "epoch": 0.42253876213286273, "grad_norm": 35.5644416809082, "learning_rate": 5.775940304527579e-07, "logits/chosen": 1.4292614459991455, "logits/rejected": NaN, "logps/chosen": -610.0234375, "logps/rejected": -346.844970703125, "loss": 0.4394, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8061065673828125, "rewards/margins": 2.127896785736084, "rewards/rejected": -2.9340033531188965, "step": 4190 }, { "epoch": 0.42354720786587674, "grad_norm": 52.95961380004883, "learning_rate": 5.765856609861853e-07, "logits/chosen": 1.5529544353485107, "logits/rejected": NaN, "logps/chosen": -533.760009765625, "logps/rejected": -371.79058837890625, "loss": 0.4512, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4031693935394287, "rewards/margins": 1.8525946140289307, "rewards/rejected": -3.2557640075683594, "step": 4200 }, { "epoch": 0.4245556535988907, "grad_norm": 66.0121078491211, "learning_rate": 5.755772915196127e-07, "logits/chosen": NaN, "logits/rejected": 1.5616247653961182, "logps/chosen": -513.6884765625, "logps/rejected": -512.3397216796875, "loss": 0.3099, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7389770746231079, "rewards/margins": 2.5257909297943115, "rewards/rejected": -3.26476788520813, "step": 4210 }, { "epoch": 0.4255640993319047, "grad_norm": 157.05810546875, "learning_rate": 5.745689220530402e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -600.3240966796875, "logps/rejected": -363.29327392578125, "loss": 0.4029, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6818476915359497, "rewards/margins": 2.7657899856567383, "rewards/rejected": -3.4476375579833984, "step": 4220 }, { "epoch": 0.4265725450649187, "grad_norm": 151.86996459960938, "learning_rate": 5.735605525864677e-07, "logits/chosen": 1.5301154851913452, "logits/rejected": NaN, "logps/chosen": -566.4988403320312, "logps/rejected": -329.52423095703125, "loss": 0.471, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2719032764434814, "rewards/margins": 2.152406692504883, "rewards/rejected": -3.4243102073669434, "step": 4230 }, { "epoch": 0.42758099079793266, "grad_norm": 174.85569763183594, "learning_rate": 5.725521831198951e-07, "logits/chosen": 1.6469628810882568, "logits/rejected": NaN, "logps/chosen": -457.0606384277344, "logps/rejected": -397.58819580078125, "loss": 0.6413, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.2452739477157593, "rewards/margins": 1.254475474357605, "rewards/rejected": -2.4997494220733643, "step": 4240 }, { "epoch": 0.42858943653094667, "grad_norm": 58.41826629638672, "learning_rate": 5.715438136533226e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -565.6073608398438, "logps/rejected": -433.80694580078125, "loss": 0.459, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1171821355819702, "rewards/margins": 1.8525562286376953, "rewards/rejected": -2.969738483428955, "step": 4250 }, { "epoch": 0.4295978822639607, "grad_norm": 129.27159118652344, "learning_rate": 5.7053544418675e-07, "logits/chosen": 1.6527026891708374, "logits/rejected": NaN, "logps/chosen": -508.5929260253906, "logps/rejected": -391.79376220703125, "loss": 0.352, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3613532781600952, "rewards/margins": 1.9072084426879883, "rewards/rejected": -3.268561601638794, "step": 4260 }, { "epoch": 0.4306063279969747, "grad_norm": 9.675626754760742, "learning_rate": 5.695270747201774e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -565.9437866210938, "logps/rejected": -353.5788879394531, "loss": 0.5185, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4921672344207764, "rewards/margins": 1.9716295003890991, "rewards/rejected": -3.463797092437744, "step": 4270 }, { "epoch": 0.43161477372998863, "grad_norm": 64.62217712402344, "learning_rate": 5.685187052536049e-07, "logits/chosen": 1.5155948400497437, "logits/rejected": NaN, "logps/chosen": -536.0640258789062, "logps/rejected": -328.10015869140625, "loss": 0.5167, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.783372163772583, "rewards/margins": 1.995296835899353, "rewards/rejected": -3.7786688804626465, "step": 4280 }, { "epoch": 0.43262321946300264, "grad_norm": 85.75222778320312, "learning_rate": 5.675103357870323e-07, "logits/chosen": 1.3476461172103882, "logits/rejected": NaN, "logps/chosen": -512.7155151367188, "logps/rejected": -302.65484619140625, "loss": 0.5604, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6254361867904663, "rewards/margins": 2.018463611602783, "rewards/rejected": -3.6439003944396973, "step": 4290 }, { "epoch": 0.43363166519601665, "grad_norm": 63.727935791015625, "learning_rate": 5.665019663204598e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -510.27545166015625, "logps/rejected": -381.00115966796875, "loss": 0.4353, "rewards/accuracies": 0.75, "rewards/chosen": -1.3049139976501465, "rewards/margins": 2.477607250213623, "rewards/rejected": -3.7825214862823486, "step": 4300 }, { "epoch": 0.43464011092903065, "grad_norm": 140.1616973876953, "learning_rate": 5.654935968538872e-07, "logits/chosen": 1.6106706857681274, "logits/rejected": NaN, "logps/chosen": -608.1015625, "logps/rejected": -319.47357177734375, "loss": 0.4076, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9157381057739258, "rewards/margins": 2.337883472442627, "rewards/rejected": -3.2536215782165527, "step": 4310 }, { "epoch": 0.4356485566620446, "grad_norm": 89.08137512207031, "learning_rate": 5.644852273873146e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -423.23602294921875, "logps/rejected": -424.0057067871094, "loss": 0.5154, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4783896207809448, "rewards/margins": 1.7338167428970337, "rewards/rejected": -3.2122066020965576, "step": 4320 }, { "epoch": 0.4366570023950586, "grad_norm": 194.09494018554688, "learning_rate": 5.634768579207422e-07, "logits/chosen": 1.505573034286499, "logits/rejected": NaN, "logps/chosen": -491.3028259277344, "logps/rejected": -328.41680908203125, "loss": 0.4916, "rewards/accuracies": 0.75, "rewards/chosen": -1.6686432361602783, "rewards/margins": 1.6620012521743774, "rewards/rejected": -3.330644130706787, "step": 4330 }, { "epoch": 0.4376654481280726, "grad_norm": 214.2198944091797, "learning_rate": 5.624684884541696e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -555.1901245117188, "logps/rejected": -468.8614196777344, "loss": 0.5047, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3777920007705688, "rewards/margins": 2.2957611083984375, "rewards/rejected": -3.673552989959717, "step": 4340 }, { "epoch": 0.43867389386108663, "grad_norm": 45.32879638671875, "learning_rate": 5.61460118987597e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -528.4771728515625, "logps/rejected": -381.917724609375, "loss": 0.411, "rewards/accuracies": 0.75, "rewards/chosen": -0.9477904438972473, "rewards/margins": 2.0054335594177246, "rewards/rejected": -2.953223943710327, "step": 4350 }, { "epoch": 0.4396823395941006, "grad_norm": 28.005102157592773, "learning_rate": 5.604517495210244e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -489.9596252441406, "logps/rejected": -406.16497802734375, "loss": 0.3866, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6705982089042664, "rewards/margins": 2.794673442840576, "rewards/rejected": -3.465271472930908, "step": 4360 }, { "epoch": 0.4406907853271146, "grad_norm": 190.82984924316406, "learning_rate": 5.59443380054452e-07, "logits/chosen": 1.4894737005233765, "logits/rejected": NaN, "logps/chosen": -547.3883666992188, "logps/rejected": -385.62353515625, "loss": 0.3881, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9850652813911438, "rewards/margins": 1.9379165172576904, "rewards/rejected": -2.9229817390441895, "step": 4370 }, { "epoch": 0.4416992310601286, "grad_norm": 201.71046447753906, "learning_rate": 5.584350105878794e-07, "logits/chosen": 1.6291601657867432, "logits/rejected": NaN, "logps/chosen": -605.591552734375, "logps/rejected": -458.922607421875, "loss": 0.5381, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9121249914169312, "rewards/margins": 2.003429889678955, "rewards/rejected": -2.9155545234680176, "step": 4380 }, { "epoch": 0.44270767679314255, "grad_norm": 193.94400024414062, "learning_rate": 5.574266411213068e-07, "logits/chosen": 1.772377371788025, "logits/rejected": NaN, "logps/chosen": -541.2896118164062, "logps/rejected": -430.02020263671875, "loss": 0.7119, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4005473852157593, "rewards/margins": 1.3201513290405273, "rewards/rejected": -2.720698595046997, "step": 4390 }, { "epoch": 0.44371612252615655, "grad_norm": 83.08052062988281, "learning_rate": 5.564182716547343e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -604.8464965820312, "logps/rejected": -414.16265869140625, "loss": 0.3948, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0906370878219604, "rewards/margins": 1.6544996500015259, "rewards/rejected": -2.7451367378234863, "step": 4400 }, { "epoch": 0.44472456825917056, "grad_norm": 167.84817504882812, "learning_rate": 5.554099021881617e-07, "logits/chosen": 1.1320059299468994, "logits/rejected": NaN, "logps/chosen": -483.31842041015625, "logps/rejected": -240.21542358398438, "loss": 0.4302, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6944326162338257, "rewards/margins": 2.266078472137451, "rewards/rejected": -2.9605114459991455, "step": 4410 }, { "epoch": 0.44573301399218457, "grad_norm": 102.63973236083984, "learning_rate": 5.544015327215892e-07, "logits/chosen": 1.3714110851287842, "logits/rejected": NaN, "logps/chosen": -518.7888793945312, "logps/rejected": -347.1249084472656, "loss": 0.4794, "rewards/accuracies": 0.75, "rewards/chosen": -1.375427007675171, "rewards/margins": 1.576852798461914, "rewards/rejected": -2.952279806137085, "step": 4420 }, { "epoch": 0.4467414597251985, "grad_norm": 184.66270446777344, "learning_rate": 5.533931632550166e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -556.7737426757812, "logps/rejected": -440.8456115722656, "loss": 0.684, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.1934192180633545, "rewards/margins": 1.663583517074585, "rewards/rejected": -2.8570027351379395, "step": 4430 }, { "epoch": 0.4477499054582125, "grad_norm": 49.795631408691406, "learning_rate": 5.523847937884441e-07, "logits/chosen": 1.739628791809082, "logits/rejected": NaN, "logps/chosen": -598.7860717773438, "logps/rejected": -391.4283142089844, "loss": 0.2914, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.051284670829773, "rewards/margins": 2.565117120742798, "rewards/rejected": -3.6164021492004395, "step": 4440 }, { "epoch": 0.44875835119122653, "grad_norm": 207.968994140625, "learning_rate": 5.513764243218715e-07, "logits/chosen": NaN, "logits/rejected": 1.4489808082580566, "logps/chosen": -418.6727600097656, "logps/rejected": -363.54791259765625, "loss": 0.5615, "rewards/accuracies": 0.625, "rewards/chosen": -1.84619140625, "rewards/margins": 1.27815842628479, "rewards/rejected": -3.124350070953369, "step": 4450 }, { "epoch": 0.44976679692424054, "grad_norm": 96.91659545898438, "learning_rate": 5.503680548552989e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -618.4803466796875, "logps/rejected": -361.6723327636719, "loss": 0.5774, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.165570616722107, "rewards/margins": 1.7326768636703491, "rewards/rejected": -2.898247718811035, "step": 4460 }, { "epoch": 0.4507752426572545, "grad_norm": 141.5798797607422, "learning_rate": 5.493596853887265e-07, "logits/chosen": 1.4284366369247437, "logits/rejected": NaN, "logps/chosen": -471.50213623046875, "logps/rejected": -349.6930236816406, "loss": 0.454, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.1783478260040283, "rewards/margins": 2.209446430206299, "rewards/rejected": -4.387794017791748, "step": 4470 }, { "epoch": 0.4517836883902685, "grad_norm": 9.472570419311523, "learning_rate": 5.483513159221539e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -481.42724609375, "logps/rejected": -392.6160583496094, "loss": 0.4874, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2431248426437378, "rewards/margins": 1.9787790775299072, "rewards/rejected": -3.2219040393829346, "step": 4480 }, { "epoch": 0.4527921341232825, "grad_norm": 197.8785858154297, "learning_rate": 5.473429464555813e-07, "logits/chosen": 1.4443845748901367, "logits/rejected": 1.4630683660507202, "logps/chosen": -456.0572204589844, "logps/rejected": -422.71270751953125, "loss": 0.549, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2432594299316406, "rewards/margins": 1.8402074575424194, "rewards/rejected": -3.0834670066833496, "step": 4490 }, { "epoch": 0.45380057985629646, "grad_norm": 63.911659240722656, "learning_rate": 5.463345769890087e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -569.6239013671875, "logps/rejected": -419.1607360839844, "loss": 0.552, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4448232650756836, "rewards/margins": 1.5667154788970947, "rewards/rejected": -3.0115389823913574, "step": 4500 }, { "epoch": 0.45480902558931047, "grad_norm": 68.13927459716797, "learning_rate": 5.453262075224361e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -614.5451049804688, "logps/rejected": -440.1087951660156, "loss": 0.3735, "rewards/accuracies": 0.75, "rewards/chosen": -1.0357359647750854, "rewards/margins": 2.1475539207458496, "rewards/rejected": -3.1832897663116455, "step": 4510 }, { "epoch": 0.4558174713223245, "grad_norm": 31.663148880004883, "learning_rate": 5.443178380558637e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -598.2615966796875, "logps/rejected": -405.9281311035156, "loss": 0.6448, "rewards/accuracies": 0.625, "rewards/chosen": -1.1693931818008423, "rewards/margins": 1.885074257850647, "rewards/rejected": -3.0544674396514893, "step": 4520 }, { "epoch": 0.4568259170553385, "grad_norm": 123.67877197265625, "learning_rate": 5.433094685892911e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -537.5609741210938, "logps/rejected": -349.1253967285156, "loss": 0.3135, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6321659088134766, "rewards/margins": 2.3676795959472656, "rewards/rejected": -3.999845504760742, "step": 4530 }, { "epoch": 0.45783436278835243, "grad_norm": 61.56708908081055, "learning_rate": 5.423010991227185e-07, "logits/chosen": 1.4102832078933716, "logits/rejected": NaN, "logps/chosen": -518.025634765625, "logps/rejected": -346.26416015625, "loss": 0.3399, "rewards/accuracies": 0.75, "rewards/chosen": -1.6391569375991821, "rewards/margins": 1.9021146297454834, "rewards/rejected": -3.541271686553955, "step": 4540 }, { "epoch": 0.45884280852136644, "grad_norm": 66.14163208007812, "learning_rate": 5.41292729656146e-07, "logits/chosen": 1.6005128622055054, "logits/rejected": NaN, "logps/chosen": -555.7031860351562, "logps/rejected": -388.61151123046875, "loss": 0.3556, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5323232412338257, "rewards/margins": 2.2010836601257324, "rewards/rejected": -3.7334067821502686, "step": 4550 }, { "epoch": 0.45985125425438045, "grad_norm": 91.23912811279297, "learning_rate": 5.402843601895734e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -492.8399353027344, "logps/rejected": -426.27313232421875, "loss": 0.6275, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3474109172821045, "rewards/margins": 1.5076382160186768, "rewards/rejected": -2.8550493717193604, "step": 4560 }, { "epoch": 0.46085969998739446, "grad_norm": 112.01329040527344, "learning_rate": 5.392759907230009e-07, "logits/chosen": 1.435485601425171, "logits/rejected": NaN, "logps/chosen": -446.0341796875, "logps/rejected": -302.8335876464844, "loss": 0.6821, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2419875860214233, "rewards/margins": 1.6708062887191772, "rewards/rejected": -2.9127936363220215, "step": 4570 }, { "epoch": 0.4618681457204084, "grad_norm": 95.28421020507812, "learning_rate": 5.382676212564283e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -465.76739501953125, "logps/rejected": -382.57305908203125, "loss": 0.3908, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6999707221984863, "rewards/margins": 2.775273323059082, "rewards/rejected": -3.4752438068389893, "step": 4580 }, { "epoch": 0.4628765914534224, "grad_norm": 118.12139892578125, "learning_rate": 5.372592517898558e-07, "logits/chosen": 1.5125601291656494, "logits/rejected": NaN, "logps/chosen": -614.3297729492188, "logps/rejected": -258.90301513671875, "loss": 0.4272, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5639606714248657, "rewards/margins": 1.8652210235595703, "rewards/rejected": -2.4291815757751465, "step": 4590 }, { "epoch": 0.4638850371864364, "grad_norm": 76.14102172851562, "learning_rate": 5.362508823232832e-07, "logits/chosen": 1.5377191305160522, "logits/rejected": NaN, "logps/chosen": -572.2455444335938, "logps/rejected": -302.4320983886719, "loss": 0.3499, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6010004281997681, "rewards/margins": 2.563817262649536, "rewards/rejected": -3.1648173332214355, "step": 4600 }, { "epoch": 0.4648934829194504, "grad_norm": 130.32102966308594, "learning_rate": 5.352425128567106e-07, "logits/chosen": 1.535927176475525, "logits/rejected": NaN, "logps/chosen": -532.5988159179688, "logps/rejected": -369.0326232910156, "loss": 0.5488, "rewards/accuracies": 0.75, "rewards/chosen": -0.7795279026031494, "rewards/margins": 1.747097373008728, "rewards/rejected": -2.526625633239746, "step": 4610 }, { "epoch": 0.4659019286524644, "grad_norm": 133.86636352539062, "learning_rate": 5.342341433901382e-07, "logits/chosen": 1.711208701133728, "logits/rejected": NaN, "logps/chosen": -715.4234619140625, "logps/rejected": -395.22430419921875, "loss": 0.5611, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.210668921470642, "rewards/margins": 1.7535184621810913, "rewards/rejected": -2.9641873836517334, "step": 4620 }, { "epoch": 0.4669103743854784, "grad_norm": 132.37612915039062, "learning_rate": 5.332257739235656e-07, "logits/chosen": 1.632071852684021, "logits/rejected": NaN, "logps/chosen": -541.2522583007812, "logps/rejected": -297.0335693359375, "loss": 0.3584, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9346321225166321, "rewards/margins": 2.474665880203247, "rewards/rejected": -3.4092979431152344, "step": 4630 }, { "epoch": 0.4679188201184924, "grad_norm": 71.59102630615234, "learning_rate": 5.32217404456993e-07, "logits/chosen": 1.5613911151885986, "logits/rejected": NaN, "logps/chosen": -595.2586669921875, "logps/rejected": -382.55206298828125, "loss": 0.421, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3343405723571777, "rewards/margins": 2.347477912902832, "rewards/rejected": -3.6818184852600098, "step": 4640 }, { "epoch": 0.46892726585150635, "grad_norm": 0.14062656462192535, "learning_rate": 5.312090349904204e-07, "logits/chosen": 1.503885269165039, "logits/rejected": NaN, "logps/chosen": -589.4260864257812, "logps/rejected": -401.29931640625, "loss": 0.4664, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7781537771224976, "rewards/margins": 2.0480191707611084, "rewards/rejected": -3.8261730670928955, "step": 4650 }, { "epoch": 0.46993571158452035, "grad_norm": 71.74249267578125, "learning_rate": 5.30200665523848e-07, "logits/chosen": 1.6884628534317017, "logits/rejected": NaN, "logps/chosen": -411.76348876953125, "logps/rejected": -402.2933654785156, "loss": 0.5741, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4376205205917358, "rewards/margins": 2.2933847904205322, "rewards/rejected": -3.7310051918029785, "step": 4660 }, { "epoch": 0.47094415731753436, "grad_norm": 166.36819458007812, "learning_rate": 5.291922960572754e-07, "logits/chosen": 1.0896258354187012, "logits/rejected": NaN, "logps/chosen": -596.0105590820312, "logps/rejected": -312.630859375, "loss": 0.5079, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6633340716362, "rewards/margins": 2.0983474254608154, "rewards/rejected": -2.76168155670166, "step": 4670 }, { "epoch": 0.47195260305054837, "grad_norm": 135.00390625, "learning_rate": 5.281839265907028e-07, "logits/chosen": 1.6991668939590454, "logits/rejected": 1.734618902206421, "logps/chosen": -506.3812561035156, "logps/rejected": -432.89007568359375, "loss": 0.4415, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.593691110610962, "rewards/margins": 1.9533478021621704, "rewards/rejected": -3.547039031982422, "step": 4680 }, { "epoch": 0.4729610487835623, "grad_norm": 45.54102325439453, "learning_rate": 5.271755571241303e-07, "logits/chosen": 1.351874828338623, "logits/rejected": NaN, "logps/chosen": -567.28564453125, "logps/rejected": -313.68463134765625, "loss": 0.6435, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5476908683776855, "rewards/margins": 1.1629033088684082, "rewards/rejected": -2.7105941772460938, "step": 4690 }, { "epoch": 0.47396949451657633, "grad_norm": 1.7519561052322388, "learning_rate": 5.261671876575577e-07, "logits/chosen": 1.6516551971435547, "logits/rejected": NaN, "logps/chosen": -612.2337036132812, "logps/rejected": -390.03802490234375, "loss": 0.4069, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.507053017616272, "rewards/margins": 2.3623180389404297, "rewards/rejected": -3.869370937347412, "step": 4700 }, { "epoch": 0.47497794024959034, "grad_norm": 90.22859954833984, "learning_rate": 5.251588181909852e-07, "logits/chosen": 1.5262432098388672, "logits/rejected": NaN, "logps/chosen": -560.3521728515625, "logps/rejected": -313.284423828125, "loss": 0.5646, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.121290922164917, "rewards/margins": 1.5452945232391357, "rewards/rejected": -2.6665854454040527, "step": 4710 }, { "epoch": 0.4759863859826043, "grad_norm": 83.0454330444336, "learning_rate": 5.241504487244126e-07, "logits/chosen": 1.7650039196014404, "logits/rejected": NaN, "logps/chosen": -584.6044921875, "logps/rejected": -334.3898620605469, "loss": 0.5189, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5392427444458008, "rewards/margins": 1.8708646297454834, "rewards/rejected": -3.410107374191284, "step": 4720 }, { "epoch": 0.4769948317156183, "grad_norm": 98.08614349365234, "learning_rate": 5.231420792578401e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -664.5535278320312, "logps/rejected": -285.24346923828125, "loss": 0.5284, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.349707841873169, "rewards/margins": 1.5165375471115112, "rewards/rejected": -2.8662455081939697, "step": 4730 }, { "epoch": 0.4780032774486323, "grad_norm": 142.93386840820312, "learning_rate": 5.221337097912675e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -617.0046997070312, "logps/rejected": -340.21661376953125, "loss": 0.4182, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7107416391372681, "rewards/margins": 2.1294426918029785, "rewards/rejected": -2.840184450149536, "step": 4740 }, { "epoch": 0.4790117231816463, "grad_norm": 98.20552825927734, "learning_rate": 5.211253403246949e-07, "logits/chosen": 1.6893653869628906, "logits/rejected": NaN, "logps/chosen": -655.6539306640625, "logps/rejected": -375.9500427246094, "loss": 0.4941, "rewards/accuracies": 0.75, "rewards/chosen": -0.7809749841690063, "rewards/margins": 2.231017589569092, "rewards/rejected": -3.0119926929473877, "step": 4750 }, { "epoch": 0.48002016891466026, "grad_norm": 121.78907775878906, "learning_rate": 5.201169708581224e-07, "logits/chosen": 1.5098079442977905, "logits/rejected": NaN, "logps/chosen": -570.4849853515625, "logps/rejected": -309.5720520019531, "loss": 0.3518, "rewards/accuracies": 0.75, "rewards/chosen": -0.7471938133239746, "rewards/margins": 2.1324493885040283, "rewards/rejected": -2.879642963409424, "step": 4760 }, { "epoch": 0.48102861464767427, "grad_norm": 77.54036712646484, "learning_rate": 5.191086013915499e-07, "logits/chosen": 1.4243053197860718, "logits/rejected": NaN, "logps/chosen": -585.0494384765625, "logps/rejected": -293.6543884277344, "loss": 0.5181, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9711127281188965, "rewards/margins": 2.1018359661102295, "rewards/rejected": -3.072948455810547, "step": 4770 }, { "epoch": 0.4820370603806883, "grad_norm": 80.83316040039062, "learning_rate": 5.181002319249773e-07, "logits/chosen": 1.658228874206543, "logits/rejected": NaN, "logps/chosen": -575.1292114257812, "logps/rejected": -393.7992248535156, "loss": 0.2909, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0511243343353271, "rewards/margins": 2.3971874713897705, "rewards/rejected": -3.4483120441436768, "step": 4780 }, { "epoch": 0.4830455061137023, "grad_norm": 41.34980773925781, "learning_rate": 5.170918624584047e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -483.4981384277344, "logps/rejected": -397.5052795410156, "loss": 0.644, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6370744705200195, "rewards/margins": 2.1218338012695312, "rewards/rejected": -3.75890851020813, "step": 4790 }, { "epoch": 0.48405395184671623, "grad_norm": 116.63592529296875, "learning_rate": 5.160834929918321e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -685.5931396484375, "logps/rejected": -476.26904296875, "loss": 0.4581, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5673896074295044, "rewards/margins": 1.6408703327178955, "rewards/rejected": -2.2082600593566895, "step": 4800 }, { "epoch": 0.48506239757973024, "grad_norm": 116.3240737915039, "learning_rate": 5.150751235252597e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -577.0780639648438, "logps/rejected": -414.46807861328125, "loss": 0.6315, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.936009168624878, "rewards/margins": 1.1578665971755981, "rewards/rejected": -3.0938756465911865, "step": 4810 }, { "epoch": 0.48607084331274425, "grad_norm": 222.77700805664062, "learning_rate": 5.140667540586871e-07, "logits/chosen": 1.6016252040863037, "logits/rejected": NaN, "logps/chosen": -475.14453125, "logps/rejected": -400.34295654296875, "loss": 0.6741, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.7056472301483154, "rewards/margins": 1.8656022548675537, "rewards/rejected": -3.5712497234344482, "step": 4820 }, { "epoch": 0.4870792890457582, "grad_norm": 123.90250396728516, "learning_rate": 5.130583845921145e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -591.0984497070312, "logps/rejected": -511.169921875, "loss": 0.4666, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.4579750299453735, "rewards/margins": 1.8704640865325928, "rewards/rejected": -3.328439235687256, "step": 4830 }, { "epoch": 0.4880877347787722, "grad_norm": 119.58617401123047, "learning_rate": 5.12050015125542e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -515.5277709960938, "logps/rejected": -400.01678466796875, "loss": 0.3838, "rewards/accuracies": 0.75, "rewards/chosen": -1.4547395706176758, "rewards/margins": 2.6457011699676514, "rewards/rejected": -4.100440979003906, "step": 4840 }, { "epoch": 0.4890961805117862, "grad_norm": 93.78873443603516, "learning_rate": 5.110416456589694e-07, "logits/chosen": 1.5007143020629883, "logits/rejected": NaN, "logps/chosen": -697.775146484375, "logps/rejected": -325.8115234375, "loss": 0.5285, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9041898846626282, "rewards/margins": 2.0447373390197754, "rewards/rejected": -2.948927402496338, "step": 4850 }, { "epoch": 0.4901046262448002, "grad_norm": 35.77062225341797, "learning_rate": 5.100332761923969e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -616.4207153320312, "logps/rejected": -449.30865478515625, "loss": 0.2593, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0786030292510986, "rewards/margins": 2.250339984893799, "rewards/rejected": -3.3289427757263184, "step": 4860 }, { "epoch": 0.4911130719778142, "grad_norm": 159.10528564453125, "learning_rate": 5.090249067258243e-07, "logits/chosen": 1.449439287185669, "logits/rejected": NaN, "logps/chosen": -512.1224365234375, "logps/rejected": -351.5968017578125, "loss": 0.7345, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.1341147422790527, "rewards/margins": 1.3273924589157104, "rewards/rejected": -3.4615070819854736, "step": 4870 }, { "epoch": 0.4921215177108282, "grad_norm": 64.3913803100586, "learning_rate": 5.080165372592518e-07, "logits/chosen": 1.5629721879959106, "logits/rejected": NaN, "logps/chosen": -555.4967041015625, "logps/rejected": -398.39654541015625, "loss": 0.4121, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5960861444473267, "rewards/margins": 1.7325074672698975, "rewards/rejected": -3.3285934925079346, "step": 4880 }, { "epoch": 0.4931299634438422, "grad_norm": 190.1175079345703, "learning_rate": 5.070081677926792e-07, "logits/chosen": NaN, "logits/rejected": 1.4825007915496826, "logps/chosen": -515.4973754882812, "logps/rejected": -368.3912658691406, "loss": 0.3856, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.895967721939087, "rewards/margins": 2.398519277572632, "rewards/rejected": -4.294487476348877, "step": 4890 }, { "epoch": 0.4941384091768562, "grad_norm": 138.611328125, "learning_rate": 5.059997983261066e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -512.2420654296875, "logps/rejected": -274.7344970703125, "loss": 0.5015, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1866602897644043, "rewards/margins": 1.8964509963989258, "rewards/rejected": -3.08311128616333, "step": 4900 }, { "epoch": 0.49514685490987015, "grad_norm": 129.1103973388672, "learning_rate": 5.049914288595342e-07, "logits/chosen": 1.5179369449615479, "logits/rejected": NaN, "logps/chosen": -544.9732055664062, "logps/rejected": -327.58758544921875, "loss": 0.4546, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.130180835723877, "rewards/margins": 2.059359550476074, "rewards/rejected": -3.189540386199951, "step": 4910 }, { "epoch": 0.49615530064288416, "grad_norm": 156.29075622558594, "learning_rate": 5.039830593929616e-07, "logits/chosen": 1.2946375608444214, "logits/rejected": NaN, "logps/chosen": -505.01116943359375, "logps/rejected": -330.7032775878906, "loss": 0.4455, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.7318496704101562, "rewards/margins": 1.9870895147323608, "rewards/rejected": -3.7189393043518066, "step": 4920 }, { "epoch": 0.49716374637589816, "grad_norm": 92.20761108398438, "learning_rate": 5.02974689926389e-07, "logits/chosen": 1.580910563468933, "logits/rejected": NaN, "logps/chosen": -495.4228515625, "logps/rejected": -390.4068908691406, "loss": 0.4188, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.8573291897773743, "rewards/margins": 2.213186264038086, "rewards/rejected": -3.0705158710479736, "step": 4930 }, { "epoch": 0.4981721921089121, "grad_norm": 148.29180908203125, "learning_rate": 5.019663204598164e-07, "logits/chosen": 1.4910881519317627, "logits/rejected": NaN, "logps/chosen": -649.0684204101562, "logps/rejected": -322.2546691894531, "loss": 0.2697, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.26669958233833313, "rewards/margins": 2.5489959716796875, "rewards/rejected": -2.8156955242156982, "step": 4940 }, { "epoch": 0.4991806378419261, "grad_norm": 32.384124755859375, "learning_rate": 5.009579509932439e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -483.87042236328125, "logps/rejected": -366.5899658203125, "loss": 0.6635, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5863161087036133, "rewards/margins": 0.8127865791320801, "rewards/rejected": -2.3991026878356934, "step": 4950 }, { "epoch": 0.5001890835749401, "grad_norm": 66.619384765625, "learning_rate": 4.999495815266714e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -557.4326171875, "logps/rejected": -547.6048583984375, "loss": 0.4362, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9686318635940552, "rewards/margins": 2.405705690383911, "rewards/rejected": -3.3743374347686768, "step": 4960 }, { "epoch": 0.5011975293079541, "grad_norm": 130.77548217773438, "learning_rate": 4.989412120600988e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -583.9109497070312, "logps/rejected": -366.87811279296875, "loss": 0.2556, "rewards/accuracies": 0.875, "rewards/chosen": -1.0918896198272705, "rewards/margins": 2.524963617324829, "rewards/rejected": -3.6168532371520996, "step": 4970 }, { "epoch": 0.5022059750409681, "grad_norm": 194.6881866455078, "learning_rate": 4.979328425935262e-07, "logits/chosen": 1.4820185899734497, "logits/rejected": NaN, "logps/chosen": -496.77703857421875, "logps/rejected": -426.40643310546875, "loss": 0.5345, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5906732082366943, "rewards/margins": 1.8685106039047241, "rewards/rejected": -3.459183931350708, "step": 4980 }, { "epoch": 0.5032144207739822, "grad_norm": 19.164302825927734, "learning_rate": 4.969244731269537e-07, "logits/chosen": 1.7878084182739258, "logits/rejected": NaN, "logps/chosen": -625.1673583984375, "logps/rejected": -359.3379821777344, "loss": 0.2573, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7281805276870728, "rewards/margins": 3.0519933700561523, "rewards/rejected": -3.7801742553710938, "step": 4990 }, { "epoch": 0.5042228665069961, "grad_norm": 11.604955673217773, "learning_rate": 4.959161036603811e-07, "logits/chosen": 1.5056488513946533, "logits/rejected": NaN, "logps/chosen": -485.7456970214844, "logps/rejected": -352.07269287109375, "loss": 0.8143, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.995746374130249, "rewards/margins": 1.4381123781204224, "rewards/rejected": -3.433859348297119, "step": 5000 }, { "epoch": 0.50523131224001, "grad_norm": 12.939789772033691, "learning_rate": 4.949077341938086e-07, "logits/chosen": 1.49629807472229, "logits/rejected": NaN, "logps/chosen": -553.2758178710938, "logps/rejected": -319.9931945800781, "loss": 0.2865, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5564974546432495, "rewards/margins": 2.742676258087158, "rewards/rejected": -3.2991738319396973, "step": 5010 }, { "epoch": 0.5062397579730241, "grad_norm": 76.2354965209961, "learning_rate": 4.93899364727236e-07, "logits/chosen": 1.5016987323760986, "logits/rejected": NaN, "logps/chosen": -578.50244140625, "logps/rejected": -395.28228759765625, "loss": 0.3762, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3938745260238647, "rewards/margins": 2.4478464126586914, "rewards/rejected": -3.8417205810546875, "step": 5020 }, { "epoch": 0.5072482037060381, "grad_norm": 58.08552169799805, "learning_rate": 4.928909952606635e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -456.92657470703125, "logps/rejected": -346.05889892578125, "loss": 0.4567, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8290023803710938, "rewards/margins": 2.035109758377075, "rewards/rejected": -3.8641116619110107, "step": 5030 }, { "epoch": 0.508256649439052, "grad_norm": 182.8728485107422, "learning_rate": 4.918826257940909e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -719.0250854492188, "logps/rejected": -349.6186828613281, "loss": 0.4518, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6906958222389221, "rewards/margins": 1.8215582370758057, "rewards/rejected": -2.512253999710083, "step": 5040 }, { "epoch": 0.5092650951720661, "grad_norm": 88.76933288574219, "learning_rate": 4.908742563275183e-07, "logits/chosen": 1.6246188879013062, "logits/rejected": NaN, "logps/chosen": -621.0450439453125, "logps/rejected": -410.12255859375, "loss": 0.4216, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2512952089309692, "rewards/margins": 2.107569456100464, "rewards/rejected": -3.3588645458221436, "step": 5050 }, { "epoch": 0.51027354090508, "grad_norm": 226.00955200195312, "learning_rate": 4.898658868609459e-07, "logits/chosen": 1.4954661130905151, "logits/rejected": NaN, "logps/chosen": -524.1861572265625, "logps/rejected": -345.79107666015625, "loss": 0.4844, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8535375595092773, "rewards/margins": 1.8973348140716553, "rewards/rejected": -3.750872850418091, "step": 5060 }, { "epoch": 0.511281986638094, "grad_norm": 138.3004608154297, "learning_rate": 4.888575173943733e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -569.0633544921875, "logps/rejected": -386.99945068359375, "loss": 0.561, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2394205331802368, "rewards/margins": 2.142503261566162, "rewards/rejected": -3.3819236755371094, "step": 5070 }, { "epoch": 0.512290432371108, "grad_norm": 3.170836925506592, "learning_rate": 4.878491479278007e-07, "logits/chosen": 1.3983629941940308, "logits/rejected": NaN, "logps/chosen": -511.10498046875, "logps/rejected": -325.0041809082031, "loss": 0.4025, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2426965236663818, "rewards/margins": 2.243264675140381, "rewards/rejected": -3.485961437225342, "step": 5080 }, { "epoch": 0.513298878104122, "grad_norm": 51.892513275146484, "learning_rate": 4.868407784612281e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -644.8787231445312, "logps/rejected": -365.41534423828125, "loss": 0.7975, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2407892942428589, "rewards/margins": 1.440056324005127, "rewards/rejected": -2.6808457374572754, "step": 5090 }, { "epoch": 0.5143073238371361, "grad_norm": 18.94330406188965, "learning_rate": 4.858324089946557e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -523.4161987304688, "logps/rejected": -355.55572509765625, "loss": 0.721, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9372611045837402, "rewards/margins": 1.631527304649353, "rewards/rejected": -2.5687882900238037, "step": 5100 }, { "epoch": 0.51531576957015, "grad_norm": 77.9837417602539, "learning_rate": 4.848240395280831e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -494.618408203125, "logps/rejected": -482.93206787109375, "loss": 0.6917, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.859309196472168, "rewards/margins": 1.4144580364227295, "rewards/rejected": -2.2737669944763184, "step": 5110 }, { "epoch": 0.516324215303164, "grad_norm": 7.890934467315674, "learning_rate": 4.838156700615105e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -462.6497497558594, "logps/rejected": -432.06036376953125, "loss": 0.4408, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2556214332580566, "rewards/margins": 2.0584449768066406, "rewards/rejected": -3.3140664100646973, "step": 5120 }, { "epoch": 0.517332661036178, "grad_norm": 106.94221496582031, "learning_rate": 4.828073005949379e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -477.2552795410156, "logps/rejected": -407.76959228515625, "loss": 0.3714, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7776157855987549, "rewards/margins": 2.653240203857422, "rewards/rejected": -3.430856227874756, "step": 5130 }, { "epoch": 0.518341106769192, "grad_norm": 92.1246109008789, "learning_rate": 4.817989311283654e-07, "logits/chosen": 1.580307960510254, "logits/rejected": NaN, "logps/chosen": -576.22314453125, "logps/rejected": -374.7902526855469, "loss": 0.3702, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.2337887287139893, "rewards/margins": 1.6352390050888062, "rewards/rejected": -2.869027614593506, "step": 5140 }, { "epoch": 0.5193495525022059, "grad_norm": 212.90060424804688, "learning_rate": 4.807905616617929e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -467.9956970214844, "logps/rejected": -459.611083984375, "loss": 0.5011, "rewards/accuracies": 0.75, "rewards/chosen": -1.185688853263855, "rewards/margins": 1.8749158382415771, "rewards/rejected": -3.0606045722961426, "step": 5150 }, { "epoch": 0.52035799823522, "grad_norm": 108.2117691040039, "learning_rate": 4.797821921952203e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -547.1290283203125, "logps/rejected": -306.83758544921875, "loss": 0.4105, "rewards/accuracies": 0.75, "rewards/chosen": -0.6233733296394348, "rewards/margins": 1.7269690036773682, "rewards/rejected": -2.350342273712158, "step": 5160 }, { "epoch": 0.521366443968234, "grad_norm": 15.16324234008789, "learning_rate": 4.787738227286478e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -592.0725708007812, "logps/rejected": -352.962646484375, "loss": 0.382, "rewards/accuracies": 0.75, "rewards/chosen": -0.05839423090219498, "rewards/margins": 1.976319670677185, "rewards/rejected": -2.0347142219543457, "step": 5170 }, { "epoch": 0.5223748897012479, "grad_norm": 141.1808319091797, "learning_rate": 4.777654532620752e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -495.7547912597656, "logps/rejected": -384.0302429199219, "loss": 0.4302, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4530797004699707, "rewards/margins": 1.7693312168121338, "rewards/rejected": -2.2224109172821045, "step": 5180 }, { "epoch": 0.523383335434262, "grad_norm": 106.27081298828125, "learning_rate": 4.767570837955027e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -481.42803955078125, "logps/rejected": -377.5594482421875, "loss": 0.335, "rewards/accuracies": 0.75, "rewards/chosen": -1.085991621017456, "rewards/margins": 1.7737334966659546, "rewards/rejected": -2.8597252368927, "step": 5190 }, { "epoch": 0.5243917811672759, "grad_norm": 125.4769287109375, "learning_rate": 4.757487143289301e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -500.1826171875, "logps/rejected": -337.6241149902344, "loss": 0.5827, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.9040135145187378, "rewards/margins": 1.5070897340774536, "rewards/rejected": -2.4111032485961914, "step": 5200 }, { "epoch": 0.52540022690029, "grad_norm": 131.75917053222656, "learning_rate": 4.747403448623575e-07, "logits/chosen": 1.3940573930740356, "logits/rejected": NaN, "logps/chosen": -613.8397827148438, "logps/rejected": -331.96319580078125, "loss": 0.3839, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4011410176753998, "rewards/margins": 2.7331459522247314, "rewards/rejected": -3.134287118911743, "step": 5210 }, { "epoch": 0.5264086726333039, "grad_norm": 69.46240997314453, "learning_rate": 4.73731975395785e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -463.4794921875, "logps/rejected": -337.7441101074219, "loss": 0.271, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.47506046295166016, "rewards/margins": 2.943647623062134, "rewards/rejected": -3.418708086013794, "step": 5220 }, { "epoch": 0.5274171183663179, "grad_norm": 112.7025146484375, "learning_rate": 4.7272360592921247e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -517.5740356445312, "logps/rejected": -443.1964416503906, "loss": 0.5186, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4401100873947144, "rewards/margins": 1.6135807037353516, "rewards/rejected": -3.0536906719207764, "step": 5230 }, { "epoch": 0.528425564099332, "grad_norm": 163.07606506347656, "learning_rate": 4.717152364626399e-07, "logits/chosen": 1.4403951168060303, "logits/rejected": NaN, "logps/chosen": -626.0901489257812, "logps/rejected": -375.61114501953125, "loss": 0.6086, "rewards/accuracies": 0.625, "rewards/chosen": -1.2371574640274048, "rewards/margins": 1.6471033096313477, "rewards/rejected": -2.884260892868042, "step": 5240 }, { "epoch": 0.5294340098323459, "grad_norm": 133.9423828125, "learning_rate": 4.707068669960673e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -576.4547119140625, "logps/rejected": -382.8070068359375, "loss": 0.3016, "rewards/accuracies": 0.75, "rewards/chosen": -0.5715006589889526, "rewards/margins": 2.509988784790039, "rewards/rejected": -3.081489324569702, "step": 5250 }, { "epoch": 0.5304424555653598, "grad_norm": 312.5793151855469, "learning_rate": 4.6969849752949483e-07, "logits/chosen": 1.4582602977752686, "logits/rejected": NaN, "logps/chosen": -563.9193725585938, "logps/rejected": -276.8607177734375, "loss": 0.394, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.294992208480835, "rewards/margins": 2.254611015319824, "rewards/rejected": -3.549603223800659, "step": 5260 }, { "epoch": 0.5314509012983739, "grad_norm": 93.34130096435547, "learning_rate": 4.6869012806292225e-07, "logits/chosen": 1.5232326984405518, "logits/rejected": NaN, "logps/chosen": -540.199462890625, "logps/rejected": -390.4368896484375, "loss": 0.296, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3155330419540405, "rewards/margins": 2.5746994018554688, "rewards/rejected": -3.890232563018799, "step": 5270 }, { "epoch": 0.5324593470313879, "grad_norm": 149.86602783203125, "learning_rate": 4.6768175859634967e-07, "logits/chosen": 1.387062668800354, "logits/rejected": NaN, "logps/chosen": -527.5596313476562, "logps/rejected": -341.5891418457031, "loss": 0.454, "rewards/accuracies": 0.75, "rewards/chosen": -1.2105454206466675, "rewards/margins": 2.037257194519043, "rewards/rejected": -3.247802734375, "step": 5280 }, { "epoch": 0.5334677927644018, "grad_norm": 99.2481460571289, "learning_rate": 4.666733891297771e-07, "logits/chosen": 1.5852326154708862, "logits/rejected": NaN, "logps/chosen": -554.8801879882812, "logps/rejected": -347.62872314453125, "loss": 0.398, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.650390863418579, "rewards/margins": 2.57560396194458, "rewards/rejected": -4.22599458694458, "step": 5290 }, { "epoch": 0.5344762384974159, "grad_norm": 460.9702453613281, "learning_rate": 4.656650196632046e-07, "logits/chosen": 1.5487719774246216, "logits/rejected": NaN, "logps/chosen": -561.2516479492188, "logps/rejected": -396.20281982421875, "loss": 0.9431, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.3060781955718994, "rewards/margins": 1.1681745052337646, "rewards/rejected": -3.4742531776428223, "step": 5300 }, { "epoch": 0.5354846842304298, "grad_norm": 109.97673797607422, "learning_rate": 4.6465665019663203e-07, "logits/chosen": 1.6293646097183228, "logits/rejected": NaN, "logps/chosen": -578.3991088867188, "logps/rejected": -358.35504150390625, "loss": 0.4761, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4949332475662231, "rewards/margins": 2.190868377685547, "rewards/rejected": -3.6858017444610596, "step": 5310 }, { "epoch": 0.5364931299634439, "grad_norm": 136.93060302734375, "learning_rate": 4.6364828073005945e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -546.0794067382812, "logps/rejected": -416.3833923339844, "loss": 0.4666, "rewards/accuracies": 0.75, "rewards/chosen": -1.316915512084961, "rewards/margins": 1.6829662322998047, "rewards/rejected": -2.9998817443847656, "step": 5320 }, { "epoch": 0.5375015756964578, "grad_norm": 70.79891204833984, "learning_rate": 4.626399112634869e-07, "logits/chosen": 1.2798854112625122, "logits/rejected": NaN, "logps/chosen": -464.3966369628906, "logps/rejected": -367.29669189453125, "loss": 0.2876, "rewards/accuracies": 0.875, "rewards/chosen": -1.009230136871338, "rewards/margins": 2.5582118034362793, "rewards/rejected": -3.567441940307617, "step": 5330 }, { "epoch": 0.5385100214294718, "grad_norm": 161.58018493652344, "learning_rate": 4.616315417969144e-07, "logits/chosen": 1.574008822441101, "logits/rejected": NaN, "logps/chosen": -520.2398681640625, "logps/rejected": -236.364990234375, "loss": 0.3828, "rewards/accuracies": 0.75, "rewards/chosen": -1.3435717821121216, "rewards/margins": 2.2943098545074463, "rewards/rejected": -3.6378815174102783, "step": 5340 }, { "epoch": 0.5395184671624859, "grad_norm": 9.884313583374023, "learning_rate": 4.606231723303418e-07, "logits/chosen": 1.6278228759765625, "logits/rejected": NaN, "logps/chosen": -626.1756591796875, "logps/rejected": -355.2240905761719, "loss": 0.3355, "rewards/accuracies": 0.75, "rewards/chosen": -1.3616056442260742, "rewards/margins": 2.2871859073638916, "rewards/rejected": -3.648791790008545, "step": 5350 }, { "epoch": 0.5405269128954998, "grad_norm": 32.4987678527832, "learning_rate": 4.5961480286376923e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -580.7880859375, "logps/rejected": -276.9309387207031, "loss": 0.4112, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.354310154914856, "rewards/margins": 2.3459091186523438, "rewards/rejected": -3.7002193927764893, "step": 5360 }, { "epoch": 0.5415353586285138, "grad_norm": 167.16783142089844, "learning_rate": 4.5860643339719676e-07, "logits/chosen": 1.4091598987579346, "logits/rejected": NaN, "logps/chosen": -475.1588439941406, "logps/rejected": -354.1369934082031, "loss": 0.6435, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.882683515548706, "rewards/margins": 2.084280490875244, "rewards/rejected": -3.96696400642395, "step": 5370 }, { "epoch": 0.5425438043615278, "grad_norm": 174.51478576660156, "learning_rate": 4.575980639306242e-07, "logits/chosen": 1.6792351007461548, "logits/rejected": NaN, "logps/chosen": -673.5963745117188, "logps/rejected": -357.77197265625, "loss": 0.5274, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0520379543304443, "rewards/margins": 2.039754629135132, "rewards/rejected": -3.091792583465576, "step": 5380 }, { "epoch": 0.5435522500945418, "grad_norm": 87.40081024169922, "learning_rate": 4.565896944640516e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -515.6634521484375, "logps/rejected": -386.10064697265625, "loss": 0.5204, "rewards/accuracies": 0.75, "rewards/chosen": -1.7372716665267944, "rewards/margins": 2.394768238067627, "rewards/rejected": -4.132039546966553, "step": 5390 }, { "epoch": 0.5445606958275557, "grad_norm": 62.99828338623047, "learning_rate": 4.55581324997479e-07, "logits/chosen": 1.7522990703582764, "logits/rejected": NaN, "logps/chosen": -531.9653930664062, "logps/rejected": -350.7470703125, "loss": 0.3722, "rewards/accuracies": 0.75, "rewards/chosen": -1.7110475301742554, "rewards/margins": 1.9384253025054932, "rewards/rejected": -3.649472713470459, "step": 5400 }, { "epoch": 0.5455691415605698, "grad_norm": 76.07980346679688, "learning_rate": 4.5457295553090654e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -624.2928466796875, "logps/rejected": -441.3082580566406, "loss": 0.2681, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.29632478952407837, "rewards/margins": 2.731778860092163, "rewards/rejected": -3.028103828430176, "step": 5410 }, { "epoch": 0.5465775872935837, "grad_norm": 112.87594604492188, "learning_rate": 4.5356458606433396e-07, "logits/chosen": 1.4692130088806152, "logits/rejected": 1.5656216144561768, "logps/chosen": -478.3143005371094, "logps/rejected": -372.451171875, "loss": 0.643, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.786134958267212, "rewards/margins": 2.0170388221740723, "rewards/rejected": -3.803173780441284, "step": 5420 }, { "epoch": 0.5475860330265978, "grad_norm": 14.628725051879883, "learning_rate": 4.525562165977614e-07, "logits/chosen": 1.4642646312713623, "logits/rejected": NaN, "logps/chosen": -527.6561279296875, "logps/rejected": -349.61016845703125, "loss": 0.5813, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4608205556869507, "rewards/margins": 1.7972990274429321, "rewards/rejected": -3.258120059967041, "step": 5430 }, { "epoch": 0.5485944787596118, "grad_norm": 142.66787719726562, "learning_rate": 4.5154784713118885e-07, "logits/chosen": 1.4674328565597534, "logits/rejected": NaN, "logps/chosen": -557.5029907226562, "logps/rejected": -356.7451477050781, "loss": 0.6749, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.599450707435608, "rewards/margins": 1.7687466144561768, "rewards/rejected": -3.368196964263916, "step": 5440 }, { "epoch": 0.5496029244926257, "grad_norm": 147.8514404296875, "learning_rate": 4.505394776646163e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -610.7626953125, "logps/rejected": -360.74151611328125, "loss": 0.536, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8640590906143188, "rewards/margins": 1.937829613685608, "rewards/rejected": -2.8018887042999268, "step": 5450 }, { "epoch": 0.5506113702256398, "grad_norm": 163.09690856933594, "learning_rate": 4.4953110819804374e-07, "logits/chosen": 1.3996461629867554, "logits/rejected": NaN, "logps/chosen": -613.0535888671875, "logps/rejected": -410.76519775390625, "loss": 0.4974, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0643975734710693, "rewards/margins": 2.1699154376983643, "rewards/rejected": -3.2343132495880127, "step": 5460 }, { "epoch": 0.5516198159586537, "grad_norm": 49.68846893310547, "learning_rate": 4.4852273873147116e-07, "logits/chosen": 1.562331199645996, "logits/rejected": 1.461674451828003, "logps/chosen": -655.0260620117188, "logps/rejected": -356.4012756347656, "loss": 0.2453, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.022911787033081, "rewards/margins": 2.908877372741699, "rewards/rejected": -3.9317893981933594, "step": 5470 }, { "epoch": 0.5526282616916677, "grad_norm": 55.314151763916016, "learning_rate": 4.475143692648987e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -520.3985595703125, "logps/rejected": -365.9579162597656, "loss": 0.5103, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3321144580841064, "rewards/margins": 2.1410343647003174, "rewards/rejected": -3.473149061203003, "step": 5480 }, { "epoch": 0.5536367074246817, "grad_norm": 45.46970748901367, "learning_rate": 4.465059997983261e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -481.1856994628906, "logps/rejected": -377.8641662597656, "loss": 0.4882, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7726356983184814, "rewards/margins": 2.256129026412964, "rewards/rejected": -4.028764724731445, "step": 5490 }, { "epoch": 0.5546451531576957, "grad_norm": 124.68540954589844, "learning_rate": 4.454976303317535e-07, "logits/chosen": 1.5066319704055786, "logits/rejected": NaN, "logps/chosen": -567.6677856445312, "logps/rejected": -392.47418212890625, "loss": 0.3983, "rewards/accuracies": 0.75, "rewards/chosen": -1.195525884628296, "rewards/margins": 2.3950858116149902, "rewards/rejected": -3.590611696243286, "step": 5500 }, { "epoch": 0.5556535988907096, "grad_norm": 57.852664947509766, "learning_rate": 4.4448926086518094e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -555.3861083984375, "logps/rejected": -424.4261779785156, "loss": 0.7006, "rewards/accuracies": 0.625, "rewards/chosen": -1.5401825904846191, "rewards/margins": 1.443800926208496, "rewards/rejected": -2.9839835166931152, "step": 5510 }, { "epoch": 0.5566620446237237, "grad_norm": 5.784912109375, "learning_rate": 4.4348089139860846e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -522.0032958984375, "logps/rejected": -459.042724609375, "loss": 0.5406, "rewards/accuracies": 0.75, "rewards/chosen": -1.1193183660507202, "rewards/margins": 2.504992723464966, "rewards/rejected": -3.6243109703063965, "step": 5520 }, { "epoch": 0.5576704903567377, "grad_norm": 40.291751861572266, "learning_rate": 4.424725219320359e-07, "logits/chosen": 1.582343339920044, "logits/rejected": NaN, "logps/chosen": -455.0132751464844, "logps/rejected": -375.9971618652344, "loss": 0.7401, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7736730575561523, "rewards/margins": 1.7811334133148193, "rewards/rejected": -3.55480694770813, "step": 5530 }, { "epoch": 0.5586789360897517, "grad_norm": 58.209869384765625, "learning_rate": 4.414641524654633e-07, "logits/chosen": 1.7041378021240234, "logits/rejected": NaN, "logps/chosen": -649.9779052734375, "logps/rejected": -410.8213806152344, "loss": 0.4793, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2752068042755127, "rewards/margins": 2.071089267730713, "rewards/rejected": -3.3462958335876465, "step": 5540 }, { "epoch": 0.5596873818227657, "grad_norm": 63.420230865478516, "learning_rate": 4.4045578299889083e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -523.49755859375, "logps/rejected": -316.01641845703125, "loss": 0.6587, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.279412865638733, "rewards/margins": 1.926479697227478, "rewards/rejected": -3.205892562866211, "step": 5550 }, { "epoch": 0.5606958275557796, "grad_norm": 38.19316864013672, "learning_rate": 4.3944741353231825e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -487.587890625, "logps/rejected": -550.312255859375, "loss": 0.4455, "rewards/accuracies": 0.75, "rewards/chosen": -1.1957818269729614, "rewards/margins": 2.031649112701416, "rewards/rejected": -3.227430820465088, "step": 5560 }, { "epoch": 0.5617042732887937, "grad_norm": 47.344722747802734, "learning_rate": 4.3843904406574567e-07, "logits/chosen": 1.6005319356918335, "logits/rejected": NaN, "logps/chosen": -500.3111877441406, "logps/rejected": -309.01263427734375, "loss": 0.3347, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.001932144165039, "rewards/margins": 2.069929599761963, "rewards/rejected": -3.071861743927002, "step": 5570 }, { "epoch": 0.5627127190218076, "grad_norm": 154.09962463378906, "learning_rate": 4.374306745991731e-07, "logits/chosen": 1.4731172323226929, "logits/rejected": 1.6073983907699585, "logps/chosen": -543.6146240234375, "logps/rejected": -356.1114196777344, "loss": 0.4042, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.061952829360962, "rewards/margins": 2.66797137260437, "rewards/rejected": -3.729923963546753, "step": 5580 }, { "epoch": 0.5637211647548216, "grad_norm": 33.196292877197266, "learning_rate": 4.364223051326006e-07, "logits/chosen": 1.8539197444915771, "logits/rejected": NaN, "logps/chosen": -572.6976318359375, "logps/rejected": -299.44500732421875, "loss": 0.3325, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.366126298904419, "rewards/margins": 2.320906162261963, "rewards/rejected": -3.687032699584961, "step": 5590 }, { "epoch": 0.5647296104878357, "grad_norm": 228.5960235595703, "learning_rate": 4.3541393566602803e-07, "logits/chosen": 1.5286890268325806, "logits/rejected": NaN, "logps/chosen": -602.0786743164062, "logps/rejected": -337.576171875, "loss": 0.6981, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5202758312225342, "rewards/margins": 1.7456527948379517, "rewards/rejected": -3.2659287452697754, "step": 5600 }, { "epoch": 0.5657380562208496, "grad_norm": 124.27777862548828, "learning_rate": 4.3440556619945545e-07, "logits/chosen": 1.545620083808899, "logits/rejected": NaN, "logps/chosen": -543.8587646484375, "logps/rejected": -272.05694580078125, "loss": 0.3668, "rewards/accuracies": 0.75, "rewards/chosen": -1.839861512184143, "rewards/margins": 2.0936532020568848, "rewards/rejected": -3.933514356613159, "step": 5610 }, { "epoch": 0.5667465019538636, "grad_norm": 11.211885452270508, "learning_rate": 4.3339719673288287e-07, "logits/chosen": 1.7493116855621338, "logits/rejected": NaN, "logps/chosen": -655.4286499023438, "logps/rejected": -361.10986328125, "loss": 0.5137, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.160420298576355, "rewards/margins": 1.8591638803482056, "rewards/rejected": -3.0195841789245605, "step": 5620 }, { "epoch": 0.5677549476868776, "grad_norm": 233.7501220703125, "learning_rate": 4.323888272663104e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -504.27252197265625, "logps/rejected": -442.11883544921875, "loss": 0.6769, "rewards/accuracies": 0.75, "rewards/chosen": -1.2185219526290894, "rewards/margins": 1.5795886516571045, "rewards/rejected": -2.7981104850769043, "step": 5630 }, { "epoch": 0.5687633934198916, "grad_norm": 207.25350952148438, "learning_rate": 4.313804577997378e-07, "logits/chosen": 1.457437515258789, "logits/rejected": NaN, "logps/chosen": -542.3445434570312, "logps/rejected": -353.546875, "loss": 0.5454, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4018526077270508, "rewards/margins": 2.2234606742858887, "rewards/rejected": -3.6253135204315186, "step": 5640 }, { "epoch": 0.5697718391529056, "grad_norm": 88.5319595336914, "learning_rate": 4.3037208833316523e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -523.3360595703125, "logps/rejected": -458.58880615234375, "loss": 0.4366, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5612462759017944, "rewards/margins": 1.6166530847549438, "rewards/rejected": -3.177899122238159, "step": 5650 }, { "epoch": 0.5707802848859196, "grad_norm": 76.90045166015625, "learning_rate": 4.2936371886659275e-07, "logits/chosen": 1.6033130884170532, "logits/rejected": NaN, "logps/chosen": -517.4710083007812, "logps/rejected": -272.10528564453125, "loss": 0.3128, "rewards/accuracies": 0.75, "rewards/chosen": -1.3261501789093018, "rewards/margins": 2.395733594894409, "rewards/rejected": -3.721883773803711, "step": 5660 }, { "epoch": 0.5717887306189335, "grad_norm": 12.291156768798828, "learning_rate": 4.2835534940002017e-07, "logits/chosen": 1.6031672954559326, "logits/rejected": 1.4473755359649658, "logps/chosen": -552.6251220703125, "logps/rejected": -383.94622802734375, "loss": 0.5375, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.7889333963394165, "rewards/margins": 1.6035964488983154, "rewards/rejected": -3.3925297260284424, "step": 5670 }, { "epoch": 0.5727971763519476, "grad_norm": 7.238757610321045, "learning_rate": 4.273469799334476e-07, "logits/chosen": 1.4769927263259888, "logits/rejected": NaN, "logps/chosen": -537.5015258789062, "logps/rejected": -409.3697204589844, "loss": 0.2528, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3447397947311401, "rewards/margins": 3.0892961025238037, "rewards/rejected": -4.434035778045654, "step": 5680 }, { "epoch": 0.5738056220849616, "grad_norm": 157.8751678466797, "learning_rate": 4.26338610466875e-07, "logits/chosen": 1.7255103588104248, "logits/rejected": NaN, "logps/chosen": -702.3285522460938, "logps/rejected": -307.12841796875, "loss": 0.4632, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.891945481300354, "rewards/margins": 2.123408317565918, "rewards/rejected": -3.0153536796569824, "step": 5690 }, { "epoch": 0.5748140678179755, "grad_norm": 85.11186218261719, "learning_rate": 4.2533024100030253e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -459.6272888183594, "logps/rejected": -348.6761474609375, "loss": 0.5906, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.7223682403564453, "rewards/margins": 2.081019878387451, "rewards/rejected": -3.8033881187438965, "step": 5700 }, { "epoch": 0.5758225135509896, "grad_norm": 0.7936001420021057, "learning_rate": 4.2432187153372995e-07, "logits/chosen": 1.3788378238677979, "logits/rejected": NaN, "logps/chosen": -528.5614624023438, "logps/rejected": -381.01361083984375, "loss": 0.3811, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.1009317636489868, "rewards/margins": 3.1102046966552734, "rewards/rejected": -4.211136817932129, "step": 5710 }, { "epoch": 0.5768309592840035, "grad_norm": 22.472021102905273, "learning_rate": 4.2331350206715737e-07, "logits/chosen": 1.4319360256195068, "logits/rejected": NaN, "logps/chosen": -430.4296875, "logps/rejected": -319.4992370605469, "loss": 0.5505, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.2936182022094727, "rewards/margins": 2.1201131343841553, "rewards/rejected": -4.413731098175049, "step": 5720 }, { "epoch": 0.5778394050170175, "grad_norm": 183.90228271484375, "learning_rate": 4.223051326005848e-07, "logits/chosen": 1.437326192855835, "logits/rejected": NaN, "logps/chosen": -517.2451171875, "logps/rejected": -372.38873291015625, "loss": 0.5389, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.5150680541992188, "rewards/margins": 2.355113983154297, "rewards/rejected": -4.870182514190674, "step": 5730 }, { "epoch": 0.5788478507500315, "grad_norm": 77.69454193115234, "learning_rate": 4.212967631340123e-07, "logits/chosen": 1.5606542825698853, "logits/rejected": NaN, "logps/chosen": -693.6867065429688, "logps/rejected": -345.2734680175781, "loss": 0.5668, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0185331106185913, "rewards/margins": 2.269010066986084, "rewards/rejected": -3.2875430583953857, "step": 5740 }, { "epoch": 0.5798562964830455, "grad_norm": 88.75012969970703, "learning_rate": 4.2028839366743974e-07, "logits/chosen": 1.577502965927124, "logits/rejected": NaN, "logps/chosen": -463.189208984375, "logps/rejected": -517.5103759765625, "loss": 0.9554, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.4206063747406006, "rewards/margins": 0.9614374041557312, "rewards/rejected": -3.3820438385009766, "step": 5750 }, { "epoch": 0.5808647422160595, "grad_norm": 3.5266220569610596, "learning_rate": 4.1928002420086715e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -557.4071655273438, "logps/rejected": -454.60626220703125, "loss": 0.2622, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.1914851665496826, "rewards/margins": 2.6877834796905518, "rewards/rejected": -3.879268169403076, "step": 5760 }, { "epoch": 0.5818731879490735, "grad_norm": 148.56594848632812, "learning_rate": 4.1827165473429457e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -593.2947387695312, "logps/rejected": -380.4075012207031, "loss": 0.4464, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2245854139328003, "rewards/margins": 2.4941725730895996, "rewards/rejected": -3.7187581062316895, "step": 5770 }, { "epoch": 0.5828816336820875, "grad_norm": 116.7092514038086, "learning_rate": 4.172632852677221e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -455.09307861328125, "logps/rejected": -284.1521301269531, "loss": 0.4676, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.265378952026367, "rewards/margins": 2.32051420211792, "rewards/rejected": -4.585893154144287, "step": 5780 }, { "epoch": 0.5838900794151015, "grad_norm": 178.8162841796875, "learning_rate": 4.162549158011495e-07, "logits/chosen": 1.3777204751968384, "logits/rejected": NaN, "logps/chosen": -562.9284057617188, "logps/rejected": -341.4028015136719, "loss": 0.4663, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8913061618804932, "rewards/margins": 2.3187084197998047, "rewards/rejected": -4.210014820098877, "step": 5790 }, { "epoch": 0.5848985251481155, "grad_norm": 7.007861137390137, "learning_rate": 4.1524654633457694e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -616.4417114257812, "logps/rejected": -355.77252197265625, "loss": 0.4627, "rewards/accuracies": 0.75, "rewards/chosen": -1.5295215845108032, "rewards/margins": 2.120459794998169, "rewards/rejected": -3.6499812602996826, "step": 5800 }, { "epoch": 0.5859069708811294, "grad_norm": 201.4776153564453, "learning_rate": 4.1423817686800446e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -482.8617248535156, "logps/rejected": -405.97918701171875, "loss": 0.6636, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.909893274307251, "rewards/margins": 2.0498533248901367, "rewards/rejected": -3.9597465991973877, "step": 5810 }, { "epoch": 0.5869154166141435, "grad_norm": 175.20431518554688, "learning_rate": 4.132298074014319e-07, "logits/chosen": 1.485641598701477, "logits/rejected": NaN, "logps/chosen": -559.0929565429688, "logps/rejected": -337.46502685546875, "loss": 0.5265, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8899952173233032, "rewards/margins": 1.971197485923767, "rewards/rejected": -3.8611927032470703, "step": 5820 }, { "epoch": 0.5879238623471574, "grad_norm": 330.932373046875, "learning_rate": 4.122214379348593e-07, "logits/chosen": 1.354172945022583, "logits/rejected": NaN, "logps/chosen": -638.62109375, "logps/rejected": -349.6034851074219, "loss": 0.5157, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0112874507904053, "rewards/margins": 1.8010482788085938, "rewards/rejected": -3.81233549118042, "step": 5830 }, { "epoch": 0.5889323080801714, "grad_norm": 106.16050720214844, "learning_rate": 4.112130684682867e-07, "logits/chosen": 1.5537893772125244, "logits/rejected": 1.645416021347046, "logps/chosen": -516.225830078125, "logps/rejected": -381.6088562011719, "loss": 0.4788, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0695254802703857, "rewards/margins": 2.4163308143615723, "rewards/rejected": -4.485856056213379, "step": 5840 }, { "epoch": 0.5899407538131854, "grad_norm": 29.510086059570312, "learning_rate": 4.1020469900171424e-07, "logits/chosen": 1.5998036861419678, "logits/rejected": 1.337446928024292, "logps/chosen": -633.7794189453125, "logps/rejected": -371.9726867675781, "loss": 0.3207, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6934849619865417, "rewards/margins": 2.7435240745544434, "rewards/rejected": -3.43700909614563, "step": 5850 }, { "epoch": 0.5909491995461994, "grad_norm": 180.5288543701172, "learning_rate": 4.0919632953514166e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -654.2200927734375, "logps/rejected": -470.08184814453125, "loss": 0.6003, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8301399946212769, "rewards/margins": 1.2141293287277222, "rewards/rejected": -2.044269323348999, "step": 5860 }, { "epoch": 0.5919576452792135, "grad_norm": 19.62813377380371, "learning_rate": 4.081879600685691e-07, "logits/chosen": 1.203652262687683, "logits/rejected": NaN, "logps/chosen": -453.1578674316406, "logps/rejected": -350.0625915527344, "loss": 0.533, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.216203212738037, "rewards/margins": 1.6428625583648682, "rewards/rejected": -3.859065532684326, "step": 5870 }, { "epoch": 0.5929660910122274, "grad_norm": 32.87977600097656, "learning_rate": 4.0717959060199655e-07, "logits/chosen": 1.4807798862457275, "logits/rejected": NaN, "logps/chosen": -493.5775451660156, "logps/rejected": -365.16339111328125, "loss": 0.4023, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4877946376800537, "rewards/margins": 2.188040018081665, "rewards/rejected": -3.6758341789245605, "step": 5880 }, { "epoch": 0.5939745367452414, "grad_norm": 223.53460693359375, "learning_rate": 4.06171221135424e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -530.9434204101562, "logps/rejected": -437.0760192871094, "loss": 0.5379, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.0765732526779175, "rewards/margins": 3.0418334007263184, "rewards/rejected": -4.118406772613525, "step": 5890 }, { "epoch": 0.5949829824782554, "grad_norm": 131.48416137695312, "learning_rate": 4.0516285166885144e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -524.3309326171875, "logps/rejected": -371.3778381347656, "loss": 0.4054, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0138602256774902, "rewards/margins": 2.70161771774292, "rewards/rejected": -3.7154781818389893, "step": 5900 }, { "epoch": 0.5959914282112694, "grad_norm": 95.67144775390625, "learning_rate": 4.0415448220227886e-07, "logits/chosen": 1.6458743810653687, "logits/rejected": NaN, "logps/chosen": -645.0982666015625, "logps/rejected": -372.07269287109375, "loss": 0.2268, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8426273465156555, "rewards/margins": 2.9459896087646484, "rewards/rejected": -3.788616895675659, "step": 5910 }, { "epoch": 0.5969998739442833, "grad_norm": 0.1611224263906479, "learning_rate": 4.031461127357064e-07, "logits/chosen": 1.458755373954773, "logits/rejected": 1.3834812641143799, "logps/chosen": -583.9760131835938, "logps/rejected": -422.8934631347656, "loss": 0.4271, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2606089115142822, "rewards/margins": 2.8816070556640625, "rewards/rejected": -4.142216205596924, "step": 5920 }, { "epoch": 0.5980083196772974, "grad_norm": 99.30024719238281, "learning_rate": 4.021377432691338e-07, "logits/chosen": 1.453270673751831, "logits/rejected": NaN, "logps/chosen": -543.3842163085938, "logps/rejected": -374.9857482910156, "loss": 0.4919, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.397733211517334, "rewards/margins": 1.9866588115692139, "rewards/rejected": -4.384392261505127, "step": 5930 }, { "epoch": 0.5990167654103113, "grad_norm": 279.98236083984375, "learning_rate": 4.011293738025612e-07, "logits/chosen": 1.3896909952163696, "logits/rejected": NaN, "logps/chosen": -583.6671752929688, "logps/rejected": -294.23907470703125, "loss": 0.3487, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1182421445846558, "rewards/margins": 2.9508042335510254, "rewards/rejected": -4.069046497344971, "step": 5940 }, { "epoch": 0.6000252111433253, "grad_norm": 39.053070068359375, "learning_rate": 4.0012100433598864e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -608.8438720703125, "logps/rejected": -363.2088317871094, "loss": 0.5776, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0182867050170898, "rewards/margins": 2.2418222427368164, "rewards/rejected": -3.2601089477539062, "step": 5950 }, { "epoch": 0.6010336568763394, "grad_norm": 32.516544342041016, "learning_rate": 3.9911263486941617e-07, "logits/chosen": 1.4425281286239624, "logits/rejected": NaN, "logps/chosen": -630.2759399414062, "logps/rejected": -379.0508117675781, "loss": 0.3641, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7708733081817627, "rewards/margins": 2.3636891841888428, "rewards/rejected": -4.1345624923706055, "step": 5960 }, { "epoch": 0.6020421026093533, "grad_norm": 5.37238073348999, "learning_rate": 3.981042654028436e-07, "logits/chosen": 1.1436803340911865, "logits/rejected": NaN, "logps/chosen": -468.1600646972656, "logps/rejected": -332.896728515625, "loss": 0.3775, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8298699855804443, "rewards/margins": 3.6059768199920654, "rewards/rejected": -5.43584680557251, "step": 5970 }, { "epoch": 0.6030505483423674, "grad_norm": 195.4139404296875, "learning_rate": 3.97095895936271e-07, "logits/chosen": 1.2969655990600586, "logits/rejected": NaN, "logps/chosen": -637.6253051757812, "logps/rejected": -369.75836181640625, "loss": 0.521, "rewards/accuracies": 0.75, "rewards/chosen": -1.4472105503082275, "rewards/margins": 1.9802173376083374, "rewards/rejected": -3.4274277687072754, "step": 5980 }, { "epoch": 0.6040589940753813, "grad_norm": 196.0348358154297, "learning_rate": 3.960875264696985e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -482.7102966308594, "logps/rejected": -395.50616455078125, "loss": 0.5451, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.161252737045288, "rewards/margins": 1.966848611831665, "rewards/rejected": -4.128101348876953, "step": 5990 }, { "epoch": 0.6050674398083953, "grad_norm": 1.8330445289611816, "learning_rate": 3.9507915700312595e-07, "logits/chosen": 1.5376890897750854, "logits/rejected": NaN, "logps/chosen": -537.2084350585938, "logps/rejected": -384.93194580078125, "loss": 0.2609, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8596226572990417, "rewards/margins": 3.327998638153076, "rewards/rejected": -4.187621116638184, "step": 6000 }, { "epoch": 0.6060758855414093, "grad_norm": 253.45452880859375, "learning_rate": 3.9407078753655337e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -507.1580505371094, "logps/rejected": -444.5716857910156, "loss": 0.4968, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6535139083862305, "rewards/margins": 1.775740623474121, "rewards/rejected": -3.4292545318603516, "step": 6010 }, { "epoch": 0.6070843312744233, "grad_norm": 79.26547241210938, "learning_rate": 3.930624180699808e-07, "logits/chosen": 1.476633071899414, "logits/rejected": NaN, "logps/chosen": -554.6206665039062, "logps/rejected": -361.450439453125, "loss": 0.3949, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9519367218017578, "rewards/margins": 2.811117172241211, "rewards/rejected": -3.763054370880127, "step": 6020 }, { "epoch": 0.6080927770074372, "grad_norm": 0.1502886414527893, "learning_rate": 3.920540486034083e-07, "logits/chosen": 1.132706642150879, "logits/rejected": NaN, "logps/chosen": -458.5147399902344, "logps/rejected": -333.7770690917969, "loss": 0.942, "rewards/accuracies": 0.625, "rewards/chosen": -2.247309923171997, "rewards/margins": 1.5652989149093628, "rewards/rejected": -3.8126087188720703, "step": 6030 }, { "epoch": 0.6091012227404513, "grad_norm": 23.128137588500977, "learning_rate": 3.9104567913683573e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -379.32623291015625, "logps/rejected": -355.065673828125, "loss": 0.4146, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5239267349243164, "rewards/margins": 2.451643466949463, "rewards/rejected": -3.9755702018737793, "step": 6040 }, { "epoch": 0.6101096684734653, "grad_norm": 82.37578582763672, "learning_rate": 3.9003730967026315e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -652.0755004882812, "logps/rejected": -347.79266357421875, "loss": 0.4914, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.2170791625976562, "rewards/margins": 1.7979984283447266, "rewards/rejected": -3.015077590942383, "step": 6050 }, { "epoch": 0.6111181142064792, "grad_norm": 56.50639724731445, "learning_rate": 3.8902894020369057e-07, "logits/chosen": 1.5726152658462524, "logits/rejected": NaN, "logps/chosen": -599.9457397460938, "logps/rejected": -262.7901611328125, "loss": 0.462, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9636222720146179, "rewards/margins": 2.2408313751220703, "rewards/rejected": -3.204453229904175, "step": 6060 }, { "epoch": 0.6121265599394933, "grad_norm": 116.4361801147461, "learning_rate": 3.880205707371181e-07, "logits/chosen": 1.3519474267959595, "logits/rejected": NaN, "logps/chosen": -504.44873046875, "logps/rejected": -327.9048156738281, "loss": 0.473, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7272756099700928, "rewards/margins": 2.5919783115386963, "rewards/rejected": -4.319254398345947, "step": 6070 }, { "epoch": 0.6131350056725072, "grad_norm": 35.2296028137207, "learning_rate": 3.870122012705455e-07, "logits/chosen": 1.4024962186813354, "logits/rejected": NaN, "logps/chosen": -566.8375854492188, "logps/rejected": -348.8834533691406, "loss": 0.5248, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3120309114456177, "rewards/margins": 2.0670857429504395, "rewards/rejected": -3.3791165351867676, "step": 6080 }, { "epoch": 0.6141434514055213, "grad_norm": 26.89258575439453, "learning_rate": 3.8600383180397293e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -618.640625, "logps/rejected": -503.69659423828125, "loss": 0.2647, "rewards/accuracies": 0.875, "rewards/chosen": -1.1724212169647217, "rewards/margins": 3.0467658042907715, "rewards/rejected": -4.219186782836914, "step": 6090 }, { "epoch": 0.6151518971385352, "grad_norm": 73.35809326171875, "learning_rate": 3.849954623374004e-07, "logits/chosen": 1.3056765794754028, "logits/rejected": NaN, "logps/chosen": -491.01995849609375, "logps/rejected": -367.5260314941406, "loss": 0.6079, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.939616322517395, "rewards/margins": 2.018688678741455, "rewards/rejected": -2.9583048820495605, "step": 6100 }, { "epoch": 0.6161603428715492, "grad_norm": 167.04856872558594, "learning_rate": 3.839870928708279e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -557.0488891601562, "logps/rejected": -341.485107421875, "loss": 0.4265, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7887910604476929, "rewards/margins": 2.127892255783081, "rewards/rejected": -2.9166831970214844, "step": 6110 }, { "epoch": 0.6171687886045633, "grad_norm": 64.7114028930664, "learning_rate": 3.829787234042553e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -443.48577880859375, "logps/rejected": -381.89825439453125, "loss": 0.7008, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3136035203933716, "rewards/margins": 1.7230329513549805, "rewards/rejected": -3.0366365909576416, "step": 6120 }, { "epoch": 0.6181772343375772, "grad_norm": 79.18092346191406, "learning_rate": 3.819703539376827e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -603.9635009765625, "logps/rejected": -436.1954650878906, "loss": 0.5702, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8107697367668152, "rewards/margins": 1.7416226863861084, "rewards/rejected": -2.5523924827575684, "step": 6130 }, { "epoch": 0.6191856800705912, "grad_norm": 230.57716369628906, "learning_rate": 3.8096198447111024e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -532.0602416992188, "logps/rejected": -372.1153259277344, "loss": 0.72, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.9626806378364563, "rewards/margins": 1.4144785404205322, "rewards/rejected": -2.3771588802337646, "step": 6140 }, { "epoch": 0.6201941258036052, "grad_norm": 17.553373336791992, "learning_rate": 3.7995361500453766e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -585.24462890625, "logps/rejected": -414.11102294921875, "loss": 0.3841, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9770393371582031, "rewards/margins": 2.7974090576171875, "rewards/rejected": -3.7744483947753906, "step": 6150 }, { "epoch": 0.6212025715366192, "grad_norm": 63.23377990722656, "learning_rate": 3.789452455379651e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -515.1998901367188, "logps/rejected": -503.7783203125, "loss": 0.5136, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.193414330482483, "rewards/margins": 1.6032289266586304, "rewards/rejected": -2.7966432571411133, "step": 6160 }, { "epoch": 0.6222110172696331, "grad_norm": 161.16574096679688, "learning_rate": 3.779368760713925e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -437.8575134277344, "logps/rejected": -387.2948913574219, "loss": 0.701, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1931874752044678, "rewards/margins": 1.7512248754501343, "rewards/rejected": -2.9444122314453125, "step": 6170 }, { "epoch": 0.6232194630026472, "grad_norm": 160.8400421142578, "learning_rate": 3.7692850660482e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -526.4771728515625, "logps/rejected": -415.7852478027344, "loss": 0.6149, "rewards/accuracies": 0.75, "rewards/chosen": -1.8828471899032593, "rewards/margins": 1.9611423015594482, "rewards/rejected": -3.843989610671997, "step": 6180 }, { "epoch": 0.6242279087356611, "grad_norm": 91.90988159179688, "learning_rate": 3.7592013713824744e-07, "logits/chosen": 1.6761198043823242, "logits/rejected": NaN, "logps/chosen": -452.23980712890625, "logps/rejected": -345.4059143066406, "loss": 0.5485, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.148512840270996, "rewards/margins": 1.4022786617279053, "rewards/rejected": -2.5507912635803223, "step": 6190 }, { "epoch": 0.6252363544686752, "grad_norm": 222.9679412841797, "learning_rate": 3.7491176767167486e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -424.8905334472656, "logps/rejected": -364.6341857910156, "loss": 0.8264, "rewards/accuracies": 0.625, "rewards/chosen": -2.1415908336639404, "rewards/margins": 0.902225136756897, "rewards/rejected": -3.043815851211548, "step": 6200 }, { "epoch": 0.6262448002016892, "grad_norm": 148.87942504882812, "learning_rate": 3.7390339820510233e-07, "logits/chosen": 1.4088588953018188, "logits/rejected": NaN, "logps/chosen": -522.0379638671875, "logps/rejected": -333.2518310546875, "loss": 0.3237, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.2743192911148071, "rewards/margins": 3.6247124671936035, "rewards/rejected": -4.899031639099121, "step": 6210 }, { "epoch": 0.6272532459347031, "grad_norm": 74.4015884399414, "learning_rate": 3.728950287385298e-07, "logits/chosen": 1.3734455108642578, "logits/rejected": NaN, "logps/chosen": -532.4038696289062, "logps/rejected": -341.8404235839844, "loss": 0.3782, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1890262365341187, "rewards/margins": 1.903922438621521, "rewards/rejected": -3.0929484367370605, "step": 6220 }, { "epoch": 0.6282616916677172, "grad_norm": 21.695077896118164, "learning_rate": 3.718866592719572e-07, "logits/chosen": 1.5245335102081299, "logits/rejected": NaN, "logps/chosen": -625.81103515625, "logps/rejected": -342.5049743652344, "loss": 0.3218, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7729543447494507, "rewards/margins": 2.4387612342834473, "rewards/rejected": -3.2117156982421875, "step": 6230 }, { "epoch": 0.6292701374007311, "grad_norm": 27.94418716430664, "learning_rate": 3.7087828980538464e-07, "logits/chosen": 1.3921048641204834, "logits/rejected": NaN, "logps/chosen": -460.135986328125, "logps/rejected": -374.34765625, "loss": 0.4242, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3492892980575562, "rewards/margins": 2.36887526512146, "rewards/rejected": -3.7181644439697266, "step": 6240 }, { "epoch": 0.6302785831337451, "grad_norm": 176.46270751953125, "learning_rate": 3.6986992033881216e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -518.4957885742188, "logps/rejected": -314.5762939453125, "loss": 0.8499, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.6818243265151978, "rewards/margins": 1.6122725009918213, "rewards/rejected": -3.2940971851348877, "step": 6250 }, { "epoch": 0.6312870288667591, "grad_norm": 79.83360290527344, "learning_rate": 3.688615508722396e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -620.8439331054688, "logps/rejected": -426.0428161621094, "loss": 0.4704, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5583305358886719, "rewards/margins": 1.9243135452270508, "rewards/rejected": -3.4826443195343018, "step": 6260 }, { "epoch": 0.6322954745997731, "grad_norm": 154.88658142089844, "learning_rate": 3.67853181405667e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -504.323974609375, "logps/rejected": -383.53363037109375, "loss": 0.3736, "rewards/accuracies": 0.75, "rewards/chosen": -1.6298496723175049, "rewards/margins": 2.4380459785461426, "rewards/rejected": -4.067895889282227, "step": 6270 }, { "epoch": 0.633303920332787, "grad_norm": 79.57142639160156, "learning_rate": 3.668448119390944e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -519.0630493164062, "logps/rejected": -390.31683349609375, "loss": 0.2459, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.9410845637321472, "rewards/margins": 2.867730140686035, "rewards/rejected": -3.808814525604248, "step": 6280 }, { "epoch": 0.6343123660658011, "grad_norm": 145.498291015625, "learning_rate": 3.6583644247252194e-07, "logits/chosen": 1.382987380027771, "logits/rejected": NaN, "logps/chosen": -554.9293212890625, "logps/rejected": -307.7407531738281, "loss": 0.4356, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1709990501403809, "rewards/margins": 2.60821795463562, "rewards/rejected": -3.779217481613159, "step": 6290 }, { "epoch": 0.635320811798815, "grad_norm": 89.77687072753906, "learning_rate": 3.6482807300594936e-07, "logits/chosen": 1.1270208358764648, "logits/rejected": NaN, "logps/chosen": -551.189453125, "logps/rejected": -396.98773193359375, "loss": 0.2461, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.049957513809204, "rewards/margins": 2.962174415588379, "rewards/rejected": -4.012132167816162, "step": 6300 }, { "epoch": 0.6363292575318291, "grad_norm": 103.19253540039062, "learning_rate": 3.638197035393768e-07, "logits/chosen": 1.2740675210952759, "logits/rejected": NaN, "logps/chosen": -482.250244140625, "logps/rejected": -370.93560791015625, "loss": 0.5118, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.742323637008667, "rewards/margins": 1.91330087184906, "rewards/rejected": -3.6556239128112793, "step": 6310 }, { "epoch": 0.6373377032648431, "grad_norm": 21.96233558654785, "learning_rate": 3.6281133407280425e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -545.3389892578125, "logps/rejected": -325.59759521484375, "loss": 0.2933, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9555495977401733, "rewards/margins": 2.7665090560913086, "rewards/rejected": -3.7220587730407715, "step": 6320 }, { "epoch": 0.638346148997857, "grad_norm": 57.42737579345703, "learning_rate": 3.6180296460623173e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -557.3653564453125, "logps/rejected": -384.56365966796875, "loss": 0.4136, "rewards/accuracies": 0.75, "rewards/chosen": -0.7611983418464661, "rewards/margins": 3.1040282249450684, "rewards/rejected": -3.8652267456054688, "step": 6330 }, { "epoch": 0.6393545947308711, "grad_norm": 19.426786422729492, "learning_rate": 3.6079459513965915e-07, "logits/chosen": 1.2237509489059448, "logits/rejected": NaN, "logps/chosen": -581.2462158203125, "logps/rejected": -360.44342041015625, "loss": 0.573, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.42097008228302, "rewards/margins": 2.336644172668457, "rewards/rejected": -3.7576141357421875, "step": 6340 }, { "epoch": 0.640363040463885, "grad_norm": 6.417221546173096, "learning_rate": 3.5978622567308656e-07, "logits/chosen": NaN, "logits/rejected": 1.3207776546478271, "logps/chosen": -498.30291748046875, "logps/rejected": -429.72369384765625, "loss": 0.4951, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7066166400909424, "rewards/margins": 2.4679317474365234, "rewards/rejected": -4.174548149108887, "step": 6350 }, { "epoch": 0.641371486196899, "grad_norm": 2.8071630001068115, "learning_rate": 3.587778562065141e-07, "logits/chosen": 1.432861089706421, "logits/rejected": 1.3240700960159302, "logps/chosen": -480.2035217285156, "logps/rejected": -319.13018798828125, "loss": 0.2609, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1936264038085938, "rewards/margins": 3.270289182662964, "rewards/rejected": -4.463915824890137, "step": 6360 }, { "epoch": 0.642379931929913, "grad_norm": 98.23338317871094, "learning_rate": 3.577694867399415e-07, "logits/chosen": 1.254350185394287, "logits/rejected": NaN, "logps/chosen": -609.2538452148438, "logps/rejected": -438.3121032714844, "loss": 0.3169, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1564069986343384, "rewards/margins": 2.6276638507843018, "rewards/rejected": -3.7840709686279297, "step": 6370 }, { "epoch": 0.643388377662927, "grad_norm": 117.20337677001953, "learning_rate": 3.5676111727336893e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -612.7037353515625, "logps/rejected": -351.73809814453125, "loss": 0.6, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9200423955917358, "rewards/margins": 2.4949793815612793, "rewards/rejected": -4.415021896362305, "step": 6380 }, { "epoch": 0.644396823395941, "grad_norm": 14.886820793151855, "learning_rate": 3.5575274780679635e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -575.1616821289062, "logps/rejected": -299.760009765625, "loss": 0.3398, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9372028112411499, "rewards/margins": 3.301304340362549, "rewards/rejected": -4.238507270812988, "step": 6390 }, { "epoch": 0.645405269128955, "grad_norm": 61.58344268798828, "learning_rate": 3.5474437834022387e-07, "logits/chosen": 1.12351655960083, "logits/rejected": NaN, "logps/chosen": -508.13934326171875, "logps/rejected": -283.9261169433594, "loss": 0.3728, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.850921392440796, "rewards/margins": 3.0395703315734863, "rewards/rejected": -4.890491485595703, "step": 6400 }, { "epoch": 0.646413714861969, "grad_norm": 23.993793487548828, "learning_rate": 3.537360088736513e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -536.8621215820312, "logps/rejected": -400.04461669921875, "loss": 0.3602, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6783040761947632, "rewards/margins": 3.0386581420898438, "rewards/rejected": -4.7169623374938965, "step": 6410 }, { "epoch": 0.647422160594983, "grad_norm": 59.04755783081055, "learning_rate": 3.527276394070787e-07, "logits/chosen": 1.4659414291381836, "logits/rejected": NaN, "logps/chosen": -501.573974609375, "logps/rejected": -325.1654968261719, "loss": 0.3568, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7200391292572021, "rewards/margins": 3.0174999237060547, "rewards/rejected": -4.737539291381836, "step": 6420 }, { "epoch": 0.648430606327997, "grad_norm": 232.39389038085938, "learning_rate": 3.517192699405062e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -531.3758544921875, "logps/rejected": -337.7171325683594, "loss": 0.5036, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9054162502288818, "rewards/margins": 2.229001998901367, "rewards/rejected": -4.13441801071167, "step": 6430 }, { "epoch": 0.6494390520610109, "grad_norm": 58.62845993041992, "learning_rate": 3.5071090047393365e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -513.2479858398438, "logps/rejected": -374.34417724609375, "loss": 0.3735, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9593952894210815, "rewards/margins": 2.7932369709014893, "rewards/rejected": -3.7526321411132812, "step": 6440 }, { "epoch": 0.650447497794025, "grad_norm": 22.515134811401367, "learning_rate": 3.4970253100736107e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -514.4745483398438, "logps/rejected": -444.06182861328125, "loss": 0.255, "rewards/accuracies": 0.875, "rewards/chosen": -1.289252519607544, "rewards/margins": 3.0981035232543945, "rewards/rejected": -4.387355804443359, "step": 6450 }, { "epoch": 0.651455943527039, "grad_norm": 141.6898956298828, "learning_rate": 3.486941615407885e-07, "logits/chosen": 1.5981035232543945, "logits/rejected": NaN, "logps/chosen": -558.7141723632812, "logps/rejected": -315.30657958984375, "loss": 0.5456, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6803967952728271, "rewards/margins": 1.7775938510894775, "rewards/rejected": -3.4579906463623047, "step": 6460 }, { "epoch": 0.6524643892600529, "grad_norm": 63.23856735229492, "learning_rate": 3.47685792074216e-07, "logits/chosen": 1.408369779586792, "logits/rejected": NaN, "logps/chosen": -624.0241088867188, "logps/rejected": -328.1123046875, "loss": 0.3621, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9106027483940125, "rewards/margins": 3.7060394287109375, "rewards/rejected": -4.616642475128174, "step": 6470 }, { "epoch": 0.653472834993067, "grad_norm": 5.489939212799072, "learning_rate": 3.4667742260764343e-07, "logits/chosen": 1.257860541343689, "logits/rejected": NaN, "logps/chosen": -463.5283203125, "logps/rejected": -295.9234924316406, "loss": 0.3917, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6713422536849976, "rewards/margins": 2.5847580432891846, "rewards/rejected": -4.256100654602051, "step": 6480 }, { "epoch": 0.6544812807260809, "grad_norm": 215.36888122558594, "learning_rate": 3.4566905314107085e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -477.8563537597656, "logps/rejected": -330.9757995605469, "loss": 0.4442, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.2341079711914062, "rewards/margins": 2.6056597232818604, "rewards/rejected": -4.839767932891846, "step": 6490 }, { "epoch": 0.6554897264590949, "grad_norm": 28.894901275634766, "learning_rate": 3.446606836744983e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -613.3446044921875, "logps/rejected": -463.8262634277344, "loss": 0.4293, "rewards/accuracies": 0.75, "rewards/chosen": -1.4873559474945068, "rewards/margins": 2.07712721824646, "rewards/rejected": -3.564483165740967, "step": 6500 }, { "epoch": 0.6564981721921089, "grad_norm": 168.58535766601562, "learning_rate": 3.436523142079258e-07, "logits/chosen": 1.2935831546783447, "logits/rejected": NaN, "logps/chosen": -556.9571533203125, "logps/rejected": -349.42236328125, "loss": 0.3548, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6192991733551025, "rewards/margins": 2.711993932723999, "rewards/rejected": -4.331293106079102, "step": 6510 }, { "epoch": 0.6575066179251229, "grad_norm": 75.59309387207031, "learning_rate": 3.426439447413532e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -588.3040771484375, "logps/rejected": -344.8088684082031, "loss": 0.4483, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.2067291736602783, "rewards/margins": 1.5185537338256836, "rewards/rejected": -3.725283145904541, "step": 6520 }, { "epoch": 0.658515063658137, "grad_norm": 18.54936981201172, "learning_rate": 3.4163557527478063e-07, "logits/chosen": 1.513332724571228, "logits/rejected": NaN, "logps/chosen": -650.6130981445312, "logps/rejected": -408.96795654296875, "loss": 0.7016, "rewards/accuracies": 0.625, "rewards/chosen": -2.5168068408966064, "rewards/margins": 1.6160110235214233, "rewards/rejected": -4.13281774520874, "step": 6530 }, { "epoch": 0.6595235093911509, "grad_norm": 189.23533630371094, "learning_rate": 3.406272058082081e-07, "logits/chosen": 1.5759923458099365, "logits/rejected": NaN, "logps/chosen": -640.6053466796875, "logps/rejected": -408.1158752441406, "loss": 0.3929, "rewards/accuracies": 0.75, "rewards/chosen": -2.042698860168457, "rewards/margins": 3.0324318408966064, "rewards/rejected": -5.075129985809326, "step": 6540 }, { "epoch": 0.6605319551241648, "grad_norm": 24.990766525268555, "learning_rate": 3.396188363416356e-07, "logits/chosen": 1.353453278541565, "logits/rejected": NaN, "logps/chosen": -520.77001953125, "logps/rejected": -302.5209655761719, "loss": 0.4776, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.263267993927002, "rewards/margins": 2.3727152347564697, "rewards/rejected": -4.635983467102051, "step": 6550 }, { "epoch": 0.6615404008571789, "grad_norm": 123.77323913574219, "learning_rate": 3.38610466875063e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -497.216552734375, "logps/rejected": -390.76806640625, "loss": 0.4437, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1237258911132812, "rewards/margins": 2.4722094535827637, "rewards/rejected": -4.595934867858887, "step": 6560 }, { "epoch": 0.6625488465901929, "grad_norm": 16.770042419433594, "learning_rate": 3.376020974084904e-07, "logits/chosen": 1.4024583101272583, "logits/rejected": NaN, "logps/chosen": -529.6685791015625, "logps/rejected": -377.95037841796875, "loss": 0.667, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.798921823501587, "rewards/margins": 1.793150544166565, "rewards/rejected": -3.592072010040283, "step": 6570 }, { "epoch": 0.6635572923232068, "grad_norm": 34.665016174316406, "learning_rate": 3.3659372794191794e-07, "logits/chosen": 1.4293023347854614, "logits/rejected": NaN, "logps/chosen": -556.5626220703125, "logps/rejected": -404.2066345214844, "loss": 0.6527, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3232123851776123, "rewards/margins": 1.983026146888733, "rewards/rejected": -3.3062386512756348, "step": 6580 }, { "epoch": 0.6645657380562209, "grad_norm": 51.51560592651367, "learning_rate": 3.3558535847534536e-07, "logits/chosen": 1.7841250896453857, "logits/rejected": NaN, "logps/chosen": -652.1134033203125, "logps/rejected": -365.6336975097656, "loss": 0.3809, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.857321560382843, "rewards/margins": 2.3876047134399414, "rewards/rejected": -3.2449259757995605, "step": 6590 }, { "epoch": 0.6655741837892348, "grad_norm": 125.40693664550781, "learning_rate": 3.345769890087728e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -600.2990112304688, "logps/rejected": -301.563720703125, "loss": 0.367, "rewards/accuracies": 0.75, "rewards/chosen": -1.417234182357788, "rewards/margins": 2.579805612564087, "rewards/rejected": -3.997039318084717, "step": 6600 }, { "epoch": 0.6665826295222488, "grad_norm": 25.631118774414062, "learning_rate": 3.3356861954220025e-07, "logits/chosen": 1.3907793760299683, "logits/rejected": NaN, "logps/chosen": -604.1616821289062, "logps/rejected": -390.59661865234375, "loss": 0.6227, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0760533809661865, "rewards/margins": 2.2477245330810547, "rewards/rejected": -3.3237781524658203, "step": 6610 }, { "epoch": 0.6675910752552628, "grad_norm": 260.72613525390625, "learning_rate": 3.325602500756277e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -498.603515625, "logps/rejected": -394.8013916015625, "loss": 0.454, "rewards/accuracies": 0.75, "rewards/chosen": -1.931079626083374, "rewards/margins": 2.7632155418395996, "rewards/rejected": -4.6942949295043945, "step": 6620 }, { "epoch": 0.6685995209882768, "grad_norm": 194.39956665039062, "learning_rate": 3.3155188060905514e-07, "logits/chosen": 1.2854211330413818, "logits/rejected": NaN, "logps/chosen": -551.8187866210938, "logps/rejected": -373.05609130859375, "loss": 0.5566, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.7370107173919678, "rewards/margins": 2.109788179397583, "rewards/rejected": -3.84679913520813, "step": 6630 }, { "epoch": 0.6696079667212909, "grad_norm": 12.483172416687012, "learning_rate": 3.3054351114248256e-07, "logits/chosen": 1.4875667095184326, "logits/rejected": NaN, "logps/chosen": -624.18017578125, "logps/rejected": -355.33502197265625, "loss": 0.4371, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9162508249282837, "rewards/margins": 2.6478426456451416, "rewards/rejected": -3.5640933513641357, "step": 6640 }, { "epoch": 0.6706164124543048, "grad_norm": 141.923583984375, "learning_rate": 3.2953514167591003e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -556.8746948242188, "logps/rejected": -429.0423889160156, "loss": 0.3508, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.435289978981018, "rewards/margins": 3.026458263397217, "rewards/rejected": -4.461748123168945, "step": 6650 }, { "epoch": 0.6716248581873188, "grad_norm": 197.44898986816406, "learning_rate": 3.285267722093375e-07, "logits/chosen": 1.171194314956665, "logits/rejected": NaN, "logps/chosen": -625.148193359375, "logps/rejected": -409.5237731933594, "loss": 0.5436, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3987010717391968, "rewards/margins": 2.396263599395752, "rewards/rejected": -3.794964551925659, "step": 6660 }, { "epoch": 0.6726333039203328, "grad_norm": 12.730628967285156, "learning_rate": 3.275184027427649e-07, "logits/chosen": 1.7062451839447021, "logits/rejected": 1.472400426864624, "logps/chosen": -584.0323486328125, "logps/rejected": -411.74151611328125, "loss": 0.5723, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9978510737419128, "rewards/margins": 2.7709286212921143, "rewards/rejected": -3.768779754638672, "step": 6670 }, { "epoch": 0.6736417496533468, "grad_norm": 21.024721145629883, "learning_rate": 3.2651003327619234e-07, "logits/chosen": NaN, "logits/rejected": 1.3620094060897827, "logps/chosen": -513.6355590820312, "logps/rejected": -437.08746337890625, "loss": 0.4459, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6503782272338867, "rewards/margins": 2.6481127738952637, "rewards/rejected": -4.29849100112915, "step": 6680 }, { "epoch": 0.6746501953863607, "grad_norm": 33.27141189575195, "learning_rate": 3.2550166380961987e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -540.9634399414062, "logps/rejected": -391.195068359375, "loss": 0.3414, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5692874193191528, "rewards/margins": 2.513904094696045, "rewards/rejected": -4.083191394805908, "step": 6690 }, { "epoch": 0.6756586411193748, "grad_norm": 23.889436721801758, "learning_rate": 3.244932943430473e-07, "logits/chosen": 1.2836248874664307, "logits/rejected": NaN, "logps/chosen": -577.1373901367188, "logps/rejected": -357.5973205566406, "loss": 0.5317, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7585052251815796, "rewards/margins": 2.630232334136963, "rewards/rejected": -4.388737678527832, "step": 6700 }, { "epoch": 0.6766670868523887, "grad_norm": 36.83075714111328, "learning_rate": 3.234849248764747e-07, "logits/chosen": 1.3418269157409668, "logits/rejected": NaN, "logps/chosen": -478.3866271972656, "logps/rejected": -477.3351135253906, "loss": 0.5046, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.916438102722168, "rewards/margins": 1.661935806274414, "rewards/rejected": -3.578373670578003, "step": 6710 }, { "epoch": 0.6776755325854027, "grad_norm": 58.86091995239258, "learning_rate": 3.224765554099022e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -549.278076171875, "logps/rejected": -451.72015380859375, "loss": 0.3153, "rewards/accuracies": 0.75, "rewards/chosen": -1.133437156677246, "rewards/margins": 2.6383793354034424, "rewards/rejected": -3.7718167304992676, "step": 6720 }, { "epoch": 0.6786839783184168, "grad_norm": 191.3727569580078, "learning_rate": 3.2146818594332965e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -553.3494873046875, "logps/rejected": -325.99517822265625, "loss": 0.6704, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.706378698348999, "rewards/margins": 2.4333291053771973, "rewards/rejected": -4.139707088470459, "step": 6730 }, { "epoch": 0.6796924240514307, "grad_norm": 184.11822509765625, "learning_rate": 3.2045981647675707e-07, "logits/chosen": 1.3332656621932983, "logits/rejected": NaN, "logps/chosen": -431.40484619140625, "logps/rejected": -356.55755615234375, "loss": 0.5282, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9182147979736328, "rewards/margins": 2.3449082374572754, "rewards/rejected": -4.263123512268066, "step": 6740 }, { "epoch": 0.6807008697844448, "grad_norm": 177.8693389892578, "learning_rate": 3.194514470101845e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -507.60491943359375, "logps/rejected": -335.28863525390625, "loss": 1.0066, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.7633941173553467, "rewards/margins": 1.2009127140045166, "rewards/rejected": -3.964306592941284, "step": 6750 }, { "epoch": 0.6817093155174587, "grad_norm": 51.87424850463867, "learning_rate": 3.1844307754361196e-07, "logits/chosen": 1.4870102405548096, "logits/rejected": NaN, "logps/chosen": -586.3523559570312, "logps/rejected": -388.3658447265625, "loss": 0.4809, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0458625555038452, "rewards/margins": 3.3572514057159424, "rewards/rejected": -4.40311336517334, "step": 6760 }, { "epoch": 0.6827177612504727, "grad_norm": 16.734664916992188, "learning_rate": 3.1743470807703943e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -616.5909423828125, "logps/rejected": -391.2740173339844, "loss": 0.414, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.9184285998344421, "rewards/margins": 2.3514416217803955, "rewards/rejected": -3.2698700428009033, "step": 6770 }, { "epoch": 0.6837262069834867, "grad_norm": 7.614058017730713, "learning_rate": 3.1642633861046685e-07, "logits/chosen": NaN, "logits/rejected": 1.2024328708648682, "logps/chosen": -465.28875732421875, "logps/rejected": -385.085693359375, "loss": 0.4712, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6019500494003296, "rewards/margins": 2.5972750186920166, "rewards/rejected": -4.199224472045898, "step": 6780 }, { "epoch": 0.6847346527165007, "grad_norm": 72.1489486694336, "learning_rate": 3.1541796914389427e-07, "logits/chosen": NaN, "logits/rejected": 1.554685354232788, "logps/chosen": -428.2252502441406, "logps/rejected": -425.290283203125, "loss": 0.5331, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4657222032546997, "rewards/margins": 2.311079978942871, "rewards/rejected": -3.7768025398254395, "step": 6790 }, { "epoch": 0.6857430984495146, "grad_norm": 30.946765899658203, "learning_rate": 3.144095996773218e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -522.2420654296875, "logps/rejected": -377.34661865234375, "loss": 0.5907, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.7974188327789307, "rewards/margins": 2.163522243499756, "rewards/rejected": -3.9609408378601074, "step": 6800 }, { "epoch": 0.6867515441825287, "grad_norm": 264.0340270996094, "learning_rate": 3.134012302107492e-07, "logits/chosen": 1.4198144674301147, "logits/rejected": NaN, "logps/chosen": -446.91668701171875, "logps/rejected": -330.69061279296875, "loss": 0.5036, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5657860040664673, "rewards/margins": 2.3246772289276123, "rewards/rejected": -3.890463352203369, "step": 6810 }, { "epoch": 0.6877599899155427, "grad_norm": 12.799182891845703, "learning_rate": 3.1239286074417663e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -443.0331115722656, "logps/rejected": -308.84503173828125, "loss": 0.4706, "rewards/accuracies": 0.75, "rewards/chosen": -2.1940696239471436, "rewards/margins": 1.9948896169662476, "rewards/rejected": -4.188960075378418, "step": 6820 }, { "epoch": 0.6887684356485566, "grad_norm": 27.858440399169922, "learning_rate": 3.113844912776041e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -475.58447265625, "logps/rejected": -380.1007080078125, "loss": 0.792, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.36283278465271, "rewards/margins": 1.3521257638931274, "rewards/rejected": -3.7149581909179688, "step": 6830 }, { "epoch": 0.6897768813815707, "grad_norm": 51.720985412597656, "learning_rate": 3.103761218110316e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -583.2789306640625, "logps/rejected": -357.98828125, "loss": 0.351, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7837697267532349, "rewards/margins": 3.0602214336395264, "rewards/rejected": -3.8439908027648926, "step": 6840 }, { "epoch": 0.6907853271145846, "grad_norm": 163.65936279296875, "learning_rate": 3.09367752344459e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -494.8663024902344, "logps/rejected": -382.2219543457031, "loss": 0.5199, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4893274307250977, "rewards/margins": 2.4646263122558594, "rewards/rejected": -3.953953981399536, "step": 6850 }, { "epoch": 0.6917937728475987, "grad_norm": 64.72606658935547, "learning_rate": 3.083593828778864e-07, "logits/chosen": 1.1857428550720215, "logits/rejected": NaN, "logps/chosen": -508.151123046875, "logps/rejected": -384.4705505371094, "loss": 0.5916, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7053626775741577, "rewards/margins": 1.7910692691802979, "rewards/rejected": -3.496431827545166, "step": 6860 }, { "epoch": 0.6928022185806126, "grad_norm": 127.66056823730469, "learning_rate": 3.073510134113139e-07, "logits/chosen": 1.5205800533294678, "logits/rejected": 1.573103904724121, "logps/chosen": -572.5715942382812, "logps/rejected": -450.53436279296875, "loss": 0.6895, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6541465520858765, "rewards/margins": 1.6359879970550537, "rewards/rejected": -3.2901344299316406, "step": 6870 }, { "epoch": 0.6938106643136266, "grad_norm": 173.02508544921875, "learning_rate": 3.0634264394474136e-07, "logits/chosen": 1.3843778371810913, "logits/rejected": NaN, "logps/chosen": -524.8762817382812, "logps/rejected": -384.42864990234375, "loss": 0.3143, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.0693600177764893, "rewards/margins": 4.416786193847656, "rewards/rejected": -5.486145973205566, "step": 6880 }, { "epoch": 0.6948191100466407, "grad_norm": 5.854733943939209, "learning_rate": 3.053342744781688e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -535.6717529296875, "logps/rejected": -457.9689025878906, "loss": 0.3245, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3194546699523926, "rewards/margins": 2.969550848007202, "rewards/rejected": -4.289005756378174, "step": 6890 }, { "epoch": 0.6958275557796546, "grad_norm": 162.8943328857422, "learning_rate": 3.043259050115962e-07, "logits/chosen": 1.1115591526031494, "logits/rejected": NaN, "logps/chosen": -415.680908203125, "logps/rejected": -360.2997131347656, "loss": 0.5556, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.036339044570923, "rewards/margins": 1.8176389932632446, "rewards/rejected": -3.853977918624878, "step": 6900 }, { "epoch": 0.6968360015126686, "grad_norm": 148.38816833496094, "learning_rate": 3.033175355450237e-07, "logits/chosen": 1.450005054473877, "logits/rejected": NaN, "logps/chosen": -562.7156982421875, "logps/rejected": -373.21435546875, "loss": 0.3258, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8405739068984985, "rewards/margins": 2.9276885986328125, "rewards/rejected": -4.7682623863220215, "step": 6910 }, { "epoch": 0.6978444472456826, "grad_norm": 238.5601348876953, "learning_rate": 3.0230916607845114e-07, "logits/chosen": 1.198157548904419, "logits/rejected": NaN, "logps/chosen": -510.0144958496094, "logps/rejected": -337.29669189453125, "loss": 0.5012, "rewards/accuracies": 0.625, "rewards/chosen": -1.110830545425415, "rewards/margins": 2.393911600112915, "rewards/rejected": -3.504742383956909, "step": 6920 }, { "epoch": 0.6988528929786966, "grad_norm": 134.5264892578125, "learning_rate": 3.0130079661187856e-07, "logits/chosen": 1.149045467376709, "logits/rejected": NaN, "logps/chosen": -532.5479736328125, "logps/rejected": -291.40557861328125, "loss": 0.5338, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.04097843170166, "rewards/margins": 2.174938440322876, "rewards/rejected": -4.215916633605957, "step": 6930 }, { "epoch": 0.6998613387117105, "grad_norm": 50.49470901489258, "learning_rate": 3.0029242714530603e-07, "logits/chosen": 1.5311704874038696, "logits/rejected": NaN, "logps/chosen": -587.8017578125, "logps/rejected": -398.8173828125, "loss": 0.3464, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.414517879486084, "rewards/margins": 2.0326387882232666, "rewards/rejected": -3.4471564292907715, "step": 6940 }, { "epoch": 0.7008697844447246, "grad_norm": 19.904003143310547, "learning_rate": 2.992840576787335e-07, "logits/chosen": 1.252253770828247, "logits/rejected": NaN, "logps/chosen": -662.811279296875, "logps/rejected": -380.81591796875, "loss": 0.4776, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5824798345565796, "rewards/margins": 2.4092259407043457, "rewards/rejected": -3.9917054176330566, "step": 6950 }, { "epoch": 0.7018782301777385, "grad_norm": 270.4064025878906, "learning_rate": 2.982756882121609e-07, "logits/chosen": 1.5825055837631226, "logits/rejected": NaN, "logps/chosen": -509.24884033203125, "logps/rejected": -385.6839294433594, "loss": 0.7066, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.184311866760254, "rewards/margins": 1.556431531906128, "rewards/rejected": -3.7407431602478027, "step": 6960 }, { "epoch": 0.7028866759107526, "grad_norm": 260.428466796875, "learning_rate": 2.9726731874558834e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -663.0402221679688, "logps/rejected": -460.31231689453125, "loss": 0.4743, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1033222675323486, "rewards/margins": 2.60957932472229, "rewards/rejected": -3.7129015922546387, "step": 6970 }, { "epoch": 0.7038951216437666, "grad_norm": 209.64881896972656, "learning_rate": 2.962589492790158e-07, "logits/chosen": 1.1809906959533691, "logits/rejected": NaN, "logps/chosen": -526.9281005859375, "logps/rejected": -401.33038330078125, "loss": 0.5238, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.054842710494995, "rewards/margins": 2.3495750427246094, "rewards/rejected": -4.404417991638184, "step": 6980 }, { "epoch": 0.7049035673767805, "grad_norm": 78.62494659423828, "learning_rate": 2.952505798124433e-07, "logits/chosen": 1.2351653575897217, "logits/rejected": NaN, "logps/chosen": -569.7028198242188, "logps/rejected": -294.07257080078125, "loss": 0.3595, "rewards/accuracies": 0.75, "rewards/chosen": -1.6113876104354858, "rewards/margins": 3.0750749111175537, "rewards/rejected": -4.686462879180908, "step": 6990 }, { "epoch": 0.7059120131097946, "grad_norm": 1.036977767944336, "learning_rate": 2.942422103458707e-07, "logits/chosen": 1.4737255573272705, "logits/rejected": NaN, "logps/chosen": -571.7430419921875, "logps/rejected": -278.9844665527344, "loss": 0.3731, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6659168004989624, "rewards/margins": 2.3811919689178467, "rewards/rejected": -4.0471086502075195, "step": 7000 }, { "epoch": 0.7069204588428085, "grad_norm": 290.5852355957031, "learning_rate": 2.932338408792981e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -534.7539672851562, "logps/rejected": -384.91217041015625, "loss": 0.5282, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.1804492473602295, "rewards/margins": 2.614532470703125, "rewards/rejected": -4.794981956481934, "step": 7010 }, { "epoch": 0.7079289045758225, "grad_norm": 33.95231628417969, "learning_rate": 2.9222547141272564e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -508.7525939941406, "logps/rejected": -467.83673095703125, "loss": 0.3486, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7137165069580078, "rewards/margins": 2.525357723236084, "rewards/rejected": -4.239074230194092, "step": 7020 }, { "epoch": 0.7089373503088365, "grad_norm": 158.44728088378906, "learning_rate": 2.9121710194615306e-07, "logits/chosen": NaN, "logits/rejected": 1.4408549070358276, "logps/chosen": -545.3702392578125, "logps/rejected": -446.054931640625, "loss": 0.5033, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.887677550315857, "rewards/margins": 2.1173553466796875, "rewards/rejected": -4.005033016204834, "step": 7030 }, { "epoch": 0.7099457960418505, "grad_norm": 122.18669128417969, "learning_rate": 2.902087324795805e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -513.5863037109375, "logps/rejected": -443.46954345703125, "loss": 0.3234, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.460033655166626, "rewards/margins": 2.7332804203033447, "rewards/rejected": -4.1933135986328125, "step": 7040 }, { "epoch": 0.7109542417748644, "grad_norm": 39.46737289428711, "learning_rate": 2.8920036301300795e-07, "logits/chosen": NaN, "logits/rejected": 1.2593272924423218, "logps/chosen": -511.69586181640625, "logps/rejected": -441.9366149902344, "loss": 0.4279, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9705705642700195, "rewards/margins": 2.563363552093506, "rewards/rejected": -4.533934593200684, "step": 7050 }, { "epoch": 0.7119626875078785, "grad_norm": 154.82473754882812, "learning_rate": 2.881919935464354e-07, "logits/chosen": 1.0433980226516724, "logits/rejected": NaN, "logps/chosen": -506.1114196777344, "logps/rejected": -355.8170471191406, "loss": 0.695, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.2748475074768066, "rewards/margins": 1.1906331777572632, "rewards/rejected": -3.4654808044433594, "step": 7060 }, { "epoch": 0.7129711332408925, "grad_norm": 22.15567398071289, "learning_rate": 2.8718362407986284e-07, "logits/chosen": NaN, "logits/rejected": 1.0042169094085693, "logps/chosen": -443.3951110839844, "logps/rejected": -374.18634033203125, "loss": 0.236, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9906342029571533, "rewards/margins": 3.5663585662841797, "rewards/rejected": -5.556992530822754, "step": 7070 }, { "epoch": 0.7139795789739065, "grad_norm": 221.2344512939453, "learning_rate": 2.8617525461329026e-07, "logits/chosen": 1.46353280544281, "logits/rejected": NaN, "logps/chosen": -582.6649169921875, "logps/rejected": -328.5586853027344, "loss": 0.508, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8888483047485352, "rewards/margins": 2.8030645847320557, "rewards/rejected": -4.691912651062012, "step": 7080 }, { "epoch": 0.7149880247069205, "grad_norm": 64.14892578125, "learning_rate": 2.8516688514671773e-07, "logits/chosen": 1.394104242324829, "logits/rejected": NaN, "logps/chosen": -604.7718505859375, "logps/rejected": -467.5599670410156, "loss": 0.337, "rewards/accuracies": 0.75, "rewards/chosen": -0.9810327291488647, "rewards/margins": 3.2823376655578613, "rewards/rejected": -4.263370513916016, "step": 7090 }, { "epoch": 0.7159964704399344, "grad_norm": 91.28746795654297, "learning_rate": 2.841585156801452e-07, "logits/chosen": 1.1792573928833008, "logits/rejected": NaN, "logps/chosen": -574.4544677734375, "logps/rejected": -330.0129699707031, "loss": 0.3342, "rewards/accuracies": 0.875, "rewards/chosen": -1.2849408388137817, "rewards/margins": 2.5346195697784424, "rewards/rejected": -3.8195605278015137, "step": 7100 }, { "epoch": 0.7170049161729485, "grad_norm": 13.692255020141602, "learning_rate": 2.831501462135726e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -614.9964599609375, "logps/rejected": -392.5575866699219, "loss": 0.4268, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3754688501358032, "rewards/margins": 2.6029181480407715, "rewards/rejected": -3.9783871173858643, "step": 7110 }, { "epoch": 0.7180133619059624, "grad_norm": 53.20606231689453, "learning_rate": 2.8214177674700004e-07, "logits/chosen": 1.4281929731369019, "logits/rejected": NaN, "logps/chosen": -585.6585083007812, "logps/rejected": -327.7998962402344, "loss": 0.2856, "rewards/accuracies": 0.875, "rewards/chosen": -1.5759410858154297, "rewards/margins": 3.3592216968536377, "rewards/rejected": -4.9351630210876465, "step": 7120 }, { "epoch": 0.7190218076389764, "grad_norm": 51.66321563720703, "learning_rate": 2.8113340728042757e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -443.15106201171875, "logps/rejected": -355.23333740234375, "loss": 0.7718, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.355701446533203, "rewards/margins": 2.233799457550049, "rewards/rejected": -4.589500904083252, "step": 7130 }, { "epoch": 0.7200302533719904, "grad_norm": 157.12545776367188, "learning_rate": 2.80125037813855e-07, "logits/chosen": 1.3231937885284424, "logits/rejected": NaN, "logps/chosen": -633.0796508789062, "logps/rejected": -397.7785949707031, "loss": 0.4554, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0445926189422607, "rewards/margins": 2.0852322578430176, "rewards/rejected": -4.129824638366699, "step": 7140 }, { "epoch": 0.7210386991050044, "grad_norm": 4.461909294128418, "learning_rate": 2.791166683472824e-07, "logits/chosen": 1.4723970890045166, "logits/rejected": NaN, "logps/chosen": -602.7531127929688, "logps/rejected": -354.03363037109375, "loss": 0.257, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4163758754730225, "rewards/margins": 3.5853495597839355, "rewards/rejected": -5.001725196838379, "step": 7150 }, { "epoch": 0.7220471448380184, "grad_norm": 84.48715209960938, "learning_rate": 2.781082988807099e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -490.9933166503906, "logps/rejected": -443.5127868652344, "loss": 0.4698, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0527329444885254, "rewards/margins": 2.417823314666748, "rewards/rejected": -4.470556259155273, "step": 7160 }, { "epoch": 0.7230555905710324, "grad_norm": 70.86988830566406, "learning_rate": 2.7709992941413735e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -627.0740356445312, "logps/rejected": -467.291259765625, "loss": 0.4653, "rewards/accuracies": 0.75, "rewards/chosen": -2.4619221687316895, "rewards/margins": 2.242668867111206, "rewards/rejected": -4.704590797424316, "step": 7170 }, { "epoch": 0.7240640363040464, "grad_norm": 302.32373046875, "learning_rate": 2.7609155994756477e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -698.0704345703125, "logps/rejected": -404.55645751953125, "loss": 0.4232, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6966660618782043, "rewards/margins": 3.2277145385742188, "rewards/rejected": -3.92438006401062, "step": 7180 }, { "epoch": 0.7250724820370604, "grad_norm": 232.58932495117188, "learning_rate": 2.750831904809922e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -523.3472900390625, "logps/rejected": -389.5090637207031, "loss": 0.5814, "rewards/accuracies": 0.75, "rewards/chosen": -1.7160654067993164, "rewards/margins": 2.1905040740966797, "rewards/rejected": -3.906569719314575, "step": 7190 }, { "epoch": 0.7260809277700744, "grad_norm": 285.5714111328125, "learning_rate": 2.7407482101441966e-07, "logits/chosen": NaN, "logits/rejected": 1.160075068473816, "logps/chosen": -544.5111083984375, "logps/rejected": -422.745849609375, "loss": 0.8823, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8426082134246826, "rewards/margins": 1.8187326192855835, "rewards/rejected": -3.6613407135009766, "step": 7200 }, { "epoch": 0.7270893735030883, "grad_norm": 21.636634826660156, "learning_rate": 2.7306645154784713e-07, "logits/chosen": 1.2934560775756836, "logits/rejected": NaN, "logps/chosen": -566.1019287109375, "logps/rejected": -318.6329650878906, "loss": 0.1462, "rewards/accuracies": 0.875, "rewards/chosen": -0.8370157480239868, "rewards/margins": 3.5276381969451904, "rewards/rejected": -4.364653587341309, "step": 7210 }, { "epoch": 0.7280978192361024, "grad_norm": 43.88969421386719, "learning_rate": 2.7205808208127455e-07, "logits/chosen": 1.6070741415023804, "logits/rejected": NaN, "logps/chosen": -639.1605224609375, "logps/rejected": -415.9158630371094, "loss": 0.3779, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9562298059463501, "rewards/margins": 3.5106754302978516, "rewards/rejected": -4.466905117034912, "step": 7220 }, { "epoch": 0.7291062649691163, "grad_norm": 80.32674407958984, "learning_rate": 2.71049712614702e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -553.6338500976562, "logps/rejected": -403.89306640625, "loss": 0.6476, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5227036476135254, "rewards/margins": 2.5566165447235107, "rewards/rejected": -4.079320430755615, "step": 7230 }, { "epoch": 0.7301147107021303, "grad_norm": 8.077322959899902, "learning_rate": 2.700413431481295e-07, "logits/chosen": 1.344150424003601, "logits/rejected": NaN, "logps/chosen": -570.9910888671875, "logps/rejected": -325.8572692871094, "loss": 0.2923, "rewards/accuracies": 0.75, "rewards/chosen": -1.1611839532852173, "rewards/margins": 3.290895938873291, "rewards/rejected": -4.452079772949219, "step": 7240 }, { "epoch": 0.7311231564351444, "grad_norm": 0.8519035577774048, "learning_rate": 2.690329736815569e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -618.9288330078125, "logps/rejected": -353.8763122558594, "loss": 0.3502, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.295431137084961, "rewards/margins": 2.847658157348633, "rewards/rejected": -4.1430888175964355, "step": 7250 }, { "epoch": 0.7321316021681583, "grad_norm": 134.77249145507812, "learning_rate": 2.6802460421498433e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -430.3670349121094, "logps/rejected": -396.1773986816406, "loss": 0.5679, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.665449857711792, "rewards/margins": 2.814920425415039, "rewards/rejected": -4.48037052154541, "step": 7260 }, { "epoch": 0.7331400479011723, "grad_norm": 174.97691345214844, "learning_rate": 2.670162347484118e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -618.08984375, "logps/rejected": -478.36431884765625, "loss": 0.3016, "rewards/accuracies": 0.875, "rewards/chosen": -0.7459568977355957, "rewards/margins": 2.835864782333374, "rewards/rejected": -3.5818214416503906, "step": 7270 }, { "epoch": 0.7341484936341863, "grad_norm": 13.421360969543457, "learning_rate": 2.660078652818393e-07, "logits/chosen": 1.4710463285446167, "logits/rejected": NaN, "logps/chosen": -585.1713256835938, "logps/rejected": -330.8350524902344, "loss": 0.5501, "rewards/accuracies": 0.625, "rewards/chosen": -1.1225618124008179, "rewards/margins": 2.2296245098114014, "rewards/rejected": -3.352186679840088, "step": 7280 }, { "epoch": 0.7351569393672003, "grad_norm": 209.09841918945312, "learning_rate": 2.649994958152667e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -523.0386962890625, "logps/rejected": -383.76068115234375, "loss": 0.5532, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1331202983856201, "rewards/margins": 2.41521954536438, "rewards/rejected": -3.54833984375, "step": 7290 }, { "epoch": 0.7361653851002143, "grad_norm": 157.1387481689453, "learning_rate": 2.639911263486941e-07, "logits/chosen": 1.2483799457550049, "logits/rejected": NaN, "logps/chosen": -632.9013671875, "logps/rejected": -492.60662841796875, "loss": 0.3524, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1405532360076904, "rewards/margins": 2.699526309967041, "rewards/rejected": -3.8400797843933105, "step": 7300 }, { "epoch": 0.7371738308332283, "grad_norm": 43.976051330566406, "learning_rate": 2.629827568821216e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -488.142822265625, "logps/rejected": -441.5086364746094, "loss": 0.1688, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.0302788019180298, "rewards/margins": 3.97499418258667, "rewards/rejected": -5.00527286529541, "step": 7310 }, { "epoch": 0.7381822765662422, "grad_norm": 28.669174194335938, "learning_rate": 2.6197438741554906e-07, "logits/chosen": 1.2346031665802002, "logits/rejected": NaN, "logps/chosen": -501.93438720703125, "logps/rejected": -329.5439758300781, "loss": 0.3678, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6674699783325195, "rewards/margins": 3.050205945968628, "rewards/rejected": -4.717675685882568, "step": 7320 }, { "epoch": 0.7391907222992563, "grad_norm": 226.16390991210938, "learning_rate": 2.609660179489765e-07, "logits/chosen": 1.3460584878921509, "logits/rejected": NaN, "logps/chosen": -595.4815673828125, "logps/rejected": -396.2691345214844, "loss": 0.6632, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5964272022247314, "rewards/margins": 1.8982270956039429, "rewards/rejected": -3.4946541786193848, "step": 7330 }, { "epoch": 0.7401991680322703, "grad_norm": 206.91009521484375, "learning_rate": 2.5995764848240395e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -451.519775390625, "logps/rejected": -415.5347595214844, "loss": 0.4922, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9586232900619507, "rewards/margins": 2.59794020652771, "rewards/rejected": -4.556563377380371, "step": 7340 }, { "epoch": 0.7412076137652842, "grad_norm": 2.8977272510528564, "learning_rate": 2.589492790158314e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -502.01861572265625, "logps/rejected": -338.139892578125, "loss": 0.2841, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5958614349365234, "rewards/margins": 2.8307862281799316, "rewards/rejected": -4.426647663116455, "step": 7350 }, { "epoch": 0.7422160594982983, "grad_norm": 29.19446563720703, "learning_rate": 2.5794090954925884e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -492.9207458496094, "logps/rejected": -409.1618347167969, "loss": 0.5863, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3112993240356445, "rewards/margins": 2.611314296722412, "rewards/rejected": -3.9226131439208984, "step": 7360 }, { "epoch": 0.7432245052313122, "grad_norm": 19.209774017333984, "learning_rate": 2.5693254008268626e-07, "logits/chosen": 1.4014372825622559, "logits/rejected": NaN, "logps/chosen": -626.2469482421875, "logps/rejected": -399.8408203125, "loss": 0.6106, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2372448444366455, "rewards/margins": 1.93669855594635, "rewards/rejected": -3.173943042755127, "step": 7370 }, { "epoch": 0.7442329509643262, "grad_norm": 2.8771371841430664, "learning_rate": 2.5592417061611373e-07, "logits/chosen": 1.4728349447250366, "logits/rejected": NaN, "logps/chosen": -555.3797607421875, "logps/rejected": -396.0411376953125, "loss": 0.7218, "rewards/accuracies": 0.75, "rewards/chosen": -1.9254348278045654, "rewards/margins": 2.1925063133239746, "rewards/rejected": -4.117941379547119, "step": 7380 }, { "epoch": 0.7452413966973402, "grad_norm": 72.17713928222656, "learning_rate": 2.549158011495412e-07, "logits/chosen": 1.0813251733779907, "logits/rejected": NaN, "logps/chosen": -488.50390625, "logps/rejected": -365.46209716796875, "loss": 0.4043, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6274478435516357, "rewards/margins": 2.828174591064453, "rewards/rejected": -4.455622673034668, "step": 7390 }, { "epoch": 0.7462498424303542, "grad_norm": 102.31511688232422, "learning_rate": 2.539074316829686e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -559.5438232421875, "logps/rejected": -415.5390625, "loss": 0.3455, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.286167621612549, "rewards/margins": 2.654029369354248, "rewards/rejected": -4.940196990966797, "step": 7400 }, { "epoch": 0.7472582881633683, "grad_norm": 32.334259033203125, "learning_rate": 2.5289906221639604e-07, "logits/chosen": 1.2920286655426025, "logits/rejected": NaN, "logps/chosen": -665.0955810546875, "logps/rejected": -378.0648193359375, "loss": 0.263, "rewards/accuracies": 0.875, "rewards/chosen": -0.548233151435852, "rewards/margins": 3.454500198364258, "rewards/rejected": -4.00273323059082, "step": 7410 }, { "epoch": 0.7482667338963822, "grad_norm": 276.6488037109375, "learning_rate": 2.518906927498235e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -616.62451171875, "logps/rejected": -441.8863220214844, "loss": 0.4698, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.758427619934082, "rewards/margins": 2.579624652862549, "rewards/rejected": -4.338051795959473, "step": 7420 }, { "epoch": 0.7492751796293962, "grad_norm": 85.47742462158203, "learning_rate": 2.50882323283251e-07, "logits/chosen": 1.386206030845642, "logits/rejected": NaN, "logps/chosen": -642.064453125, "logps/rejected": -338.9590759277344, "loss": 0.3516, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6096546649932861, "rewards/margins": 2.356110095977783, "rewards/rejected": -3.9657649993896484, "step": 7430 }, { "epoch": 0.7502836253624102, "grad_norm": 126.86175537109375, "learning_rate": 2.498739538166784e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -478.3641052246094, "logps/rejected": -404.3877868652344, "loss": 0.4367, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8057448863983154, "rewards/margins": 2.587409734725952, "rewards/rejected": -4.393155097961426, "step": 7440 }, { "epoch": 0.7512920710954242, "grad_norm": 211.68992614746094, "learning_rate": 2.488655843501059e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -591.9841918945312, "logps/rejected": -340.3852844238281, "loss": 0.5905, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0794870853424072, "rewards/margins": 2.6695899963378906, "rewards/rejected": -4.749076843261719, "step": 7450 }, { "epoch": 0.7523005168284381, "grad_norm": 121.24835205078125, "learning_rate": 2.478572148835333e-07, "logits/chosen": 1.510718822479248, "logits/rejected": NaN, "logps/chosen": -561.1539306640625, "logps/rejected": -371.6653137207031, "loss": 0.281, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.250438928604126, "rewards/margins": 3.4831290245056152, "rewards/rejected": -4.73356819152832, "step": 7460 }, { "epoch": 0.7533089625614522, "grad_norm": 114.05532836914062, "learning_rate": 2.4684884541696077e-07, "logits/chosen": 1.4387580156326294, "logits/rejected": NaN, "logps/chosen": -624.0140991210938, "logps/rejected": -505.66961669921875, "loss": 0.4115, "rewards/accuracies": 0.75, "rewards/chosen": -1.5114496946334839, "rewards/margins": 2.375624179840088, "rewards/rejected": -3.8870739936828613, "step": 7470 }, { "epoch": 0.7543174082944661, "grad_norm": 138.507080078125, "learning_rate": 2.458404759503882e-07, "logits/chosen": 1.4221339225769043, "logits/rejected": NaN, "logps/chosen": -541.8663330078125, "logps/rejected": -384.7060546875, "loss": 0.3709, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3814576864242554, "rewards/margins": 3.2611656188964844, "rewards/rejected": -4.642623424530029, "step": 7480 }, { "epoch": 0.7553258540274801, "grad_norm": 256.1872253417969, "learning_rate": 2.4483210648381566e-07, "logits/chosen": 1.3628588914871216, "logits/rejected": NaN, "logps/chosen": -521.6887817382812, "logps/rejected": -398.8039855957031, "loss": 0.478, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8987623453140259, "rewards/margins": 2.0570321083068848, "rewards/rejected": -3.9557945728302, "step": 7490 }, { "epoch": 0.7563342997604942, "grad_norm": 74.34307861328125, "learning_rate": 2.438237370172431e-07, "logits/chosen": 1.2709819078445435, "logits/rejected": NaN, "logps/chosen": -456.36944580078125, "logps/rejected": -305.7723693847656, "loss": 0.3863, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.247654914855957, "rewards/margins": 2.8456008434295654, "rewards/rejected": -4.093255519866943, "step": 7500 }, { "epoch": 0.7573427454935081, "grad_norm": 102.28872680664062, "learning_rate": 2.4281536755067055e-07, "logits/chosen": 1.5163867473602295, "logits/rejected": NaN, "logps/chosen": -515.4991455078125, "logps/rejected": -332.25213623046875, "loss": 0.4165, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.2907311916351318, "rewards/margins": 3.062706470489502, "rewards/rejected": -4.353437423706055, "step": 7510 }, { "epoch": 0.7583511912265222, "grad_norm": 80.86771392822266, "learning_rate": 2.4180699808409797e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -504.15191650390625, "logps/rejected": -366.73583984375, "loss": 0.4243, "rewards/accuracies": 0.75, "rewards/chosen": -1.4334955215454102, "rewards/margins": 2.180549144744873, "rewards/rejected": -3.6140449047088623, "step": 7520 }, { "epoch": 0.7593596369595361, "grad_norm": 11.521836280822754, "learning_rate": 2.4079862861752544e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -533.41845703125, "logps/rejected": -389.3733825683594, "loss": 0.5222, "rewards/accuracies": 0.75, "rewards/chosen": -2.224616527557373, "rewards/margins": 2.1458468437194824, "rewards/rejected": -4.3704633712768555, "step": 7530 }, { "epoch": 0.7603680826925501, "grad_norm": 4.173572540283203, "learning_rate": 2.397902591509529e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -625.0285034179688, "logps/rejected": -400.3772277832031, "loss": 0.6149, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2080460786819458, "rewards/margins": 1.9794355630874634, "rewards/rejected": -3.18748140335083, "step": 7540 }, { "epoch": 0.7613765284255641, "grad_norm": 39.09406280517578, "learning_rate": 2.3878188968438033e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -527.1708984375, "logps/rejected": -341.56292724609375, "loss": 0.2584, "rewards/accuracies": 0.875, "rewards/chosen": -0.7727364301681519, "rewards/margins": 3.139866352081299, "rewards/rejected": -3.9126029014587402, "step": 7550 }, { "epoch": 0.7623849741585781, "grad_norm": 23.646398544311523, "learning_rate": 2.377735202178078e-07, "logits/chosen": 1.325150966644287, "logits/rejected": 1.0885802507400513, "logps/chosen": -586.4305419921875, "logps/rejected": -426.7127380371094, "loss": 0.4188, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.9094429016113281, "rewards/margins": 3.0449697971343994, "rewards/rejected": -4.954412937164307, "step": 7560 }, { "epoch": 0.763393419891592, "grad_norm": 74.90028381347656, "learning_rate": 2.3676515075123525e-07, "logits/chosen": 1.2937357425689697, "logits/rejected": NaN, "logps/chosen": -648.4932861328125, "logps/rejected": -337.02935791015625, "loss": 0.4729, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2374306917190552, "rewards/margins": 2.6235756874084473, "rewards/rejected": -3.861006259918213, "step": 7570 }, { "epoch": 0.7644018656246061, "grad_norm": 74.0202407836914, "learning_rate": 2.357567812846627e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -493.665283203125, "logps/rejected": -333.66680908203125, "loss": 0.6289, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6156609058380127, "rewards/margins": 2.3931541442871094, "rewards/rejected": -4.008814811706543, "step": 7580 }, { "epoch": 0.76541031135762, "grad_norm": 201.4212646484375, "learning_rate": 2.3474841181809014e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -549.6376342773438, "logps/rejected": -356.9134826660156, "loss": 0.8226, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5024398565292358, "rewards/margins": 1.8985761404037476, "rewards/rejected": -3.4010162353515625, "step": 7590 }, { "epoch": 0.766418757090634, "grad_norm": 44.021629333496094, "learning_rate": 2.3374004235151758e-07, "logits/chosen": 1.2668837308883667, "logits/rejected": NaN, "logps/chosen": -556.4466552734375, "logps/rejected": -383.5392150878906, "loss": 0.4977, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3923927545547485, "rewards/margins": 2.9426841735839844, "rewards/rejected": -4.335076808929443, "step": 7600 }, { "epoch": 0.7674272028236481, "grad_norm": 28.258939743041992, "learning_rate": 2.3273167288494503e-07, "logits/chosen": 1.148958683013916, "logits/rejected": NaN, "logps/chosen": -527.5221557617188, "logps/rejected": -342.91143798828125, "loss": 0.3934, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3954365253448486, "rewards/margins": 3.0019686222076416, "rewards/rejected": -4.39740514755249, "step": 7610 }, { "epoch": 0.768435648556662, "grad_norm": 53.501983642578125, "learning_rate": 2.3172330341837247e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -512.8677978515625, "logps/rejected": -457.7418518066406, "loss": 0.3916, "rewards/accuracies": 0.75, "rewards/chosen": -1.2483813762664795, "rewards/margins": 2.647067070007324, "rewards/rejected": -3.8954482078552246, "step": 7620 }, { "epoch": 0.7694440942896761, "grad_norm": 154.58050537109375, "learning_rate": 2.3071493395179992e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -449.853759765625, "logps/rejected": -330.112548828125, "loss": 0.58, "rewards/accuracies": 0.75, "rewards/chosen": -1.9930031299591064, "rewards/margins": 3.0677647590637207, "rewards/rejected": -5.06076717376709, "step": 7630 }, { "epoch": 0.77045254002269, "grad_norm": 182.11090087890625, "learning_rate": 2.297065644852274e-07, "logits/chosen": 1.4834855794906616, "logits/rejected": NaN, "logps/chosen": -645.1505126953125, "logps/rejected": -303.25628662109375, "loss": 0.2741, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.5204505920410156, "rewards/margins": 3.091580867767334, "rewards/rejected": -4.61203145980835, "step": 7640 }, { "epoch": 0.771460985755704, "grad_norm": 65.42035675048828, "learning_rate": 2.2869819501865484e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -462.03289794921875, "logps/rejected": -401.0262451171875, "loss": 0.4012, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9669873714447021, "rewards/margins": 2.047100067138672, "rewards/rejected": -4.014087200164795, "step": 7650 }, { "epoch": 0.772469431488718, "grad_norm": 241.79458618164062, "learning_rate": 2.2768982555208228e-07, "logits/chosen": 1.2540092468261719, "logits/rejected": NaN, "logps/chosen": -555.0641479492188, "logps/rejected": -396.2474060058594, "loss": 0.322, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0574710369110107, "rewards/margins": 3.371112108230591, "rewards/rejected": -4.42858362197876, "step": 7660 }, { "epoch": 0.773477877221732, "grad_norm": 216.1240997314453, "learning_rate": 2.2668145608550973e-07, "logits/chosen": 1.2874658107757568, "logits/rejected": NaN, "logps/chosen": -564.5521240234375, "logps/rejected": -413.3180236816406, "loss": 0.5912, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3090660572052002, "rewards/margins": 2.276660442352295, "rewards/rejected": -3.585726499557495, "step": 7670 }, { "epoch": 0.774486322954746, "grad_norm": 141.48443603515625, "learning_rate": 2.2567308661893717e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -519.0718994140625, "logps/rejected": -398.0462646484375, "loss": 0.5944, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0978617668151855, "rewards/margins": 2.2046570777893066, "rewards/rejected": -4.30251932144165, "step": 7680 }, { "epoch": 0.77549476868776, "grad_norm": 82.9463882446289, "learning_rate": 2.2466471715236462e-07, "logits/chosen": 1.3953626155853271, "logits/rejected": NaN, "logps/chosen": -476.53680419921875, "logps/rejected": -332.23370361328125, "loss": 0.5017, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.774409294128418, "rewards/margins": 2.033280849456787, "rewards/rejected": -3.807690382003784, "step": 7690 }, { "epoch": 0.776503214420774, "grad_norm": 92.70377349853516, "learning_rate": 2.2365634768579206e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -385.44561767578125, "logps/rejected": -427.9739685058594, "loss": 0.9281, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.6614081859588623, "rewards/margins": 1.1610486507415771, "rewards/rejected": -3.8224570751190186, "step": 7700 }, { "epoch": 0.7775116601537879, "grad_norm": 51.040279388427734, "learning_rate": 2.226479782192195e-07, "logits/chosen": 1.4248685836791992, "logits/rejected": NaN, "logps/chosen": -525.4703979492188, "logps/rejected": -312.01824951171875, "loss": 0.6385, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5609959363937378, "rewards/margins": 2.18875789642334, "rewards/rejected": -3.749753952026367, "step": 7710 }, { "epoch": 0.778520105886802, "grad_norm": 54.973506927490234, "learning_rate": 2.2163960875264695e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -662.613037109375, "logps/rejected": -407.2256164550781, "loss": 0.4617, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8849870562553406, "rewards/margins": 2.558242082595825, "rewards/rejected": -3.4432289600372314, "step": 7720 }, { "epoch": 0.7795285516198159, "grad_norm": 124.40599060058594, "learning_rate": 2.206312392860744e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -510.08978271484375, "logps/rejected": -492.24603271484375, "loss": 0.3097, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.7318106889724731, "rewards/margins": 2.701977252960205, "rewards/rejected": -3.4337878227233887, "step": 7730 }, { "epoch": 0.78053699735283, "grad_norm": 124.02118682861328, "learning_rate": 2.1962286981950184e-07, "logits/chosen": 1.5536335706710815, "logits/rejected": NaN, "logps/chosen": -669.4107055664062, "logps/rejected": -368.49334716796875, "loss": 0.4299, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8551338911056519, "rewards/margins": 2.4927706718444824, "rewards/rejected": -3.347904682159424, "step": 7740 }, { "epoch": 0.781545443085844, "grad_norm": 30.808317184448242, "learning_rate": 2.1861450035292932e-07, "logits/chosen": 1.1557958126068115, "logits/rejected": NaN, "logps/chosen": -593.427734375, "logps/rejected": -358.6719055175781, "loss": 0.4513, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7418386936187744, "rewards/margins": 2.4279141426086426, "rewards/rejected": -4.169752597808838, "step": 7750 }, { "epoch": 0.7825538888188579, "grad_norm": 312.8758239746094, "learning_rate": 2.1760613088635676e-07, "logits/chosen": 1.4294843673706055, "logits/rejected": NaN, "logps/chosen": -609.7586059570312, "logps/rejected": -284.0500793457031, "loss": 0.4104, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.432022213935852, "rewards/margins": 2.2183408737182617, "rewards/rejected": -3.6503632068634033, "step": 7760 }, { "epoch": 0.783562334551872, "grad_norm": 158.09494018554688, "learning_rate": 2.165977614197842e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -524.121337890625, "logps/rejected": -363.2555236816406, "loss": 0.4828, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2588969469070435, "rewards/margins": 2.886733293533325, "rewards/rejected": -4.1456298828125, "step": 7770 }, { "epoch": 0.7845707802848859, "grad_norm": 218.08108520507812, "learning_rate": 2.1558939195321165e-07, "logits/chosen": 1.605324387550354, "logits/rejected": NaN, "logps/chosen": -606.8055419921875, "logps/rejected": -404.5296325683594, "loss": 0.5221, "rewards/accuracies": 0.75, "rewards/chosen": -1.2582685947418213, "rewards/margins": 2.2435765266418457, "rewards/rejected": -3.501845121383667, "step": 7780 }, { "epoch": 0.7855792260178999, "grad_norm": 50.87548828125, "learning_rate": 2.145810224866391e-07, "logits/chosen": 1.1142985820770264, "logits/rejected": NaN, "logps/chosen": -495.1842346191406, "logps/rejected": -371.9150390625, "loss": 0.5467, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.828421950340271, "rewards/margins": 1.9782183170318604, "rewards/rejected": -3.806640625, "step": 7790 }, { "epoch": 0.7865876717509139, "grad_norm": 124.46637725830078, "learning_rate": 2.1357265302006654e-07, "logits/chosen": 1.4716176986694336, "logits/rejected": NaN, "logps/chosen": -526.0120239257812, "logps/rejected": -375.25408935546875, "loss": 0.508, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.8266643285751343, "rewards/margins": 1.9933960437774658, "rewards/rejected": -3.8200607299804688, "step": 7800 }, { "epoch": 0.7875961174839279, "grad_norm": 189.39585876464844, "learning_rate": 2.12564283553494e-07, "logits/chosen": 1.44776451587677, "logits/rejected": NaN, "logps/chosen": -572.4915771484375, "logps/rejected": -371.0738830566406, "loss": 0.4126, "rewards/accuracies": 0.75, "rewards/chosen": -0.9381662607192993, "rewards/margins": 1.959282636642456, "rewards/rejected": -2.897449016571045, "step": 7810 }, { "epoch": 0.7886045632169418, "grad_norm": 176.40945434570312, "learning_rate": 2.1155591408692143e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -664.86474609375, "logps/rejected": -385.84063720703125, "loss": 0.2951, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.45881277322769165, "rewards/margins": 2.978999137878418, "rewards/rejected": -3.437812328338623, "step": 7820 }, { "epoch": 0.7896130089499559, "grad_norm": 72.30253601074219, "learning_rate": 2.1054754462034888e-07, "logits/chosen": 1.2233315706253052, "logits/rejected": 1.3288192749023438, "logps/chosen": -480.31805419921875, "logps/rejected": -392.42974853515625, "loss": 0.6007, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9192899465560913, "rewards/margins": 2.4784998893737793, "rewards/rejected": -4.39778995513916, "step": 7830 }, { "epoch": 0.7906214546829698, "grad_norm": 177.0711212158203, "learning_rate": 2.0953917515377635e-07, "logits/chosen": 1.4234563112258911, "logits/rejected": NaN, "logps/chosen": -649.5838623046875, "logps/rejected": -367.64263916015625, "loss": 0.3557, "rewards/accuracies": 0.75, "rewards/chosen": -0.37149664759635925, "rewards/margins": 2.7606444358825684, "rewards/rejected": -3.13214111328125, "step": 7840 }, { "epoch": 0.7916299004159839, "grad_norm": 51.72471618652344, "learning_rate": 2.0853080568720377e-07, "logits/chosen": 1.1864405870437622, "logits/rejected": NaN, "logps/chosen": -569.1951904296875, "logps/rejected": -306.24896240234375, "loss": 0.3684, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8000138998031616, "rewards/margins": 2.6416513919830322, "rewards/rejected": -4.441665172576904, "step": 7850 }, { "epoch": 0.7926383461489979, "grad_norm": 118.53602600097656, "learning_rate": 2.0752243622063124e-07, "logits/chosen": 1.2984447479248047, "logits/rejected": NaN, "logps/chosen": -702.3104248046875, "logps/rejected": -416.5572814941406, "loss": 0.3616, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5438877940177917, "rewards/margins": 2.7435903549194336, "rewards/rejected": -3.287477970123291, "step": 7860 }, { "epoch": 0.7936467918820118, "grad_norm": 98.3089370727539, "learning_rate": 2.065140667540587e-07, "logits/chosen": 1.1101152896881104, "logits/rejected": NaN, "logps/chosen": -566.5203247070312, "logps/rejected": -274.1617431640625, "loss": 0.3671, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0507484674453735, "rewards/margins": 2.0250906944274902, "rewards/rejected": -3.0758392810821533, "step": 7870 }, { "epoch": 0.7946552376150259, "grad_norm": 56.01569366455078, "learning_rate": 2.0550569728748613e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -443.2359924316406, "logps/rejected": -428.4798278808594, "loss": 0.3231, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3114516735076904, "rewards/margins": 2.947496175765991, "rewards/rejected": -4.25894832611084, "step": 7880 }, { "epoch": 0.7956636833480398, "grad_norm": 125.52682495117188, "learning_rate": 2.0449732782091358e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -518.1143188476562, "logps/rejected": -440.58306884765625, "loss": 0.6667, "rewards/accuracies": 0.75, "rewards/chosen": -1.4093632698059082, "rewards/margins": 2.656242609024048, "rewards/rejected": -4.065606117248535, "step": 7890 }, { "epoch": 0.7966721290810538, "grad_norm": 97.47962188720703, "learning_rate": 2.0348895835434102e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -513.7648315429688, "logps/rejected": -354.33404541015625, "loss": 0.4101, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5012996196746826, "rewards/margins": 2.2870612144470215, "rewards/rejected": -3.788361072540283, "step": 7900 }, { "epoch": 0.7976805748140678, "grad_norm": 58.481597900390625, "learning_rate": 2.0248058888776847e-07, "logits/chosen": 1.4327654838562012, "logits/rejected": NaN, "logps/chosen": -589.8261108398438, "logps/rejected": -324.690673828125, "loss": 0.5795, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6772197484970093, "rewards/margins": 2.4824156761169434, "rewards/rejected": -3.159635543823242, "step": 7910 }, { "epoch": 0.7986890205470818, "grad_norm": 3.9643375873565674, "learning_rate": 2.0147221942119591e-07, "logits/chosen": 1.5454295873641968, "logits/rejected": NaN, "logps/chosen": -559.5291137695312, "logps/rejected": -336.30487060546875, "loss": 0.4645, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6180394887924194, "rewards/margins": 2.3961148262023926, "rewards/rejected": -4.014153480529785, "step": 7920 }, { "epoch": 0.7996974662800957, "grad_norm": 269.73858642578125, "learning_rate": 2.0046384995462336e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -571.8561401367188, "logps/rejected": -392.6446228027344, "loss": 0.3967, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2942414283752441, "rewards/margins": 2.8885533809661865, "rewards/rejected": -4.182794570922852, "step": 7930 }, { "epoch": 0.8007059120131098, "grad_norm": 34.85377883911133, "learning_rate": 1.994554804880508e-07, "logits/chosen": NaN, "logits/rejected": 1.370593786239624, "logps/chosen": -446.0674743652344, "logps/rejected": -392.88800048828125, "loss": 0.3418, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.752575159072876, "rewards/margins": 2.5257811546325684, "rewards/rejected": -4.278356075286865, "step": 7940 }, { "epoch": 0.8017143577461238, "grad_norm": 174.3632049560547, "learning_rate": 1.9844711102147828e-07, "logits/chosen": 1.1691820621490479, "logits/rejected": NaN, "logps/chosen": -498.49725341796875, "logps/rejected": -373.37408447265625, "loss": 0.2937, "rewards/accuracies": 0.875, "rewards/chosen": -1.4601024389266968, "rewards/margins": 3.646409273147583, "rewards/rejected": -5.10651159286499, "step": 7950 }, { "epoch": 0.8027228034791378, "grad_norm": 138.26513671875, "learning_rate": 1.974387415549057e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -501.12872314453125, "logps/rejected": -394.46685791015625, "loss": 0.2688, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0983667373657227, "rewards/margins": 3.2050979137420654, "rewards/rejected": -4.303464412689209, "step": 7960 }, { "epoch": 0.8037312492121518, "grad_norm": 13.149581909179688, "learning_rate": 1.9643037208833317e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -554.7947387695312, "logps/rejected": -398.8082580566406, "loss": 0.4669, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2501085996627808, "rewards/margins": 2.83133864402771, "rewards/rejected": -4.081447124481201, "step": 7970 }, { "epoch": 0.8047396949451657, "grad_norm": 2.4320168495178223, "learning_rate": 1.954220026217606e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -530.6738891601562, "logps/rejected": -348.2565002441406, "loss": 0.2448, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.1831125020980835, "rewards/margins": 3.1641268730163574, "rewards/rejected": -4.3472394943237305, "step": 7980 }, { "epoch": 0.8057481406781798, "grad_norm": 126.152587890625, "learning_rate": 1.9441363315518806e-07, "logits/chosen": 1.3015731573104858, "logits/rejected": NaN, "logps/chosen": -511.9236755371094, "logps/rejected": -314.6552734375, "loss": 0.5491, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6252765655517578, "rewards/margins": 2.1115691661834717, "rewards/rejected": -3.7368457317352295, "step": 7990 }, { "epoch": 0.8067565864111937, "grad_norm": 135.28199768066406, "learning_rate": 1.934052636886155e-07, "logits/chosen": 1.212005376815796, "logits/rejected": 1.2587730884552002, "logps/chosen": -541.5305786132812, "logps/rejected": -447.4697265625, "loss": 0.6352, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.2665176391601562, "rewards/margins": 2.68599009513855, "rewards/rejected": -4.952507972717285, "step": 8000 }, { "epoch": 0.8077650321442077, "grad_norm": 13.08143138885498, "learning_rate": 1.9239689422204295e-07, "logits/chosen": 1.255826711654663, "logits/rejected": NaN, "logps/chosen": -513.0394897460938, "logps/rejected": -395.6827697753906, "loss": 0.5296, "rewards/accuracies": 0.75, "rewards/chosen": -2.244904041290283, "rewards/margins": 2.7955563068389893, "rewards/rejected": -5.040460586547852, "step": 8010 }, { "epoch": 0.8087734778772218, "grad_norm": 251.3086395263672, "learning_rate": 1.913885247554704e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -469.97418212890625, "logps/rejected": -442.24609375, "loss": 0.4271, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.124461054801941, "rewards/margins": 3.1335957050323486, "rewards/rejected": -4.258056640625, "step": 8020 }, { "epoch": 0.8097819236102357, "grad_norm": 303.0198669433594, "learning_rate": 1.9038015528889784e-07, "logits/chosen": 1.2517106533050537, "logits/rejected": NaN, "logps/chosen": -551.8768920898438, "logps/rejected": -328.2702941894531, "loss": 0.4973, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6441055536270142, "rewards/margins": 2.6610116958618164, "rewards/rejected": -4.305117607116699, "step": 8030 }, { "epoch": 0.8107903693432497, "grad_norm": 8.285945892333984, "learning_rate": 1.8937178582232528e-07, "logits/chosen": 1.4363871812820435, "logits/rejected": NaN, "logps/chosen": -543.33935546875, "logps/rejected": -367.1376037597656, "loss": 0.4769, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2011969089508057, "rewards/margins": 2.4650113582611084, "rewards/rejected": -3.666208267211914, "step": 8040 }, { "epoch": 0.8117988150762637, "grad_norm": 33.888587951660156, "learning_rate": 1.8836341635575273e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -514.8185424804688, "logps/rejected": -383.6392517089844, "loss": 0.311, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.090641736984253, "rewards/margins": 2.750021457672119, "rewards/rejected": -3.840662717819214, "step": 8050 }, { "epoch": 0.8128072608092777, "grad_norm": 181.0938262939453, "learning_rate": 1.873550468891802e-07, "logits/chosen": 1.228882074356079, "logits/rejected": NaN, "logps/chosen": -513.5469970703125, "logps/rejected": -338.71148681640625, "loss": 0.4418, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6686341762542725, "rewards/margins": 2.9657700061798096, "rewards/rejected": -4.634404182434082, "step": 8060 }, { "epoch": 0.8138157065422917, "grad_norm": 86.51502227783203, "learning_rate": 1.8634667742260762e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -471.6944274902344, "logps/rejected": -456.6875915527344, "loss": 0.5369, "rewards/accuracies": 0.75, "rewards/chosen": -1.508042335510254, "rewards/margins": 2.5296542644500732, "rewards/rejected": -4.037696838378906, "step": 8070 }, { "epoch": 0.8148241522753057, "grad_norm": 141.82571411132812, "learning_rate": 1.853383079560351e-07, "logits/chosen": 1.2851946353912354, "logits/rejected": NaN, "logps/chosen": -594.3504638671875, "logps/rejected": -330.6008605957031, "loss": 0.4117, "rewards/accuracies": 0.75, "rewards/chosen": -1.963446021080017, "rewards/margins": 2.904629945755005, "rewards/rejected": -4.868076324462891, "step": 8080 }, { "epoch": 0.8158325980083196, "grad_norm": 104.56779479980469, "learning_rate": 1.8432993848946254e-07, "logits/chosen": 1.490864872932434, "logits/rejected": NaN, "logps/chosen": -707.23681640625, "logps/rejected": -390.23333740234375, "loss": 0.321, "rewards/accuracies": 0.75, "rewards/chosen": -0.705654501914978, "rewards/margins": 3.087526559829712, "rewards/rejected": -3.7931809425354004, "step": 8090 }, { "epoch": 0.8168410437413337, "grad_norm": 219.0856475830078, "learning_rate": 1.8332156902288998e-07, "logits/chosen": 1.4323586225509644, "logits/rejected": NaN, "logps/chosen": -591.95703125, "logps/rejected": -409.02288818359375, "loss": 0.7244, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.2378268241882324, "rewards/margins": 2.3225529193878174, "rewards/rejected": -4.560379981994629, "step": 8100 }, { "epoch": 0.8178494894743477, "grad_norm": 3.4652955532073975, "learning_rate": 1.8231319955631743e-07, "logits/chosen": 1.124257206916809, "logits/rejected": NaN, "logps/chosen": -511.77740478515625, "logps/rejected": -387.05462646484375, "loss": 0.5879, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5306557416915894, "rewards/margins": 2.4167442321777344, "rewards/rejected": -3.947399854660034, "step": 8110 }, { "epoch": 0.8188579352073616, "grad_norm": 187.7091064453125, "learning_rate": 1.8130483008974487e-07, "logits/chosen": 1.236798882484436, "logits/rejected": 1.0457655191421509, "logps/chosen": -521.9114990234375, "logps/rejected": -336.33306884765625, "loss": 0.723, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.2079739570617676, "rewards/margins": 2.0932843685150146, "rewards/rejected": -4.3012590408325195, "step": 8120 }, { "epoch": 0.8198663809403757, "grad_norm": 23.66530990600586, "learning_rate": 1.8029646062317232e-07, "logits/chosen": 1.2586616277694702, "logits/rejected": NaN, "logps/chosen": -601.6995239257812, "logps/rejected": -409.310546875, "loss": 0.4285, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8036762475967407, "rewards/margins": 2.5148329734802246, "rewards/rejected": -4.318509101867676, "step": 8130 }, { "epoch": 0.8208748266733896, "grad_norm": 62.792808532714844, "learning_rate": 1.7928809115659977e-07, "logits/chosen": 1.318830966949463, "logits/rejected": NaN, "logps/chosen": -497.4949645996094, "logps/rejected": -335.09637451171875, "loss": 0.7889, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1403074264526367, "rewards/margins": 1.3942108154296875, "rewards/rejected": -3.534518003463745, "step": 8140 }, { "epoch": 0.8218832724064036, "grad_norm": 19.618398666381836, "learning_rate": 1.782797216900272e-07, "logits/chosen": 1.5507906675338745, "logits/rejected": NaN, "logps/chosen": -589.4019775390625, "logps/rejected": -406.50042724609375, "loss": 0.5281, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5916690826416016, "rewards/margins": 2.423452377319336, "rewards/rejected": -4.0151214599609375, "step": 8150 }, { "epoch": 0.8228917181394176, "grad_norm": 70.81237030029297, "learning_rate": 1.7727135222345466e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -522.9805908203125, "logps/rejected": -425.14532470703125, "loss": 0.4382, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8748185634613037, "rewards/margins": 2.1948275566101074, "rewards/rejected": -4.069645881652832, "step": 8160 }, { "epoch": 0.8239001638724316, "grad_norm": 131.76499938964844, "learning_rate": 1.7626298275688213e-07, "logits/chosen": 1.4998846054077148, "logits/rejected": NaN, "logps/chosen": -566.4273681640625, "logps/rejected": -353.4342346191406, "loss": 0.4532, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4530006647109985, "rewards/margins": 2.4603967666625977, "rewards/rejected": -3.9133973121643066, "step": 8170 }, { "epoch": 0.8249086096054457, "grad_norm": 19.031557083129883, "learning_rate": 1.7525461329030955e-07, "logits/chosen": 1.0798604488372803, "logits/rejected": NaN, "logps/chosen": -572.9620361328125, "logps/rejected": -377.7325439453125, "loss": 0.4431, "rewards/accuracies": 0.75, "rewards/chosen": -1.615537405014038, "rewards/margins": 2.2389626502990723, "rewards/rejected": -3.8544998168945312, "step": 8180 }, { "epoch": 0.8259170553384596, "grad_norm": 4.246763229370117, "learning_rate": 1.7424624382373702e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -597.414794921875, "logps/rejected": -473.6275939941406, "loss": 0.4094, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7034368515014648, "rewards/margins": 3.4176509380340576, "rewards/rejected": -5.121088027954102, "step": 8190 }, { "epoch": 0.8269255010714736, "grad_norm": 30.14617919921875, "learning_rate": 1.7323787435716446e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -748.19091796875, "logps/rejected": -378.2371826171875, "loss": 0.5293, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8194640874862671, "rewards/margins": 2.952305316925049, "rewards/rejected": -3.7717692852020264, "step": 8200 }, { "epoch": 0.8279339468044876, "grad_norm": 227.75161743164062, "learning_rate": 1.722295048905919e-07, "logits/chosen": 1.4204576015472412, "logits/rejected": NaN, "logps/chosen": -627.1295166015625, "logps/rejected": -431.9143981933594, "loss": 0.6461, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0076513290405273, "rewards/margins": 2.6818063259124756, "rewards/rejected": -4.689457893371582, "step": 8210 }, { "epoch": 0.8289423925375016, "grad_norm": 82.74388885498047, "learning_rate": 1.7122113542401935e-07, "logits/chosen": 1.2979477643966675, "logits/rejected": NaN, "logps/chosen": -634.6468505859375, "logps/rejected": -415.310546875, "loss": 0.3513, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1884150505065918, "rewards/margins": 3.311117649078369, "rewards/rejected": -4.499532699584961, "step": 8220 }, { "epoch": 0.8299508382705155, "grad_norm": 66.27422332763672, "learning_rate": 1.702127659574468e-07, "logits/chosen": 1.3187530040740967, "logits/rejected": NaN, "logps/chosen": -542.83447265625, "logps/rejected": -322.1431579589844, "loss": 0.2545, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.1877866983413696, "rewards/margins": 3.333501100540161, "rewards/rejected": -4.521287441253662, "step": 8230 }, { "epoch": 0.8309592840035296, "grad_norm": 196.0778045654297, "learning_rate": 1.6920439649087425e-07, "logits/chosen": 1.305849313735962, "logits/rejected": NaN, "logps/chosen": -591.6474609375, "logps/rejected": -288.0565185546875, "loss": 0.7701, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6233844757080078, "rewards/margins": 1.7112886905670166, "rewards/rejected": -3.3346734046936035, "step": 8240 }, { "epoch": 0.8319677297365435, "grad_norm": 96.40045166015625, "learning_rate": 1.681960270243017e-07, "logits/chosen": 1.3794500827789307, "logits/rejected": NaN, "logps/chosen": -496.15972900390625, "logps/rejected": -381.20477294921875, "loss": 0.5709, "rewards/accuracies": 0.625, "rewards/chosen": -2.7540652751922607, "rewards/margins": 2.0188097953796387, "rewards/rejected": -4.7728753089904785, "step": 8250 }, { "epoch": 0.8329761754695575, "grad_norm": 29.227581024169922, "learning_rate": 1.6718765755772916e-07, "logits/chosen": 1.277820110321045, "logits/rejected": NaN, "logps/chosen": -650.5760498046875, "logps/rejected": -428.5926818847656, "loss": 0.3455, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0900661945343018, "rewards/margins": 2.3653769493103027, "rewards/rejected": -3.4554431438446045, "step": 8260 }, { "epoch": 0.8339846212025716, "grad_norm": 121.17410278320312, "learning_rate": 1.6617928809115658e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -500.3058166503906, "logps/rejected": -440.67034912109375, "loss": 0.8906, "rewards/accuracies": 0.625, "rewards/chosen": -1.803006887435913, "rewards/margins": 1.4897782802581787, "rewards/rejected": -3.2927849292755127, "step": 8270 }, { "epoch": 0.8349930669355855, "grad_norm": 2.8291051387786865, "learning_rate": 1.6517091862458405e-07, "logits/chosen": 1.5246641635894775, "logits/rejected": NaN, "logps/chosen": -588.2095947265625, "logps/rejected": -328.9344787597656, "loss": 0.383, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7683519124984741, "rewards/margins": 2.6751770973205566, "rewards/rejected": -4.443528652191162, "step": 8280 }, { "epoch": 0.8360015126685996, "grad_norm": 152.87876892089844, "learning_rate": 1.6416254915801147e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -470.2386779785156, "logps/rejected": -408.6388244628906, "loss": 0.5212, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8684698343276978, "rewards/margins": 2.412717580795288, "rewards/rejected": -4.281187057495117, "step": 8290 }, { "epoch": 0.8370099584016135, "grad_norm": 207.78347778320312, "learning_rate": 1.6315417969143894e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -619.1025390625, "logps/rejected": -409.36370849609375, "loss": 0.3749, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8963649272918701, "rewards/margins": 2.601672649383545, "rewards/rejected": -4.498037815093994, "step": 8300 }, { "epoch": 0.8380184041346275, "grad_norm": 111.7822494506836, "learning_rate": 1.621458102248664e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -480.3690490722656, "logps/rejected": -314.6206359863281, "loss": 0.5124, "rewards/accuracies": 0.75, "rewards/chosen": -1.520829200744629, "rewards/margins": 2.5987565517425537, "rewards/rejected": -4.119585990905762, "step": 8310 }, { "epoch": 0.8390268498676415, "grad_norm": 115.1094741821289, "learning_rate": 1.6113744075829384e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -726.0597534179688, "logps/rejected": -419.36083984375, "loss": 0.3964, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.7559757828712463, "rewards/margins": 2.673915386199951, "rewards/rejected": -3.4298903942108154, "step": 8320 }, { "epoch": 0.8400352956006555, "grad_norm": 11.211020469665527, "learning_rate": 1.6012907129172128e-07, "logits/chosen": 1.59041166305542, "logits/rejected": 1.3823574781417847, "logps/chosen": -538.17529296875, "logps/rejected": -387.9873962402344, "loss": 0.3945, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3141148090362549, "rewards/margins": 3.5412395000457764, "rewards/rejected": -4.8553547859191895, "step": 8330 }, { "epoch": 0.8410437413336694, "grad_norm": 122.51263427734375, "learning_rate": 1.5912070182514873e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -522.8004150390625, "logps/rejected": -363.5242614746094, "loss": 0.2659, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1836879253387451, "rewards/margins": 3.012805938720703, "rewards/rejected": -4.196493148803711, "step": 8340 }, { "epoch": 0.8420521870666835, "grad_norm": 208.3848114013672, "learning_rate": 1.5811233235857617e-07, "logits/chosen": 1.348746657371521, "logits/rejected": NaN, "logps/chosen": -540.8243408203125, "logps/rejected": -331.22918701171875, "loss": 0.646, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.9912761449813843, "rewards/margins": 2.5247321128845215, "rewards/rejected": -4.516008377075195, "step": 8350 }, { "epoch": 0.8430606327996975, "grad_norm": 35.90250778198242, "learning_rate": 1.5710396289200362e-07, "logits/chosen": 1.1361439228057861, "logits/rejected": 1.1984970569610596, "logps/chosen": -454.05438232421875, "logps/rejected": -296.5288391113281, "loss": 0.4079, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0193028450012207, "rewards/margins": 2.209594249725342, "rewards/rejected": -4.228897571563721, "step": 8360 }, { "epoch": 0.8440690785327114, "grad_norm": 95.97454071044922, "learning_rate": 1.560955934254311e-07, "logits/chosen": 1.2267601490020752, "logits/rejected": NaN, "logps/chosen": -523.6048583984375, "logps/rejected": -343.2007141113281, "loss": 0.4328, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.062736749649048, "rewards/margins": 2.790703296661377, "rewards/rejected": -4.853440284729004, "step": 8370 }, { "epoch": 0.8450775242657255, "grad_norm": 289.0679931640625, "learning_rate": 1.550872239588585e-07, "logits/chosen": 1.5604432821273804, "logits/rejected": NaN, "logps/chosen": -582.113525390625, "logps/rejected": -314.300048828125, "loss": 0.8434, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.8267650604248047, "rewards/margins": 1.4889771938323975, "rewards/rejected": -4.315742015838623, "step": 8380 }, { "epoch": 0.8460859699987394, "grad_norm": 179.50872802734375, "learning_rate": 1.5407885449228598e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -646.34716796875, "logps/rejected": -389.22283935546875, "loss": 0.3741, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9640658497810364, "rewards/margins": 2.4505443572998047, "rewards/rejected": -3.4146103858947754, "step": 8390 }, { "epoch": 0.8470944157317535, "grad_norm": 116.79316711425781, "learning_rate": 1.530704850257134e-07, "logits/chosen": 1.3261789083480835, "logits/rejected": NaN, "logps/chosen": -592.0455932617188, "logps/rejected": -282.3583679199219, "loss": 0.4702, "rewards/accuracies": 0.75, "rewards/chosen": -1.6400206089019775, "rewards/margins": 2.640972852706909, "rewards/rejected": -4.280993938446045, "step": 8400 }, { "epoch": 0.8481028614647674, "grad_norm": 13.27502155303955, "learning_rate": 1.5206211555914087e-07, "logits/chosen": 1.361731767654419, "logits/rejected": NaN, "logps/chosen": -559.0557861328125, "logps/rejected": -398.6104736328125, "loss": 0.5224, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1913673877716064, "rewards/margins": 2.8971219062805176, "rewards/rejected": -4.088489055633545, "step": 8410 }, { "epoch": 0.8491113071977814, "grad_norm": 2.3290085792541504, "learning_rate": 1.5105374609256832e-07, "logits/chosen": 1.3517669439315796, "logits/rejected": NaN, "logps/chosen": -494.0711364746094, "logps/rejected": -327.96319580078125, "loss": 0.6071, "rewards/accuracies": 0.625, "rewards/chosen": -2.2014102935791016, "rewards/margins": 2.0245325565338135, "rewards/rejected": -4.225943088531494, "step": 8420 }, { "epoch": 0.8501197529307954, "grad_norm": 292.47015380859375, "learning_rate": 1.5004537662599576e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -474.76385498046875, "logps/rejected": -435.83160400390625, "loss": 0.5605, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4733778238296509, "rewards/margins": 2.167741298675537, "rewards/rejected": -3.6411190032958984, "step": 8430 }, { "epoch": 0.8511281986638094, "grad_norm": 26.384361267089844, "learning_rate": 1.490370071594232e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -608.834228515625, "logps/rejected": -519.8109741210938, "loss": 0.5007, "rewards/accuracies": 0.75, "rewards/chosen": -1.530571460723877, "rewards/margins": 2.514812469482422, "rewards/rejected": -4.045383930206299, "step": 8440 }, { "epoch": 0.8521366443968234, "grad_norm": 0.08604435622692108, "learning_rate": 1.4802863769285065e-07, "logits/chosen": 1.3642421960830688, "logits/rejected": NaN, "logps/chosen": -566.4822387695312, "logps/rejected": -429.36138916015625, "loss": 0.5184, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3738213777542114, "rewards/margins": 3.4914183616638184, "rewards/rejected": -4.86523962020874, "step": 8450 }, { "epoch": 0.8531450901298374, "grad_norm": 224.59405517578125, "learning_rate": 1.470202682262781e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -468.2710876464844, "logps/rejected": -414.88427734375, "loss": 0.647, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3115627765655518, "rewards/margins": 2.1710925102233887, "rewards/rejected": -3.4826552867889404, "step": 8460 }, { "epoch": 0.8541535358628514, "grad_norm": 52.56248092651367, "learning_rate": 1.4601189875970554e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -568.4177856445312, "logps/rejected": -365.6427917480469, "loss": 0.5533, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.996789276599884, "rewards/margins": 2.4560415744781494, "rewards/rejected": -3.4528307914733887, "step": 8470 }, { "epoch": 0.8551619815958653, "grad_norm": 6.229449272155762, "learning_rate": 1.4500352929313301e-07, "logits/chosen": 1.4022973775863647, "logits/rejected": NaN, "logps/chosen": -661.896240234375, "logps/rejected": -293.3689880371094, "loss": 0.3121, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3956620693206787, "rewards/margins": 2.3570797443389893, "rewards/rejected": -3.752741575241089, "step": 8480 }, { "epoch": 0.8561704273288794, "grad_norm": 6.79275369644165, "learning_rate": 1.4399515982656043e-07, "logits/chosen": NaN, "logits/rejected": 1.5365525484085083, "logps/chosen": -436.3828125, "logps/rejected": -461.59808349609375, "loss": 0.3429, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6638362407684326, "rewards/margins": 2.430692434310913, "rewards/rejected": -4.094529151916504, "step": 8490 }, { "epoch": 0.8571788730618933, "grad_norm": 138.2994842529297, "learning_rate": 1.429867903599879e-07, "logits/chosen": 1.3129445314407349, "logits/rejected": NaN, "logps/chosen": -527.8194580078125, "logps/rejected": -346.9879150390625, "loss": 0.3163, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0167725086212158, "rewards/margins": 2.4063327312469482, "rewards/rejected": -3.423104763031006, "step": 8500 }, { "epoch": 0.8581873187949074, "grad_norm": 202.46888732910156, "learning_rate": 1.4197842089341532e-07, "logits/chosen": 1.3857146501541138, "logits/rejected": NaN, "logps/chosen": -493.88519287109375, "logps/rejected": -438.23114013671875, "loss": 0.408, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0860286951065063, "rewards/margins": 3.183978796005249, "rewards/rejected": -4.270007133483887, "step": 8510 }, { "epoch": 0.8591957645279213, "grad_norm": 66.49409484863281, "learning_rate": 1.409700514268428e-07, "logits/chosen": 1.5267709493637085, "logits/rejected": NaN, "logps/chosen": -528.7257690429688, "logps/rejected": -320.33184814453125, "loss": 0.3318, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0112366676330566, "rewards/margins": 3.1728222370147705, "rewards/rejected": -4.184058666229248, "step": 8520 }, { "epoch": 0.8602042102609353, "grad_norm": 104.89572143554688, "learning_rate": 1.3996168196027024e-07, "logits/chosen": 1.5368298292160034, "logits/rejected": NaN, "logps/chosen": -623.4102783203125, "logps/rejected": -327.61163330078125, "loss": 0.6985, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1255433559417725, "rewards/margins": 1.8909581899642944, "rewards/rejected": -3.0165019035339355, "step": 8530 }, { "epoch": 0.8612126559939494, "grad_norm": 30.62982940673828, "learning_rate": 1.3895331249369769e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -511.48834228515625, "logps/rejected": -342.35528564453125, "loss": 0.6285, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5849393606185913, "rewards/margins": 2.245983839035034, "rewards/rejected": -3.830923557281494, "step": 8540 }, { "epoch": 0.8622211017269633, "grad_norm": 209.38198852539062, "learning_rate": 1.3794494302712513e-07, "logits/chosen": 1.4750689268112183, "logits/rejected": NaN, "logps/chosen": -560.9714965820312, "logps/rejected": -418.8634338378906, "loss": 0.6024, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8114614486694336, "rewards/margins": 2.0718631744384766, "rewards/rejected": -3.883324384689331, "step": 8550 }, { "epoch": 0.8632295474599773, "grad_norm": 204.8344268798828, "learning_rate": 1.3693657356055258e-07, "logits/chosen": 1.5387074947357178, "logits/rejected": NaN, "logps/chosen": -682.771240234375, "logps/rejected": -348.6722106933594, "loss": 0.4615, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8892038464546204, "rewards/margins": 3.295124053955078, "rewards/rejected": -4.184328079223633, "step": 8560 }, { "epoch": 0.8642379931929913, "grad_norm": 4.446771144866943, "learning_rate": 1.3592820409398005e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -515.9931030273438, "logps/rejected": -368.46148681640625, "loss": 0.5713, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0005111694335938, "rewards/margins": 1.9527114629745483, "rewards/rejected": -3.9532227516174316, "step": 8570 }, { "epoch": 0.8652464389260053, "grad_norm": 2.9560813903808594, "learning_rate": 1.3491983462740747e-07, "logits/chosen": 1.2693067789077759, "logits/rejected": NaN, "logps/chosen": -438.1006774902344, "logps/rejected": -307.5976257324219, "loss": 0.3482, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.42154061794281, "rewards/margins": 3.389305591583252, "rewards/rejected": -4.810846328735352, "step": 8580 }, { "epoch": 0.8662548846590193, "grad_norm": 9.299932479858398, "learning_rate": 1.3391146516083494e-07, "logits/chosen": 1.3644795417785645, "logits/rejected": NaN, "logps/chosen": -508.4331970214844, "logps/rejected": -272.9469299316406, "loss": 0.4012, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1286704540252686, "rewards/margins": 2.959615468978882, "rewards/rejected": -4.08828592300415, "step": 8590 }, { "epoch": 0.8672633303920333, "grad_norm": 259.32958984375, "learning_rate": 1.3290309569426236e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -609.2679443359375, "logps/rejected": -375.79718017578125, "loss": 0.382, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1886597871780396, "rewards/margins": 2.2796826362609863, "rewards/rejected": -3.4683425426483154, "step": 8600 }, { "epoch": 0.8682717761250472, "grad_norm": 193.69158935546875, "learning_rate": 1.3189472622768983e-07, "logits/chosen": 1.2013041973114014, "logits/rejected": NaN, "logps/chosen": -563.8181762695312, "logps/rejected": -320.06402587890625, "loss": 0.4115, "rewards/accuracies": 0.75, "rewards/chosen": -1.6035617589950562, "rewards/margins": 2.654860019683838, "rewards/rejected": -4.258421897888184, "step": 8610 }, { "epoch": 0.8692802218580613, "grad_norm": 120.16075134277344, "learning_rate": 1.3088635676111725e-07, "logits/chosen": 1.375438928604126, "logits/rejected": NaN, "logps/chosen": -581.2388916015625, "logps/rejected": -359.941650390625, "loss": 0.5779, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2576782703399658, "rewards/margins": 2.5380666255950928, "rewards/rejected": -3.7957451343536377, "step": 8620 }, { "epoch": 0.8702886675910753, "grad_norm": 218.8374786376953, "learning_rate": 1.2987798729454472e-07, "logits/chosen": 1.3441765308380127, "logits/rejected": NaN, "logps/chosen": -635.183349609375, "logps/rejected": -478.590576171875, "loss": 0.4712, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3104984760284424, "rewards/margins": 3.247734546661377, "rewards/rejected": -4.55823278427124, "step": 8630 }, { "epoch": 0.8712971133240892, "grad_norm": 50.71980285644531, "learning_rate": 1.2886961782797217e-07, "logits/chosen": 1.3110581636428833, "logits/rejected": NaN, "logps/chosen": -522.8035278320312, "logps/rejected": -450.37469482421875, "loss": 0.6392, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6283988952636719, "rewards/margins": 2.4698870182037354, "rewards/rejected": -4.098286151885986, "step": 8640 }, { "epoch": 0.8723055590571033, "grad_norm": 60.54728317260742, "learning_rate": 1.278612483613996e-07, "logits/chosen": 1.5356099605560303, "logits/rejected": NaN, "logps/chosen": -513.87548828125, "logps/rejected": -295.8777770996094, "loss": 0.2412, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6385595798492432, "rewards/margins": 3.3138809204101562, "rewards/rejected": -4.95244026184082, "step": 8650 }, { "epoch": 0.8733140047901172, "grad_norm": 191.0852508544922, "learning_rate": 1.2685287889482706e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -540.8341064453125, "logps/rejected": -388.80950927734375, "loss": 0.4758, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5285649299621582, "rewards/margins": 2.558866500854492, "rewards/rejected": -4.087431907653809, "step": 8660 }, { "epoch": 0.8743224505231312, "grad_norm": 24.49887466430664, "learning_rate": 1.258445094282545e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -566.1243896484375, "logps/rejected": -397.8572998046875, "loss": 0.4746, "rewards/accuracies": 0.75, "rewards/chosen": -0.7597999572753906, "rewards/margins": 2.36128830909729, "rewards/rejected": -3.1210882663726807, "step": 8670 }, { "epoch": 0.8753308962561452, "grad_norm": 238.9630126953125, "learning_rate": 1.2483613996168195e-07, "logits/chosen": 1.4144121408462524, "logits/rejected": NaN, "logps/chosen": -483.82196044921875, "logps/rejected": -359.2357177734375, "loss": 0.5208, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.561306357383728, "rewards/margins": 2.3385512828826904, "rewards/rejected": -3.89985728263855, "step": 8680 }, { "epoch": 0.8763393419891592, "grad_norm": 89.159423828125, "learning_rate": 1.238277704951094e-07, "logits/chosen": 1.37838613986969, "logits/rejected": NaN, "logps/chosen": -524.5528564453125, "logps/rejected": -284.2425537109375, "loss": 0.4426, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.7586714029312134, "rewards/margins": 2.7221996784210205, "rewards/rejected": -4.480871200561523, "step": 8690 }, { "epoch": 0.8773477877221733, "grad_norm": 168.784423828125, "learning_rate": 1.2281940102853687e-07, "logits/chosen": 1.3564541339874268, "logits/rejected": NaN, "logps/chosen": -545.0909423828125, "logps/rejected": -322.850830078125, "loss": 0.4447, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4250669479370117, "rewards/margins": 2.679561138153076, "rewards/rejected": -4.104628086090088, "step": 8700 }, { "epoch": 0.8783562334551872, "grad_norm": 263.19622802734375, "learning_rate": 1.218110315619643e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -525.6681518554688, "logps/rejected": -353.65887451171875, "loss": 0.7506, "rewards/accuracies": 0.5, "rewards/chosen": -2.459197759628296, "rewards/margins": 1.5193570852279663, "rewards/rejected": -3.978555202484131, "step": 8710 }, { "epoch": 0.8793646791882012, "grad_norm": 188.57505798339844, "learning_rate": 1.2080266209539176e-07, "logits/chosen": 1.6118524074554443, "logits/rejected": NaN, "logps/chosen": -642.9658203125, "logps/rejected": -415.99365234375, "loss": 0.3988, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1756763458251953, "rewards/margins": 2.4078075885772705, "rewards/rejected": -3.583483934402466, "step": 8720 }, { "epoch": 0.8803731249212152, "grad_norm": 114.84732818603516, "learning_rate": 1.197942926288192e-07, "logits/chosen": 1.390998363494873, "logits/rejected": NaN, "logps/chosen": -420.97698974609375, "logps/rejected": -277.1137390136719, "loss": 0.483, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0634777545928955, "rewards/margins": 2.17257022857666, "rewards/rejected": -4.236047744750977, "step": 8730 }, { "epoch": 0.8813815706542292, "grad_norm": 107.58872985839844, "learning_rate": 1.1878592316224665e-07, "logits/chosen": 1.2566334009170532, "logits/rejected": NaN, "logps/chosen": -590.5297241210938, "logps/rejected": -336.845947265625, "loss": 0.3721, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0594079494476318, "rewards/margins": 2.9538016319274902, "rewards/rejected": -4.013209819793701, "step": 8740 }, { "epoch": 0.8823900163872431, "grad_norm": 176.91856384277344, "learning_rate": 1.1777755369567409e-07, "logits/chosen": 1.6921049356460571, "logits/rejected": NaN, "logps/chosen": -551.56103515625, "logps/rejected": -367.87042236328125, "loss": 0.5116, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4988815784454346, "rewards/margins": 2.2651233673095703, "rewards/rejected": -3.764004945755005, "step": 8750 }, { "epoch": 0.8833984621202572, "grad_norm": 82.8668212890625, "learning_rate": 1.1676918422910154e-07, "logits/chosen": 1.5597890615463257, "logits/rejected": NaN, "logps/chosen": -536.2977294921875, "logps/rejected": -314.06524658203125, "loss": 0.2831, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3834867477416992, "rewards/margins": 3.0234766006469727, "rewards/rejected": -4.406962871551514, "step": 8760 }, { "epoch": 0.8844069078532711, "grad_norm": 149.29696655273438, "learning_rate": 1.1576081476252898e-07, "logits/chosen": 1.454813838005066, "logits/rejected": NaN, "logps/chosen": -552.8738403320312, "logps/rejected": -402.768798828125, "loss": 0.5236, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6908366680145264, "rewards/margins": 2.359163761138916, "rewards/rejected": -4.050000190734863, "step": 8770 }, { "epoch": 0.8854153535862851, "grad_norm": 42.68349838256836, "learning_rate": 1.1475244529595643e-07, "logits/chosen": 1.4400161504745483, "logits/rejected": NaN, "logps/chosen": -609.2666625976562, "logps/rejected": -274.33245849609375, "loss": 0.2137, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.4944143295288086, "rewards/margins": 3.560666561126709, "rewards/rejected": -4.055081367492676, "step": 8780 }, { "epoch": 0.8864237993192992, "grad_norm": 32.67169189453125, "learning_rate": 1.1374407582938387e-07, "logits/chosen": 1.7709732055664062, "logits/rejected": NaN, "logps/chosen": -672.9063720703125, "logps/rejected": -414.513916015625, "loss": 0.4147, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8768621683120728, "rewards/margins": 2.707180976867676, "rewards/rejected": -3.584043025970459, "step": 8790 }, { "epoch": 0.8874322450523131, "grad_norm": 137.8654327392578, "learning_rate": 1.1273570636281132e-07, "logits/chosen": 1.1986639499664307, "logits/rejected": NaN, "logps/chosen": -475.2950134277344, "logps/rejected": -330.8153381347656, "loss": 0.6284, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.1064887046813965, "rewards/margins": 2.799989700317383, "rewards/rejected": -4.906478404998779, "step": 8800 }, { "epoch": 0.8884406907853272, "grad_norm": 2.009772777557373, "learning_rate": 1.1172733689623878e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -591.454345703125, "logps/rejected": -387.70037841796875, "loss": 0.4372, "rewards/accuracies": 0.75, "rewards/chosen": -1.4577832221984863, "rewards/margins": 2.4990744590759277, "rewards/rejected": -3.956857204437256, "step": 8810 }, { "epoch": 0.8894491365183411, "grad_norm": 4.005239009857178, "learning_rate": 1.1071896742966622e-07, "logits/chosen": 1.2421221733093262, "logits/rejected": NaN, "logps/chosen": -494.95843505859375, "logps/rejected": -432.4629821777344, "loss": 0.5857, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8187671899795532, "rewards/margins": 2.5255444049835205, "rewards/rejected": -4.344311714172363, "step": 8820 }, { "epoch": 0.8904575822513551, "grad_norm": 0.6940796971321106, "learning_rate": 1.0971059796309368e-07, "logits/chosen": 1.3259942531585693, "logits/rejected": NaN, "logps/chosen": -546.400390625, "logps/rejected": -382.6554260253906, "loss": 0.4101, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.418977975845337, "rewards/margins": 2.486889123916626, "rewards/rejected": -3.905867338180542, "step": 8830 }, { "epoch": 0.8914660279843691, "grad_norm": 12.031754493713379, "learning_rate": 1.0870222849652113e-07, "logits/chosen": 1.0948941707611084, "logits/rejected": 1.0722075700759888, "logps/chosen": -482.42926025390625, "logps/rejected": -486.935791015625, "loss": 0.4087, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5652921199798584, "rewards/margins": 3.0157346725463867, "rewards/rejected": -4.581027030944824, "step": 8840 }, { "epoch": 0.8924744737173831, "grad_norm": 43.81049346923828, "learning_rate": 1.0769385902994857e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -560.08935546875, "logps/rejected": -313.6084899902344, "loss": 0.5484, "rewards/accuracies": 0.75, "rewards/chosen": -2.0042006969451904, "rewards/margins": 2.020162582397461, "rewards/rejected": -4.0243635177612305, "step": 8850 }, { "epoch": 0.893482919450397, "grad_norm": 9.995402336120605, "learning_rate": 1.0668548956337602e-07, "logits/chosen": 1.5012110471725464, "logits/rejected": NaN, "logps/chosen": -588.5028686523438, "logps/rejected": -414.3243103027344, "loss": 0.8346, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -3.1675472259521484, "rewards/margins": 1.2568758726119995, "rewards/rejected": -4.4244232177734375, "step": 8860 }, { "epoch": 0.8944913651834111, "grad_norm": 9.869577407836914, "learning_rate": 1.0567712009680346e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -478.99835205078125, "logps/rejected": -313.8074645996094, "loss": 0.81, "rewards/accuracies": 0.625, "rewards/chosen": -2.219780683517456, "rewards/margins": 1.5886200666427612, "rewards/rejected": -3.8084006309509277, "step": 8870 }, { "epoch": 0.895499810916425, "grad_norm": 18.697174072265625, "learning_rate": 1.0466875063023091e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -549.2710571289062, "logps/rejected": -378.4424743652344, "loss": 0.4003, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7034868001937866, "rewards/margins": 2.47304105758667, "rewards/rejected": -4.176527976989746, "step": 8880 }, { "epoch": 0.896508256649439, "grad_norm": 175.4088134765625, "learning_rate": 1.0366038116365835e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -472.02374267578125, "logps/rejected": -369.50457763671875, "loss": 0.8713, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.32767653465271, "rewards/margins": 1.4237571954727173, "rewards/rejected": -3.751433849334717, "step": 8890 }, { "epoch": 0.8975167023824531, "grad_norm": 180.0683135986328, "learning_rate": 1.026520116970858e-07, "logits/chosen": 1.1481364965438843, "logits/rejected": NaN, "logps/chosen": -558.3416748046875, "logps/rejected": -390.7160339355469, "loss": 0.6755, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6273072957992554, "rewards/margins": 2.1272642612457275, "rewards/rejected": -3.7545719146728516, "step": 8900 }, { "epoch": 0.898525148115467, "grad_norm": 181.85903930664062, "learning_rate": 1.0164364223051325e-07, "logits/chosen": 1.4755562543869019, "logits/rejected": NaN, "logps/chosen": -564.26708984375, "logps/rejected": -429.63916015625, "loss": 0.5155, "rewards/accuracies": 0.625, "rewards/chosen": -1.017952799797058, "rewards/margins": 2.114309310913086, "rewards/rejected": -3.1322624683380127, "step": 8910 }, { "epoch": 0.8995335938484811, "grad_norm": 94.49153137207031, "learning_rate": 1.006352727639407e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -447.64178466796875, "logps/rejected": -386.3350524902344, "loss": 0.6511, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.9122902154922485, "rewards/margins": 1.9534841775894165, "rewards/rejected": -3.865774154663086, "step": 8920 }, { "epoch": 0.900542039581495, "grad_norm": 165.80703735351562, "learning_rate": 9.962690329736816e-08, "logits/chosen": 1.307084321975708, "logits/rejected": NaN, "logps/chosen": -574.6239013671875, "logps/rejected": -310.83831787109375, "loss": 0.3989, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8934838175773621, "rewards/margins": 2.6532645225524902, "rewards/rejected": -3.546748399734497, "step": 8930 }, { "epoch": 0.901550485314509, "grad_norm": 33.91143035888672, "learning_rate": 9.861853383079561e-08, "logits/chosen": 1.195193886756897, "logits/rejected": NaN, "logps/chosen": -554.8931884765625, "logps/rejected": -297.0273132324219, "loss": 0.3943, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4649077653884888, "rewards/margins": 2.9339027404785156, "rewards/rejected": -4.398810386657715, "step": 8940 }, { "epoch": 0.902558931047523, "grad_norm": 57.40180206298828, "learning_rate": 9.761016436422305e-08, "logits/chosen": 1.4107997417449951, "logits/rejected": 1.3695188760757446, "logps/chosen": -515.4632568359375, "logps/rejected": -315.887939453125, "loss": 0.5217, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9121745824813843, "rewards/margins": 3.0018343925476074, "rewards/rejected": -4.914009094238281, "step": 8950 }, { "epoch": 0.903567376780537, "grad_norm": 19.417890548706055, "learning_rate": 9.66017948976505e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -533.85498046875, "logps/rejected": -461.1097717285156, "loss": 0.5593, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5732746124267578, "rewards/margins": 2.663957118988037, "rewards/rejected": -4.237232208251953, "step": 8960 }, { "epoch": 0.904575822513551, "grad_norm": 14.467521667480469, "learning_rate": 9.559342543107794e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -539.0198974609375, "logps/rejected": -406.87921142578125, "loss": 0.3904, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3646247386932373, "rewards/margins": 2.426529884338379, "rewards/rejected": -3.791154384613037, "step": 8970 }, { "epoch": 0.905584268246565, "grad_norm": 63.98080062866211, "learning_rate": 9.458505596450539e-08, "logits/chosen": 1.3252742290496826, "logits/rejected": 1.1640480756759644, "logps/chosen": -429.81634521484375, "logps/rejected": -398.22222900390625, "loss": 0.4994, "rewards/accuracies": 0.75, "rewards/chosen": -1.6696672439575195, "rewards/margins": 1.8329483270645142, "rewards/rejected": -3.502615451812744, "step": 8980 }, { "epoch": 0.906592713979579, "grad_norm": 62.951499938964844, "learning_rate": 9.357668649793284e-08, "logits/chosen": 1.517341136932373, "logits/rejected": NaN, "logps/chosen": -537.2692260742188, "logps/rejected": -374.8719482421875, "loss": 0.45, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.462612509727478, "rewards/margins": 2.1922154426574707, "rewards/rejected": -3.654827833175659, "step": 8990 }, { "epoch": 0.9076011597125929, "grad_norm": 0.6337027549743652, "learning_rate": 9.256831703136028e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -555.7584838867188, "logps/rejected": -341.49676513671875, "loss": 0.4657, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6370372772216797, "rewards/margins": 2.792696714401245, "rewards/rejected": -4.429734230041504, "step": 9000 }, { "epoch": 0.908609605445607, "grad_norm": 89.793701171875, "learning_rate": 9.155994756478773e-08, "logits/chosen": 1.4793472290039062, "logits/rejected": NaN, "logps/chosen": -567.7770385742188, "logps/rejected": -336.2908630371094, "loss": 0.4423, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6726789474487305, "rewards/margins": 2.453594446182251, "rewards/rejected": -4.126273155212402, "step": 9010 }, { "epoch": 0.9096180511786209, "grad_norm": 188.0178680419922, "learning_rate": 9.055157809821517e-08, "logits/chosen": NaN, "logits/rejected": 1.5546802282333374, "logps/chosen": -549.0169067382812, "logps/rejected": -457.88458251953125, "loss": 0.3759, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4873149394989014, "rewards/margins": 2.364131450653076, "rewards/rejected": -3.8514466285705566, "step": 9020 }, { "epoch": 0.910626496911635, "grad_norm": 152.02085876464844, "learning_rate": 8.954320863164263e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -672.3857421875, "logps/rejected": -372.69476318359375, "loss": 0.4882, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7940438389778137, "rewards/margins": 2.5120744705200195, "rewards/rejected": -3.3061184883117676, "step": 9030 }, { "epoch": 0.911634942644649, "grad_norm": 119.32661437988281, "learning_rate": 8.853483916507009e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -559.7059326171875, "logps/rejected": -500.46307373046875, "loss": 0.6396, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5761739015579224, "rewards/margins": 1.968753457069397, "rewards/rejected": -3.5449271202087402, "step": 9040 }, { "epoch": 0.9126433883776629, "grad_norm": 29.843032836914062, "learning_rate": 8.752646969849753e-08, "logits/chosen": 1.4583637714385986, "logits/rejected": NaN, "logps/chosen": -649.4757690429688, "logps/rejected": -319.21868896484375, "loss": 0.3801, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3751647472381592, "rewards/margins": 2.303933620452881, "rewards/rejected": -3.679098129272461, "step": 9050 }, { "epoch": 0.913651834110677, "grad_norm": 21.044513702392578, "learning_rate": 8.651810023192498e-08, "logits/chosen": 1.4908130168914795, "logits/rejected": 1.2535879611968994, "logps/chosen": -467.785888671875, "logps/rejected": -339.8295593261719, "loss": 0.4659, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.690281867980957, "rewards/margins": 3.052429437637329, "rewards/rejected": -4.742711067199707, "step": 9060 }, { "epoch": 0.9146602798436909, "grad_norm": 21.272811889648438, "learning_rate": 8.550973076535242e-08, "logits/chosen": 1.3637406826019287, "logits/rejected": NaN, "logps/chosen": -564.7250366210938, "logps/rejected": -328.559814453125, "loss": 0.3709, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.8344917297363281, "rewards/margins": 3.3245208263397217, "rewards/rejected": -5.159013271331787, "step": 9070 }, { "epoch": 0.9156687255767049, "grad_norm": 147.69654846191406, "learning_rate": 8.450136129877987e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -576.0225219726562, "logps/rejected": -426.3824157714844, "loss": 0.5065, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1904791593551636, "rewards/margins": 2.015995740890503, "rewards/rejected": -3.206474781036377, "step": 9080 }, { "epoch": 0.9166771713097189, "grad_norm": 0.8301156759262085, "learning_rate": 8.349299183220732e-08, "logits/chosen": 1.4222745895385742, "logits/rejected": NaN, "logps/chosen": -556.2261352539062, "logps/rejected": -320.4910888671875, "loss": 0.4761, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4689304828643799, "rewards/margins": 2.9239444732666016, "rewards/rejected": -4.392874717712402, "step": 9090 }, { "epoch": 0.9176856170427329, "grad_norm": 221.32821655273438, "learning_rate": 8.248462236563476e-08, "logits/chosen": 1.566599726676941, "logits/rejected": NaN, "logps/chosen": -661.3073120117188, "logps/rejected": -322.56591796875, "loss": 0.3496, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2340878248214722, "rewards/margins": 2.8977625370025635, "rewards/rejected": -4.131850242614746, "step": 9100 }, { "epoch": 0.9186940627757468, "grad_norm": 9.800461769104004, "learning_rate": 8.14762528990622e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -548.1616821289062, "logps/rejected": -380.61083984375, "loss": 0.5087, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7605657577514648, "rewards/margins": 2.537370204925537, "rewards/rejected": -4.297935485839844, "step": 9110 }, { "epoch": 0.9197025085087609, "grad_norm": 94.53682708740234, "learning_rate": 8.046788343248965e-08, "logits/chosen": 1.596928596496582, "logits/rejected": NaN, "logps/chosen": -569.3401489257812, "logps/rejected": -338.50482177734375, "loss": 0.5995, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.577665090560913, "rewards/margins": 1.935767412185669, "rewards/rejected": -3.513432741165161, "step": 9120 }, { "epoch": 0.9207109542417748, "grad_norm": 8.400388717651367, "learning_rate": 7.94595139659171e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -434.49462890625, "logps/rejected": -396.23114013671875, "loss": 0.657, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.952082633972168, "rewards/margins": 2.154625415802002, "rewards/rejected": -4.106708526611328, "step": 9130 }, { "epoch": 0.9217193999747889, "grad_norm": 84.8908920288086, "learning_rate": 7.845114449934457e-08, "logits/chosen": 1.3952075242996216, "logits/rejected": NaN, "logps/chosen": -587.2257690429688, "logps/rejected": -292.682373046875, "loss": 0.3463, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.837665855884552, "rewards/margins": 3.2057392597198486, "rewards/rejected": -4.043405055999756, "step": 9140 }, { "epoch": 0.9227278457078029, "grad_norm": 197.246826171875, "learning_rate": 7.744277503277201e-08, "logits/chosen": 1.2749310731887817, "logits/rejected": NaN, "logps/chosen": -650.3246459960938, "logps/rejected": -309.12054443359375, "loss": 0.4739, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.05872642993927, "rewards/margins": 2.6391122341156006, "rewards/rejected": -3.6978390216827393, "step": 9150 }, { "epoch": 0.9237362914408168, "grad_norm": 139.67002868652344, "learning_rate": 7.643440556619946e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -614.3273315429688, "logps/rejected": -366.16162109375, "loss": 0.3359, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6997923851013184, "rewards/margins": 2.797621965408325, "rewards/rejected": -3.4974143505096436, "step": 9160 }, { "epoch": 0.9247447371738309, "grad_norm": 36.750972747802734, "learning_rate": 7.54260360996269e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -542.0679931640625, "logps/rejected": -355.3759460449219, "loss": 0.3894, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4902878999710083, "rewards/margins": 2.7766804695129395, "rewards/rejected": -4.2669677734375, "step": 9170 }, { "epoch": 0.9257531829068448, "grad_norm": 299.01025390625, "learning_rate": 7.441766663305435e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -499.16485595703125, "logps/rejected": -417.24365234375, "loss": 0.9236, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.9379615783691406, "rewards/margins": 1.335378885269165, "rewards/rejected": -3.2733397483825684, "step": 9180 }, { "epoch": 0.9267616286398588, "grad_norm": 0.2546979486942291, "learning_rate": 7.34092971664818e-08, "logits/chosen": 1.4956692457199097, "logits/rejected": NaN, "logps/chosen": -517.9393310546875, "logps/rejected": -414.1587829589844, "loss": 0.4593, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3447647094726562, "rewards/margins": 2.3910529613494873, "rewards/rejected": -3.7358176708221436, "step": 9190 }, { "epoch": 0.9277700743728728, "grad_norm": 137.61404418945312, "learning_rate": 7.240092769990924e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -488.5502014160156, "logps/rejected": -433.44024658203125, "loss": 0.7587, "rewards/accuracies": 0.75, "rewards/chosen": -1.4920413494110107, "rewards/margins": 1.8213481903076172, "rewards/rejected": -3.313389539718628, "step": 9200 }, { "epoch": 0.9287785201058868, "grad_norm": 14.035024642944336, "learning_rate": 7.139255823333669e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -627.0167846679688, "logps/rejected": -369.670654296875, "loss": 0.2408, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9718162417411804, "rewards/margins": 3.3854422569274902, "rewards/rejected": -4.3572587966918945, "step": 9210 }, { "epoch": 0.9297869658389007, "grad_norm": 1.6017898321151733, "learning_rate": 7.038418876676413e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -647.627197265625, "logps/rejected": -351.80450439453125, "loss": 0.3416, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.066913366317749, "rewards/margins": 2.924858570098877, "rewards/rejected": -3.991772174835205, "step": 9220 }, { "epoch": 0.9307954115719148, "grad_norm": 6.873948097229004, "learning_rate": 6.937581930019158e-08, "logits/chosen": 1.3104326725006104, "logits/rejected": NaN, "logps/chosen": -547.0677490234375, "logps/rejected": -354.56829833984375, "loss": 0.5005, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2099125385284424, "rewards/margins": 2.674123764038086, "rewards/rejected": -3.88403582572937, "step": 9230 }, { "epoch": 0.9318038573049288, "grad_norm": 101.31409454345703, "learning_rate": 6.836744983361902e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -472.23638916015625, "logps/rejected": -440.79522705078125, "loss": 0.3783, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.9329389333724976, "rewards/margins": 3.523350238800049, "rewards/rejected": -4.456289291381836, "step": 9240 }, { "epoch": 0.9328123030379428, "grad_norm": 49.19733810424805, "learning_rate": 6.73590803670465e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -422.0565490722656, "logps/rejected": -423.60302734375, "loss": 0.2506, "rewards/accuracies": 0.875, "rewards/chosen": -1.5237648487091064, "rewards/margins": 3.03407621383667, "rewards/rejected": -4.5578413009643555, "step": 9250 }, { "epoch": 0.9338207487709568, "grad_norm": 105.39127349853516, "learning_rate": 6.635071090047394e-08, "logits/chosen": 1.2934852838516235, "logits/rejected": 1.2278354167938232, "logps/chosen": -516.948974609375, "logps/rejected": -454.3348693847656, "loss": 0.5097, "rewards/accuracies": 0.75, "rewards/chosen": -2.2148590087890625, "rewards/margins": 2.668937921524048, "rewards/rejected": -4.883796691894531, "step": 9260 }, { "epoch": 0.9348291945039707, "grad_norm": 49.05742263793945, "learning_rate": 6.534234143390139e-08, "logits/chosen": 1.350793719291687, "logits/rejected": NaN, "logps/chosen": -487.6669921875, "logps/rejected": -426.49066162109375, "loss": 0.9479, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.3802263736724854, "rewards/margins": 1.4694944620132446, "rewards/rejected": -3.8497207164764404, "step": 9270 }, { "epoch": 0.9358376402369848, "grad_norm": 98.7350845336914, "learning_rate": 6.433397196732883e-08, "logits/chosen": 1.311647891998291, "logits/rejected": NaN, "logps/chosen": -488.65850830078125, "logps/rejected": -358.48651123046875, "loss": 0.2952, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.2914804220199585, "rewards/margins": 3.281757354736328, "rewards/rejected": -4.573237895965576, "step": 9280 }, { "epoch": 0.9368460859699987, "grad_norm": 254.75282287597656, "learning_rate": 6.332560250075628e-08, "logits/chosen": 1.3596370220184326, "logits/rejected": NaN, "logps/chosen": -702.1997680664062, "logps/rejected": -404.9862976074219, "loss": 0.3908, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.138912320137024, "rewards/margins": 2.3611905574798584, "rewards/rejected": -3.500102996826172, "step": 9290 }, { "epoch": 0.9378545317030127, "grad_norm": 98.32270050048828, "learning_rate": 6.231723303418372e-08, "logits/chosen": 1.3338205814361572, "logits/rejected": NaN, "logps/chosen": -513.7178344726562, "logps/rejected": -269.7005615234375, "loss": 0.4263, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6024541854858398, "rewards/margins": 2.287015914916992, "rewards/rejected": -3.889469861984253, "step": 9300 }, { "epoch": 0.9388629774360268, "grad_norm": 147.4320831298828, "learning_rate": 6.130886356761117e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -520.8784790039062, "logps/rejected": -307.2264709472656, "loss": 0.7687, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5286967754364014, "rewards/margins": 1.9055455923080444, "rewards/rejected": -3.4342427253723145, "step": 9310 }, { "epoch": 0.9398714231690407, "grad_norm": 31.534530639648438, "learning_rate": 6.030049410103861e-08, "logits/chosen": 1.2434722185134888, "logits/rejected": NaN, "logps/chosen": -511.72381591796875, "logps/rejected": -402.0461120605469, "loss": 0.58, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.53402578830719, "rewards/margins": 1.8919090032577515, "rewards/rejected": -3.4259345531463623, "step": 9320 }, { "epoch": 0.9408798689020547, "grad_norm": 56.984352111816406, "learning_rate": 5.929212463446607e-08, "logits/chosen": 1.4254885911941528, "logits/rejected": NaN, "logps/chosen": -696.8053588867188, "logps/rejected": -396.79547119140625, "loss": 0.5738, "rewards/accuracies": 0.75, "rewards/chosen": -0.9667094945907593, "rewards/margins": 2.212472438812256, "rewards/rejected": -3.1791820526123047, "step": 9330 }, { "epoch": 0.9418883146350687, "grad_norm": 83.13147735595703, "learning_rate": 5.8283755167893516e-08, "logits/chosen": 1.4278464317321777, "logits/rejected": NaN, "logps/chosen": -515.931884765625, "logps/rejected": -351.0133972167969, "loss": 0.4859, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1597071886062622, "rewards/margins": 2.3692822456359863, "rewards/rejected": -3.528989791870117, "step": 9340 }, { "epoch": 0.9428967603680827, "grad_norm": 155.033203125, "learning_rate": 5.727538570132096e-08, "logits/chosen": 1.2524205446243286, "logits/rejected": NaN, "logps/chosen": -547.89111328125, "logps/rejected": -351.72393798828125, "loss": 0.5157, "rewards/accuracies": 0.625, "rewards/chosen": -1.3617627620697021, "rewards/margins": 2.1228790283203125, "rewards/rejected": -3.4846420288085938, "step": 9350 }, { "epoch": 0.9439052061010967, "grad_norm": 143.2166290283203, "learning_rate": 5.626701623474841e-08, "logits/chosen": 1.609174370765686, "logits/rejected": NaN, "logps/chosen": -658.8812866210938, "logps/rejected": -442.3304748535156, "loss": 0.439, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9612255096435547, "rewards/margins": 2.4610416889190674, "rewards/rejected": -3.422267198562622, "step": 9360 }, { "epoch": 0.9449136518341107, "grad_norm": 253.04173278808594, "learning_rate": 5.525864676817585e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -575.1897583007812, "logps/rejected": -464.62701416015625, "loss": 0.4254, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8200836181640625, "rewards/margins": 1.9017713069915771, "rewards/rejected": -3.7218546867370605, "step": 9370 }, { "epoch": 0.9459220975671246, "grad_norm": 96.96712493896484, "learning_rate": 5.4250277301603304e-08, "logits/chosen": 1.097786545753479, "logits/rejected": NaN, "logps/chosen": -481.8631896972656, "logps/rejected": -369.58209228515625, "loss": 0.5536, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.4211463928222656, "rewards/margins": 2.462031126022339, "rewards/rejected": -4.883177280426025, "step": 9380 }, { "epoch": 0.9469305433001387, "grad_norm": 102.40232849121094, "learning_rate": 5.3241907835030756e-08, "logits/chosen": 1.440932035446167, "logits/rejected": NaN, "logps/chosen": -574.1976318359375, "logps/rejected": -340.77166748046875, "loss": 0.2918, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5068124532699585, "rewards/margins": 3.1700193881988525, "rewards/rejected": -4.67683219909668, "step": 9390 }, { "epoch": 0.9479389890331527, "grad_norm": 51.46451187133789, "learning_rate": 5.22335383684582e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -660.3555297851562, "logps/rejected": -338.3406982421875, "loss": 0.3389, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.7450598478317261, "rewards/margins": 2.5658066272735596, "rewards/rejected": -3.310866594314575, "step": 9400 }, { "epoch": 0.9489474347661666, "grad_norm": 70.72069549560547, "learning_rate": 5.122516890188565e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -454.9549865722656, "logps/rejected": -460.2415466308594, "loss": 0.6043, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.32573401927948, "rewards/margins": 2.5754895210266113, "rewards/rejected": -3.9012234210968018, "step": 9410 }, { "epoch": 0.9499558804991807, "grad_norm": 59.71761703491211, "learning_rate": 5.021679943531309e-08, "logits/chosen": 1.0795526504516602, "logits/rejected": NaN, "logps/chosen": -596.1949462890625, "logps/rejected": -454.98681640625, "loss": 0.3969, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1529386043548584, "rewards/margins": 3.116344451904297, "rewards/rejected": -4.269283294677734, "step": 9420 }, { "epoch": 0.9509643262321946, "grad_norm": 169.21466064453125, "learning_rate": 4.9208429968740545e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -502.062744140625, "logps/rejected": -445.260009765625, "loss": 0.7228, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.6789785623550415, "rewards/margins": 1.8386170864105225, "rewards/rejected": -3.5175957679748535, "step": 9430 }, { "epoch": 0.9519727719652086, "grad_norm": 87.36554718017578, "learning_rate": 4.8200060502167997e-08, "logits/chosen": 1.6954683065414429, "logits/rejected": NaN, "logps/chosen": -643.3045654296875, "logps/rejected": -342.4455871582031, "loss": 0.3653, "rewards/accuracies": 0.875, "rewards/chosen": -0.9786726832389832, "rewards/margins": 2.970777750015259, "rewards/rejected": -3.9494502544403076, "step": 9440 }, { "epoch": 0.9529812176982226, "grad_norm": 158.3344268798828, "learning_rate": 4.719169103559544e-08, "logits/chosen": 1.3730494976043701, "logits/rejected": NaN, "logps/chosen": -568.244140625, "logps/rejected": -287.03314208984375, "loss": 0.448, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9873912334442139, "rewards/margins": 2.1887426376342773, "rewards/rejected": -4.176133632659912, "step": 9450 }, { "epoch": 0.9539896634312366, "grad_norm": 47.528594970703125, "learning_rate": 4.618332156902289e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -570.3289794921875, "logps/rejected": -364.846435546875, "loss": 0.4871, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3318191766738892, "rewards/margins": 2.607056140899658, "rewards/rejected": -3.938875198364258, "step": 9460 }, { "epoch": 0.9549981091642507, "grad_norm": 173.63978576660156, "learning_rate": 4.517495210245033e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -555.6517333984375, "logps/rejected": -474.48272705078125, "loss": 0.7347, "rewards/accuracies": 0.625, "rewards/chosen": -1.8298985958099365, "rewards/margins": 1.7643358707427979, "rewards/rejected": -3.5942344665527344, "step": 9470 }, { "epoch": 0.9560065548972646, "grad_norm": 72.12725067138672, "learning_rate": 4.4166582635877785e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -461.42474365234375, "logps/rejected": -321.6534729003906, "loss": 0.2517, "rewards/accuracies": 0.875, "rewards/chosen": -1.202330231666565, "rewards/margins": 3.088892698287964, "rewards/rejected": -4.29122257232666, "step": 9480 }, { "epoch": 0.9570150006302786, "grad_norm": 37.82660675048828, "learning_rate": 4.315821316930523e-08, "logits/chosen": 1.1542412042617798, "logits/rejected": NaN, "logps/chosen": -601.80712890625, "logps/rejected": -359.8675842285156, "loss": 0.3086, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.376191258430481, "rewards/margins": 3.1437602043151855, "rewards/rejected": -4.519951820373535, "step": 9490 }, { "epoch": 0.9580234463632926, "grad_norm": 42.01772689819336, "learning_rate": 4.214984370273268e-08, "logits/chosen": 1.3851813077926636, "logits/rejected": NaN, "logps/chosen": -606.8939208984375, "logps/rejected": -320.97747802734375, "loss": 0.4157, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7216705083847046, "rewards/margins": 2.8276658058166504, "rewards/rejected": -4.5493364334106445, "step": 9500 }, { "epoch": 0.9590318920963066, "grad_norm": 1.8783015012741089, "learning_rate": 4.114147423616013e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -519.8948974609375, "logps/rejected": -331.6911926269531, "loss": 0.4659, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.673006296157837, "rewards/margins": 2.4531986713409424, "rewards/rejected": -4.126204490661621, "step": 9510 }, { "epoch": 0.9600403378293205, "grad_norm": 179.98672485351562, "learning_rate": 4.013310476958757e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -537.9671630859375, "logps/rejected": -435.52569580078125, "loss": 0.3754, "rewards/accuracies": 0.75, "rewards/chosen": -1.423946499824524, "rewards/margins": 2.9711508750915527, "rewards/rejected": -4.395097255706787, "step": 9520 }, { "epoch": 0.9610487835623346, "grad_norm": 163.6973419189453, "learning_rate": 3.9124735303015025e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -465.26788330078125, "logps/rejected": -398.8098449707031, "loss": 0.4769, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8771053552627563, "rewards/margins": 1.9463990926742554, "rewards/rejected": -3.8235042095184326, "step": 9530 }, { "epoch": 0.9620572292953485, "grad_norm": 45.099483489990234, "learning_rate": 3.811636583644247e-08, "logits/chosen": 1.2462549209594727, "logits/rejected": NaN, "logps/chosen": -571.02392578125, "logps/rejected": -310.81219482421875, "loss": 0.441, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.74118971824646, "rewards/margins": 2.8847556114196777, "rewards/rejected": -4.625945091247559, "step": 9540 }, { "epoch": 0.9630656750283625, "grad_norm": 71.9881591796875, "learning_rate": 3.710799636986992e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -550.244140625, "logps/rejected": -395.53594970703125, "loss": 0.5325, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6058218479156494, "rewards/margins": 1.6595308780670166, "rewards/rejected": -3.265352725982666, "step": 9550 }, { "epoch": 0.9640741207613766, "grad_norm": 109.00980377197266, "learning_rate": 3.609962690329737e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -560.4908447265625, "logps/rejected": -400.94512939453125, "loss": 0.3425, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3154301643371582, "rewards/margins": 2.972480535507202, "rewards/rejected": -4.2879109382629395, "step": 9560 }, { "epoch": 0.9650825664943905, "grad_norm": 5.173654079437256, "learning_rate": 3.509125743672481e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -499.8777770996094, "logps/rejected": -354.48260498046875, "loss": 0.4539, "rewards/accuracies": 0.75, "rewards/chosen": -1.6715742349624634, "rewards/margins": 2.6732513904571533, "rewards/rejected": -4.344825744628906, "step": 9570 }, { "epoch": 0.9660910122274046, "grad_norm": 132.50454711914062, "learning_rate": 3.408288797015226e-08, "logits/chosen": 1.2860350608825684, "logits/rejected": 1.2981116771697998, "logps/chosen": -501.554443359375, "logps/rejected": -315.6057434082031, "loss": 0.4549, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5468028783798218, "rewards/margins": 2.4190561771392822, "rewards/rejected": -3.9658591747283936, "step": 9580 }, { "epoch": 0.9670994579604185, "grad_norm": 81.65623474121094, "learning_rate": 3.307451850357971e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -572.5663452148438, "logps/rejected": -403.93365478515625, "loss": 0.4868, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4932186603546143, "rewards/margins": 2.049819231033325, "rewards/rejected": -3.5430374145507812, "step": 9590 }, { "epoch": 0.9681079036934325, "grad_norm": 166.60043334960938, "learning_rate": 3.2066149037007156e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -552.8825073242188, "logps/rejected": -543.188232421875, "loss": 0.6899, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.844140648841858, "rewards/margins": 1.8505548238754272, "rewards/rejected": -3.6946959495544434, "step": 9600 }, { "epoch": 0.9691163494264465, "grad_norm": 82.18245697021484, "learning_rate": 3.105777957043461e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -626.7330322265625, "logps/rejected": -391.215576171875, "loss": 0.4885, "rewards/accuracies": 0.75, "rewards/chosen": -1.1417266130447388, "rewards/margins": 2.1733994483947754, "rewards/rejected": -3.3151259422302246, "step": 9610 }, { "epoch": 0.9701247951594605, "grad_norm": 245.6163787841797, "learning_rate": 3.0049410103862053e-08, "logits/chosen": 1.5011723041534424, "logits/rejected": NaN, "logps/chosen": -579.9866333007812, "logps/rejected": -348.2902526855469, "loss": 0.8227, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.174417495727539, "rewards/margins": 1.3842902183532715, "rewards/rejected": -3.5587074756622314, "step": 9620 }, { "epoch": 0.9711332408924744, "grad_norm": 170.7389678955078, "learning_rate": 2.9041040637289502e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -601.441162109375, "logps/rejected": -431.8787536621094, "loss": 0.6675, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4509103298187256, "rewards/margins": 1.961669921875, "rewards/rejected": -3.4125800132751465, "step": 9630 }, { "epoch": 0.9721416866254885, "grad_norm": 279.5130310058594, "learning_rate": 2.8032671170716947e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -565.6126708984375, "logps/rejected": -576.6492919921875, "loss": 0.5968, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.174140453338623, "rewards/margins": 2.0522522926330566, "rewards/rejected": -4.226393222808838, "step": 9640 }, { "epoch": 0.9731501323585025, "grad_norm": 22.778839111328125, "learning_rate": 2.70243017041444e-08, "logits/chosen": 1.1580431461334229, "logits/rejected": NaN, "logps/chosen": -495.92291259765625, "logps/rejected": -413.457275390625, "loss": 0.3915, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.723453164100647, "rewards/margins": 2.914457082748413, "rewards/rejected": -4.637909889221191, "step": 9650 }, { "epoch": 0.9741585780915164, "grad_norm": 13.211109161376953, "learning_rate": 2.6015932237571845e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -637.6839599609375, "logps/rejected": -473.6417541503906, "loss": 0.3467, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.5298431515693665, "rewards/margins": 2.72489857673645, "rewards/rejected": -3.25474214553833, "step": 9660 }, { "epoch": 0.9751670238245305, "grad_norm": 145.5353546142578, "learning_rate": 2.500756277099929e-08, "logits/chosen": 1.4381129741668701, "logits/rejected": NaN, "logps/chosen": -594.4530029296875, "logps/rejected": -469.20098876953125, "loss": 0.5847, "rewards/accuracies": 0.75, "rewards/chosen": -1.2626738548278809, "rewards/margins": 3.4925262928009033, "rewards/rejected": -4.755200386047363, "step": 9670 }, { "epoch": 0.9761754695575444, "grad_norm": 18.425416946411133, "learning_rate": 2.3999193304426742e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -580.8128662109375, "logps/rejected": -448.91046142578125, "loss": 0.5748, "rewards/accuracies": 0.75, "rewards/chosen": -1.5873377323150635, "rewards/margins": 2.843841075897217, "rewards/rejected": -4.431178092956543, "step": 9680 }, { "epoch": 0.9771839152905585, "grad_norm": 127.254638671875, "learning_rate": 2.2990823837854188e-08, "logits/chosen": 1.490780234336853, "logits/rejected": NaN, "logps/chosen": -486.8974609375, "logps/rejected": -341.58343505859375, "loss": 0.3413, "rewards/accuracies": 0.875, "rewards/chosen": -1.556675672531128, "rewards/margins": 3.2373242378234863, "rewards/rejected": -4.793999671936035, "step": 9690 }, { "epoch": 0.9781923610235724, "grad_norm": 37.526947021484375, "learning_rate": 2.1982454371281636e-08, "logits/chosen": 1.342883825302124, "logits/rejected": NaN, "logps/chosen": -598.4722900390625, "logps/rejected": -428.3997497558594, "loss": 0.1668, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.5429970026016235, "rewards/margins": 4.550286769866943, "rewards/rejected": -5.093283653259277, "step": 9700 }, { "epoch": 0.9792008067565864, "grad_norm": 231.89108276367188, "learning_rate": 2.0974084904709085e-08, "logits/chosen": 1.5822843313217163, "logits/rejected": NaN, "logps/chosen": -693.489501953125, "logps/rejected": -318.67681884765625, "loss": 0.386, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6233460903167725, "rewards/margins": 2.776402473449707, "rewards/rejected": -4.399748802185059, "step": 9710 }, { "epoch": 0.9802092524896004, "grad_norm": 10.840592384338379, "learning_rate": 1.996571543813653e-08, "logits/chosen": 1.3159451484680176, "logits/rejected": NaN, "logps/chosen": -519.5774536132812, "logps/rejected": -282.61138916015625, "loss": 0.2179, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.277756690979004, "rewards/margins": 3.368131637573242, "rewards/rejected": -4.645888328552246, "step": 9720 }, { "epoch": 0.9812176982226144, "grad_norm": 140.56381225585938, "learning_rate": 1.8957345971563982e-08, "logits/chosen": 1.4060128927230835, "logits/rejected": NaN, "logps/chosen": -575.5593872070312, "logps/rejected": -340.30047607421875, "loss": 0.5248, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9507768750190735, "rewards/margins": 2.7250285148620605, "rewards/rejected": -3.6758055686950684, "step": 9730 }, { "epoch": 0.9822261439556284, "grad_norm": 42.33375930786133, "learning_rate": 1.7948976504991428e-08, "logits/chosen": NaN, "logits/rejected": 1.2268702983856201, "logps/chosen": -415.6500549316406, "logps/rejected": -397.98150634765625, "loss": 0.5575, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.126239538192749, "rewards/margins": 1.7755523920059204, "rewards/rejected": -3.90179181098938, "step": 9740 }, { "epoch": 0.9832345896886424, "grad_norm": 157.8643798828125, "learning_rate": 1.6940607038418876e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -485.4161071777344, "logps/rejected": -446.3720703125, "loss": 0.7111, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.598641872406006, "rewards/margins": 1.6341133117675781, "rewards/rejected": -4.232755661010742, "step": 9750 }, { "epoch": 0.9842430354216564, "grad_norm": 3.772545337677002, "learning_rate": 1.5932237571846325e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -483.40594482421875, "logps/rejected": -347.0514221191406, "loss": 0.4155, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7884260416030884, "rewards/margins": 2.480278730392456, "rewards/rejected": -4.268704891204834, "step": 9760 }, { "epoch": 0.9852514811546703, "grad_norm": 106.87601470947266, "learning_rate": 1.492386810527377e-08, "logits/chosen": 1.07418954372406, "logits/rejected": NaN, "logps/chosen": -557.2086181640625, "logps/rejected": -281.61260986328125, "loss": 0.587, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4391064643859863, "rewards/margins": 2.486748456954956, "rewards/rejected": -3.9258549213409424, "step": 9770 }, { "epoch": 0.9862599268876844, "grad_norm": 60.415958404541016, "learning_rate": 1.391549863870122e-08, "logits/chosen": 1.412265419960022, "logits/rejected": NaN, "logps/chosen": -550.6650390625, "logps/rejected": -355.59747314453125, "loss": 0.3923, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4035052061080933, "rewards/margins": 3.155042886734009, "rewards/rejected": -4.5585479736328125, "step": 9780 }, { "epoch": 0.9872683726206983, "grad_norm": 40.74187469482422, "learning_rate": 1.2907129172128668e-08, "logits/chosen": NaN, "logits/rejected": 1.4696996212005615, "logps/chosen": -463.9778747558594, "logps/rejected": -398.99224853515625, "loss": 0.7071, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7729146480560303, "rewards/margins": 2.3817389011383057, "rewards/rejected": -4.154653072357178, "step": 9790 }, { "epoch": 0.9882768183537124, "grad_norm": 111.10861206054688, "learning_rate": 1.1898759705556115e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -592.4940185546875, "logps/rejected": -527.9998168945312, "loss": 0.4823, "rewards/accuracies": 0.75, "rewards/chosen": -1.5539222955703735, "rewards/margins": 2.992663621902466, "rewards/rejected": -4.546585559844971, "step": 9800 }, { "epoch": 0.9892852640867263, "grad_norm": 104.14710235595703, "learning_rate": 1.0890390238983564e-08, "logits/chosen": 1.4900633096694946, "logits/rejected": 1.2835993766784668, "logps/chosen": -546.0692138671875, "logps/rejected": -451.728759765625, "loss": 0.5692, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8466380834579468, "rewards/margins": 2.4432857036590576, "rewards/rejected": -4.289923667907715, "step": 9810 }, { "epoch": 0.9902937098197403, "grad_norm": 10.7011079788208, "learning_rate": 9.88202077241101e-09, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -575.5400390625, "logps/rejected": -322.0439147949219, "loss": 0.6372, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.7357696294784546, "rewards/margins": 2.520719051361084, "rewards/rejected": -3.256488800048828, "step": 9820 }, { "epoch": 0.9913021555527544, "grad_norm": 42.406986236572266, "learning_rate": 8.873651305838458e-09, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -625.5560302734375, "logps/rejected": -417.21685791015625, "loss": 0.2795, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.9510596394538879, "rewards/margins": 3.1779727935791016, "rewards/rejected": -4.129032135009766, "step": 9830 }, { "epoch": 0.9923106012857683, "grad_norm": 70.81024169921875, "learning_rate": 7.865281839265906e-09, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -560.6497802734375, "logps/rejected": -457.9280700683594, "loss": 0.7175, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.0936636924743652, "rewards/margins": 2.270786762237549, "rewards/rejected": -4.364450454711914, "step": 9840 }, { "epoch": 0.9933190470187823, "grad_norm": 29.3454647064209, "learning_rate": 6.856912372693355e-09, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -504.94207763671875, "logps/rejected": -426.76898193359375, "loss": 0.4731, "rewards/accuracies": 0.75, "rewards/chosen": -1.2967321872711182, "rewards/margins": 2.9306976795196533, "rewards/rejected": -4.2274298667907715, "step": 9850 }, { "epoch": 0.9943274927517963, "grad_norm": 35.332862854003906, "learning_rate": 5.848542906120802e-09, "logits/chosen": 1.480236291885376, "logits/rejected": NaN, "logps/chosen": -646.1192626953125, "logps/rejected": -342.320068359375, "loss": 0.2451, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3374769687652588, "rewards/margins": 3.7136483192443848, "rewards/rejected": -5.051125526428223, "step": 9860 }, { "epoch": 0.9953359384848103, "grad_norm": 3.738142967224121, "learning_rate": 4.840173439548251e-09, "logits/chosen": 1.607187271118164, "logits/rejected": NaN, "logps/chosen": -598.8643798828125, "logps/rejected": -329.76300048828125, "loss": 0.2793, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.2444044351577759, "rewards/margins": 3.778430938720703, "rewards/rejected": -5.022835731506348, "step": 9870 }, { "epoch": 0.9963443842178242, "grad_norm": 26.042478561401367, "learning_rate": 3.831803972975698e-09, "logits/chosen": 1.4589027166366577, "logits/rejected": NaN, "logps/chosen": -628.6232299804688, "logps/rejected": -429.97296142578125, "loss": 0.3601, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5208218097686768, "rewards/margins": 2.972923994064331, "rewards/rejected": -4.493745803833008, "step": 9880 }, { "epoch": 0.9973528299508383, "grad_norm": 24.507606506347656, "learning_rate": 2.823434506403146e-09, "logits/chosen": 1.1324280500411987, "logits/rejected": NaN, "logps/chosen": -544.1085205078125, "logps/rejected": -327.521240234375, "loss": 0.4201, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6379578113555908, "rewards/margins": 2.1790614128112793, "rewards/rejected": -3.81701922416687, "step": 9890 }, { "epoch": 0.9983612756838522, "grad_norm": 205.07461547851562, "learning_rate": 1.8150650398305939e-09, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -588.0301513671875, "logps/rejected": -388.85479736328125, "loss": 0.4954, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6153032779693604, "rewards/margins": 2.8501744270324707, "rewards/rejected": -4.46547794342041, "step": 9900 }, { "epoch": 0.9993697214168663, "grad_norm": 322.6021728515625, "learning_rate": 8.066955732580417e-10, "logits/chosen": 1.2009689807891846, "logits/rejected": NaN, "logps/chosen": -446.2076110839844, "logps/rejected": -297.8315734863281, "loss": 0.8922, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.7542126178741455, "rewards/margins": 1.6736247539520264, "rewards/rejected": -4.427837371826172, "step": 9910 } ], "logging_steps": 10, "max_steps": 9917, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }