diff --git "a/dpo_output/10k_students_10k_stack/checkpoint/checkpoint-17000/trainer_state.json" "b/dpo_output/10k_students_10k_stack/checkpoint/checkpoint-17000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/dpo_output/10k_students_10k_stack/checkpoint/checkpoint-17000/trainer_state.json" @@ -0,0 +1,2855 @@ +{ + "best_metric": 0.8348324298858643, + "best_model_checkpoint": "./output/dpo_output/10k_students_10k_stack/checkpoint/checkpoint-1000", + "epoch": 0.9444444444444444, + "eval_steps": 1000, + "global_step": 17000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.005555555555555556, + "grad_norm": 21.485193252563477, + "learning_rate": 4.973055555555556e-05, + "logits/chosen": -26.227602005004883, + "logits/rejected": -24.66718101501465, + "logps/chosen": -325.9598693847656, + "logps/rejected": -259.147216796875, + "loss": 0.6589, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": -0.35466018319129944, + "rewards/margins": 0.1602381318807602, + "rewards/rejected": -0.5148983001708984, + "step": 100 + }, + { + "epoch": 0.011111111111111112, + "grad_norm": 34.235862731933594, + "learning_rate": 4.9452777777777784e-05, + "logits/chosen": -25.238574981689453, + "logits/rejected": -23.293319702148438, + "logps/chosen": -345.8126525878906, + "logps/rejected": -315.44549560546875, + "loss": 0.7702, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.4225605726242065, + "rewards/margins": 0.2760031223297119, + "rewards/rejected": -1.6985636949539185, + "step": 200 + }, + { + "epoch": 0.016666666666666666, + "grad_norm": 157.2738037109375, + "learning_rate": 4.917777777777778e-05, + "logits/chosen": -26.076765060424805, + "logits/rejected": -23.33395767211914, + "logps/chosen": -435.3138732910156, + "logps/rejected": -355.1646423339844, + "loss": 0.9623, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.961271286010742, + "rewards/margins": 0.42585060000419617, + "rewards/rejected": -3.3871219158172607, + "step": 300 + }, + { + "epoch": 0.022222222222222223, + "grad_norm": 18.932538986206055, + "learning_rate": 4.89e-05, + "logits/chosen": -23.54969024658203, + "logits/rejected": -20.78655242919922, + "logps/chosen": -359.6000061035156, + "logps/rejected": -308.255126953125, + "loss": 0.8535, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": -3.577561855316162, + "rewards/margins": 0.4763610064983368, + "rewards/rejected": -4.053922653198242, + "step": 400 + }, + { + "epoch": 0.027777777777777776, + "grad_norm": 6.906754493713379, + "learning_rate": 4.862222222222222e-05, + "logits/chosen": -22.755563735961914, + "logits/rejected": -20.829153060913086, + "logps/chosen": -364.575927734375, + "logps/rejected": -342.2480163574219, + "loss": 0.9323, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -6.1168599128723145, + "rewards/margins": 0.8832033276557922, + "rewards/rejected": -7.000063419342041, + "step": 500 + }, + { + "epoch": 0.03333333333333333, + "grad_norm": 26.487075805664062, + "learning_rate": 4.835e-05, + "logits/chosen": -22.652997970581055, + "logits/rejected": -20.060657501220703, + "logps/chosen": -406.8963928222656, + "logps/rejected": -349.3740539550781, + "loss": 1.4299, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": -7.09837532043457, + "rewards/margins": 0.5164350867271423, + "rewards/rejected": -7.614809989929199, + "step": 600 + }, + { + "epoch": 0.03888888888888889, + "grad_norm": 20.443788528442383, + "learning_rate": 4.807222222222222e-05, + "logits/chosen": -22.491497039794922, + "logits/rejected": -19.860458374023438, + "logps/chosen": -374.18438720703125, + "logps/rejected": -326.9835205078125, + "loss": 0.9971, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -6.852818489074707, + "rewards/margins": 1.3072794675827026, + "rewards/rejected": -8.1600980758667, + "step": 700 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 125.28089904785156, + "learning_rate": 4.779444444444445e-05, + "logits/chosen": -22.120317459106445, + "logits/rejected": -18.23455238342285, + "logps/chosen": -447.3978576660156, + "logps/rejected": -332.87811279296875, + "loss": 1.1692, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": -8.441169738769531, + "rewards/margins": 1.4077394008636475, + "rewards/rejected": -9.848909378051758, + "step": 800 + }, + { + "epoch": 0.05, + "grad_norm": 12.869420051574707, + "learning_rate": 4.751666666666667e-05, + "logits/chosen": -20.9626407623291, + "logits/rejected": -17.587430953979492, + "logps/chosen": -432.1313171386719, + "logps/rejected": -356.2396240234375, + "loss": 0.8583, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -10.766374588012695, + "rewards/margins": 2.785109043121338, + "rewards/rejected": -13.551483154296875, + "step": 900 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 28.736312866210938, + "learning_rate": 4.7241666666666665e-05, + "logits/chosen": -19.083620071411133, + "logits/rejected": -17.32378387451172, + "logps/chosen": -409.73272705078125, + "logps/rejected": -387.8572692871094, + "loss": 1.2954, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -11.696721076965332, + "rewards/margins": 1.5358176231384277, + "rewards/rejected": -13.232539176940918, + "step": 1000 + }, + { + "epoch": 0.05555555555555555, + "eval_logits/chosen": -22.120441436767578, + "eval_logits/rejected": -19.691335678100586, + "eval_logps/chosen": -394.6796569824219, + "eval_logps/rejected": -373.52099609375, + "eval_loss": 0.8348324298858643, + "eval_rewards/accuracies": 0.6974999904632568, + "eval_rewards/chosen": -8.014745712280273, + "eval_rewards/margins": 1.990256905555725, + "eval_rewards/rejected": -10.005003929138184, + "eval_runtime": 514.2377, + "eval_samples_per_second": 3.889, + "eval_steps_per_second": 3.889, + "step": 1000 + }, + { + "epoch": 0.06111111111111111, + "grad_norm": 247.53744506835938, + "learning_rate": 4.696388888888889e-05, + "logits/chosen": -23.17668914794922, + "logits/rejected": -19.9500732421875, + "logps/chosen": -428.59490966796875, + "logps/rejected": -390.1414794921875, + "loss": 0.7751, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -9.060705184936523, + "rewards/margins": 2.4199957847595215, + "rewards/rejected": -11.480701446533203, + "step": 1100 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 203.20802307128906, + "learning_rate": 4.6686111111111116e-05, + "logits/chosen": -19.072528839111328, + "logits/rejected": -17.659456253051758, + "logps/chosen": -404.7214050292969, + "logps/rejected": -427.1907043457031, + "loss": 1.4619, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": -13.172075271606445, + "rewards/margins": 2.379429578781128, + "rewards/rejected": -15.551506042480469, + "step": 1200 + }, + { + "epoch": 0.07222222222222222, + "grad_norm": 84.43495178222656, + "learning_rate": 4.6408333333333334e-05, + "logits/chosen": -19.611614227294922, + "logits/rejected": -17.71375274658203, + "logps/chosen": -409.61248779296875, + "logps/rejected": -437.9902038574219, + "loss": 1.1262, + "rewards/accuracies": 0.75, + "rewards/chosen": -13.74755859375, + "rewards/margins": 3.3012917041778564, + "rewards/rejected": -17.048851013183594, + "step": 1300 + }, + { + "epoch": 0.07777777777777778, + "grad_norm": 136.84536743164062, + "learning_rate": 4.613055555555556e-05, + "logits/chosen": -21.674936294555664, + "logits/rejected": -19.507160186767578, + "logps/chosen": -458.37335205078125, + "logps/rejected": -462.635009765625, + "loss": 1.2063, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": -11.58407974243164, + "rewards/margins": 2.0285184383392334, + "rewards/rejected": -13.612598419189453, + "step": 1400 + }, + { + "epoch": 0.08333333333333333, + "grad_norm": 30.185537338256836, + "learning_rate": 4.585277777777778e-05, + "logits/chosen": -20.78430938720703, + "logits/rejected": -19.84255027770996, + "logps/chosen": -429.7988586425781, + "logps/rejected": -424.0312805175781, + "loss": 1.2422, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -11.19260025024414, + "rewards/margins": 2.49784255027771, + "rewards/rejected": -13.690443992614746, + "step": 1500 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 94.36914825439453, + "learning_rate": 4.5575e-05, + "logits/chosen": -23.507675170898438, + "logits/rejected": -19.682266235351562, + "logps/chosen": -440.8300476074219, + "logps/rejected": -373.97869873046875, + "loss": 1.0793, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -9.692771911621094, + "rewards/margins": 2.620527744293213, + "rewards/rejected": -12.313299179077148, + "step": 1600 + }, + { + "epoch": 0.09444444444444444, + "grad_norm": 0.23974980413913727, + "learning_rate": 4.529722222222222e-05, + "logits/chosen": -24.138744354248047, + "logits/rejected": -22.071462631225586, + "logps/chosen": -484.0884094238281, + "logps/rejected": -449.6929931640625, + "loss": 1.3212, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -12.323857307434082, + "rewards/margins": 2.2430076599121094, + "rewards/rejected": -14.566866874694824, + "step": 1700 + }, + { + "epoch": 0.1, + "grad_norm": 24.225988388061523, + "learning_rate": 4.501944444444445e-05, + "logits/chosen": -23.07463836669922, + "logits/rejected": -21.728696823120117, + "logps/chosen": -415.1795654296875, + "logps/rejected": -389.4674072265625, + "loss": 1.2762, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -11.503925323486328, + "rewards/margins": 2.0540616512298584, + "rewards/rejected": -13.557987213134766, + "step": 1800 + }, + { + "epoch": 0.10555555555555556, + "grad_norm": 0.7433995008468628, + "learning_rate": 4.474166666666667e-05, + "logits/chosen": -23.470361709594727, + "logits/rejected": -20.10572052001953, + "logps/chosen": -436.0697326660156, + "logps/rejected": -394.37469482421875, + "loss": 1.1971, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": -9.563211441040039, + "rewards/margins": 2.4659552574157715, + "rewards/rejected": -12.029166221618652, + "step": 1900 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 10.76259708404541, + "learning_rate": 4.446388888888889e-05, + "logits/chosen": -24.007524490356445, + "logits/rejected": -21.513368606567383, + "logps/chosen": -367.1390380859375, + "logps/rejected": -352.7116394042969, + "loss": 0.9626, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -6.886416435241699, + "rewards/margins": 2.0072519779205322, + "rewards/rejected": -8.893669128417969, + "step": 2000 + }, + { + "epoch": 0.1111111111111111, + "eval_logits/chosen": -22.168193817138672, + "eval_logits/rejected": -19.89411735534668, + "eval_logps/chosen": -430.50250244140625, + "eval_logps/rejected": -418.2765808105469, + "eval_loss": 1.1797771453857422, + "eval_rewards/accuracies": 0.6959999799728394, + "eval_rewards/chosen": -11.597028732299805, + "eval_rewards/margins": 2.883537530899048, + "eval_rewards/rejected": -14.480566024780273, + "eval_runtime": 514.2817, + "eval_samples_per_second": 3.889, + "eval_steps_per_second": 3.889, + "step": 2000 + }, + { + "epoch": 0.11666666666666667, + "grad_norm": 90.20030212402344, + "learning_rate": 4.418611111111111e-05, + "logits/chosen": -22.598526000976562, + "logits/rejected": -20.72931480407715, + "logps/chosen": -461.95587158203125, + "logps/rejected": -460.3869934082031, + "loss": 1.312, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -11.029842376708984, + "rewards/margins": 3.128079891204834, + "rewards/rejected": -14.157923698425293, + "step": 2100 + }, + { + "epoch": 0.12222222222222222, + "grad_norm": 290.6123962402344, + "learning_rate": 4.3908333333333334e-05, + "logits/chosen": -20.012657165527344, + "logits/rejected": -17.646896362304688, + "logps/chosen": -437.4259033203125, + "logps/rejected": -457.9421081542969, + "loss": 1.0725, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -12.908517837524414, + "rewards/margins": 4.963741302490234, + "rewards/rejected": -17.87225914001465, + "step": 2200 + }, + { + "epoch": 0.12777777777777777, + "grad_norm": 5.925068378448486, + "learning_rate": 4.363055555555556e-05, + "logits/chosen": -21.483291625976562, + "logits/rejected": -20.906360626220703, + "logps/chosen": -435.2004699707031, + "logps/rejected": -452.76641845703125, + "loss": 1.2038, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -11.339729309082031, + "rewards/margins": 2.3920578956604004, + "rewards/rejected": -13.731786727905273, + "step": 2300 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 182.24160766601562, + "learning_rate": 4.335277777777778e-05, + "logits/chosen": -22.853578567504883, + "logits/rejected": -21.049165725708008, + "logps/chosen": -473.3839111328125, + "logps/rejected": -426.16143798828125, + "loss": 0.9851, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -11.509615898132324, + "rewards/margins": 2.831500291824341, + "rewards/rejected": -14.341115951538086, + "step": 2400 + }, + { + "epoch": 0.1388888888888889, + "grad_norm": 121.64925384521484, + "learning_rate": 4.3075000000000003e-05, + "logits/chosen": -21.929183959960938, + "logits/rejected": -19.701156616210938, + "logps/chosen": -400.2357482910156, + "logps/rejected": -393.37750244140625, + "loss": 1.2416, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -12.905362129211426, + "rewards/margins": 1.9682769775390625, + "rewards/rejected": -14.873639106750488, + "step": 2500 + }, + { + "epoch": 0.14444444444444443, + "grad_norm": 0.09141811728477478, + "learning_rate": 4.279722222222222e-05, + "logits/chosen": -21.682767868041992, + "logits/rejected": -18.566747665405273, + "logps/chosen": -418.79376220703125, + "logps/rejected": -372.9311218261719, + "loss": 1.3261, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -11.817575454711914, + "rewards/margins": 2.6196460723876953, + "rewards/rejected": -14.437220573425293, + "step": 2600 + }, + { + "epoch": 0.15, + "grad_norm": 92.0936508178711, + "learning_rate": 4.251944444444445e-05, + "logits/chosen": -20.049480438232422, + "logits/rejected": -18.856403350830078, + "logps/chosen": -404.0262145996094, + "logps/rejected": -406.05914306640625, + "loss": 0.9823, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -11.436899185180664, + "rewards/margins": 2.77144718170166, + "rewards/rejected": -14.20834732055664, + "step": 2700 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 0.00573139451444149, + "learning_rate": 4.2241666666666666e-05, + "logits/chosen": -22.489606857299805, + "logits/rejected": -19.643596649169922, + "logps/chosen": -456.1054382324219, + "logps/rejected": -431.6153869628906, + "loss": 1.0018, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -10.16519546508789, + "rewards/margins": 3.6660971641540527, + "rewards/rejected": -13.831293106079102, + "step": 2800 + }, + { + "epoch": 0.16111111111111112, + "grad_norm": 69.13850402832031, + "learning_rate": 4.196388888888889e-05, + "logits/chosen": -20.94375228881836, + "logits/rejected": -18.140661239624023, + "logps/chosen": -490.3839111328125, + "logps/rejected": -438.868408203125, + "loss": 1.1621, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -13.176280975341797, + "rewards/margins": 2.5138561725616455, + "rewards/rejected": -15.690135955810547, + "step": 2900 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 92.93842315673828, + "learning_rate": 4.1686111111111116e-05, + "logits/chosen": -23.850454330444336, + "logits/rejected": -20.55642318725586, + "logps/chosen": -500.03302001953125, + "logps/rejected": -413.0674133300781, + "loss": 0.8367, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -9.567950248718262, + "rewards/margins": 3.257716655731201, + "rewards/rejected": -12.825667381286621, + "step": 3000 + }, + { + "epoch": 0.16666666666666666, + "eval_logits/chosen": -22.81545639038086, + "eval_logits/rejected": -20.421916961669922, + "eval_logps/chosen": -418.67730712890625, + "eval_logps/rejected": -404.2204284667969, + "eval_loss": 1.0444144010543823, + "eval_rewards/accuracies": 0.7014999985694885, + "eval_rewards/chosen": -10.414511680603027, + "eval_rewards/margins": 2.6604373455047607, + "eval_rewards/rejected": -13.074949264526367, + "eval_runtime": 514.0529, + "eval_samples_per_second": 3.891, + "eval_steps_per_second": 3.891, + "step": 3000 + }, + { + "epoch": 0.17222222222222222, + "grad_norm": 61.449012756347656, + "learning_rate": 4.1408333333333335e-05, + "logits/chosen": -22.061830520629883, + "logits/rejected": -19.879331588745117, + "logps/chosen": -452.66387939453125, + "logps/rejected": -476.47515869140625, + "loss": 1.0061, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -10.6780366897583, + "rewards/margins": 4.249261856079102, + "rewards/rejected": -14.927298545837402, + "step": 3100 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.09455399960279465, + "learning_rate": 4.113055555555555e-05, + "logits/chosen": -21.203563690185547, + "logits/rejected": -19.023717880249023, + "logps/chosen": -402.2676696777344, + "logps/rejected": -374.0508117675781, + "loss": 1.1172, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -9.928569793701172, + "rewards/margins": 3.0265896320343018, + "rewards/rejected": -12.955158233642578, + "step": 3200 + }, + { + "epoch": 0.18333333333333332, + "grad_norm": 0.8054964542388916, + "learning_rate": 4.085277777777778e-05, + "logits/chosen": -22.43466567993164, + "logits/rejected": -19.53727149963379, + "logps/chosen": -439.93804931640625, + "logps/rejected": -401.6429443359375, + "loss": 1.3647, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": -10.733546257019043, + "rewards/margins": 2.850750207901001, + "rewards/rejected": -13.584297180175781, + "step": 3300 + }, + { + "epoch": 0.18888888888888888, + "grad_norm": 159.09869384765625, + "learning_rate": 4.0575000000000004e-05, + "logits/chosen": -22.517881393432617, + "logits/rejected": -19.877910614013672, + "logps/chosen": -473.3965759277344, + "logps/rejected": -460.45684814453125, + "loss": 0.8705, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -12.473438262939453, + "rewards/margins": 3.278029203414917, + "rewards/rejected": -15.751468658447266, + "step": 3400 + }, + { + "epoch": 0.19444444444444445, + "grad_norm": 84.26875305175781, + "learning_rate": 4.029722222222222e-05, + "logits/chosen": -22.69594955444336, + "logits/rejected": -20.513687133789062, + "logps/chosen": -470.288818359375, + "logps/rejected": -464.061767578125, + "loss": 1.2692, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -12.416376113891602, + "rewards/margins": 3.6612563133239746, + "rewards/rejected": -16.077632904052734, + "step": 3500 + }, + { + "epoch": 0.2, + "grad_norm": 199.74862670898438, + "learning_rate": 4.001944444444445e-05, + "logits/chosen": -21.172372817993164, + "logits/rejected": -19.417011260986328, + "logps/chosen": -498.2371826171875, + "logps/rejected": -449.1603088378906, + "loss": 1.3479, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -15.376605987548828, + "rewards/margins": 2.43000864982605, + "rewards/rejected": -17.806615829467773, + "step": 3600 + }, + { + "epoch": 0.20555555555555555, + "grad_norm": 0.41523414850234985, + "learning_rate": 3.9741666666666666e-05, + "logits/chosen": -23.427236557006836, + "logits/rejected": -19.214725494384766, + "logps/chosen": -537.3643188476562, + "logps/rejected": -419.5450134277344, + "loss": 0.9649, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -11.399796485900879, + "rewards/margins": 3.3221099376678467, + "rewards/rejected": -14.721905708312988, + "step": 3700 + }, + { + "epoch": 0.2111111111111111, + "grad_norm": 1.2027373313903809, + "learning_rate": 3.946388888888889e-05, + "logits/chosen": -21.525869369506836, + "logits/rejected": -20.017871856689453, + "logps/chosen": -413.3069152832031, + "logps/rejected": -451.5067138671875, + "loss": 1.1504, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -11.805826187133789, + "rewards/margins": 3.5292584896087646, + "rewards/rejected": -15.335083961486816, + "step": 3800 + }, + { + "epoch": 0.21666666666666667, + "grad_norm": 0.47572144865989685, + "learning_rate": 3.918611111111111e-05, + "logits/chosen": -22.006607055664062, + "logits/rejected": -20.585355758666992, + "logps/chosen": -441.50335693359375, + "logps/rejected": -481.7433166503906, + "loss": 1.0002, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -13.167389869689941, + "rewards/margins": 4.146308422088623, + "rewards/rejected": -17.313697814941406, + "step": 3900 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.549039900302887, + "learning_rate": 3.8908333333333335e-05, + "logits/chosen": -21.06528091430664, + "logits/rejected": -18.383689880371094, + "logps/chosen": -414.032958984375, + "logps/rejected": -396.2176208496094, + "loss": 1.0356, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -13.922795295715332, + "rewards/margins": 3.354052782058716, + "rewards/rejected": -17.27684783935547, + "step": 4000 + }, + { + "epoch": 0.2222222222222222, + "eval_logits/chosen": -20.979387283325195, + "eval_logits/rejected": -18.688735961914062, + "eval_logps/chosen": -442.41949462890625, + "eval_logps/rejected": -433.6531982421875, + "eval_loss": 1.1642365455627441, + "eval_rewards/accuracies": 0.7110000252723694, + "eval_rewards/chosen": -12.78873348236084, + "eval_rewards/margins": 3.229485273361206, + "eval_rewards/rejected": -16.018218994140625, + "eval_runtime": 514.7749, + "eval_samples_per_second": 3.885, + "eval_steps_per_second": 3.885, + "step": 4000 + }, + { + "epoch": 0.22777777777777777, + "grad_norm": 1.2259142398834229, + "learning_rate": 3.863055555555556e-05, + "logits/chosen": -21.384546279907227, + "logits/rejected": -19.441072463989258, + "logps/chosen": -417.543212890625, + "logps/rejected": -407.3497619628906, + "loss": 1.5039, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -11.751730918884277, + "rewards/margins": 1.9737460613250732, + "rewards/rejected": -13.725478172302246, + "step": 4100 + }, + { + "epoch": 0.23333333333333334, + "grad_norm": 147.1872100830078, + "learning_rate": 3.835277777777778e-05, + "logits/chosen": -22.113927841186523, + "logits/rejected": -20.51749038696289, + "logps/chosen": -375.87469482421875, + "logps/rejected": -367.4930114746094, + "loss": 0.9857, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -9.018177032470703, + "rewards/margins": 1.861909031867981, + "rewards/rejected": -10.880086898803711, + "step": 4200 + }, + { + "epoch": 0.2388888888888889, + "grad_norm": 0.33266976475715637, + "learning_rate": 3.8075e-05, + "logits/chosen": -22.488109588623047, + "logits/rejected": -19.233457565307617, + "logps/chosen": -416.78009033203125, + "logps/rejected": -393.82794189453125, + "loss": 0.9777, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -11.60778522491455, + "rewards/margins": 2.578366279602051, + "rewards/rejected": -14.186152458190918, + "step": 4300 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 1.7657444477081299, + "learning_rate": 3.779722222222222e-05, + "logits/chosen": -22.703102111816406, + "logits/rejected": -20.914888381958008, + "logps/chosen": -428.1728820800781, + "logps/rejected": -417.9210205078125, + "loss": 1.2457, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": -10.225010871887207, + "rewards/margins": 2.225151538848877, + "rewards/rejected": -12.450161933898926, + "step": 4400 + }, + { + "epoch": 0.25, + "grad_norm": 1.0219632713415194e-06, + "learning_rate": 3.751944444444445e-05, + "logits/chosen": -23.13308334350586, + "logits/rejected": -20.678983688354492, + "logps/chosen": -481.14019775390625, + "logps/rejected": -426.427490234375, + "loss": 0.9046, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -10.760095596313477, + "rewards/margins": 3.60892915725708, + "rewards/rejected": -14.369023323059082, + "step": 4500 + }, + { + "epoch": 0.25555555555555554, + "grad_norm": 83.02727508544922, + "learning_rate": 3.7241666666666666e-05, + "logits/chosen": -22.329181671142578, + "logits/rejected": -20.42967987060547, + "logps/chosen": -443.2716064453125, + "logps/rejected": -431.0432434082031, + "loss": 1.0157, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -12.298175811767578, + "rewards/margins": 3.1891872882843018, + "rewards/rejected": -15.4873628616333, + "step": 4600 + }, + { + "epoch": 0.2611111111111111, + "grad_norm": 165.80728149414062, + "learning_rate": 3.696388888888889e-05, + "logits/chosen": -20.576295852661133, + "logits/rejected": -18.578920364379883, + "logps/chosen": -452.2530822753906, + "logps/rejected": -449.0054016113281, + "loss": 1.1048, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -15.771139144897461, + "rewards/margins": 2.9585018157958984, + "rewards/rejected": -18.729639053344727, + "step": 4700 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 70.66239929199219, + "learning_rate": 3.668611111111112e-05, + "logits/chosen": -21.812227249145508, + "logits/rejected": -18.751893997192383, + "logps/chosen": -502.3072509765625, + "logps/rejected": -469.2129211425781, + "loss": 0.7681, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -14.37415885925293, + "rewards/margins": 3.6863880157470703, + "rewards/rejected": -18.060546875, + "step": 4800 + }, + { + "epoch": 0.2722222222222222, + "grad_norm": 36.76324462890625, + "learning_rate": 3.6408333333333335e-05, + "logits/chosen": -21.831098556518555, + "logits/rejected": -18.956510543823242, + "logps/chosen": -508.1009826660156, + "logps/rejected": -427.7332763671875, + "loss": 1.4531, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -15.493310928344727, + "rewards/margins": 3.0047502517700195, + "rewards/rejected": -18.49806022644043, + "step": 4900 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 36.31307601928711, + "learning_rate": 3.6130555555555554e-05, + "logits/chosen": -20.227272033691406, + "logits/rejected": -19.00299835205078, + "logps/chosen": -407.9676208496094, + "logps/rejected": -453.76727294921875, + "loss": 0.8077, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -12.182390213012695, + "rewards/margins": 3.1895272731781006, + "rewards/rejected": -15.371917724609375, + "step": 5000 + }, + { + "epoch": 0.2777777777777778, + "eval_logits/chosen": -21.659564971923828, + "eval_logits/rejected": -19.52533531188965, + "eval_logps/chosen": -473.2389831542969, + "eval_logps/rejected": -465.0683288574219, + "eval_loss": 1.2852035760879517, + "eval_rewards/accuracies": 0.6970000267028809, + "eval_rewards/chosen": -15.87067699432373, + "eval_rewards/margins": 3.289060592651367, + "eval_rewards/rejected": -19.159738540649414, + "eval_runtime": 513.6976, + "eval_samples_per_second": 3.893, + "eval_steps_per_second": 3.893, + "step": 5000 + }, + { + "epoch": 0.2833333333333333, + "grad_norm": 21.41518211364746, + "learning_rate": 3.585277777777778e-05, + "logits/chosen": -22.319299697875977, + "logits/rejected": -20.139793395996094, + "logps/chosen": -428.5429382324219, + "logps/rejected": -404.91339111328125, + "loss": 1.5601, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": -13.043146133422852, + "rewards/margins": 2.018044948577881, + "rewards/rejected": -15.06119155883789, + "step": 5100 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 86.79508972167969, + "learning_rate": 3.5575000000000004e-05, + "logits/chosen": -22.575952529907227, + "logits/rejected": -20.234146118164062, + "logps/chosen": -395.1700744628906, + "logps/rejected": -343.6614074707031, + "loss": 1.2703, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": -9.909366607666016, + "rewards/margins": 2.0385243892669678, + "rewards/rejected": -11.947890281677246, + "step": 5200 + }, + { + "epoch": 0.29444444444444445, + "grad_norm": 2.5327157974243164, + "learning_rate": 3.529722222222222e-05, + "logits/chosen": -21.576257705688477, + "logits/rejected": -20.435449600219727, + "logps/chosen": -401.197021484375, + "logps/rejected": -413.098388671875, + "loss": 1.1228, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -12.823896408081055, + "rewards/margins": 2.34714412689209, + "rewards/rejected": -15.171039581298828, + "step": 5300 + }, + { + "epoch": 0.3, + "grad_norm": 9.139660687651485e-05, + "learning_rate": 3.501944444444444e-05, + "logits/chosen": -22.42139434814453, + "logits/rejected": -19.57501792907715, + "logps/chosen": -399.0223693847656, + "logps/rejected": -353.8651428222656, + "loss": 1.2688, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": -11.916400909423828, + "rewards/margins": 2.020895004272461, + "rewards/rejected": -13.937294960021973, + "step": 5400 + }, + { + "epoch": 0.3055555555555556, + "grad_norm": 1.9351762533187866, + "learning_rate": 3.4741666666666666e-05, + "logits/chosen": -23.159526824951172, + "logits/rejected": -21.7009220123291, + "logps/chosen": -398.2279052734375, + "logps/rejected": -388.0333251953125, + "loss": 1.1417, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -12.379648208618164, + "rewards/margins": 1.8808188438415527, + "rewards/rejected": -14.260467529296875, + "step": 5500 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 3.178453698637895e-05, + "learning_rate": 3.446388888888889e-05, + "logits/chosen": -23.597034454345703, + "logits/rejected": -21.123607635498047, + "logps/chosen": -443.39385986328125, + "logps/rejected": -416.39508056640625, + "loss": 1.1924, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -13.244176864624023, + "rewards/margins": 2.6304259300231934, + "rewards/rejected": -15.874602317810059, + "step": 5600 + }, + { + "epoch": 0.31666666666666665, + "grad_norm": 1.023967981338501, + "learning_rate": 3.418611111111111e-05, + "logits/chosen": -24.59587860107422, + "logits/rejected": -22.183242797851562, + "logps/chosen": -442.1808166503906, + "logps/rejected": -444.6952819824219, + "loss": 1.2523, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -13.608128547668457, + "rewards/margins": 2.371929407119751, + "rewards/rejected": -15.980057716369629, + "step": 5700 + }, + { + "epoch": 0.32222222222222224, + "grad_norm": 164.69281005859375, + "learning_rate": 3.391111111111111e-05, + "logits/chosen": -24.16000747680664, + "logits/rejected": -21.930662155151367, + "logps/chosen": -494.000244140625, + "logps/rejected": -455.0928649902344, + "loss": 1.4878, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": -11.979549407958984, + "rewards/margins": 2.5671088695526123, + "rewards/rejected": -14.546658515930176, + "step": 5800 + }, + { + "epoch": 0.3277777777777778, + "grad_norm": 9.61162281036377, + "learning_rate": 3.3633333333333335e-05, + "logits/chosen": -23.32587432861328, + "logits/rejected": -19.92889976501465, + "logps/chosen": -483.7118835449219, + "logps/rejected": -393.5342102050781, + "loss": 1.1587, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -11.974082946777344, + "rewards/margins": 2.592017412185669, + "rewards/rejected": -14.56610107421875, + "step": 5900 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 5.826363121741451e-06, + "learning_rate": 3.3355555555555554e-05, + "logits/chosen": -21.392478942871094, + "logits/rejected": -20.116945266723633, + "logps/chosen": -472.2950439453125, + "logps/rejected": -532.1182250976562, + "loss": 0.9033, + "rewards/accuracies": 0.7599999904632568, + "rewards/chosen": -13.251187324523926, + "rewards/margins": 3.6873950958251953, + "rewards/rejected": -16.938583374023438, + "step": 6000 + }, + { + "epoch": 0.3333333333333333, + "eval_logits/chosen": -19.404247283935547, + "eval_logits/rejected": -17.24852180480957, + "eval_logps/chosen": -477.0966796875, + "eval_logps/rejected": -469.74505615234375, + "eval_loss": 1.159195065498352, + "eval_rewards/accuracies": 0.703499972820282, + "eval_rewards/chosen": -16.256450653076172, + "eval_rewards/margins": 3.370957136154175, + "eval_rewards/rejected": -19.62740707397461, + "eval_runtime": 514.4506, + "eval_samples_per_second": 3.888, + "eval_steps_per_second": 3.888, + "step": 6000 + }, + { + "epoch": 0.3388888888888889, + "grad_norm": 187.22665405273438, + "learning_rate": 3.307777777777778e-05, + "logits/chosen": -20.702913284301758, + "logits/rejected": -19.409847259521484, + "logps/chosen": -499.7383728027344, + "logps/rejected": -497.5546875, + "loss": 1.482, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": -16.19487762451172, + "rewards/margins": 2.657153844833374, + "rewards/rejected": -18.85202980041504, + "step": 6100 + }, + { + "epoch": 0.34444444444444444, + "grad_norm": 8.905752182006836, + "learning_rate": 3.2800000000000004e-05, + "logits/chosen": -20.83306884765625, + "logits/rejected": -18.038827896118164, + "logps/chosen": -443.535888671875, + "logps/rejected": -386.2900085449219, + "loss": 1.3382, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": -12.683348655700684, + "rewards/margins": 2.031609058380127, + "rewards/rejected": -14.714957237243652, + "step": 6200 + }, + { + "epoch": 0.35, + "grad_norm": 7.003231048583984, + "learning_rate": 3.252222222222222e-05, + "logits/chosen": -23.669322967529297, + "logits/rejected": -19.861604690551758, + "logps/chosen": -413.4633483886719, + "logps/rejected": -357.4058532714844, + "loss": 0.7981, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -9.068346977233887, + "rewards/margins": 2.977654457092285, + "rewards/rejected": -12.046002388000488, + "step": 6300 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.7310218214988708, + "learning_rate": 3.224444444444444e-05, + "logits/chosen": -22.32114028930664, + "logits/rejected": -20.614816665649414, + "logps/chosen": -444.9656982421875, + "logps/rejected": -448.8602294921875, + "loss": 0.9717, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -10.981616973876953, + "rewards/margins": 3.538257360458374, + "rewards/rejected": -14.519875526428223, + "step": 6400 + }, + { + "epoch": 0.3611111111111111, + "grad_norm": 205.68313598632812, + "learning_rate": 3.196666666666667e-05, + "logits/chosen": -19.612163543701172, + "logits/rejected": -18.399375915527344, + "logps/chosen": -460.20623779296875, + "logps/rejected": -466.8053894042969, + "loss": 1.5208, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -14.53918170928955, + "rewards/margins": 2.0252883434295654, + "rewards/rejected": -16.564472198486328, + "step": 6500 + }, + { + "epoch": 0.36666666666666664, + "grad_norm": 76.91722106933594, + "learning_rate": 3.168888888888889e-05, + "logits/chosen": -22.61004066467285, + "logits/rejected": -19.702980041503906, + "logps/chosen": -481.0522766113281, + "logps/rejected": -461.2919006347656, + "loss": 0.9532, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -13.666290283203125, + "rewards/margins": 3.198349714279175, + "rewards/rejected": -16.864639282226562, + "step": 6600 + }, + { + "epoch": 0.37222222222222223, + "grad_norm": 0.46531158685684204, + "learning_rate": 3.141111111111111e-05, + "logits/chosen": -22.23012924194336, + "logits/rejected": -20.40010643005371, + "logps/chosen": -419.5267639160156, + "logps/rejected": -405.9312438964844, + "loss": 0.5831, + "rewards/accuracies": 0.8100000023841858, + "rewards/chosen": -11.47601318359375, + "rewards/margins": 3.7709903717041016, + "rewards/rejected": -15.247004508972168, + "step": 6700 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 0.1342560350894928, + "learning_rate": 3.1133333333333336e-05, + "logits/chosen": -20.81796646118164, + "logits/rejected": -18.216876983642578, + "logps/chosen": -455.734130859375, + "logps/rejected": -420.1650390625, + "loss": 1.1916, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -12.779292106628418, + "rewards/margins": 2.576178789138794, + "rewards/rejected": -15.35547161102295, + "step": 6800 + }, + { + "epoch": 0.38333333333333336, + "grad_norm": 8.749398231506348, + "learning_rate": 3.085555555555556e-05, + "logits/chosen": -20.364534378051758, + "logits/rejected": -18.375808715820312, + "logps/chosen": -449.0058288574219, + "logps/rejected": -414.3318786621094, + "loss": 1.1997, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": -13.63778305053711, + "rewards/margins": 2.045936346054077, + "rewards/rejected": -15.683719635009766, + "step": 6900 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 0.244241863489151, + "learning_rate": 3.057777777777778e-05, + "logits/chosen": -20.147541046142578, + "logits/rejected": -17.860939025878906, + "logps/chosen": -480.4094543457031, + "logps/rejected": -507.03570556640625, + "loss": 0.9384, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -14.279885292053223, + "rewards/margins": 3.4173572063446045, + "rewards/rejected": -17.697240829467773, + "step": 7000 + }, + { + "epoch": 0.3888888888888889, + "eval_logits/chosen": -20.12757682800293, + "eval_logits/rejected": -17.744281768798828, + "eval_logps/chosen": -464.8318176269531, + "eval_logps/rejected": -457.46710205078125, + "eval_loss": 1.0641870498657227, + "eval_rewards/accuracies": 0.7229999899864197, + "eval_rewards/chosen": -15.029963493347168, + "eval_rewards/margins": 3.3696537017822266, + "eval_rewards/rejected": -18.399616241455078, + "eval_runtime": 514.5541, + "eval_samples_per_second": 3.887, + "eval_steps_per_second": 3.887, + "step": 7000 + }, + { + "epoch": 0.39444444444444443, + "grad_norm": 0.005536902695894241, + "learning_rate": 3.03e-05, + "logits/chosen": -20.647850036621094, + "logits/rejected": -18.133609771728516, + "logps/chosen": -415.6095275878906, + "logps/rejected": -402.4629821777344, + "loss": 0.8857, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.265934944152832, + "rewards/margins": 3.624260663986206, + "rewards/rejected": -15.8901948928833, + "step": 7100 + }, + { + "epoch": 0.4, + "grad_norm": 0.04572853818535805, + "learning_rate": 3.0022222222222223e-05, + "logits/chosen": -19.907888412475586, + "logits/rejected": -18.142955780029297, + "logps/chosen": -423.48309326171875, + "logps/rejected": -462.4736022949219, + "loss": 0.9713, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -15.313251495361328, + "rewards/margins": 2.8448657989501953, + "rewards/rejected": -18.158119201660156, + "step": 7200 + }, + { + "epoch": 0.40555555555555556, + "grad_norm": 0.6151033043861389, + "learning_rate": 2.974444444444445e-05, + "logits/chosen": -20.229476928710938, + "logits/rejected": -18.490764617919922, + "logps/chosen": -457.56182861328125, + "logps/rejected": -473.5592956542969, + "loss": 1.0747, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -14.09079360961914, + "rewards/margins": 2.365145683288574, + "rewards/rejected": -16.4559383392334, + "step": 7300 + }, + { + "epoch": 0.4111111111111111, + "grad_norm": 7.291659355163574, + "learning_rate": 2.946666666666667e-05, + "logits/chosen": -23.004566192626953, + "logits/rejected": -19.805797576904297, + "logps/chosen": -481.6417236328125, + "logps/rejected": -431.62652587890625, + "loss": 0.7681, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -12.548958778381348, + "rewards/margins": 3.8286855220794678, + "rewards/rejected": -16.37764549255371, + "step": 7400 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 83.0041732788086, + "learning_rate": 2.918888888888889e-05, + "logits/chosen": -19.5795841217041, + "logits/rejected": -17.497295379638672, + "logps/chosen": -500.30450439453125, + "logps/rejected": -476.33551025390625, + "loss": 0.9668, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -15.366935729980469, + "rewards/margins": 3.3807599544525146, + "rewards/rejected": -18.74769401550293, + "step": 7500 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 91.04598236083984, + "learning_rate": 2.891111111111111e-05, + "logits/chosen": -17.51012420654297, + "logits/rejected": -15.843911170959473, + "logps/chosen": -526.481689453125, + "logps/rejected": -524.4310302734375, + "loss": 1.4432, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -21.066740036010742, + "rewards/margins": 3.3124139308929443, + "rewards/rejected": -24.3791561126709, + "step": 7600 + }, + { + "epoch": 0.42777777777777776, + "grad_norm": 104.69700622558594, + "learning_rate": 2.8633333333333336e-05, + "logits/chosen": -20.341449737548828, + "logits/rejected": -18.151596069335938, + "logps/chosen": -461.58953857421875, + "logps/rejected": -475.7055358886719, + "loss": 1.3875, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": -14.036093711853027, + "rewards/margins": 2.4607656002044678, + "rewards/rejected": -16.496858596801758, + "step": 7700 + }, + { + "epoch": 0.43333333333333335, + "grad_norm": 0.027652239426970482, + "learning_rate": 2.8358333333333336e-05, + "logits/chosen": -19.186847686767578, + "logits/rejected": -17.649900436401367, + "logps/chosen": -383.07452392578125, + "logps/rejected": -415.3429260253906, + "loss": 0.653, + "rewards/accuracies": 0.7799999713897705, + "rewards/chosen": -12.019598007202148, + "rewards/margins": 3.3070571422576904, + "rewards/rejected": -15.326654434204102, + "step": 7800 + }, + { + "epoch": 0.4388888888888889, + "grad_norm": 6.4372172355651855, + "learning_rate": 2.8086111111111114e-05, + "logits/chosen": -19.208694458007812, + "logits/rejected": -16.9169979095459, + "logps/chosen": -421.28155517578125, + "logps/rejected": -468.7637634277344, + "loss": 1.2184, + "rewards/accuracies": 0.7599999904632568, + "rewards/chosen": -14.001653671264648, + "rewards/margins": 3.6824240684509277, + "rewards/rejected": -17.684080123901367, + "step": 7900 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 12.756057739257812, + "learning_rate": 2.7808333333333336e-05, + "logits/chosen": -20.561195373535156, + "logits/rejected": -18.399860382080078, + "logps/chosen": -395.6372985839844, + "logps/rejected": -380.3861389160156, + "loss": 0.972, + "rewards/accuracies": 0.7599999904632568, + "rewards/chosen": -12.981096267700195, + "rewards/margins": 3.4479990005493164, + "rewards/rejected": -16.429096221923828, + "step": 8000 + }, + { + "epoch": 0.4444444444444444, + "eval_logits/chosen": -21.462675094604492, + "eval_logits/rejected": -18.91302490234375, + "eval_logps/chosen": -445.71258544921875, + "eval_logps/rejected": -438.1914367675781, + "eval_loss": 1.005431056022644, + "eval_rewards/accuracies": 0.7304999828338623, + "eval_rewards/chosen": -13.118040084838867, + "eval_rewards/margins": 3.35400390625, + "eval_rewards/rejected": -16.472042083740234, + "eval_runtime": 514.4453, + "eval_samples_per_second": 3.888, + "eval_steps_per_second": 3.888, + "step": 8000 + }, + { + "epoch": 0.45, + "grad_norm": 0.873455822467804, + "learning_rate": 2.7530555555555558e-05, + "logits/chosen": -20.926027297973633, + "logits/rejected": -18.914487838745117, + "logps/chosen": -449.6330871582031, + "logps/rejected": -401.6344909667969, + "loss": 1.4466, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": -13.639978408813477, + "rewards/margins": 2.354579448699951, + "rewards/rejected": -15.994558334350586, + "step": 8100 + }, + { + "epoch": 0.45555555555555555, + "grad_norm": 0.09531093388795853, + "learning_rate": 2.7252777777777777e-05, + "logits/chosen": -23.3233699798584, + "logits/rejected": -20.481056213378906, + "logps/chosen": -498.3794860839844, + "logps/rejected": -424.1575927734375, + "loss": 1.2541, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -13.865898132324219, + "rewards/margins": 2.3438451290130615, + "rewards/rejected": -16.20974349975586, + "step": 8200 + }, + { + "epoch": 0.46111111111111114, + "grad_norm": 1.2285990715026855, + "learning_rate": 2.6975000000000002e-05, + "logits/chosen": -20.74081802368164, + "logits/rejected": -18.432857513427734, + "logps/chosen": -419.7821960449219, + "logps/rejected": -401.6585693359375, + "loss": 0.9419, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -12.170726776123047, + "rewards/margins": 2.6834239959716797, + "rewards/rejected": -14.854150772094727, + "step": 8300 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 105.08723449707031, + "learning_rate": 2.6697222222222224e-05, + "logits/chosen": -20.322032928466797, + "logits/rejected": -19.04830551147461, + "logps/chosen": -386.33343505859375, + "logps/rejected": -441.2618713378906, + "loss": 1.1772, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -13.48509407043457, + "rewards/margins": 2.949591636657715, + "rewards/rejected": -16.4346866607666, + "step": 8400 + }, + { + "epoch": 0.4722222222222222, + "grad_norm": 19.047433853149414, + "learning_rate": 2.6419444444444446e-05, + "logits/chosen": -20.47550392150879, + "logits/rejected": -17.77329444885254, + "logps/chosen": -463.8529968261719, + "logps/rejected": -431.220458984375, + "loss": 1.1826, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -13.552116394042969, + "rewards/margins": 2.663545846939087, + "rewards/rejected": -16.215662002563477, + "step": 8500 + }, + { + "epoch": 0.4777777777777778, + "grad_norm": 1.1426838142369888e-07, + "learning_rate": 2.614166666666667e-05, + "logits/chosen": -20.838682174682617, + "logits/rejected": -18.776742935180664, + "logps/chosen": -498.4126892089844, + "logps/rejected": -543.6761474609375, + "loss": 0.7934, + "rewards/accuracies": 0.7900000214576721, + "rewards/chosen": -16.565221786499023, + "rewards/margins": 5.299337387084961, + "rewards/rejected": -21.86455726623535, + "step": 8600 + }, + { + "epoch": 0.48333333333333334, + "grad_norm": 0.1120363399386406, + "learning_rate": 2.5863888888888886e-05, + "logits/chosen": -17.256568908691406, + "logits/rejected": -16.536701202392578, + "logps/chosen": -466.61212158203125, + "logps/rejected": -482.1832275390625, + "loss": 1.7192, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -18.734832763671875, + "rewards/margins": 1.7566769123077393, + "rewards/rejected": -20.49150848388672, + "step": 8700 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 0.0029417257755994797, + "learning_rate": 2.558611111111111e-05, + "logits/chosen": -19.67310905456543, + "logits/rejected": -16.213468551635742, + "logps/chosen": -428.3750305175781, + "logps/rejected": -393.8487548828125, + "loss": 0.9911, + "rewards/accuracies": 0.7699999809265137, + "rewards/chosen": -13.139904975891113, + "rewards/margins": 3.9154715538024902, + "rewards/rejected": -17.055377960205078, + "step": 8800 + }, + { + "epoch": 0.49444444444444446, + "grad_norm": 126.61809539794922, + "learning_rate": 2.5308333333333333e-05, + "logits/chosen": -23.180278778076172, + "logits/rejected": -19.511472702026367, + "logps/chosen": -489.6237487792969, + "logps/rejected": -454.31573486328125, + "loss": 0.9557, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -12.57674789428711, + "rewards/margins": 4.107083797454834, + "rewards/rejected": -16.6838321685791, + "step": 8900 + }, + { + "epoch": 0.5, + "grad_norm": 0.33939558267593384, + "learning_rate": 2.5030555555555558e-05, + "logits/chosen": -22.32333755493164, + "logits/rejected": -20.256454467773438, + "logps/chosen": -536.5589599609375, + "logps/rejected": -516.0322265625, + "loss": 1.2412, + "rewards/accuracies": 0.75, + "rewards/chosen": -13.354819297790527, + "rewards/margins": 2.919679880142212, + "rewards/rejected": -16.274499893188477, + "step": 9000 + }, + { + "epoch": 0.5, + "eval_logits/chosen": -20.591720581054688, + "eval_logits/rejected": -18.115394592285156, + "eval_logps/chosen": -434.9206237792969, + "eval_logps/rejected": -427.2961120605469, + "eval_loss": 0.953862190246582, + "eval_rewards/accuracies": 0.7310000061988831, + "eval_rewards/chosen": -12.03884220123291, + "eval_rewards/margins": 3.343675374984741, + "eval_rewards/rejected": -15.382518768310547, + "eval_runtime": 514.5622, + "eval_samples_per_second": 3.887, + "eval_steps_per_second": 3.887, + "step": 9000 + }, + { + "epoch": 0.5055555555555555, + "grad_norm": 142.49417114257812, + "learning_rate": 2.475277777777778e-05, + "logits/chosen": -20.888193130493164, + "logits/rejected": -17.153379440307617, + "logps/chosen": -424.7377624511719, + "logps/rejected": -385.7281494140625, + "loss": 0.9644, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -10.958181381225586, + "rewards/margins": 3.00290584564209, + "rewards/rejected": -13.96108627319336, + "step": 9100 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 8.631003379821777, + "learning_rate": 2.4475000000000002e-05, + "logits/chosen": -21.170795440673828, + "logits/rejected": -19.17572784423828, + "logps/chosen": -455.48712158203125, + "logps/rejected": -481.4903869628906, + "loss": 1.031, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -10.5238037109375, + "rewards/margins": 3.058394193649292, + "rewards/rejected": -13.582197189331055, + "step": 9200 + }, + { + "epoch": 0.5166666666666667, + "grad_norm": 49.392459869384766, + "learning_rate": 2.4197222222222224e-05, + "logits/chosen": -19.933317184448242, + "logits/rejected": -17.989290237426758, + "logps/chosen": -409.6031188964844, + "logps/rejected": -428.5143737792969, + "loss": 1.0103, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -12.640972137451172, + "rewards/margins": 3.1585609912872314, + "rewards/rejected": -15.799530982971191, + "step": 9300 + }, + { + "epoch": 0.5222222222222223, + "grad_norm": 0.04712487757205963, + "learning_rate": 2.3919444444444446e-05, + "logits/chosen": -19.352563858032227, + "logits/rejected": -16.899452209472656, + "logps/chosen": -433.61737060546875, + "logps/rejected": -420.65753173828125, + "loss": 0.9818, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -12.074993133544922, + "rewards/margins": 3.3845109939575195, + "rewards/rejected": -15.459502220153809, + "step": 9400 + }, + { + "epoch": 0.5277777777777778, + "grad_norm": 51.79705047607422, + "learning_rate": 2.3641666666666668e-05, + "logits/chosen": -20.3154296875, + "logits/rejected": -17.9931583404541, + "logps/chosen": -461.1064758300781, + "logps/rejected": -437.752685546875, + "loss": 0.8858, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -13.093022346496582, + "rewards/margins": 3.264617919921875, + "rewards/rejected": -16.357641220092773, + "step": 9500 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 119.83451843261719, + "learning_rate": 2.336388888888889e-05, + "logits/chosen": -18.93035316467285, + "logits/rejected": -18.193710327148438, + "logps/chosen": -452.3094482421875, + "logps/rejected": -444.74090576171875, + "loss": 1.0935, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -13.256900787353516, + "rewards/margins": 2.8288626670837402, + "rewards/rejected": -16.08576202392578, + "step": 9600 + }, + { + "epoch": 0.5388888888888889, + "grad_norm": 3.306342363357544, + "learning_rate": 2.308611111111111e-05, + "logits/chosen": -20.609481811523438, + "logits/rejected": -19.013137817382812, + "logps/chosen": -452.32977294921875, + "logps/rejected": -459.5257873535156, + "loss": 0.7868, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -14.13031005859375, + "rewards/margins": 2.8975772857666016, + "rewards/rejected": -17.02788734436035, + "step": 9700 + }, + { + "epoch": 0.5444444444444444, + "grad_norm": 2.263075351715088, + "learning_rate": 2.2808333333333333e-05, + "logits/chosen": -19.277490615844727, + "logits/rejected": -15.582589149475098, + "logps/chosen": -431.451904296875, + "logps/rejected": -414.69317626953125, + "loss": 0.7792, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -13.001684188842773, + "rewards/margins": 3.5575027465820312, + "rewards/rejected": -16.559186935424805, + "step": 9800 + }, + { + "epoch": 0.55, + "grad_norm": 0.0013317704433575273, + "learning_rate": 2.253055555555556e-05, + "logits/chosen": -20.729467391967773, + "logits/rejected": -17.83069610595703, + "logps/chosen": -446.4141540527344, + "logps/rejected": -436.2938537597656, + "loss": 0.9514, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -14.056103706359863, + "rewards/margins": 3.98051381111145, + "rewards/rejected": -18.036617279052734, + "step": 9900 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 35.789085388183594, + "learning_rate": 2.2252777777777777e-05, + "logits/chosen": -21.45740509033203, + "logits/rejected": -18.5902042388916, + "logps/chosen": -450.0965270996094, + "logps/rejected": -435.7716064453125, + "loss": 0.8216, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.103019714355469, + "rewards/margins": 3.6182754039764404, + "rewards/rejected": -17.721294403076172, + "step": 10000 + }, + { + "epoch": 0.5555555555555556, + "eval_logits/chosen": -21.712465286254883, + "eval_logits/rejected": -19.086681365966797, + "eval_logps/chosen": -441.17352294921875, + "eval_logps/rejected": -435.81170654296875, + "eval_loss": 1.0835009813308716, + "eval_rewards/accuracies": 0.7285000085830688, + "eval_rewards/chosen": -12.664133071899414, + "eval_rewards/margins": 3.5699427127838135, + "eval_rewards/rejected": -16.23407554626465, + "eval_runtime": 513.7391, + "eval_samples_per_second": 3.893, + "eval_steps_per_second": 3.893, + "step": 10000 + }, + { + "epoch": 0.5611111111111111, + "grad_norm": 0.7180230021476746, + "learning_rate": 2.1975000000000002e-05, + "logits/chosen": -21.479724884033203, + "logits/rejected": -18.234243392944336, + "logps/chosen": -443.31597900390625, + "logps/rejected": -431.96600341796875, + "loss": 0.819, + "rewards/accuracies": 0.7599999904632568, + "rewards/chosen": -12.655189514160156, + "rewards/margins": 4.10093879699707, + "rewards/rejected": -16.756128311157227, + "step": 10100 + }, + { + "epoch": 0.5666666666666667, + "grad_norm": 126.35246276855469, + "learning_rate": 2.1697222222222224e-05, + "logits/chosen": -19.195253372192383, + "logits/rejected": -17.009681701660156, + "logps/chosen": -448.04376220703125, + "logps/rejected": -401.873291015625, + "loss": 1.3503, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": -14.247776985168457, + "rewards/margins": 3.0657379627227783, + "rewards/rejected": -17.313512802124023, + "step": 10200 + }, + { + "epoch": 0.5722222222222222, + "grad_norm": 109.78204345703125, + "learning_rate": 2.1419444444444446e-05, + "logits/chosen": -22.040660858154297, + "logits/rejected": -19.4868221282959, + "logps/chosen": -497.30859375, + "logps/rejected": -412.1036071777344, + "loss": 1.3664, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -12.726430892944336, + "rewards/margins": 2.247918128967285, + "rewards/rejected": -14.974349021911621, + "step": 10300 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 131.8098602294922, + "learning_rate": 2.1141666666666668e-05, + "logits/chosen": -19.425859451293945, + "logits/rejected": -16.64602279663086, + "logps/chosen": -422.4449157714844, + "logps/rejected": -385.5190734863281, + "loss": 1.0666, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.786916732788086, + "rewards/margins": 3.4485232830047607, + "rewards/rejected": -16.235441207885742, + "step": 10400 + }, + { + "epoch": 0.5833333333333334, + "grad_norm": 39.41535186767578, + "learning_rate": 2.086388888888889e-05, + "logits/chosen": -20.2507381439209, + "logits/rejected": -18.010120391845703, + "logps/chosen": -542.4505004882812, + "logps/rejected": -533.647705078125, + "loss": 0.8223, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -17.083816528320312, + "rewards/margins": 3.801875591278076, + "rewards/rejected": -20.885690689086914, + "step": 10500 + }, + { + "epoch": 0.5888888888888889, + "grad_norm": 19.924392700195312, + "learning_rate": 2.058611111111111e-05, + "logits/chosen": -19.612703323364258, + "logits/rejected": -18.6157169342041, + "logps/chosen": -445.16156005859375, + "logps/rejected": -499.56976318359375, + "loss": 0.9663, + "rewards/accuracies": 0.75, + "rewards/chosen": -16.382972717285156, + "rewards/margins": 3.505514144897461, + "rewards/rejected": -19.88848876953125, + "step": 10600 + }, + { + "epoch": 0.5944444444444444, + "grad_norm": 4.810013294219971, + "learning_rate": 2.0308333333333333e-05, + "logits/chosen": -17.834911346435547, + "logits/rejected": -15.12381362915039, + "logps/chosen": -537.6931762695312, + "logps/rejected": -514.4136962890625, + "loss": 0.8739, + "rewards/accuracies": 0.7900000214576721, + "rewards/chosen": -18.53960418701172, + "rewards/margins": 3.9858081340789795, + "rewards/rejected": -22.525409698486328, + "step": 10700 + }, + { + "epoch": 0.6, + "grad_norm": 19.66077995300293, + "learning_rate": 2.0030555555555555e-05, + "logits/chosen": -16.975112915039062, + "logits/rejected": -15.271533966064453, + "logps/chosen": -485.5791320800781, + "logps/rejected": -470.1282043457031, + "loss": 0.9378, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -16.441003799438477, + "rewards/margins": 4.130311489105225, + "rewards/rejected": -20.571313858032227, + "step": 10800 + }, + { + "epoch": 0.6055555555555555, + "grad_norm": 137.1862335205078, + "learning_rate": 1.9752777777777777e-05, + "logits/chosen": -19.495790481567383, + "logits/rejected": -17.684574127197266, + "logps/chosen": -489.1669921875, + "logps/rejected": -474.1072998046875, + "loss": 1.2387, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.691755294799805, + "rewards/margins": 3.6350252628326416, + "rewards/rejected": -19.326780319213867, + "step": 10900 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 0.000310105417156592, + "learning_rate": 1.9475000000000002e-05, + "logits/chosen": -19.06981658935547, + "logits/rejected": -16.941869735717773, + "logps/chosen": -476.37713623046875, + "logps/rejected": -477.0956726074219, + "loss": 0.8211, + "rewards/accuracies": 0.7799999713897705, + "rewards/chosen": -13.586738586425781, + "rewards/margins": 3.9357149600982666, + "rewards/rejected": -17.52245330810547, + "step": 11000 + }, + { + "epoch": 0.6111111111111112, + "eval_logits/chosen": -18.58342170715332, + "eval_logits/rejected": -16.301347732543945, + "eval_logps/chosen": -445.8861999511719, + "eval_logps/rejected": -440.4415588378906, + "eval_loss": 1.0195616483688354, + "eval_rewards/accuracies": 0.734499990940094, + "eval_rewards/chosen": -13.135401725769043, + "eval_rewards/margins": 3.561659812927246, + "eval_rewards/rejected": -16.697063446044922, + "eval_runtime": 514.0886, + "eval_samples_per_second": 3.89, + "eval_steps_per_second": 3.89, + "step": 11000 + }, + { + "epoch": 0.6166666666666667, + "grad_norm": 53.936344146728516, + "learning_rate": 1.919722222222222e-05, + "logits/chosen": -20.594526290893555, + "logits/rejected": -17.588470458984375, + "logps/chosen": -481.7535095214844, + "logps/rejected": -439.7695617675781, + "loss": 0.6712, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -11.763232231140137, + "rewards/margins": 3.9691224098205566, + "rewards/rejected": -15.732356071472168, + "step": 11100 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 205.2020721435547, + "learning_rate": 1.8919444444444446e-05, + "logits/chosen": -19.583057403564453, + "logits/rejected": -17.61818504333496, + "logps/chosen": -460.3359069824219, + "logps/rejected": -442.7764587402344, + "loss": 0.9559, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -13.048367500305176, + "rewards/margins": 3.1950597763061523, + "rewards/rejected": -16.24342918395996, + "step": 11200 + }, + { + "epoch": 0.6277777777777778, + "grad_norm": 192.8442840576172, + "learning_rate": 1.8641666666666668e-05, + "logits/chosen": -20.468460083007812, + "logits/rejected": -17.665485382080078, + "logps/chosen": -530.1047973632812, + "logps/rejected": -502.96307373046875, + "loss": 1.2576, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -15.131207466125488, + "rewards/margins": 4.251911640167236, + "rewards/rejected": -19.38311767578125, + "step": 11300 + }, + { + "epoch": 0.6333333333333333, + "grad_norm": 0.004229276906698942, + "learning_rate": 1.836388888888889e-05, + "logits/chosen": -21.695653915405273, + "logits/rejected": -19.568984985351562, + "logps/chosen": -441.556884765625, + "logps/rejected": -431.2348937988281, + "loss": 1.3119, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": -11.982224464416504, + "rewards/margins": 2.4050285816192627, + "rewards/rejected": -14.387251853942871, + "step": 11400 + }, + { + "epoch": 0.6388888888888888, + "grad_norm": 25.97652244567871, + "learning_rate": 1.8086111111111112e-05, + "logits/chosen": -21.555927276611328, + "logits/rejected": -18.617895126342773, + "logps/chosen": -408.0957336425781, + "logps/rejected": -403.4908142089844, + "loss": 1.1708, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -11.917741775512695, + "rewards/margins": 3.201524019241333, + "rewards/rejected": -15.119266510009766, + "step": 11500 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 0.0003936285793315619, + "learning_rate": 1.7808333333333334e-05, + "logits/chosen": -19.159265518188477, + "logits/rejected": -16.5881404876709, + "logps/chosen": -422.9766540527344, + "logps/rejected": -356.32781982421875, + "loss": 1.0118, + "rewards/accuracies": 0.7599999904632568, + "rewards/chosen": -12.874427795410156, + "rewards/margins": 2.578512191772461, + "rewards/rejected": -15.452940940856934, + "step": 11600 + }, + { + "epoch": 0.65, + "grad_norm": 1.3970117568969727, + "learning_rate": 1.7530555555555556e-05, + "logits/chosen": -20.206884384155273, + "logits/rejected": -17.95098876953125, + "logps/chosen": -457.98773193359375, + "logps/rejected": -438.3656921386719, + "loss": 1.1103, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -13.596541404724121, + "rewards/margins": 3.0302700996398926, + "rewards/rejected": -16.626811981201172, + "step": 11700 + }, + { + "epoch": 0.6555555555555556, + "grad_norm": 0.0008353956509381533, + "learning_rate": 1.7252777777777777e-05, + "logits/chosen": -19.167177200317383, + "logits/rejected": -17.282243728637695, + "logps/chosen": -465.3480529785156, + "logps/rejected": -428.1108093261719, + "loss": 0.9969, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -14.03890609741211, + "rewards/margins": 3.1509668827056885, + "rewards/rejected": -17.18987274169922, + "step": 11800 + }, + { + "epoch": 0.6611111111111111, + "grad_norm": 7.814167022705078, + "learning_rate": 1.6975000000000003e-05, + "logits/chosen": -20.246158599853516, + "logits/rejected": -17.652912139892578, + "logps/chosen": -439.302734375, + "logps/rejected": -391.4903869628906, + "loss": 0.5739, + "rewards/accuracies": 0.7900000214576721, + "rewards/chosen": -11.366064071655273, + "rewards/margins": 3.457024574279785, + "rewards/rejected": -14.823086738586426, + "step": 11900 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 181.47531127929688, + "learning_rate": 1.669722222222222e-05, + "logits/chosen": -21.140504837036133, + "logits/rejected": -17.57772445678711, + "logps/chosen": -499.9806213378906, + "logps/rejected": -414.9298400878906, + "loss": 0.83, + "rewards/accuracies": 0.7699999809265137, + "rewards/chosen": -12.902310371398926, + "rewards/margins": 3.5421319007873535, + "rewards/rejected": -16.44444465637207, + "step": 12000 + }, + { + "epoch": 0.6666666666666666, + "eval_logits/chosen": -21.231950759887695, + "eval_logits/rejected": -18.7503719329834, + "eval_logps/chosen": -421.833740234375, + "eval_logps/rejected": -413.7301025390625, + "eval_loss": 0.9428097605705261, + "eval_rewards/accuracies": 0.7360000014305115, + "eval_rewards/chosen": -10.730157852172852, + "eval_rewards/margins": 3.295757293701172, + "eval_rewards/rejected": -14.025914192199707, + "eval_runtime": 514.3943, + "eval_samples_per_second": 3.888, + "eval_steps_per_second": 3.888, + "step": 12000 + }, + { + "epoch": 0.6722222222222223, + "grad_norm": 5.458330154418945, + "learning_rate": 1.6419444444444446e-05, + "logits/chosen": -20.68967056274414, + "logits/rejected": -17.7625789642334, + "logps/chosen": -477.7763977050781, + "logps/rejected": -451.9176940917969, + "loss": 0.5351, + "rewards/accuracies": 0.8600000143051147, + "rewards/chosen": -11.71245002746582, + "rewards/margins": 4.78226375579834, + "rewards/rejected": -16.494712829589844, + "step": 12100 + }, + { + "epoch": 0.6777777777777778, + "grad_norm": 0.8240537643432617, + "learning_rate": 1.6141666666666668e-05, + "logits/chosen": -21.084909439086914, + "logits/rejected": -18.897369384765625, + "logps/chosen": -402.53204345703125, + "logps/rejected": -375.0473327636719, + "loss": 1.4215, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -10.851410865783691, + "rewards/margins": 2.256866693496704, + "rewards/rejected": -13.1082763671875, + "step": 12200 + }, + { + "epoch": 0.6833333333333333, + "grad_norm": 0.034935299307107925, + "learning_rate": 1.586388888888889e-05, + "logits/chosen": -20.94774627685547, + "logits/rejected": -19.851974487304688, + "logps/chosen": -443.2183837890625, + "logps/rejected": -426.5263977050781, + "loss": 1.0076, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -11.41849136352539, + "rewards/margins": 2.4822630882263184, + "rewards/rejected": -13.900753021240234, + "step": 12300 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 0.2003341168165207, + "learning_rate": 1.5586111111111112e-05, + "logits/chosen": -22.319114685058594, + "logits/rejected": -19.663768768310547, + "logps/chosen": -482.7795715332031, + "logps/rejected": -444.809326171875, + "loss": 1.1242, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -10.44569206237793, + "rewards/margins": 3.099884271621704, + "rewards/rejected": -13.545577049255371, + "step": 12400 + }, + { + "epoch": 0.6944444444444444, + "grad_norm": 0.048337243497371674, + "learning_rate": 1.5308333333333334e-05, + "logits/chosen": -21.67813491821289, + "logits/rejected": -19.13730812072754, + "logps/chosen": -431.6572265625, + "logps/rejected": -405.55499267578125, + "loss": 0.7676, + "rewards/accuracies": 0.7599999904632568, + "rewards/chosen": -10.356959342956543, + "rewards/margins": 3.7439322471618652, + "rewards/rejected": -14.10089111328125, + "step": 12500 + }, + { + "epoch": 0.7, + "grad_norm": 87.86064147949219, + "learning_rate": 1.5030555555555556e-05, + "logits/chosen": -20.569547653198242, + "logits/rejected": -17.331565856933594, + "logps/chosen": -425.2514343261719, + "logps/rejected": -377.5094909667969, + "loss": 0.8172, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -11.045492172241211, + "rewards/margins": 3.5272183418273926, + "rewards/rejected": -14.572710037231445, + "step": 12600 + }, + { + "epoch": 0.7055555555555556, + "grad_norm": 1.1034353519789875e-05, + "learning_rate": 1.475277777777778e-05, + "logits/chosen": -20.142818450927734, + "logits/rejected": -18.60303497314453, + "logps/chosen": -470.3570556640625, + "logps/rejected": -485.6142578125, + "loss": 1.0245, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -13.037267684936523, + "rewards/margins": 3.5026137828826904, + "rewards/rejected": -16.539880752563477, + "step": 12700 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 6.376559734344482, + "learning_rate": 1.4475e-05, + "logits/chosen": -20.782583236694336, + "logits/rejected": -17.673080444335938, + "logps/chosen": -461.5556945800781, + "logps/rejected": -433.3602294921875, + "loss": 0.8861, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -12.721044540405273, + "rewards/margins": 3.551687717437744, + "rewards/rejected": -16.272733688354492, + "step": 12800 + }, + { + "epoch": 0.7166666666666667, + "grad_norm": 50.99307632446289, + "learning_rate": 1.4197222222222223e-05, + "logits/chosen": -21.353221893310547, + "logits/rejected": -19.589374542236328, + "logps/chosen": -402.0509033203125, + "logps/rejected": -413.9472351074219, + "loss": 0.975, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -11.283716201782227, + "rewards/margins": 3.2868738174438477, + "rewards/rejected": -14.570590019226074, + "step": 12900 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 169.88388061523438, + "learning_rate": 1.3922222222222223e-05, + "logits/chosen": -19.80286407470703, + "logits/rejected": -18.1820125579834, + "logps/chosen": -466.3910827636719, + "logps/rejected": -481.96844482421875, + "loss": 1.2182, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": -14.61322021484375, + "rewards/margins": 3.726336717605591, + "rewards/rejected": -18.339557647705078, + "step": 13000 + }, + { + "epoch": 0.7222222222222222, + "eval_logits/chosen": -20.12559700012207, + "eval_logits/rejected": -17.831607818603516, + "eval_logps/chosen": -461.2136535644531, + "eval_logps/rejected": -457.02996826171875, + "eval_loss": 1.0792973041534424, + "eval_rewards/accuracies": 0.734000027179718, + "eval_rewards/chosen": -14.668147087097168, + "eval_rewards/margins": 3.6877522468566895, + "eval_rewards/rejected": -18.355897903442383, + "eval_runtime": 515.0682, + "eval_samples_per_second": 3.883, + "eval_steps_per_second": 3.883, + "step": 13000 + }, + { + "epoch": 0.7277777777777777, + "grad_norm": 78.8185806274414, + "learning_rate": 1.3644444444444445e-05, + "logits/chosen": -19.425329208374023, + "logits/rejected": -17.312776565551758, + "logps/chosen": -401.5394287109375, + "logps/rejected": -456.3423767089844, + "loss": 0.953, + "rewards/accuracies": 0.7599999904632568, + "rewards/chosen": -12.77175521850586, + "rewards/margins": 4.353198051452637, + "rewards/rejected": -17.124954223632812, + "step": 13100 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.13797318935394287, + "learning_rate": 1.3366666666666667e-05, + "logits/chosen": -21.982481002807617, + "logits/rejected": -19.057777404785156, + "logps/chosen": -467.70135498046875, + "logps/rejected": -460.4678955078125, + "loss": 0.6991, + "rewards/accuracies": 0.7900000214576721, + "rewards/chosen": -11.2850341796875, + "rewards/margins": 3.9855663776397705, + "rewards/rejected": -15.270599365234375, + "step": 13200 + }, + { + "epoch": 0.7388888888888889, + "grad_norm": 44.618019104003906, + "learning_rate": 1.3088888888888889e-05, + "logits/chosen": -21.93603515625, + "logits/rejected": -19.611677169799805, + "logps/chosen": -437.025634765625, + "logps/rejected": -460.77703857421875, + "loss": 0.9846, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -10.351505279541016, + "rewards/margins": 2.9802308082580566, + "rewards/rejected": -13.331737518310547, + "step": 13300 + }, + { + "epoch": 0.7444444444444445, + "grad_norm": 99.39557647705078, + "learning_rate": 1.2811111111111112e-05, + "logits/chosen": -20.339609146118164, + "logits/rejected": -18.70067596435547, + "logps/chosen": -419.6831970214844, + "logps/rejected": -411.19781494140625, + "loss": 1.1346, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -10.9735107421875, + "rewards/margins": 2.7434375286102295, + "rewards/rejected": -13.716948509216309, + "step": 13400 + }, + { + "epoch": 0.75, + "grad_norm": 0.06221761181950569, + "learning_rate": 1.2533333333333332e-05, + "logits/chosen": -19.273221969604492, + "logits/rejected": -16.66728401184082, + "logps/chosen": -471.1360168457031, + "logps/rejected": -412.35906982421875, + "loss": 0.8601, + "rewards/accuracies": 0.7599999904632568, + "rewards/chosen": -11.734038352966309, + "rewards/margins": 3.970507860183716, + "rewards/rejected": -15.704545974731445, + "step": 13500 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 26.5037841796875, + "learning_rate": 1.2255555555555556e-05, + "logits/chosen": -18.12467384338379, + "logits/rejected": -16.774181365966797, + "logps/chosen": -415.8067321777344, + "logps/rejected": -426.3531188964844, + "loss": 0.8413, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -12.49328899383545, + "rewards/margins": 3.7444074153900146, + "rewards/rejected": -16.237695693969727, + "step": 13600 + }, + { + "epoch": 0.7611111111111111, + "grad_norm": 0.0017695535207167268, + "learning_rate": 1.1977777777777778e-05, + "logits/chosen": -18.577011108398438, + "logits/rejected": -17.081361770629883, + "logps/chosen": -445.9018859863281, + "logps/rejected": -403.2713317871094, + "loss": 0.9961, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -13.018659591674805, + "rewards/margins": 3.142739772796631, + "rewards/rejected": -16.161399841308594, + "step": 13700 + }, + { + "epoch": 0.7666666666666667, + "grad_norm": 120.24114227294922, + "learning_rate": 1.1700000000000001e-05, + "logits/chosen": -20.942720413208008, + "logits/rejected": -17.959300994873047, + "logps/chosen": -476.0978088378906, + "logps/rejected": -438.0683288574219, + "loss": 0.7017, + "rewards/accuracies": 0.7699999809265137, + "rewards/chosen": -11.276473045349121, + "rewards/margins": 4.5048699378967285, + "rewards/rejected": -15.781344413757324, + "step": 13800 + }, + { + "epoch": 0.7722222222222223, + "grad_norm": 91.04949188232422, + "learning_rate": 1.1422222222222223e-05, + "logits/chosen": -21.5107421875, + "logits/rejected": -18.65487289428711, + "logps/chosen": -467.3624267578125, + "logps/rejected": -446.7812194824219, + "loss": 0.7725, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -12.59874153137207, + "rewards/margins": 3.995955228805542, + "rewards/rejected": -16.594697952270508, + "step": 13900 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 1.2415697710821405e-05, + "learning_rate": 1.1144444444444445e-05, + "logits/chosen": -20.914100646972656, + "logits/rejected": -17.760295867919922, + "logps/chosen": -458.08660888671875, + "logps/rejected": -403.4181213378906, + "loss": 1.0529, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -11.659071922302246, + "rewards/margins": 2.8984909057617188, + "rewards/rejected": -14.557561874389648, + "step": 14000 + }, + { + "epoch": 0.7777777777777778, + "eval_logits/chosen": -20.84697151184082, + "eval_logits/rejected": -18.357351303100586, + "eval_logps/chosen": -427.6429443359375, + "eval_logps/rejected": -422.7457580566406, + "eval_loss": 0.9920390844345093, + "eval_rewards/accuracies": 0.7384999990463257, + "eval_rewards/chosen": -11.311074256896973, + "eval_rewards/margins": 3.616405725479126, + "eval_rewards/rejected": -14.92747974395752, + "eval_runtime": 513.1399, + "eval_samples_per_second": 3.898, + "eval_steps_per_second": 3.898, + "step": 14000 + }, + { + "epoch": 0.7833333333333333, + "grad_norm": 154.7070770263672, + "learning_rate": 1.0866666666666667e-05, + "logits/chosen": -21.29863739013672, + "logits/rejected": -18.247251510620117, + "logps/chosen": -502.478759765625, + "logps/rejected": -462.8417663574219, + "loss": 1.026, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -11.932686805725098, + "rewards/margins": 3.9626734256744385, + "rewards/rejected": -15.895360946655273, + "step": 14100 + }, + { + "epoch": 0.7888888888888889, + "grad_norm": 0.22017847001552582, + "learning_rate": 1.058888888888889e-05, + "logits/chosen": -19.6921329498291, + "logits/rejected": -18.3222713470459, + "logps/chosen": -467.2056579589844, + "logps/rejected": -492.9352722167969, + "loss": 1.1251, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -13.240918159484863, + "rewards/margins": 3.780466318130493, + "rewards/rejected": -17.021385192871094, + "step": 14200 + }, + { + "epoch": 0.7944444444444444, + "grad_norm": 0.0003805610176641494, + "learning_rate": 1.031111111111111e-05, + "logits/chosen": -18.71148681640625, + "logits/rejected": -16.97392463684082, + "logps/chosen": -426.7343444824219, + "logps/rejected": -424.07501220703125, + "loss": 1.3862, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -14.3801908493042, + "rewards/margins": 3.363192558288574, + "rewards/rejected": -17.743385314941406, + "step": 14300 + }, + { + "epoch": 0.8, + "grad_norm": 0.04610005021095276, + "learning_rate": 1.0033333333333333e-05, + "logits/chosen": -19.151874542236328, + "logits/rejected": -16.206754684448242, + "logps/chosen": -455.8854675292969, + "logps/rejected": -439.97491455078125, + "loss": 1.5306, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -14.66149616241455, + "rewards/margins": 3.2972664833068848, + "rewards/rejected": -17.958763122558594, + "step": 14400 + }, + { + "epoch": 0.8055555555555556, + "grad_norm": 0.20745764672756195, + "learning_rate": 9.755555555555556e-06, + "logits/chosen": -20.90310287475586, + "logits/rejected": -18.054218292236328, + "logps/chosen": -509.6355895996094, + "logps/rejected": -482.0080871582031, + "loss": 1.0155, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -13.729355812072754, + "rewards/margins": 4.072160243988037, + "rewards/rejected": -17.801515579223633, + "step": 14500 + }, + { + "epoch": 0.8111111111111111, + "grad_norm": 200.8791961669922, + "learning_rate": 9.477777777777778e-06, + "logits/chosen": -20.012248992919922, + "logits/rejected": -19.31513023376465, + "logps/chosen": -445.0365295410156, + "logps/rejected": -478.9870300292969, + "loss": 1.0974, + "rewards/accuracies": 0.6800000071525574, + "rewards/chosen": -13.788928985595703, + "rewards/margins": 3.1558427810668945, + "rewards/rejected": -16.94477081298828, + "step": 14600 + }, + { + "epoch": 0.8166666666666667, + "grad_norm": 106.82402801513672, + "learning_rate": 9.2e-06, + "logits/chosen": -19.246234893798828, + "logits/rejected": -18.08530044555664, + "logps/chosen": -440.52911376953125, + "logps/rejected": -484.75726318359375, + "loss": 1.2672, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -13.885964393615723, + "rewards/margins": 4.022563457489014, + "rewards/rejected": -17.908527374267578, + "step": 14700 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 144.63417053222656, + "learning_rate": 8.922222222222222e-06, + "logits/chosen": -19.080158233642578, + "logits/rejected": -16.771650314331055, + "logps/chosen": -449.21014404296875, + "logps/rejected": -454.19415283203125, + "loss": 0.7222, + "rewards/accuracies": 0.7900000214576721, + "rewards/chosen": -14.682646751403809, + "rewards/margins": 4.202045440673828, + "rewards/rejected": -18.884693145751953, + "step": 14800 + }, + { + "epoch": 0.8277777777777777, + "grad_norm": 116.8870620727539, + "learning_rate": 8.644444444444445e-06, + "logits/chosen": -17.994277954101562, + "logits/rejected": -15.897448539733887, + "logps/chosen": -495.1518859863281, + "logps/rejected": -474.31561279296875, + "loss": 0.9327, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -15.393341064453125, + "rewards/margins": 4.086789608001709, + "rewards/rejected": -19.480131149291992, + "step": 14900 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.07545996457338333, + "learning_rate": 8.366666666666667e-06, + "logits/chosen": -19.65306854248047, + "logits/rejected": -17.02388572692871, + "logps/chosen": -522.7219848632812, + "logps/rejected": -519.5859985351562, + "loss": 0.7585, + "rewards/accuracies": 0.7900000214576721, + "rewards/chosen": -14.443299293518066, + "rewards/margins": 5.269186496734619, + "rewards/rejected": -19.712488174438477, + "step": 15000 + }, + { + "epoch": 0.8333333333333334, + "eval_logits/chosen": -18.504989624023438, + "eval_logits/rejected": -16.171443939208984, + "eval_logps/chosen": -471.3619079589844, + "eval_logps/rejected": -468.74993896484375, + "eval_loss": 1.0822033882141113, + "eval_rewards/accuracies": 0.7275000214576721, + "eval_rewards/chosen": -15.68297004699707, + "eval_rewards/margins": 3.84492826461792, + "eval_rewards/rejected": -19.527896881103516, + "eval_runtime": 513.1545, + "eval_samples_per_second": 3.897, + "eval_steps_per_second": 3.897, + "step": 15000 + }, + { + "epoch": 0.8388888888888889, + "grad_norm": 62.221466064453125, + "learning_rate": 8.08888888888889e-06, + "logits/chosen": -18.367977142333984, + "logits/rejected": -16.585079193115234, + "logps/chosen": -464.9667053222656, + "logps/rejected": -481.6781921386719, + "loss": 1.3595, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -15.07307243347168, + "rewards/margins": 2.712407112121582, + "rewards/rejected": -17.785480499267578, + "step": 15100 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 161.0553741455078, + "learning_rate": 7.811111111111113e-06, + "logits/chosen": -19.680564880371094, + "logits/rejected": -18.090286254882812, + "logps/chosen": -478.542236328125, + "logps/rejected": -479.900634765625, + "loss": 0.952, + "rewards/accuracies": 0.75, + "rewards/chosen": -14.784024238586426, + "rewards/margins": 3.371377944946289, + "rewards/rejected": -18.1554012298584, + "step": 15200 + }, + { + "epoch": 0.85, + "grad_norm": 115.31925201416016, + "learning_rate": 7.533333333333334e-06, + "logits/chosen": -19.634929656982422, + "logits/rejected": -16.501371383666992, + "logps/chosen": -481.2236022949219, + "logps/rejected": -434.7230224609375, + "loss": 1.0336, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -14.362470626831055, + "rewards/margins": 4.348694324493408, + "rewards/rejected": -18.711166381835938, + "step": 15300 + }, + { + "epoch": 0.8555555555555555, + "grad_norm": 6.737954616546631, + "learning_rate": 7.255555555555556e-06, + "logits/chosen": -19.425790786743164, + "logits/rejected": -17.327377319335938, + "logps/chosen": -509.2445373535156, + "logps/rejected": -517.9005737304688, + "loss": 1.0515, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -14.808392524719238, + "rewards/margins": 4.024547576904297, + "rewards/rejected": -18.83293914794922, + "step": 15400 + }, + { + "epoch": 0.8611111111111112, + "grad_norm": 132.69430541992188, + "learning_rate": 6.9777777777777775e-06, + "logits/chosen": -19.59749984741211, + "logits/rejected": -16.880125045776367, + "logps/chosen": -462.0685119628906, + "logps/rejected": -423.8826599121094, + "loss": 0.8662, + "rewards/accuracies": 0.7699999809265137, + "rewards/chosen": -14.050195693969727, + "rewards/margins": 4.8742594718933105, + "rewards/rejected": -18.924455642700195, + "step": 15500 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 160.39437866210938, + "learning_rate": 6.700000000000001e-06, + "logits/chosen": -18.782743453979492, + "logits/rejected": -16.216781616210938, + "logps/chosen": -481.4902038574219, + "logps/rejected": -454.8882141113281, + "loss": 1.0277, + "rewards/accuracies": 0.7300000190734863, + "rewards/chosen": -14.456459045410156, + "rewards/margins": 3.125344753265381, + "rewards/rejected": -17.581804275512695, + "step": 15600 + }, + { + "epoch": 0.8722222222222222, + "grad_norm": 0.4665509760379791, + "learning_rate": 6.422222222222223e-06, + "logits/chosen": -20.162229537963867, + "logits/rejected": -17.23107147216797, + "logps/chosen": -477.02581787109375, + "logps/rejected": -444.90985107421875, + "loss": 1.1166, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -12.746391296386719, + "rewards/margins": 3.2495129108428955, + "rewards/rejected": -15.995903015136719, + "step": 15700 + }, + { + "epoch": 0.8777777777777778, + "grad_norm": 47.69727325439453, + "learning_rate": 6.144444444444445e-06, + "logits/chosen": -19.23339080810547, + "logits/rejected": -17.244380950927734, + "logps/chosen": -425.1358337402344, + "logps/rejected": -441.84429931640625, + "loss": 0.8033, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.218267440795898, + "rewards/margins": 4.156614303588867, + "rewards/rejected": -16.3748836517334, + "step": 15800 + }, + { + "epoch": 0.8833333333333333, + "grad_norm": 129.9224853515625, + "learning_rate": 5.866666666666667e-06, + "logits/chosen": -18.405790328979492, + "logits/rejected": -15.977246284484863, + "logps/chosen": -441.53460693359375, + "logps/rejected": -413.96343994140625, + "loss": 1.1833, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -12.511604309082031, + "rewards/margins": 2.9923038482666016, + "rewards/rejected": -15.50390911102295, + "step": 15900 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 5.841636657714844, + "learning_rate": 5.588888888888889e-06, + "logits/chosen": -19.392654418945312, + "logits/rejected": -17.360950469970703, + "logps/chosen": -451.1518859863281, + "logps/rejected": -456.2735900878906, + "loss": 1.0311, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.161535263061523, + "rewards/margins": 3.8600049018859863, + "rewards/rejected": -16.02153968811035, + "step": 16000 + }, + { + "epoch": 0.8888888888888888, + "eval_logits/chosen": -19.68054962158203, + "eval_logits/rejected": -17.208925247192383, + "eval_logps/chosen": -436.6055908203125, + "eval_logps/rejected": -431.5029602050781, + "eval_loss": 0.9351751208305359, + "eval_rewards/accuracies": 0.7429999709129333, + "eval_rewards/chosen": -12.207340240478516, + "eval_rewards/margins": 3.59586238861084, + "eval_rewards/rejected": -15.803203582763672, + "eval_runtime": 513.0948, + "eval_samples_per_second": 3.898, + "eval_steps_per_second": 3.898, + "step": 16000 + }, + { + "epoch": 0.8944444444444445, + "grad_norm": 80.59151458740234, + "learning_rate": 5.311111111111111e-06, + "logits/chosen": -20.11895179748535, + "logits/rejected": -17.307449340820312, + "logps/chosen": -378.7174987792969, + "logps/rejected": -400.519287109375, + "loss": 0.6783, + "rewards/accuracies": 0.7599999904632568, + "rewards/chosen": -11.845584869384766, + "rewards/margins": 4.167884349822998, + "rewards/rejected": -16.013469696044922, + "step": 16100 + }, + { + "epoch": 0.9, + "grad_norm": 104.4501724243164, + "learning_rate": 5.033333333333334e-06, + "logits/chosen": -19.619365692138672, + "logits/rejected": -17.79317283630371, + "logps/chosen": -482.5891418457031, + "logps/rejected": -465.8780822753906, + "loss": 1.0629, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": -13.3302640914917, + "rewards/margins": 2.675769090652466, + "rewards/rejected": -16.00603485107422, + "step": 16200 + }, + { + "epoch": 0.9055555555555556, + "grad_norm": 20.666343688964844, + "learning_rate": 4.755555555555556e-06, + "logits/chosen": -19.390151977539062, + "logits/rejected": -15.959493637084961, + "logps/chosen": -469.0502014160156, + "logps/rejected": -415.9452819824219, + "loss": 0.9635, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -12.088029861450195, + "rewards/margins": 3.508117914199829, + "rewards/rejected": -15.596147537231445, + "step": 16300 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 0.030232036486268044, + "learning_rate": 4.477777777777778e-06, + "logits/chosen": -18.614093780517578, + "logits/rejected": -18.804079055786133, + "logps/chosen": -431.15155029296875, + "logps/rejected": -498.12359619140625, + "loss": 1.0989, + "rewards/accuracies": 0.7200000286102295, + "rewards/chosen": -13.811687469482422, + "rewards/margins": 2.9138965606689453, + "rewards/rejected": -16.725584030151367, + "step": 16400 + }, + { + "epoch": 0.9166666666666666, + "grad_norm": 67.77301788330078, + "learning_rate": 4.2000000000000004e-06, + "logits/chosen": -19.063812255859375, + "logits/rejected": -16.122283935546875, + "logps/chosen": -440.4454345703125, + "logps/rejected": -404.6186218261719, + "loss": 0.7615, + "rewards/accuracies": 0.7599999904632568, + "rewards/chosen": -12.809778213500977, + "rewards/margins": 4.089334011077881, + "rewards/rejected": -16.899110794067383, + "step": 16500 + }, + { + "epoch": 0.9222222222222223, + "grad_norm": 43.06341552734375, + "learning_rate": 3.922222222222222e-06, + "logits/chosen": -19.97921371459961, + "logits/rejected": -15.873488426208496, + "logps/chosen": -479.7019958496094, + "logps/rejected": -390.3157043457031, + "loss": 1.0142, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -12.450453758239746, + "rewards/margins": 2.622169256210327, + "rewards/rejected": -15.072624206542969, + "step": 16600 + }, + { + "epoch": 0.9277777777777778, + "grad_norm": 20.9642333984375, + "learning_rate": 3.6444444444444446e-06, + "logits/chosen": -19.42534637451172, + "logits/rejected": -18.331153869628906, + "logps/chosen": -439.32354736328125, + "logps/rejected": -463.80328369140625, + "loss": 0.9979, + "rewards/accuracies": 0.75, + "rewards/chosen": -12.121214866638184, + "rewards/margins": 3.0654172897338867, + "rewards/rejected": -15.186631202697754, + "step": 16700 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 3.745821237564087, + "learning_rate": 3.3666666666666665e-06, + "logits/chosen": -19.767005920410156, + "logits/rejected": -18.188875198364258, + "logps/chosen": -438.71429443359375, + "logps/rejected": -488.7001647949219, + "loss": 0.8824, + "rewards/accuracies": 0.6899999976158142, + "rewards/chosen": -11.744170188903809, + "rewards/margins": 3.699794292449951, + "rewards/rejected": -15.443962097167969, + "step": 16800 + }, + { + "epoch": 0.9388888888888889, + "grad_norm": 54.07151794433594, + "learning_rate": 3.088888888888889e-06, + "logits/chosen": -20.854005813598633, + "logits/rejected": -17.623567581176758, + "logps/chosen": -520.1600952148438, + "logps/rejected": -449.6727294921875, + "loss": 1.0675, + "rewards/accuracies": 0.7099999785423279, + "rewards/chosen": -12.64111614227295, + "rewards/margins": 2.9364287853240967, + "rewards/rejected": -15.577545166015625, + "step": 16900 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 1.7077102661132812, + "learning_rate": 2.811111111111111e-06, + "logits/chosen": -18.75611114501953, + "logits/rejected": -17.589269638061523, + "logps/chosen": -387.24456787109375, + "logps/rejected": -420.2656555175781, + "loss": 0.6916, + "rewards/accuracies": 0.7400000095367432, + "rewards/chosen": -12.266399383544922, + "rewards/margins": 3.452897548675537, + "rewards/rejected": -15.719295501708984, + "step": 17000 + }, + { + "epoch": 0.9444444444444444, + "eval_logits/chosen": -19.807449340820312, + "eval_logits/rejected": -17.341161727905273, + "eval_logps/chosen": -436.4026184082031, + "eval_logps/rejected": -430.0996398925781, + "eval_loss": 0.9044573307037354, + "eval_rewards/accuracies": 0.746999979019165, + "eval_rewards/chosen": -12.187041282653809, + "eval_rewards/margins": 3.4758214950561523, + "eval_rewards/rejected": -15.662864685058594, + "eval_runtime": 513.5551, + "eval_samples_per_second": 3.894, + "eval_steps_per_second": 3.894, + "step": 17000 + } + ], + "logging_steps": 100, + "max_steps": 18000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}