{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 60, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 1.6346618384122849, "epoch": 0.050314465408805034, "grad_norm": 3.734375, "learning_rate": 5e-05, "logits/chosen": -0.7411658949470903, "logits/rejected": -0.20507352810005436, "logps/chosen": -152.59497356414795, "logps/rejected": -195.90787506103516, "loss": 0.6931471824645996, "mean_token_accuracy": 0.562163695693016, "num_tokens": 6930.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "entropy": 1.4584085196256638, "epoch": 0.10062893081761007, "grad_norm": 3.40625, "learning_rate": 4.9166666666666665e-05, "logits/chosen": -0.9251237438184291, "logits/rejected": -0.42817166651451355, "logps/chosen": -153.27350616455078, "logps/rejected": -180.06759071350098, "loss": 0.5006560683250427, "mean_token_accuracy": 0.583746887743473, "num_tokens": 12841.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.23006458766758442, "rewards/margins": 0.4357452392578125, "rewards/rejected": -0.20568065904080868, "step": 2 }, { "entropy": 1.4456398040056229, "epoch": 0.1509433962264151, "grad_norm": 2.609375, "learning_rate": 4.8333333333333334e-05, "logits/chosen": -1.0732471831941308, "logits/rejected": -0.5970107323243525, "logps/chosen": -146.46685123443604, "logps/rejected": -187.66933250427246, "loss": 0.3808254599571228, "mean_token_accuracy": 0.6039908826351166, "num_tokens": 18816.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.34212866611778736, "rewards/margins": 0.7833537347614765, "rewards/rejected": -0.4412250593304634, "step": 3 }, { "entropy": 1.5161051750183105, "epoch": 0.20125786163522014, "grad_norm": 2.15625, "learning_rate": 4.75e-05, "logits/chosen": -0.9412079692678976, "logits/rejected": -0.4002159728435217, "logps/chosen": -118.29372596740723, "logps/rejected": -182.18238067626953, "loss": 0.34213775396347046, "mean_token_accuracy": 0.5912635922431946, "num_tokens": 25522.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.3642648714594543, "rewards/margins": 0.954810731112957, "rewards/rejected": -0.5905458554625511, "step": 4 }, { "entropy": 1.4330662935972214, "epoch": 0.25157232704402516, "grad_norm": 1.7421875, "learning_rate": 4.666666666666667e-05, "logits/chosen": -1.2388932583587922, "logits/rejected": -0.6178736694127079, "logps/chosen": -118.57859134674072, "logps/rejected": -193.721284866333, "loss": 0.21044230461120605, "mean_token_accuracy": 0.5765212289988995, "num_tokens": 32576.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.45518894493579865, "rewards/margins": 1.520921140909195, "rewards/rejected": -1.065732218325138, "step": 5 }, { "entropy": 1.4573922604322433, "epoch": 0.3018867924528302, "grad_norm": 1.484375, "learning_rate": 4.5833333333333334e-05, "logits/chosen": -1.2737800530058234, "logits/rejected": -0.5034645169102513, "logps/chosen": -120.18647003173828, "logps/rejected": -203.72421646118164, "loss": 0.17699970304965973, "mean_token_accuracy": 0.5985999256372452, "num_tokens": 39703.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.41289406828582287, "rewards/margins": 1.7254902124404907, "rewards/rejected": -1.3125961422920227, "step": 6 }, { "entropy": 1.2998351603746414, "epoch": 0.3522012578616352, "grad_norm": 1.546875, "learning_rate": 4.5e-05, "logits/chosen": -1.3130657002820167, "logits/rejected": -0.6094413558435448, "logps/chosen": -108.3181095123291, "logps/rejected": -194.44207382202148, "loss": 0.18618769943714142, "mean_token_accuracy": 0.6349928826093674, "num_tokens": 47481.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.26029996760189533, "rewards/margins": 1.7353856414556503, "rewards/rejected": -1.4750856757164001, "step": 7 }, { "entropy": 1.354924201965332, "epoch": 0.4025157232704403, "grad_norm": 1.015625, "learning_rate": 4.4166666666666665e-05, "logits/chosen": -1.437566920334179, "logits/rejected": -1.0028015186250743, "logps/chosen": -137.25361728668213, "logps/rejected": -203.34245681762695, "loss": 0.1016073077917099, "mean_token_accuracy": 0.6037986874580383, "num_tokens": 54551.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.6320773344486952, "rewards/margins": 2.5685721784830093, "rewards/rejected": -1.9364948570728302, "step": 8 }, { "entropy": 1.3007782101631165, "epoch": 0.4528301886792453, "grad_norm": 0.6796875, "learning_rate": 4.3333333333333334e-05, "logits/chosen": -1.4337730887270794, "logits/rejected": -0.9420388615307833, "logps/chosen": -133.88286685943604, "logps/rejected": -215.5273036956787, "loss": 0.06624128669500351, "mean_token_accuracy": 0.6150126904249191, "num_tokens": 61082.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.7177156624384224, "rewards/margins": 3.2935406416654587, "rewards/rejected": -2.5758249759674072, "step": 9 }, { "entropy": 1.3091522455215454, "epoch": 0.5031446540880503, "grad_norm": 0.515625, "learning_rate": 4.25e-05, "logits/chosen": -1.7237460889408442, "logits/rejected": -1.1708205992219858, "logps/chosen": -114.79162216186523, "logps/rejected": -216.33137321472168, "loss": 0.03747009485960007, "mean_token_accuracy": 0.5977272689342499, "num_tokens": 68063.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.7877620831131935, "rewards/margins": 4.083773195743561, "rewards/rejected": -3.296011045575142, "step": 10 }, { "entropy": 1.2404007613658905, "epoch": 0.5534591194968553, "grad_norm": 0.259765625, "learning_rate": 4.166666666666667e-05, "logits/chosen": -1.655923360923379, "logits/rejected": -1.1929389227396723, "logps/chosen": -137.04386043548584, "logps/rejected": -223.17731285095215, "loss": 0.019634969532489777, "mean_token_accuracy": 0.615639328956604, "num_tokens": 73558.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.1441535080084577, "rewards/margins": 4.826931297779083, "rewards/rejected": -3.6827778220176697, "step": 11 }, { "entropy": 1.2675090283155441, "epoch": 0.6037735849056604, "grad_norm": 0.16796875, "learning_rate": 4.0833333333333334e-05, "logits/chosen": -1.738670325722853, "logits/rejected": -1.2716987398570991, "logps/chosen": -133.57194137573242, "logps/rejected": -223.28443908691406, "loss": 0.013121644034981728, "mean_token_accuracy": 0.6074652224779129, "num_tokens": 79421.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.0685334280133247, "rewards/margins": 5.048604816198349, "rewards/rejected": -3.980071395635605, "step": 12 }, { "entropy": 1.221432313323021, "epoch": 0.6540880503144654, "grad_norm": 0.466796875, "learning_rate": 4e-05, "logits/chosen": -1.8638311019017777, "logits/rejected": -1.3540506586835501, "logps/chosen": -130.8041524887085, "logps/rejected": -240.23817443847656, "loss": 0.029251903295516968, "mean_token_accuracy": 0.6208610609173775, "num_tokens": 86682.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.4837151520187035, "rewards/margins": 5.231908708810806, "rewards/rejected": -4.748193636536598, "step": 13 }, { "entropy": 1.179955169558525, "epoch": 0.7044025157232704, "grad_norm": 0.263671875, "learning_rate": 3.9166666666666665e-05, "logits/chosen": -1.714330251919895, "logits/rejected": -1.4322710140173394, "logps/chosen": -134.82413864135742, "logps/rejected": -221.86817169189453, "loss": 0.019698552787303925, "mean_token_accuracy": 0.6337382346391678, "num_tokens": 93035.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.23185482621192932, "rewards/margins": 5.084479838609695, "rewards/rejected": -4.852624952793121, "step": 14 }, { "entropy": 1.1727432310581207, "epoch": 0.7547169811320755, "grad_norm": 0.046142578125, "learning_rate": 3.8333333333333334e-05, "logits/chosen": -1.907136892939249, "logits/rejected": -1.514815267146274, "logps/chosen": -132.96124839782715, "logps/rejected": -249.50230979919434, "loss": 0.002145277801901102, "mean_token_accuracy": 0.6479089632630348, "num_tokens": 98572.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.1812188103795052, "rewards/margins": 7.1862099170684814, "rewards/rejected": -6.004990994930267, "step": 15 }, { "entropy": 1.2092190980911255, "epoch": 0.8050314465408805, "grad_norm": 0.203125, "learning_rate": 3.7500000000000003e-05, "logits/chosen": -1.8553556408587935, "logits/rejected": -1.4260795074385058, "logps/chosen": -139.758376121521, "logps/rejected": -248.12162590026855, "loss": 0.015183546580374241, "mean_token_accuracy": 0.6128971949219704, "num_tokens": 106014.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.30653896857984364, "rewards/margins": 6.243123233318329, "rewards/rejected": -5.9365842044353485, "step": 16 }, { "entropy": 1.2008470296859741, "epoch": 0.8553459119496856, "grad_norm": 0.134765625, "learning_rate": 3.6666666666666666e-05, "logits/chosen": -1.9781900924423292, "logits/rejected": -1.3513694447951352, "logps/chosen": -127.7047929763794, "logps/rejected": -251.46310424804688, "loss": 0.005312850698828697, "mean_token_accuracy": 0.6195648014545441, "num_tokens": 112297.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.948453530203551, "rewards/margins": 7.2391074895858765, "rewards/rejected": -6.290653884410858, "step": 17 }, { "entropy": 1.1938088834285736, "epoch": 0.9056603773584906, "grad_norm": 0.0283203125, "learning_rate": 3.5833333333333335e-05, "logits/chosen": -2.0970170230360243, "logits/rejected": -1.6893045219197895, "logps/chosen": -128.71086406707764, "logps/rejected": -248.6879997253418, "loss": 0.0016415835125371814, "mean_token_accuracy": 0.6066833287477493, "num_tokens": 118259.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.6370860114693642, "rewards/margins": 7.347302317619324, "rewards/rejected": -6.710216283798218, "step": 18 }, { "entropy": 1.1347751468420029, "epoch": 0.9559748427672956, "grad_norm": 0.021240234375, "learning_rate": 3.5e-05, "logits/chosen": -2.174784405399738, "logits/rejected": -1.8171304190186441, "logps/chosen": -137.3084011077881, "logps/rejected": -237.20912742614746, "loss": 0.0013000022154301405, "mean_token_accuracy": 0.6105500273406506, "num_tokens": 123618.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.1855077669024467, "rewards/margins": 7.6381800174713135, "rewards/rejected": -6.452672183513641, "step": 19 }, { "entropy": 1.2229801756995065, "epoch": 1.0, "grad_norm": 0.034912109375, "learning_rate": 3.4166666666666666e-05, "logits/chosen": -1.9402861332224905, "logits/rejected": -1.478508916635409, "logps/chosen": -140.62159075055803, "logps/rejected": -258.58056640625, "loss": 0.0018078959546983242, "mean_token_accuracy": 0.6002635615212577, "num_tokens": 130000.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.18390404965196336, "rewards/margins": 7.131651401519775, "rewards/rejected": -7.31555564062936, "step": 20 }, { "entropy": 1.1338201016187668, "epoch": 1.050314465408805, "grad_norm": 0.00341796875, "learning_rate": 3.3333333333333335e-05, "logits/chosen": -2.2381289981693855, "logits/rejected": -1.7685196120349025, "logps/chosen": -132.71670532226562, "logps/rejected": -270.32165908813477, "loss": 0.00017723625933285803, "mean_token_accuracy": 0.640245221555233, "num_tokens": 135799.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.0271762115880847, "rewards/margins": 9.069988369941711, "rewards/rejected": -8.04281210899353, "step": 21 }, { "entropy": 1.184071958065033, "epoch": 1.10062893081761, "grad_norm": 0.06201171875, "learning_rate": 3.2500000000000004e-05, "logits/chosen": -2.2030959691219643, "logits/rejected": -1.549622772314418, "logps/chosen": -137.2278184890747, "logps/rejected": -278.6941947937012, "loss": 0.002708145882934332, "mean_token_accuracy": 0.5984649807214737, "num_tokens": 142686.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.10265790205448866, "rewards/margins": 8.331138014793396, "rewards/rejected": -8.228479981422424, "step": 22 }, { "entropy": 1.1163400262594223, "epoch": 1.150943396226415, "grad_norm": 0.010498046875, "learning_rate": 3.1666666666666666e-05, "logits/chosen": -2.2692944342560915, "logits/rejected": -1.9815728330371412, "logps/chosen": -121.65661811828613, "logps/rejected": -251.65609550476074, "loss": 0.0006153068970888853, "mean_token_accuracy": 0.5803752839565277, "num_tokens": 148888.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.6473507843911648, "rewards/margins": 8.257275819778442, "rewards/rejected": -7.609925091266632, "step": 23 }, { "entropy": 1.2022811472415924, "epoch": 1.20125786163522, "grad_norm": 0.0196533203125, "learning_rate": 3.0833333333333335e-05, "logits/chosen": -2.0684997205168574, "logits/rejected": -1.5187979793505884, "logps/chosen": -142.0221881866455, "logps/rejected": -288.0162467956543, "loss": 0.0010658408282324672, "mean_token_accuracy": 0.6041584983468056, "num_tokens": 155066.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.3129555657505989, "rewards/margins": 9.287788093090057, "rewards/rejected": -8.974832653999329, "step": 24 }, { "entropy": 1.0715540498495102, "epoch": 1.251572327044025, "grad_norm": 0.0152587890625, "learning_rate": 3e-05, "logits/chosen": -2.253688589577229, "logits/rejected": -1.709912522981535, "logps/chosen": -124.24794864654541, "logps/rejected": -268.31462478637695, "loss": 0.0007783680921420455, "mean_token_accuracy": 0.6487728357315063, "num_tokens": 161896.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.5254653260344639, "rewards/margins": 8.636412143707275, "rewards/rejected": -8.110946834087372, "step": 25 }, { "entropy": 1.1292345225811005, "epoch": 1.3018867924528301, "grad_norm": 0.01611328125, "learning_rate": 2.916666666666667e-05, "logits/chosen": -2.03905456801847, "logits/rejected": -1.653255032290945, "logps/chosen": -121.39779376983643, "logps/rejected": -255.01618576049805, "loss": 0.0008846853161230683, "mean_token_accuracy": 0.622931219637394, "num_tokens": 168755.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.2767142332158983, "rewards/margins": 8.121429800987244, "rewards/rejected": -7.844715654850006, "step": 26 }, { "entropy": 1.107276238501072, "epoch": 1.3522012578616351, "grad_norm": 0.0076904296875, "learning_rate": 2.8333333333333335e-05, "logits/chosen": -2.286326659906524, "logits/rejected": -1.9377510490705079, "logps/chosen": -144.30573749542236, "logps/rejected": -272.37299728393555, "loss": 0.00033011281630024314, "mean_token_accuracy": 0.619342528283596, "num_tokens": 175381.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.9377570524811745, "rewards/margins": 9.561910688877106, "rewards/rejected": -8.6241534948349, "step": 27 }, { "entropy": 1.157460778951645, "epoch": 1.4025157232704402, "grad_norm": 0.0047607421875, "learning_rate": 2.7500000000000004e-05, "logits/chosen": -2.1326494507814266, "logits/rejected": -1.8268033175884986, "logps/chosen": -125.0985517501831, "logps/rejected": -274.5887870788574, "loss": 0.0002390609442954883, "mean_token_accuracy": 0.620079979300499, "num_tokens": 181771.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.8465396109968424, "rewards/margins": 9.534583747386932, "rewards/rejected": -8.688044130802155, "step": 28 }, { "entropy": 1.1201048269867897, "epoch": 1.4528301886792452, "grad_norm": 0.0093994140625, "learning_rate": 2.6666666666666667e-05, "logits/chosen": -2.311015681209517, "logits/rejected": -2.0115759556684885, "logps/chosen": -140.10973072052002, "logps/rejected": -260.4351615905762, "loss": 0.0005316605675034225, "mean_token_accuracy": 0.6181787773966789, "num_tokens": 188644.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.1467662937939167, "rewards/margins": 8.486627459526062, "rewards/rejected": -8.633393704891205, "step": 29 }, { "entropy": 1.1074048355221748, "epoch": 1.5031446540880502, "grad_norm": 0.024658203125, "learning_rate": 2.5833333333333336e-05, "logits/chosen": -2.2242952052635343, "logits/rejected": -1.9084164743872978, "logps/chosen": -120.53641414642334, "logps/rejected": -248.61384201049805, "loss": 0.0015231292927637696, "mean_token_accuracy": 0.6136712729930878, "num_tokens": 196391.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.0413979523582384, "rewards/margins": 8.107804000377655, "rewards/rejected": -8.149202108383179, "step": 30 }, { "entropy": 1.0835556164383888, "epoch": 1.5534591194968552, "grad_norm": 0.0308837890625, "learning_rate": 2.5e-05, "logits/chosen": -2.2358070307390454, "logits/rejected": -1.8015257262782527, "logps/chosen": -160.49627590179443, "logps/rejected": -286.36713790893555, "loss": 0.0012202183715999126, "mean_token_accuracy": 0.6140667796134949, "num_tokens": 203518.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.12220276962034404, "rewards/margins": 8.978903114795685, "rewards/rejected": -9.10110592842102, "step": 31 }, { "entropy": 1.1327729299664497, "epoch": 1.6037735849056602, "grad_norm": 0.0078125, "learning_rate": 2.4166666666666667e-05, "logits/chosen": -2.222914372779498, "logits/rejected": -1.8115026450741711, "logps/chosen": -143.74180603027344, "logps/rejected": -296.8906936645508, "loss": 0.0003141126944683492, "mean_token_accuracy": 0.6098875515162945, "num_tokens": 209850.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.1798817589879036, "rewards/margins": 10.448372840881348, "rewards/rejected": -10.268491089344025, "step": 32 }, { "entropy": 1.125333271920681, "epoch": 1.6540880503144653, "grad_norm": 0.017578125, "learning_rate": 2.3333333333333336e-05, "logits/chosen": -2.4135272067915143, "logits/rejected": -1.7676647261719525, "logps/chosen": -125.86479663848877, "logps/rejected": -266.06410026550293, "loss": 0.0005237428122200072, "mean_token_accuracy": 0.6229566335678101, "num_tokens": 215637.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.7084813113324344, "rewards/margins": 9.689094841480255, "rewards/rejected": -8.980613589286804, "step": 33 }, { "entropy": 1.058401882648468, "epoch": 1.7044025157232703, "grad_norm": 0.007354736328125, "learning_rate": 2.25e-05, "logits/chosen": -2.46378613488542, "logits/rejected": -2.1071431291236653, "logps/chosen": -125.6760368347168, "logps/rejected": -258.0559844970703, "loss": 0.00038854233571328223, "mean_token_accuracy": 0.6303330659866333, "num_tokens": 221051.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.3429398629814386, "rewards/margins": 10.128098785877228, "rewards/rejected": -8.785158812999725, "step": 34 }, { "entropy": 1.1321996748447418, "epoch": 1.7547169811320755, "grad_norm": 0.0030670166015625, "learning_rate": 2.1666666666666667e-05, "logits/chosen": -2.3747636222967063, "logits/rejected": -2.115993388323722, "logps/chosen": -131.67159175872803, "logps/rejected": -276.36437797546387, "loss": 0.00012030061770929024, "mean_token_accuracy": 0.6251056790351868, "num_tokens": 226426.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.8489564098417759, "rewards/margins": 10.621768474578857, "rewards/rejected": -9.772812008857727, "step": 35 }, { "entropy": 1.116398274898529, "epoch": 1.8050314465408805, "grad_norm": 0.0181884765625, "learning_rate": 2.0833333333333336e-05, "logits/chosen": -2.4568723077785557, "logits/rejected": -1.9738618373268513, "logps/chosen": -117.32646560668945, "logps/rejected": -272.7527599334717, "loss": 0.0008142158621922135, "mean_token_accuracy": 0.6250453069806099, "num_tokens": 233419.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.3417433723807335, "rewards/margins": 9.766010344028473, "rewards/rejected": -9.424266993999481, "step": 36 }, { "entropy": 1.0965562090277672, "epoch": 1.8553459119496856, "grad_norm": 0.0027313232421875, "learning_rate": 2e-05, "logits/chosen": -2.3551532120785383, "logits/rejected": -1.9812547363037738, "logps/chosen": -144.9142713546753, "logps/rejected": -296.3636951446533, "loss": 0.00013589797890745103, "mean_token_accuracy": 0.5917238146066666, "num_tokens": 240673.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.10223913192749023, "rewards/margins": 10.127011775970459, "rewards/rejected": -10.22925090789795, "step": 37 }, { "entropy": 1.088953472673893, "epoch": 1.9056603773584906, "grad_norm": 0.0032806396484375, "learning_rate": 1.9166666666666667e-05, "logits/chosen": -2.326897647347544, "logits/rejected": -2.0923411183004155, "logps/chosen": -141.62347412109375, "logps/rejected": -285.8790645599365, "loss": 0.0001322527095908299, "mean_token_accuracy": 0.6075941771268845, "num_tokens": 245974.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.7096749469637871, "rewards/margins": 10.769983649253845, "rewards/rejected": -10.060308814048767, "step": 38 }, { "entropy": 1.1708777844905853, "epoch": 1.9559748427672956, "grad_norm": 0.003997802734375, "learning_rate": 1.8333333333333333e-05, "logits/chosen": -2.1918049696005726, "logits/rejected": -1.7380488443097255, "logps/chosen": -127.21480464935303, "logps/rejected": -273.00823402404785, "loss": 0.00020453613251447678, "mean_token_accuracy": 0.6081509068608284, "num_tokens": 253184.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.09920912818051875, "rewards/margins": 9.214752078056335, "rewards/rejected": -9.313961207866669, "step": 39 }, { "entropy": 1.1617241672107153, "epoch": 2.0, "grad_norm": 0.0191650390625, "learning_rate": 1.75e-05, "logits/chosen": -2.2749118295696342, "logits/rejected": -1.7584887991939784, "logps/chosen": -154.0042724609375, "logps/rejected": -292.14568001883373, "loss": 0.0009170390549115837, "mean_token_accuracy": 0.5644707764898028, "num_tokens": 260000.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.44687261964593616, "rewards/margins": 9.13522618157523, "rewards/rejected": -9.582098756517683, "step": 40 }, { "entropy": 1.1219489574432373, "epoch": 2.050314465408805, "grad_norm": 0.00396728515625, "learning_rate": 1.6666666666666667e-05, "logits/chosen": -2.3933433594531426, "logits/rejected": -1.8782996338947386, "logps/chosen": -121.1051197052002, "logps/rejected": -273.1605224609375, "loss": 0.00018580301548354328, "mean_token_accuracy": 0.6062684953212738, "num_tokens": 266060.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.5098631188739091, "rewards/margins": 9.81256103515625, "rewards/rejected": -9.302697837352753, "step": 41 }, { "entropy": 1.142582356929779, "epoch": 2.10062893081761, "grad_norm": 0.00506591796875, "learning_rate": 1.5833333333333333e-05, "logits/chosen": -2.3932424547453, "logits/rejected": -2.034844354918984, "logps/chosen": -132.19951725006104, "logps/rejected": -273.3875484466553, "loss": 0.00027762039098888636, "mean_token_accuracy": 0.6254889070987701, "num_tokens": 272769.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.027547355741262436, "rewards/margins": 9.588396728038788, "rewards/rejected": -9.560849368572235, "step": 42 }, { "entropy": 1.0831276252865791, "epoch": 2.150943396226415, "grad_norm": 0.0201416015625, "learning_rate": 1.5e-05, "logits/chosen": -2.3213323581687466, "logits/rejected": -1.9188541617869985, "logps/chosen": -140.96678638458252, "logps/rejected": -298.01341819763184, "loss": 0.0006403317674994469, "mean_token_accuracy": 0.6212100088596344, "num_tokens": 279603.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.13022289611399174, "rewards/margins": 10.482895195484161, "rewards/rejected": -10.61311811208725, "step": 43 }, { "entropy": 1.1165131032466888, "epoch": 2.20125786163522, "grad_norm": 0.004119873046875, "learning_rate": 1.4166666666666668e-05, "logits/chosen": -2.325539811977018, "logits/rejected": -1.819132333499079, "logps/chosen": -137.1863489151001, "logps/rejected": -280.5503120422363, "loss": 0.00018718022329267114, "mean_token_accuracy": 0.6205575913190842, "num_tokens": 286436.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.0954840648919344, "rewards/margins": 9.663193047046661, "rewards/rejected": -9.567708969116211, "step": 44 }, { "entropy": 1.0798147842288017, "epoch": 2.251572327044025, "grad_norm": 0.006195068359375, "learning_rate": 1.3333333333333333e-05, "logits/chosen": -2.462720523269614, "logits/rejected": -1.9150426029399608, "logps/chosen": -137.90959072113037, "logps/rejected": -299.62617683410645, "loss": 0.00032327763619832695, "mean_token_accuracy": 0.6055086851119995, "num_tokens": 294223.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.0803464986383915, "rewards/margins": 10.386663496494293, "rewards/rejected": -10.467009961605072, "step": 45 }, { "entropy": 1.0406965985894203, "epoch": 2.30188679245283, "grad_norm": 0.00360107421875, "learning_rate": 1.25e-05, "logits/chosen": -2.351859813790029, "logits/rejected": -2.079019230872164, "logps/chosen": -114.20672798156738, "logps/rejected": -270.77587890625, "loss": 0.0001596831134520471, "mean_token_accuracy": 0.6743896827101707, "num_tokens": 299634.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.2796257368754596, "rewards/margins": 10.65468156337738, "rewards/rejected": -9.375055730342865, "step": 46 }, { "entropy": 1.1319249421358109, "epoch": 2.352201257861635, "grad_norm": 0.00323486328125, "learning_rate": 1.1666666666666668e-05, "logits/chosen": -2.3348495432588066, "logits/rejected": -1.8859447778318736, "logps/chosen": -162.34913635253906, "logps/rejected": -305.2253665924072, "loss": 0.00012244780373293906, "mean_token_accuracy": 0.5992361158132553, "num_tokens": 305920.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.0735319098457694, "rewards/margins": 10.709958910942078, "rewards/rejected": -10.783490896224976, "step": 47 }, { "entropy": 1.1012349873781204, "epoch": 2.40251572327044, "grad_norm": 0.00555419921875, "learning_rate": 1.0833333333333334e-05, "logits/chosen": -2.376434497997167, "logits/rejected": -1.9649504169742804, "logps/chosen": -134.29817581176758, "logps/rejected": -269.4762592315674, "loss": 0.00029120981344021857, "mean_token_accuracy": 0.615195669233799, "num_tokens": 313346.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.04450349509716034, "rewards/margins": 8.847583532333374, "rewards/rejected": -8.803080201148987, "step": 48 }, { "entropy": 1.0822330936789513, "epoch": 2.452830188679245, "grad_norm": 0.004486083984375, "learning_rate": 1e-05, "logits/chosen": -2.4181823111443506, "logits/rejected": -1.9432117626083543, "logps/chosen": -139.22668647766113, "logps/rejected": -286.98141860961914, "loss": 0.00024342790129594505, "mean_token_accuracy": 0.5807452276349068, "num_tokens": 320662.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.20274513261392713, "rewards/margins": 9.553876638412476, "rewards/rejected": -9.756621778011322, "step": 49 }, { "entropy": 1.1255912110209465, "epoch": 2.50314465408805, "grad_norm": 0.01043701171875, "learning_rate": 9.166666666666666e-06, "logits/chosen": -2.390428515839434, "logits/rejected": -2.070867890248319, "logps/chosen": -152.57851219177246, "logps/rejected": -278.1038990020752, "loss": 0.0005284082726575434, "mean_token_accuracy": 0.5914618484675884, "num_tokens": 326611.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.24946272000670433, "rewards/margins": 9.3264040350914, "rewards/rejected": -9.076941430568695, "step": 50 }, { "entropy": 1.10995664447546, "epoch": 2.5534591194968552, "grad_norm": 0.005950927734375, "learning_rate": 8.333333333333334e-06, "logits/chosen": -2.3087954809493545, "logits/rejected": -1.9792274410316972, "logps/chosen": -116.59206485748291, "logps/rejected": -267.238338470459, "loss": 0.00025631688185967505, "mean_token_accuracy": 0.625831313431263, "num_tokens": 332814.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.8696157680824399, "rewards/margins": 10.124218106269836, "rewards/rejected": -9.254602372646332, "step": 51 }, { "entropy": 1.117017239332199, "epoch": 2.6037735849056602, "grad_norm": 0.00152587890625, "learning_rate": 7.5e-06, "logits/chosen": -2.4137662915261817, "logits/rejected": -1.9523699949115683, "logps/chosen": -120.80679893493652, "logps/rejected": -280.64561653137207, "loss": 7.924844976514578e-05, "mean_token_accuracy": 0.6106409505009651, "num_tokens": 339547.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.790530975908041, "rewards/margins": 10.681683659553528, "rewards/rejected": -9.891152799129486, "step": 52 }, { "entropy": 1.1054791137576103, "epoch": 2.6540880503144653, "grad_norm": 0.0164794921875, "learning_rate": 6.666666666666667e-06, "logits/chosen": -2.273507778205582, "logits/rejected": -1.9109036313937824, "logps/chosen": -115.94961833953857, "logps/rejected": -269.0294952392578, "loss": 0.0007658082176931202, "mean_token_accuracy": 0.5990233793854713, "num_tokens": 346647.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.25752640701830387, "rewards/margins": 9.45040088891983, "rewards/rejected": -9.192874610424042, "step": 53 }, { "entropy": 1.103651024401188, "epoch": 2.7044025157232703, "grad_norm": 0.002593994140625, "learning_rate": 5.833333333333334e-06, "logits/chosen": -2.333249501998797, "logits/rejected": -1.8058473440003249, "logps/chosen": -152.4753885269165, "logps/rejected": -301.55738639831543, "loss": 0.00010592853504931554, "mean_token_accuracy": 0.6078185737133026, "num_tokens": 353155.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.3159999940544367, "rewards/margins": 10.630276560783386, "rewards/rejected": -10.314276397228241, "step": 54 }, { "entropy": 1.160094790160656, "epoch": 2.7547169811320753, "grad_norm": 0.0015716552734375, "learning_rate": 5e-06, "logits/chosen": -2.3308505618468462, "logits/rejected": -1.8965502590056609, "logps/chosen": -141.07257843017578, "logps/rejected": -296.07042503356934, "loss": 6.342934648273513e-05, "mean_token_accuracy": 0.6171156838536263, "num_tokens": 358538.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.7141566202044487, "rewards/margins": 10.787114977836609, "rewards/rejected": -10.072958290576935, "step": 55 }, { "entropy": 1.1058026999235153, "epoch": 2.8050314465408803, "grad_norm": 0.030517578125, "learning_rate": 4.166666666666667e-06, "logits/chosen": -2.322696784138965, "logits/rejected": -1.894479697226797, "logps/chosen": -121.85036373138428, "logps/rejected": -254.1634178161621, "loss": 0.00172812445089221, "mean_token_accuracy": 0.6225817948579788, "num_tokens": 364730.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.10639754496514797, "rewards/margins": 8.579113006591797, "rewards/rejected": -8.472715437412262, "step": 56 }, { "entropy": 1.102943792939186, "epoch": 2.8553459119496853, "grad_norm": 0.00872802734375, "learning_rate": 3.3333333333333333e-06, "logits/chosen": -2.173708624672683, "logits/rejected": -1.970646926930764, "logps/chosen": -142.89206504821777, "logps/rejected": -268.407958984375, "loss": 0.00041806488297879696, "mean_token_accuracy": 0.5970962047576904, "num_tokens": 371209.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.27814904414117336, "rewards/margins": 9.40092819929123, "rewards/rejected": -9.122779071331024, "step": 57 }, { "entropy": 1.1477283239364624, "epoch": 2.9056603773584904, "grad_norm": 0.00469970703125, "learning_rate": 2.5e-06, "logits/chosen": -2.293317013131777, "logits/rejected": -1.8812048808544868, "logps/chosen": -137.3569221496582, "logps/rejected": -291.4598960876465, "loss": 0.00018517834541853517, "mean_token_accuracy": 0.6110436543822289, "num_tokens": 378077.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.33568347059190273, "rewards/margins": 10.355388283729553, "rewards/rejected": -10.01970487833023, "step": 58 }, { "entropy": 1.0955762341618538, "epoch": 2.9559748427672954, "grad_norm": 0.00193023681640625, "learning_rate": 1.6666666666666667e-06, "logits/chosen": -2.3105184309249727, "logits/rejected": -2.0821060341980577, "logps/chosen": -141.80660915374756, "logps/rejected": -279.77105140686035, "loss": 8.936067024478689e-05, "mean_token_accuracy": 0.6105019077658653, "num_tokens": 384115.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.5417802967131138, "rewards/margins": 10.488943040370941, "rewards/rejected": -9.947162747383118, "step": 59 }, { "entropy": 1.0900011318070548, "epoch": 3.0, "grad_norm": 0.024169921875, "learning_rate": 8.333333333333333e-07, "logits/chosen": -2.332351866206725, "logits/rejected": -1.8031474708962778, "logps/chosen": -138.6800994873047, "logps/rejected": -265.439217703683, "loss": 0.0013465541414916515, "mean_token_accuracy": 0.6026440688541957, "num_tokens": 390000.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.2070258174623762, "rewards/margins": 8.572459425245013, "rewards/rejected": -8.77948522567749, "step": 60 } ], "logging_steps": 1, "max_steps": 60, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 60, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3189182437748736.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }