Tenacious-Qwen-DPO-Stable / trainer_state.json
meseretbolled's picture
Upload folder using huggingface_hub
8c7a140 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 60,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"entropy": 1.6346618384122849,
"epoch": 0.050314465408805034,
"grad_norm": 3.734375,
"learning_rate": 5e-05,
"logits/chosen": -0.7411658949470903,
"logits/rejected": -0.20507352810005436,
"logps/chosen": -152.59497356414795,
"logps/rejected": -195.90787506103516,
"loss": 0.6931471824645996,
"mean_token_accuracy": 0.562163695693016,
"num_tokens": 6930.0,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"entropy": 1.4584085196256638,
"epoch": 0.10062893081761007,
"grad_norm": 3.40625,
"learning_rate": 4.9166666666666665e-05,
"logits/chosen": -0.9251237438184291,
"logits/rejected": -0.42817166651451355,
"logps/chosen": -153.27350616455078,
"logps/rejected": -180.06759071350098,
"loss": 0.5006560683250427,
"mean_token_accuracy": 0.583746887743473,
"num_tokens": 12841.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.23006458766758442,
"rewards/margins": 0.4357452392578125,
"rewards/rejected": -0.20568065904080868,
"step": 2
},
{
"entropy": 1.4456398040056229,
"epoch": 0.1509433962264151,
"grad_norm": 2.609375,
"learning_rate": 4.8333333333333334e-05,
"logits/chosen": -1.0732471831941308,
"logits/rejected": -0.5970107323243525,
"logps/chosen": -146.46685123443604,
"logps/rejected": -187.66933250427246,
"loss": 0.3808254599571228,
"mean_token_accuracy": 0.6039908826351166,
"num_tokens": 18816.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.34212866611778736,
"rewards/margins": 0.7833537347614765,
"rewards/rejected": -0.4412250593304634,
"step": 3
},
{
"entropy": 1.5161051750183105,
"epoch": 0.20125786163522014,
"grad_norm": 2.15625,
"learning_rate": 4.75e-05,
"logits/chosen": -0.9412079692678976,
"logits/rejected": -0.4002159728435217,
"logps/chosen": -118.29372596740723,
"logps/rejected": -182.18238067626953,
"loss": 0.34213775396347046,
"mean_token_accuracy": 0.5912635922431946,
"num_tokens": 25522.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3642648714594543,
"rewards/margins": 0.954810731112957,
"rewards/rejected": -0.5905458554625511,
"step": 4
},
{
"entropy": 1.4330662935972214,
"epoch": 0.25157232704402516,
"grad_norm": 1.7421875,
"learning_rate": 4.666666666666667e-05,
"logits/chosen": -1.2388932583587922,
"logits/rejected": -0.6178736694127079,
"logps/chosen": -118.57859134674072,
"logps/rejected": -193.721284866333,
"loss": 0.21044230461120605,
"mean_token_accuracy": 0.5765212289988995,
"num_tokens": 32576.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.45518894493579865,
"rewards/margins": 1.520921140909195,
"rewards/rejected": -1.065732218325138,
"step": 5
},
{
"entropy": 1.4573922604322433,
"epoch": 0.3018867924528302,
"grad_norm": 1.484375,
"learning_rate": 4.5833333333333334e-05,
"logits/chosen": -1.2737800530058234,
"logits/rejected": -0.5034645169102513,
"logps/chosen": -120.18647003173828,
"logps/rejected": -203.72421646118164,
"loss": 0.17699970304965973,
"mean_token_accuracy": 0.5985999256372452,
"num_tokens": 39703.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.41289406828582287,
"rewards/margins": 1.7254902124404907,
"rewards/rejected": -1.3125961422920227,
"step": 6
},
{
"entropy": 1.2998351603746414,
"epoch": 0.3522012578616352,
"grad_norm": 1.546875,
"learning_rate": 4.5e-05,
"logits/chosen": -1.3130657002820167,
"logits/rejected": -0.6094413558435448,
"logps/chosen": -108.3181095123291,
"logps/rejected": -194.44207382202148,
"loss": 0.18618769943714142,
"mean_token_accuracy": 0.6349928826093674,
"num_tokens": 47481.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.26029996760189533,
"rewards/margins": 1.7353856414556503,
"rewards/rejected": -1.4750856757164001,
"step": 7
},
{
"entropy": 1.354924201965332,
"epoch": 0.4025157232704403,
"grad_norm": 1.015625,
"learning_rate": 4.4166666666666665e-05,
"logits/chosen": -1.437566920334179,
"logits/rejected": -1.0028015186250743,
"logps/chosen": -137.25361728668213,
"logps/rejected": -203.34245681762695,
"loss": 0.1016073077917099,
"mean_token_accuracy": 0.6037986874580383,
"num_tokens": 54551.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.6320773344486952,
"rewards/margins": 2.5685721784830093,
"rewards/rejected": -1.9364948570728302,
"step": 8
},
{
"entropy": 1.3007782101631165,
"epoch": 0.4528301886792453,
"grad_norm": 0.6796875,
"learning_rate": 4.3333333333333334e-05,
"logits/chosen": -1.4337730887270794,
"logits/rejected": -0.9420388615307833,
"logps/chosen": -133.88286685943604,
"logps/rejected": -215.5273036956787,
"loss": 0.06624128669500351,
"mean_token_accuracy": 0.6150126904249191,
"num_tokens": 61082.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.7177156624384224,
"rewards/margins": 3.2935406416654587,
"rewards/rejected": -2.5758249759674072,
"step": 9
},
{
"entropy": 1.3091522455215454,
"epoch": 0.5031446540880503,
"grad_norm": 0.515625,
"learning_rate": 4.25e-05,
"logits/chosen": -1.7237460889408442,
"logits/rejected": -1.1708205992219858,
"logps/chosen": -114.79162216186523,
"logps/rejected": -216.33137321472168,
"loss": 0.03747009485960007,
"mean_token_accuracy": 0.5977272689342499,
"num_tokens": 68063.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.7877620831131935,
"rewards/margins": 4.083773195743561,
"rewards/rejected": -3.296011045575142,
"step": 10
},
{
"entropy": 1.2404007613658905,
"epoch": 0.5534591194968553,
"grad_norm": 0.259765625,
"learning_rate": 4.166666666666667e-05,
"logits/chosen": -1.655923360923379,
"logits/rejected": -1.1929389227396723,
"logps/chosen": -137.04386043548584,
"logps/rejected": -223.17731285095215,
"loss": 0.019634969532489777,
"mean_token_accuracy": 0.615639328956604,
"num_tokens": 73558.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1441535080084577,
"rewards/margins": 4.826931297779083,
"rewards/rejected": -3.6827778220176697,
"step": 11
},
{
"entropy": 1.2675090283155441,
"epoch": 0.6037735849056604,
"grad_norm": 0.16796875,
"learning_rate": 4.0833333333333334e-05,
"logits/chosen": -1.738670325722853,
"logits/rejected": -1.2716987398570991,
"logps/chosen": -133.57194137573242,
"logps/rejected": -223.28443908691406,
"loss": 0.013121644034981728,
"mean_token_accuracy": 0.6074652224779129,
"num_tokens": 79421.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.0685334280133247,
"rewards/margins": 5.048604816198349,
"rewards/rejected": -3.980071395635605,
"step": 12
},
{
"entropy": 1.221432313323021,
"epoch": 0.6540880503144654,
"grad_norm": 0.466796875,
"learning_rate": 4e-05,
"logits/chosen": -1.8638311019017777,
"logits/rejected": -1.3540506586835501,
"logps/chosen": -130.8041524887085,
"logps/rejected": -240.23817443847656,
"loss": 0.029251903295516968,
"mean_token_accuracy": 0.6208610609173775,
"num_tokens": 86682.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.4837151520187035,
"rewards/margins": 5.231908708810806,
"rewards/rejected": -4.748193636536598,
"step": 13
},
{
"entropy": 1.179955169558525,
"epoch": 0.7044025157232704,
"grad_norm": 0.263671875,
"learning_rate": 3.9166666666666665e-05,
"logits/chosen": -1.714330251919895,
"logits/rejected": -1.4322710140173394,
"logps/chosen": -134.82413864135742,
"logps/rejected": -221.86817169189453,
"loss": 0.019698552787303925,
"mean_token_accuracy": 0.6337382346391678,
"num_tokens": 93035.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.23185482621192932,
"rewards/margins": 5.084479838609695,
"rewards/rejected": -4.852624952793121,
"step": 14
},
{
"entropy": 1.1727432310581207,
"epoch": 0.7547169811320755,
"grad_norm": 0.046142578125,
"learning_rate": 3.8333333333333334e-05,
"logits/chosen": -1.907136892939249,
"logits/rejected": -1.514815267146274,
"logps/chosen": -132.96124839782715,
"logps/rejected": -249.50230979919434,
"loss": 0.002145277801901102,
"mean_token_accuracy": 0.6479089632630348,
"num_tokens": 98572.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1812188103795052,
"rewards/margins": 7.1862099170684814,
"rewards/rejected": -6.004990994930267,
"step": 15
},
{
"entropy": 1.2092190980911255,
"epoch": 0.8050314465408805,
"grad_norm": 0.203125,
"learning_rate": 3.7500000000000003e-05,
"logits/chosen": -1.8553556408587935,
"logits/rejected": -1.4260795074385058,
"logps/chosen": -139.758376121521,
"logps/rejected": -248.12162590026855,
"loss": 0.015183546580374241,
"mean_token_accuracy": 0.6128971949219704,
"num_tokens": 106014.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.30653896857984364,
"rewards/margins": 6.243123233318329,
"rewards/rejected": -5.9365842044353485,
"step": 16
},
{
"entropy": 1.2008470296859741,
"epoch": 0.8553459119496856,
"grad_norm": 0.134765625,
"learning_rate": 3.6666666666666666e-05,
"logits/chosen": -1.9781900924423292,
"logits/rejected": -1.3513694447951352,
"logps/chosen": -127.7047929763794,
"logps/rejected": -251.46310424804688,
"loss": 0.005312850698828697,
"mean_token_accuracy": 0.6195648014545441,
"num_tokens": 112297.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.948453530203551,
"rewards/margins": 7.2391074895858765,
"rewards/rejected": -6.290653884410858,
"step": 17
},
{
"entropy": 1.1938088834285736,
"epoch": 0.9056603773584906,
"grad_norm": 0.0283203125,
"learning_rate": 3.5833333333333335e-05,
"logits/chosen": -2.0970170230360243,
"logits/rejected": -1.6893045219197895,
"logps/chosen": -128.71086406707764,
"logps/rejected": -248.6879997253418,
"loss": 0.0016415835125371814,
"mean_token_accuracy": 0.6066833287477493,
"num_tokens": 118259.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.6370860114693642,
"rewards/margins": 7.347302317619324,
"rewards/rejected": -6.710216283798218,
"step": 18
},
{
"entropy": 1.1347751468420029,
"epoch": 0.9559748427672956,
"grad_norm": 0.021240234375,
"learning_rate": 3.5e-05,
"logits/chosen": -2.174784405399738,
"logits/rejected": -1.8171304190186441,
"logps/chosen": -137.3084011077881,
"logps/rejected": -237.20912742614746,
"loss": 0.0013000022154301405,
"mean_token_accuracy": 0.6105500273406506,
"num_tokens": 123618.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.1855077669024467,
"rewards/margins": 7.6381800174713135,
"rewards/rejected": -6.452672183513641,
"step": 19
},
{
"entropy": 1.2229801756995065,
"epoch": 1.0,
"grad_norm": 0.034912109375,
"learning_rate": 3.4166666666666666e-05,
"logits/chosen": -1.9402861332224905,
"logits/rejected": -1.478508916635409,
"logps/chosen": -140.62159075055803,
"logps/rejected": -258.58056640625,
"loss": 0.0018078959546983242,
"mean_token_accuracy": 0.6002635615212577,
"num_tokens": 130000.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.18390404965196336,
"rewards/margins": 7.131651401519775,
"rewards/rejected": -7.31555564062936,
"step": 20
},
{
"entropy": 1.1338201016187668,
"epoch": 1.050314465408805,
"grad_norm": 0.00341796875,
"learning_rate": 3.3333333333333335e-05,
"logits/chosen": -2.2381289981693855,
"logits/rejected": -1.7685196120349025,
"logps/chosen": -132.71670532226562,
"logps/rejected": -270.32165908813477,
"loss": 0.00017723625933285803,
"mean_token_accuracy": 0.640245221555233,
"num_tokens": 135799.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.0271762115880847,
"rewards/margins": 9.069988369941711,
"rewards/rejected": -8.04281210899353,
"step": 21
},
{
"entropy": 1.184071958065033,
"epoch": 1.10062893081761,
"grad_norm": 0.06201171875,
"learning_rate": 3.2500000000000004e-05,
"logits/chosen": -2.2030959691219643,
"logits/rejected": -1.549622772314418,
"logps/chosen": -137.2278184890747,
"logps/rejected": -278.6941947937012,
"loss": 0.002708145882934332,
"mean_token_accuracy": 0.5984649807214737,
"num_tokens": 142686.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.10265790205448866,
"rewards/margins": 8.331138014793396,
"rewards/rejected": -8.228479981422424,
"step": 22
},
{
"entropy": 1.1163400262594223,
"epoch": 1.150943396226415,
"grad_norm": 0.010498046875,
"learning_rate": 3.1666666666666666e-05,
"logits/chosen": -2.2692944342560915,
"logits/rejected": -1.9815728330371412,
"logps/chosen": -121.65661811828613,
"logps/rejected": -251.65609550476074,
"loss": 0.0006153068970888853,
"mean_token_accuracy": 0.5803752839565277,
"num_tokens": 148888.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.6473507843911648,
"rewards/margins": 8.257275819778442,
"rewards/rejected": -7.609925091266632,
"step": 23
},
{
"entropy": 1.2022811472415924,
"epoch": 1.20125786163522,
"grad_norm": 0.0196533203125,
"learning_rate": 3.0833333333333335e-05,
"logits/chosen": -2.0684997205168574,
"logits/rejected": -1.5187979793505884,
"logps/chosen": -142.0221881866455,
"logps/rejected": -288.0162467956543,
"loss": 0.0010658408282324672,
"mean_token_accuracy": 0.6041584983468056,
"num_tokens": 155066.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3129555657505989,
"rewards/margins": 9.287788093090057,
"rewards/rejected": -8.974832653999329,
"step": 24
},
{
"entropy": 1.0715540498495102,
"epoch": 1.251572327044025,
"grad_norm": 0.0152587890625,
"learning_rate": 3e-05,
"logits/chosen": -2.253688589577229,
"logits/rejected": -1.709912522981535,
"logps/chosen": -124.24794864654541,
"logps/rejected": -268.31462478637695,
"loss": 0.0007783680921420455,
"mean_token_accuracy": 0.6487728357315063,
"num_tokens": 161896.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5254653260344639,
"rewards/margins": 8.636412143707275,
"rewards/rejected": -8.110946834087372,
"step": 25
},
{
"entropy": 1.1292345225811005,
"epoch": 1.3018867924528301,
"grad_norm": 0.01611328125,
"learning_rate": 2.916666666666667e-05,
"logits/chosen": -2.03905456801847,
"logits/rejected": -1.653255032290945,
"logps/chosen": -121.39779376983643,
"logps/rejected": -255.01618576049805,
"loss": 0.0008846853161230683,
"mean_token_accuracy": 0.622931219637394,
"num_tokens": 168755.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.2767142332158983,
"rewards/margins": 8.121429800987244,
"rewards/rejected": -7.844715654850006,
"step": 26
},
{
"entropy": 1.107276238501072,
"epoch": 1.3522012578616351,
"grad_norm": 0.0076904296875,
"learning_rate": 2.8333333333333335e-05,
"logits/chosen": -2.286326659906524,
"logits/rejected": -1.9377510490705079,
"logps/chosen": -144.30573749542236,
"logps/rejected": -272.37299728393555,
"loss": 0.00033011281630024314,
"mean_token_accuracy": 0.619342528283596,
"num_tokens": 175381.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.9377570524811745,
"rewards/margins": 9.561910688877106,
"rewards/rejected": -8.6241534948349,
"step": 27
},
{
"entropy": 1.157460778951645,
"epoch": 1.4025157232704402,
"grad_norm": 0.0047607421875,
"learning_rate": 2.7500000000000004e-05,
"logits/chosen": -2.1326494507814266,
"logits/rejected": -1.8268033175884986,
"logps/chosen": -125.0985517501831,
"logps/rejected": -274.5887870788574,
"loss": 0.0002390609442954883,
"mean_token_accuracy": 0.620079979300499,
"num_tokens": 181771.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.8465396109968424,
"rewards/margins": 9.534583747386932,
"rewards/rejected": -8.688044130802155,
"step": 28
},
{
"entropy": 1.1201048269867897,
"epoch": 1.4528301886792452,
"grad_norm": 0.0093994140625,
"learning_rate": 2.6666666666666667e-05,
"logits/chosen": -2.311015681209517,
"logits/rejected": -2.0115759556684885,
"logps/chosen": -140.10973072052002,
"logps/rejected": -260.4351615905762,
"loss": 0.0005316605675034225,
"mean_token_accuracy": 0.6181787773966789,
"num_tokens": 188644.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.1467662937939167,
"rewards/margins": 8.486627459526062,
"rewards/rejected": -8.633393704891205,
"step": 29
},
{
"entropy": 1.1074048355221748,
"epoch": 1.5031446540880502,
"grad_norm": 0.024658203125,
"learning_rate": 2.5833333333333336e-05,
"logits/chosen": -2.2242952052635343,
"logits/rejected": -1.9084164743872978,
"logps/chosen": -120.53641414642334,
"logps/rejected": -248.61384201049805,
"loss": 0.0015231292927637696,
"mean_token_accuracy": 0.6136712729930878,
"num_tokens": 196391.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.0413979523582384,
"rewards/margins": 8.107804000377655,
"rewards/rejected": -8.149202108383179,
"step": 30
},
{
"entropy": 1.0835556164383888,
"epoch": 1.5534591194968552,
"grad_norm": 0.0308837890625,
"learning_rate": 2.5e-05,
"logits/chosen": -2.2358070307390454,
"logits/rejected": -1.8015257262782527,
"logps/chosen": -160.49627590179443,
"logps/rejected": -286.36713790893555,
"loss": 0.0012202183715999126,
"mean_token_accuracy": 0.6140667796134949,
"num_tokens": 203518.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.12220276962034404,
"rewards/margins": 8.978903114795685,
"rewards/rejected": -9.10110592842102,
"step": 31
},
{
"entropy": 1.1327729299664497,
"epoch": 1.6037735849056602,
"grad_norm": 0.0078125,
"learning_rate": 2.4166666666666667e-05,
"logits/chosen": -2.222914372779498,
"logits/rejected": -1.8115026450741711,
"logps/chosen": -143.74180603027344,
"logps/rejected": -296.8906936645508,
"loss": 0.0003141126944683492,
"mean_token_accuracy": 0.6098875515162945,
"num_tokens": 209850.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1798817589879036,
"rewards/margins": 10.448372840881348,
"rewards/rejected": -10.268491089344025,
"step": 32
},
{
"entropy": 1.125333271920681,
"epoch": 1.6540880503144653,
"grad_norm": 0.017578125,
"learning_rate": 2.3333333333333336e-05,
"logits/chosen": -2.4135272067915143,
"logits/rejected": -1.7676647261719525,
"logps/chosen": -125.86479663848877,
"logps/rejected": -266.06410026550293,
"loss": 0.0005237428122200072,
"mean_token_accuracy": 0.6229566335678101,
"num_tokens": 215637.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.7084813113324344,
"rewards/margins": 9.689094841480255,
"rewards/rejected": -8.980613589286804,
"step": 33
},
{
"entropy": 1.058401882648468,
"epoch": 1.7044025157232703,
"grad_norm": 0.007354736328125,
"learning_rate": 2.25e-05,
"logits/chosen": -2.46378613488542,
"logits/rejected": -2.1071431291236653,
"logps/chosen": -125.6760368347168,
"logps/rejected": -258.0559844970703,
"loss": 0.00038854233571328223,
"mean_token_accuracy": 0.6303330659866333,
"num_tokens": 221051.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.3429398629814386,
"rewards/margins": 10.128098785877228,
"rewards/rejected": -8.785158812999725,
"step": 34
},
{
"entropy": 1.1321996748447418,
"epoch": 1.7547169811320755,
"grad_norm": 0.0030670166015625,
"learning_rate": 2.1666666666666667e-05,
"logits/chosen": -2.3747636222967063,
"logits/rejected": -2.115993388323722,
"logps/chosen": -131.67159175872803,
"logps/rejected": -276.36437797546387,
"loss": 0.00012030061770929024,
"mean_token_accuracy": 0.6251056790351868,
"num_tokens": 226426.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.8489564098417759,
"rewards/margins": 10.621768474578857,
"rewards/rejected": -9.772812008857727,
"step": 35
},
{
"entropy": 1.116398274898529,
"epoch": 1.8050314465408805,
"grad_norm": 0.0181884765625,
"learning_rate": 2.0833333333333336e-05,
"logits/chosen": -2.4568723077785557,
"logits/rejected": -1.9738618373268513,
"logps/chosen": -117.32646560668945,
"logps/rejected": -272.7527599334717,
"loss": 0.0008142158621922135,
"mean_token_accuracy": 0.6250453069806099,
"num_tokens": 233419.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3417433723807335,
"rewards/margins": 9.766010344028473,
"rewards/rejected": -9.424266993999481,
"step": 36
},
{
"entropy": 1.0965562090277672,
"epoch": 1.8553459119496856,
"grad_norm": 0.0027313232421875,
"learning_rate": 2e-05,
"logits/chosen": -2.3551532120785383,
"logits/rejected": -1.9812547363037738,
"logps/chosen": -144.9142713546753,
"logps/rejected": -296.3636951446533,
"loss": 0.00013589797890745103,
"mean_token_accuracy": 0.5917238146066666,
"num_tokens": 240673.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.10223913192749023,
"rewards/margins": 10.127011775970459,
"rewards/rejected": -10.22925090789795,
"step": 37
},
{
"entropy": 1.088953472673893,
"epoch": 1.9056603773584906,
"grad_norm": 0.0032806396484375,
"learning_rate": 1.9166666666666667e-05,
"logits/chosen": -2.326897647347544,
"logits/rejected": -2.0923411183004155,
"logps/chosen": -141.62347412109375,
"logps/rejected": -285.8790645599365,
"loss": 0.0001322527095908299,
"mean_token_accuracy": 0.6075941771268845,
"num_tokens": 245974.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.7096749469637871,
"rewards/margins": 10.769983649253845,
"rewards/rejected": -10.060308814048767,
"step": 38
},
{
"entropy": 1.1708777844905853,
"epoch": 1.9559748427672956,
"grad_norm": 0.003997802734375,
"learning_rate": 1.8333333333333333e-05,
"logits/chosen": -2.1918049696005726,
"logits/rejected": -1.7380488443097255,
"logps/chosen": -127.21480464935303,
"logps/rejected": -273.00823402404785,
"loss": 0.00020453613251447678,
"mean_token_accuracy": 0.6081509068608284,
"num_tokens": 253184.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.09920912818051875,
"rewards/margins": 9.214752078056335,
"rewards/rejected": -9.313961207866669,
"step": 39
},
{
"entropy": 1.1617241672107153,
"epoch": 2.0,
"grad_norm": 0.0191650390625,
"learning_rate": 1.75e-05,
"logits/chosen": -2.2749118295696342,
"logits/rejected": -1.7584887991939784,
"logps/chosen": -154.0042724609375,
"logps/rejected": -292.14568001883373,
"loss": 0.0009170390549115837,
"mean_token_accuracy": 0.5644707764898028,
"num_tokens": 260000.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.44687261964593616,
"rewards/margins": 9.13522618157523,
"rewards/rejected": -9.582098756517683,
"step": 40
},
{
"entropy": 1.1219489574432373,
"epoch": 2.050314465408805,
"grad_norm": 0.00396728515625,
"learning_rate": 1.6666666666666667e-05,
"logits/chosen": -2.3933433594531426,
"logits/rejected": -1.8782996338947386,
"logps/chosen": -121.1051197052002,
"logps/rejected": -273.1605224609375,
"loss": 0.00018580301548354328,
"mean_token_accuracy": 0.6062684953212738,
"num_tokens": 266060.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5098631188739091,
"rewards/margins": 9.81256103515625,
"rewards/rejected": -9.302697837352753,
"step": 41
},
{
"entropy": 1.142582356929779,
"epoch": 2.10062893081761,
"grad_norm": 0.00506591796875,
"learning_rate": 1.5833333333333333e-05,
"logits/chosen": -2.3932424547453,
"logits/rejected": -2.034844354918984,
"logps/chosen": -132.19951725006104,
"logps/rejected": -273.3875484466553,
"loss": 0.00027762039098888636,
"mean_token_accuracy": 0.6254889070987701,
"num_tokens": 272769.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.027547355741262436,
"rewards/margins": 9.588396728038788,
"rewards/rejected": -9.560849368572235,
"step": 42
},
{
"entropy": 1.0831276252865791,
"epoch": 2.150943396226415,
"grad_norm": 0.0201416015625,
"learning_rate": 1.5e-05,
"logits/chosen": -2.3213323581687466,
"logits/rejected": -1.9188541617869985,
"logps/chosen": -140.96678638458252,
"logps/rejected": -298.01341819763184,
"loss": 0.0006403317674994469,
"mean_token_accuracy": 0.6212100088596344,
"num_tokens": 279603.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.13022289611399174,
"rewards/margins": 10.482895195484161,
"rewards/rejected": -10.61311811208725,
"step": 43
},
{
"entropy": 1.1165131032466888,
"epoch": 2.20125786163522,
"grad_norm": 0.004119873046875,
"learning_rate": 1.4166666666666668e-05,
"logits/chosen": -2.325539811977018,
"logits/rejected": -1.819132333499079,
"logps/chosen": -137.1863489151001,
"logps/rejected": -280.5503120422363,
"loss": 0.00018718022329267114,
"mean_token_accuracy": 0.6205575913190842,
"num_tokens": 286436.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.0954840648919344,
"rewards/margins": 9.663193047046661,
"rewards/rejected": -9.567708969116211,
"step": 44
},
{
"entropy": 1.0798147842288017,
"epoch": 2.251572327044025,
"grad_norm": 0.006195068359375,
"learning_rate": 1.3333333333333333e-05,
"logits/chosen": -2.462720523269614,
"logits/rejected": -1.9150426029399608,
"logps/chosen": -137.90959072113037,
"logps/rejected": -299.62617683410645,
"loss": 0.00032327763619832695,
"mean_token_accuracy": 0.6055086851119995,
"num_tokens": 294223.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.0803464986383915,
"rewards/margins": 10.386663496494293,
"rewards/rejected": -10.467009961605072,
"step": 45
},
{
"entropy": 1.0406965985894203,
"epoch": 2.30188679245283,
"grad_norm": 0.00360107421875,
"learning_rate": 1.25e-05,
"logits/chosen": -2.351859813790029,
"logits/rejected": -2.079019230872164,
"logps/chosen": -114.20672798156738,
"logps/rejected": -270.77587890625,
"loss": 0.0001596831134520471,
"mean_token_accuracy": 0.6743896827101707,
"num_tokens": 299634.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 1.2796257368754596,
"rewards/margins": 10.65468156337738,
"rewards/rejected": -9.375055730342865,
"step": 46
},
{
"entropy": 1.1319249421358109,
"epoch": 2.352201257861635,
"grad_norm": 0.00323486328125,
"learning_rate": 1.1666666666666668e-05,
"logits/chosen": -2.3348495432588066,
"logits/rejected": -1.8859447778318736,
"logps/chosen": -162.34913635253906,
"logps/rejected": -305.2253665924072,
"loss": 0.00012244780373293906,
"mean_token_accuracy": 0.5992361158132553,
"num_tokens": 305920.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.0735319098457694,
"rewards/margins": 10.709958910942078,
"rewards/rejected": -10.783490896224976,
"step": 47
},
{
"entropy": 1.1012349873781204,
"epoch": 2.40251572327044,
"grad_norm": 0.00555419921875,
"learning_rate": 1.0833333333333334e-05,
"logits/chosen": -2.376434497997167,
"logits/rejected": -1.9649504169742804,
"logps/chosen": -134.29817581176758,
"logps/rejected": -269.4762592315674,
"loss": 0.00029120981344021857,
"mean_token_accuracy": 0.615195669233799,
"num_tokens": 313346.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.04450349509716034,
"rewards/margins": 8.847583532333374,
"rewards/rejected": -8.803080201148987,
"step": 48
},
{
"entropy": 1.0822330936789513,
"epoch": 2.452830188679245,
"grad_norm": 0.004486083984375,
"learning_rate": 1e-05,
"logits/chosen": -2.4181823111443506,
"logits/rejected": -1.9432117626083543,
"logps/chosen": -139.22668647766113,
"logps/rejected": -286.98141860961914,
"loss": 0.00024342790129594505,
"mean_token_accuracy": 0.5807452276349068,
"num_tokens": 320662.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.20274513261392713,
"rewards/margins": 9.553876638412476,
"rewards/rejected": -9.756621778011322,
"step": 49
},
{
"entropy": 1.1255912110209465,
"epoch": 2.50314465408805,
"grad_norm": 0.01043701171875,
"learning_rate": 9.166666666666666e-06,
"logits/chosen": -2.390428515839434,
"logits/rejected": -2.070867890248319,
"logps/chosen": -152.57851219177246,
"logps/rejected": -278.1038990020752,
"loss": 0.0005284082726575434,
"mean_token_accuracy": 0.5914618484675884,
"num_tokens": 326611.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.24946272000670433,
"rewards/margins": 9.3264040350914,
"rewards/rejected": -9.076941430568695,
"step": 50
},
{
"entropy": 1.10995664447546,
"epoch": 2.5534591194968552,
"grad_norm": 0.005950927734375,
"learning_rate": 8.333333333333334e-06,
"logits/chosen": -2.3087954809493545,
"logits/rejected": -1.9792274410316972,
"logps/chosen": -116.59206485748291,
"logps/rejected": -267.238338470459,
"loss": 0.00025631688185967505,
"mean_token_accuracy": 0.625831313431263,
"num_tokens": 332814.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.8696157680824399,
"rewards/margins": 10.124218106269836,
"rewards/rejected": -9.254602372646332,
"step": 51
},
{
"entropy": 1.117017239332199,
"epoch": 2.6037735849056602,
"grad_norm": 0.00152587890625,
"learning_rate": 7.5e-06,
"logits/chosen": -2.4137662915261817,
"logits/rejected": -1.9523699949115683,
"logps/chosen": -120.80679893493652,
"logps/rejected": -280.64561653137207,
"loss": 7.924844976514578e-05,
"mean_token_accuracy": 0.6106409505009651,
"num_tokens": 339547.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.790530975908041,
"rewards/margins": 10.681683659553528,
"rewards/rejected": -9.891152799129486,
"step": 52
},
{
"entropy": 1.1054791137576103,
"epoch": 2.6540880503144653,
"grad_norm": 0.0164794921875,
"learning_rate": 6.666666666666667e-06,
"logits/chosen": -2.273507778205582,
"logits/rejected": -1.9109036313937824,
"logps/chosen": -115.94961833953857,
"logps/rejected": -269.0294952392578,
"loss": 0.0007658082176931202,
"mean_token_accuracy": 0.5990233793854713,
"num_tokens": 346647.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.25752640701830387,
"rewards/margins": 9.45040088891983,
"rewards/rejected": -9.192874610424042,
"step": 53
},
{
"entropy": 1.103651024401188,
"epoch": 2.7044025157232703,
"grad_norm": 0.002593994140625,
"learning_rate": 5.833333333333334e-06,
"logits/chosen": -2.333249501998797,
"logits/rejected": -1.8058473440003249,
"logps/chosen": -152.4753885269165,
"logps/rejected": -301.55738639831543,
"loss": 0.00010592853504931554,
"mean_token_accuracy": 0.6078185737133026,
"num_tokens": 353155.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.3159999940544367,
"rewards/margins": 10.630276560783386,
"rewards/rejected": -10.314276397228241,
"step": 54
},
{
"entropy": 1.160094790160656,
"epoch": 2.7547169811320753,
"grad_norm": 0.0015716552734375,
"learning_rate": 5e-06,
"logits/chosen": -2.3308505618468462,
"logits/rejected": -1.8965502590056609,
"logps/chosen": -141.07257843017578,
"logps/rejected": -296.07042503356934,
"loss": 6.342934648273513e-05,
"mean_token_accuracy": 0.6171156838536263,
"num_tokens": 358538.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.7141566202044487,
"rewards/margins": 10.787114977836609,
"rewards/rejected": -10.072958290576935,
"step": 55
},
{
"entropy": 1.1058026999235153,
"epoch": 2.8050314465408803,
"grad_norm": 0.030517578125,
"learning_rate": 4.166666666666667e-06,
"logits/chosen": -2.322696784138965,
"logits/rejected": -1.894479697226797,
"logps/chosen": -121.85036373138428,
"logps/rejected": -254.1634178161621,
"loss": 0.00172812445089221,
"mean_token_accuracy": 0.6225817948579788,
"num_tokens": 364730.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.10639754496514797,
"rewards/margins": 8.579113006591797,
"rewards/rejected": -8.472715437412262,
"step": 56
},
{
"entropy": 1.102943792939186,
"epoch": 2.8553459119496853,
"grad_norm": 0.00872802734375,
"learning_rate": 3.3333333333333333e-06,
"logits/chosen": -2.173708624672683,
"logits/rejected": -1.970646926930764,
"logps/chosen": -142.89206504821777,
"logps/rejected": -268.407958984375,
"loss": 0.00041806488297879696,
"mean_token_accuracy": 0.5970962047576904,
"num_tokens": 371209.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.27814904414117336,
"rewards/margins": 9.40092819929123,
"rewards/rejected": -9.122779071331024,
"step": 57
},
{
"entropy": 1.1477283239364624,
"epoch": 2.9056603773584904,
"grad_norm": 0.00469970703125,
"learning_rate": 2.5e-06,
"logits/chosen": -2.293317013131777,
"logits/rejected": -1.8812048808544868,
"logps/chosen": -137.3569221496582,
"logps/rejected": -291.4598960876465,
"loss": 0.00018517834541853517,
"mean_token_accuracy": 0.6110436543822289,
"num_tokens": 378077.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.33568347059190273,
"rewards/margins": 10.355388283729553,
"rewards/rejected": -10.01970487833023,
"step": 58
},
{
"entropy": 1.0955762341618538,
"epoch": 2.9559748427672954,
"grad_norm": 0.00193023681640625,
"learning_rate": 1.6666666666666667e-06,
"logits/chosen": -2.3105184309249727,
"logits/rejected": -2.0821060341980577,
"logps/chosen": -141.80660915374756,
"logps/rejected": -279.77105140686035,
"loss": 8.936067024478689e-05,
"mean_token_accuracy": 0.6105019077658653,
"num_tokens": 384115.0,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.5417802967131138,
"rewards/margins": 10.488943040370941,
"rewards/rejected": -9.947162747383118,
"step": 59
},
{
"entropy": 1.0900011318070548,
"epoch": 3.0,
"grad_norm": 0.024169921875,
"learning_rate": 8.333333333333333e-07,
"logits/chosen": -2.332351866206725,
"logits/rejected": -1.8031474708962778,
"logps/chosen": -138.6800994873047,
"logps/rejected": -265.439217703683,
"loss": 0.0013465541414916515,
"mean_token_accuracy": 0.6026440688541957,
"num_tokens": 390000.0,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.2070258174623762,
"rewards/margins": 8.572459425245013,
"rewards/rejected": -8.77948522567749,
"step": 60
}
],
"logging_steps": 1,
"max_steps": 60,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 60,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3189182437748736.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}