testing_new / trainer_state.json
sAviOr287's picture
Add files using upload-large-folder tool
a154f8c verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.3885003885003885,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 748.7643005371094,
"epoch": 0.009712509712509712,
"grad_norm": 0.3736153689307914,
"learning_rate": 1e-06,
"loss": -0.0068,
"reward": 3.822531852722168,
"reward_std": 0.476869178712368,
"rewards/agent_reward_func_MC": 0.8567936837673187,
"rewards/correctness_reward_func": 1.8019047832489015,
"rewards/correctness_reward_func_eval": 0.8142857432365418,
"rewards/format_reward_func": 0.3495476073026657,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 836.8619177246094,
"epoch": 0.019425019425019424,
"grad_norm": 0.3188121982595845,
"learning_rate": 1e-06,
"loss": -0.0107,
"reward": 3.90938099861145,
"reward_std": 0.38552937254309655,
"rewards/agent_reward_func_MC": 0.8773333775997162,
"rewards/correctness_reward_func": 1.8704762172698974,
"rewards/correctness_reward_func_eval": 0.812380975484848,
"rewards/format_reward_func": 0.3491904670000076,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 973.8400158691406,
"epoch": 0.029137529137529136,
"grad_norm": 0.27198200236481873,
"learning_rate": 1e-06,
"loss": -0.0031,
"reward": 3.8425635862350465,
"reward_std": 0.4346236677467823,
"rewards/agent_reward_func_MC": 0.864666701555252,
"rewards/correctness_reward_func": 1.8247619271278381,
"rewards/correctness_reward_func_eval": 0.8034920811653137,
"rewards/format_reward_func": 0.34964284658432004,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 1018.5909753417968,
"epoch": 0.03885003885003885,
"grad_norm": 0.28510359189330664,
"learning_rate": 1e-06,
"loss": 0.011,
"reward": 3.9731746339797973,
"reward_std": 0.2720495498180389,
"rewards/agent_reward_func_MC": 0.8882857489585877,
"rewards/correctness_reward_func": 1.8980952572822571,
"rewards/correctness_reward_func_eval": 0.8369841420650482,
"rewards/format_reward_func": 0.34980951368808744,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 1140.068126220703,
"epoch": 0.04856254856254856,
"grad_norm": 0.23329425011386498,
"learning_rate": 1e-06,
"loss": 0.0032,
"reward": 3.988293719291687,
"reward_std": 0.30129268489778044,
"rewards/agent_reward_func_MC": 0.8880000543594361,
"rewards/correctness_reward_func": 1.896190493106842,
"rewards/correctness_reward_func_eval": 0.8541270065307617,
"rewards/format_reward_func": 0.3499761837720871,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 1093.8681176757811,
"epoch": 0.05827505827505827,
"grad_norm": 0.29946322491146965,
"learning_rate": 1e-06,
"loss": -0.0109,
"reward": 3.977658863067627,
"reward_std": 0.32108820773661134,
"rewards/agent_reward_func_MC": 0.869904808998108,
"rewards/correctness_reward_func": 1.8838095450401307,
"rewards/correctness_reward_func_eval": 0.8739682829380035,
"rewards/format_reward_func": 0.34997618436813355,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 1075.1785949707032,
"epoch": 0.06798756798756798,
"grad_norm": 0.3040633702971847,
"learning_rate": 1e-06,
"loss": 0.0037,
"reward": 4.008127021789551,
"reward_std": 0.3045533967390657,
"rewards/agent_reward_func_MC": 0.8873333740234375,
"rewards/correctness_reward_func": 1.892380964756012,
"rewards/correctness_reward_func_eval": 0.8784127116203309,
"rewards/format_reward_func": 0.3499999940395355,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 1051.7819262695311,
"epoch": 0.0777000777000777,
"grad_norm": 0.2654552473631526,
"learning_rate": 1e-06,
"loss": 0.0047,
"reward": 3.9801508474349974,
"reward_std": 0.3097702523320913,
"rewards/agent_reward_func_MC": 0.8759365463256836,
"rewards/correctness_reward_func": 1.8752381134033203,
"rewards/correctness_reward_func_eval": 0.8790476399660111,
"rewards/format_reward_func": 0.34992856383323667,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 920.3571643066406,
"epoch": 0.08741258741258741,
"grad_norm": 0.30787656246569944,
"learning_rate": 1e-06,
"loss": 0.0022,
"reward": 4.030627017021179,
"reward_std": 0.257388199865818,
"rewards/agent_reward_func_MC": 0.9038730561733246,
"rewards/correctness_reward_func": 1.8942857313156127,
"rewards/correctness_reward_func_eval": 0.8825397026538849,
"rewards/format_reward_func": 0.34992856383323667,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 950.1019201660156,
"epoch": 0.09712509712509712,
"grad_norm": 0.278978790381264,
"learning_rate": 1e-06,
"loss": 0.0049,
"reward": 4.0613808870315555,
"reward_std": 0.2030999060533941,
"rewards/agent_reward_func_MC": 0.9173016262054443,
"rewards/correctness_reward_func": 1.8819047713279724,
"rewards/correctness_reward_func_eval": 0.9122222352027893,
"rewards/format_reward_func": 0.34995237350463865,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 988.4190625,
"epoch": 0.10683760683760683,
"grad_norm": 0.28501385265891443,
"learning_rate": 1e-06,
"loss": 0.0041,
"reward": 4.06360315322876,
"reward_std": 0.28966286245733497,
"rewards/agent_reward_func_MC": 0.9020952671766281,
"rewards/correctness_reward_func": 1.896190493106842,
"rewards/correctness_reward_func_eval": 0.9155555760860443,
"rewards/format_reward_func": 0.34976189851760864,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 988.201923828125,
"epoch": 0.11655011655011654,
"grad_norm": 0.35841673382474903,
"learning_rate": 1e-06,
"loss": 0.0082,
"reward": 4.126190505027771,
"reward_std": 0.24293919634073974,
"rewards/agent_reward_func_MC": 0.918984169960022,
"rewards/correctness_reward_func": 1.9352381134033203,
"rewards/correctness_reward_func_eval": 0.9220635092258453,
"rewards/format_reward_func": 0.3499047553539276,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 964.7343017578125,
"epoch": 0.12626262626262627,
"grad_norm": 0.3255852780092938,
"learning_rate": 1e-06,
"loss": 0.003,
"reward": 4.03707145690918,
"reward_std": 0.2535955292731524,
"rewards/agent_reward_func_MC": 0.9001587682962418,
"rewards/correctness_reward_func": 1.8800000143051148,
"rewards/correctness_reward_func_eval": 0.9069841504096985,
"rewards/format_reward_func": 0.34992856442928316,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 1023.8019348144531,
"epoch": 0.13597513597513597,
"grad_norm": 0.258258613758542,
"learning_rate": 1e-06,
"loss": 0.0021,
"reward": 4.010547647476196,
"reward_std": 0.30033121041953564,
"rewards/agent_reward_func_MC": 0.8870794075727463,
"rewards/correctness_reward_func": 1.867619068622589,
"rewards/correctness_reward_func_eval": 0.9058730411529541,
"rewards/format_reward_func": 0.34997618436813355,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 1049.6647875976562,
"epoch": 0.1456876456876457,
"grad_norm": 0.2868301098892232,
"learning_rate": 1e-06,
"loss": 0.0083,
"reward": 4.163015942573548,
"reward_std": 0.19761438958346844,
"rewards/agent_reward_func_MC": 0.936666705608368,
"rewards/correctness_reward_func": 1.9514285826683044,
"rewards/correctness_reward_func_eval": 0.9249206626415253,
"rewards/format_reward_func": 0.3499999940395355,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 1131.1047875976562,
"epoch": 0.1554001554001554,
"grad_norm": 0.22388105570487274,
"learning_rate": 1e-06,
"loss": 0.007,
"reward": 3.9448730325698853,
"reward_std": 0.26593201816082,
"rewards/agent_reward_func_MC": 0.8765079736709595,
"rewards/correctness_reward_func": 1.839047634601593,
"rewards/correctness_reward_func_eval": 0.879365097284317,
"rewards/format_reward_func": 0.34995237410068514,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 1176.7590686035155,
"epoch": 0.16511266511266512,
"grad_norm": 0.2821630374549152,
"learning_rate": 1e-06,
"loss": 0.0177,
"reward": 3.8298174810409544,
"reward_std": 0.25791051633656026,
"rewards/agent_reward_func_MC": 0.8357143165171146,
"rewards/correctness_reward_func": 1.7523809648305178,
"rewards/correctness_reward_func_eval": 0.8917460405826568,
"rewards/format_reward_func": 0.3499761837720871,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 1018.4166821289062,
"epoch": 0.17482517482517482,
"grad_norm": 0.29273821007360346,
"learning_rate": 1e-06,
"loss": 0.0077,
"reward": 4.071373038291931,
"reward_std": 0.18497788973152637,
"rewards/agent_reward_func_MC": 0.9020317900180816,
"rewards/correctness_reward_func": 1.8876190567016602,
"rewards/correctness_reward_func_eval": 0.9317460560798645,
"rewards/format_reward_func": 0.3499761837720871,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 1005.7043090820313,
"epoch": 0.18453768453768454,
"grad_norm": 0.24921090330226142,
"learning_rate": 1e-06,
"loss": 0.0056,
"reward": 4.00266664981842,
"reward_std": 0.22064166717231273,
"rewards/agent_reward_func_MC": 0.8949206686019897,
"rewards/correctness_reward_func": 1.849523823261261,
"rewards/correctness_reward_func_eval": 0.9084127140045166,
"rewards/format_reward_func": 0.3498095166683197,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 943.940966796875,
"epoch": 0.19425019425019424,
"grad_norm": 0.255215749280062,
"learning_rate": 1e-06,
"loss": -0.0096,
"reward": 3.9010079383850096,
"reward_std": 0.26742793841287493,
"rewards/agent_reward_func_MC": 0.8706984454393387,
"rewards/correctness_reward_func": 1.796190493106842,
"rewards/correctness_reward_func_eval": 0.8842857277393341,
"rewards/format_reward_func": 0.3498333257436752,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 1131.998592529297,
"epoch": 0.20396270396270397,
"grad_norm": 0.24196861235641634,
"learning_rate": 1e-06,
"loss": 0.0061,
"reward": 3.9897142422199248,
"reward_std": 0.2609097701497376,
"rewards/agent_reward_func_MC": 0.8956508328020573,
"rewards/correctness_reward_func": 1.8285714424401522,
"rewards/correctness_reward_func_eval": 0.9158730319142342,
"rewards/format_reward_func": 0.34961903989315035,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 1269.4943090820314,
"epoch": 0.21367521367521367,
"grad_norm": 0.22055070432080545,
"learning_rate": 1e-06,
"loss": 0.0059,
"reward": 3.955634880065918,
"reward_std": 0.2273477977141738,
"rewards/agent_reward_func_MC": 0.8830476495623588,
"rewards/correctness_reward_func": 1.8095238244533538,
"rewards/correctness_reward_func_eval": 0.9134920847415924,
"rewards/format_reward_func": 0.3495714205503464,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 1250.7200256347655,
"epoch": 0.2233877233877234,
"grad_norm": 0.28987017976023693,
"learning_rate": 1e-06,
"loss": -0.0021,
"reward": 4.0416190052032475,
"reward_std": 0.20220871651545166,
"rewards/agent_reward_func_MC": 0.8979682916402817,
"rewards/correctness_reward_func": 1.8542857229709626,
"rewards/correctness_reward_func_eval": 0.9393650984764099,
"rewards/format_reward_func": 0.3499999940395355,
"step": 115
},
{
"clip_ratio": 0.0,
"completion_length": 1257.4157287597657,
"epoch": 0.2331002331002331,
"grad_norm": 0.19555680054474148,
"learning_rate": 1e-06,
"loss": 0.008,
"reward": 3.912722191810608,
"reward_std": 0.22784368658438325,
"rewards/agent_reward_func_MC": 0.869650827050209,
"rewards/correctness_reward_func": 1.796190493106842,
"rewards/correctness_reward_func_eval": 0.897142875790596,
"rewards/format_reward_func": 0.3497380870580673,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 1272.7885961914062,
"epoch": 0.24281274281274282,
"grad_norm": 0.18255389452293974,
"learning_rate": 1e-06,
"loss": -0.0106,
"reward": 3.8947618293762205,
"reward_std": 0.20762850038707256,
"rewards/agent_reward_func_MC": 0.8573016184568405,
"rewards/correctness_reward_func": 1.773333351612091,
"rewards/correctness_reward_func_eval": 0.9141269952058793,
"rewards/format_reward_func": 0.3499999940395355,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 1440.9728833007812,
"epoch": 0.25252525252525254,
"grad_norm": 0.2547576435452318,
"learning_rate": 1e-06,
"loss": 0.0119,
"reward": 3.864793620109558,
"reward_std": 0.29515512300655244,
"rewards/agent_reward_func_MC": 0.8445079725980759,
"rewards/correctness_reward_func": 1.7628571581840515,
"rewards/correctness_reward_func_eval": 0.9076190626621247,
"rewards/format_reward_func": 0.34980951607227323,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 1441.5447827148437,
"epoch": 0.26223776223776224,
"grad_norm": 0.24563201564074505,
"learning_rate": 1e-06,
"loss": 0.0135,
"reward": 3.883611145019531,
"reward_std": 0.2823502243310213,
"rewards/agent_reward_func_MC": 0.8565397095680237,
"rewards/correctness_reward_func": 1.7819047725200654,
"rewards/correctness_reward_func_eval": 0.8952381134033203,
"rewards/format_reward_func": 0.34992856562137603,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 1483.5781201171876,
"epoch": 0.27195027195027194,
"grad_norm": 0.22800872368606584,
"learning_rate": 1e-06,
"loss": 0.018,
"reward": 3.8423254334926606,
"reward_std": 0.2927846448868513,
"rewards/agent_reward_func_MC": 0.836825436502695,
"rewards/correctness_reward_func": 1.755238108932972,
"rewards/correctness_reward_func_eval": 0.900476205945015,
"rewards/format_reward_func": 0.34978570640087125,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 1435.1005004882813,
"epoch": 0.28166278166278164,
"grad_norm": 0.1789329116748729,
"learning_rate": 1e-06,
"loss": 0.0029,
"reward": 3.8026666712760924,
"reward_std": 0.2736345401033759,
"rewards/agent_reward_func_MC": 0.8252063795924187,
"rewards/correctness_reward_func": 1.7352381092309952,
"rewards/correctness_reward_func_eval": 0.8922222399711609,
"rewards/format_reward_func": 0.3499999940395355,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 1238.4776416015625,
"epoch": 0.2913752913752914,
"grad_norm": 0.25524578153777566,
"learning_rate": 1e-06,
"loss": 0.0066,
"reward": 3.9930158853530884,
"reward_std": 0.2553951171413064,
"rewards/agent_reward_func_MC": 0.876190505027771,
"rewards/correctness_reward_func": 1.8361904859542846,
"rewards/correctness_reward_func_eval": 0.9306349349021912,
"rewards/format_reward_func": 0.3499999940395355,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 1195.0243041992187,
"epoch": 0.3010878010878011,
"grad_norm": 0.23520669203586708,
"learning_rate": 1e-06,
"loss": 0.0052,
"reward": 4.016698341369629,
"reward_std": 0.281817892305553,
"rewards/agent_reward_func_MC": 0.9011428928375245,
"rewards/correctness_reward_func": 1.8390476369857789,
"rewards/correctness_reward_func_eval": 0.9265079569816589,
"rewards/format_reward_func": 0.3499999940395355,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 1184.551446533203,
"epoch": 0.3108003108003108,
"grad_norm": 0.23613211319734428,
"learning_rate": 1e-06,
"loss": 0.0083,
"reward": 3.7817697978019713,
"reward_std": 0.32593878942541776,
"rewards/agent_reward_func_MC": 0.8337143290042878,
"rewards/correctness_reward_func": 1.715238115787506,
"rewards/correctness_reward_func_eval": 0.8831746190786361,
"rewards/format_reward_func": 0.34964285016059876,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 1244.0347839355468,
"epoch": 0.32051282051282054,
"grad_norm": 0.24529652266955448,
"learning_rate": 1e-06,
"loss": 0.0014,
"reward": 3.7896666431427004,
"reward_std": 0.3328119495511055,
"rewards/agent_reward_func_MC": 0.8448571795225144,
"rewards/correctness_reward_func": 1.718095259666443,
"rewards/correctness_reward_func_eval": 0.8771428710222244,
"rewards/format_reward_func": 0.34957142114639284,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 1447.985263671875,
"epoch": 0.33022533022533024,
"grad_norm": 0.2065211585384318,
"learning_rate": 1e-06,
"loss": 0.0103,
"reward": 3.7185713863372802,
"reward_std": 0.3374773776344955,
"rewards/agent_reward_func_MC": 0.8139047813415528,
"rewards/correctness_reward_func": 1.686666680574417,
"rewards/correctness_reward_func_eval": 0.8680952602624893,
"rewards/format_reward_func": 0.3499047553539276,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 1459.0500244140626,
"epoch": 0.33993783993783994,
"grad_norm": 0.21734053290147362,
"learning_rate": 1e-06,
"loss": 0.0124,
"reward": 3.979817385673523,
"reward_std": 0.31312828628346323,
"rewards/agent_reward_func_MC": 0.8849206674098968,
"rewards/correctness_reward_func": 1.8247619199752807,
"rewards/correctness_reward_func_eval": 0.9206349372863769,
"rewards/format_reward_func": 0.34949999272823334,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 1425.19859375,
"epoch": 0.34965034965034963,
"grad_norm": 0.2907016194013063,
"learning_rate": 1e-06,
"loss": 0.0084,
"reward": 3.65442857503891,
"reward_std": 0.3651884417142719,
"rewards/agent_reward_func_MC": 0.785936538875103,
"rewards/correctness_reward_func": 1.647619072496891,
"rewards/correctness_reward_func_eval": 0.8711111295223236,
"rewards/format_reward_func": 0.3497618967294693,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 1544.9428930664062,
"epoch": 0.3593628593628594,
"grad_norm": 0.20955445164203745,
"learning_rate": 1e-06,
"loss": 0.0074,
"reward": 3.2816984033584595,
"reward_std": 0.4594585011713207,
"rewards/agent_reward_func_MC": 0.6978095433861017,
"rewards/correctness_reward_func": 1.4428571613132954,
"rewards/correctness_reward_func_eval": 0.7912698584794998,
"rewards/format_reward_func": 0.3497618967294693,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 1710.9262353515626,
"epoch": 0.3690753690753691,
"grad_norm": 0.22177847546734317,
"learning_rate": 1e-06,
"loss": 0.0104,
"reward": 3.5176428842544554,
"reward_std": 0.38553844563663003,
"rewards/agent_reward_func_MC": 0.7493016171455383,
"rewards/correctness_reward_func": 1.5676190626621247,
"rewards/correctness_reward_func_eval": 0.8507936751842499,
"rewards/format_reward_func": 0.34992856562137603,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 1679.4776538085937,
"epoch": 0.3787878787878788,
"grad_norm": 0.2779597660638594,
"learning_rate": 1e-06,
"loss": 0.0073,
"reward": 3.700857148170471,
"reward_std": 0.40202371578663587,
"rewards/agent_reward_func_MC": 0.7945079684257508,
"rewards/correctness_reward_func": 1.682857164144516,
"rewards/correctness_reward_func_eval": 0.8734920841455459,
"rewards/format_reward_func": 0.3499999940395355,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 1529.8266943359374,
"epoch": 0.3885003885003885,
"grad_norm": 0.24354444168029116,
"learning_rate": 1e-06,
"loss": 0.0098,
"reward": 3.8641825485229493,
"reward_std": 0.3314341966807842,
"rewards/agent_reward_func_MC": 0.8359365397691727,
"rewards/correctness_reward_func": 1.7533333575725556,
"rewards/correctness_reward_func_eval": 0.9250793838500977,
"rewards/format_reward_func": 0.34983332693576813,
"step": 200
}
],
"logging_steps": 5,
"max_steps": 514,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}