| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.3885003885003885, |
| "eval_steps": 500, |
| "global_step": 200, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 748.7643005371094, |
| "epoch": 0.009712509712509712, |
| "grad_norm": 0.3736153689307914, |
| "learning_rate": 1e-06, |
| "loss": -0.0068, |
| "reward": 3.822531852722168, |
| "reward_std": 0.476869178712368, |
| "rewards/agent_reward_func_MC": 0.8567936837673187, |
| "rewards/correctness_reward_func": 1.8019047832489015, |
| "rewards/correctness_reward_func_eval": 0.8142857432365418, |
| "rewards/format_reward_func": 0.3495476073026657, |
| "step": 5 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 836.8619177246094, |
| "epoch": 0.019425019425019424, |
| "grad_norm": 0.3188121982595845, |
| "learning_rate": 1e-06, |
| "loss": -0.0107, |
| "reward": 3.90938099861145, |
| "reward_std": 0.38552937254309655, |
| "rewards/agent_reward_func_MC": 0.8773333775997162, |
| "rewards/correctness_reward_func": 1.8704762172698974, |
| "rewards/correctness_reward_func_eval": 0.812380975484848, |
| "rewards/format_reward_func": 0.3491904670000076, |
| "step": 10 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 973.8400158691406, |
| "epoch": 0.029137529137529136, |
| "grad_norm": 0.27198200236481873, |
| "learning_rate": 1e-06, |
| "loss": -0.0031, |
| "reward": 3.8425635862350465, |
| "reward_std": 0.4346236677467823, |
| "rewards/agent_reward_func_MC": 0.864666701555252, |
| "rewards/correctness_reward_func": 1.8247619271278381, |
| "rewards/correctness_reward_func_eval": 0.8034920811653137, |
| "rewards/format_reward_func": 0.34964284658432004, |
| "step": 15 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1018.5909753417968, |
| "epoch": 0.03885003885003885, |
| "grad_norm": 0.28510359189330664, |
| "learning_rate": 1e-06, |
| "loss": 0.011, |
| "reward": 3.9731746339797973, |
| "reward_std": 0.2720495498180389, |
| "rewards/agent_reward_func_MC": 0.8882857489585877, |
| "rewards/correctness_reward_func": 1.8980952572822571, |
| "rewards/correctness_reward_func_eval": 0.8369841420650482, |
| "rewards/format_reward_func": 0.34980951368808744, |
| "step": 20 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1140.068126220703, |
| "epoch": 0.04856254856254856, |
| "grad_norm": 0.23329425011386498, |
| "learning_rate": 1e-06, |
| "loss": 0.0032, |
| "reward": 3.988293719291687, |
| "reward_std": 0.30129268489778044, |
| "rewards/agent_reward_func_MC": 0.8880000543594361, |
| "rewards/correctness_reward_func": 1.896190493106842, |
| "rewards/correctness_reward_func_eval": 0.8541270065307617, |
| "rewards/format_reward_func": 0.3499761837720871, |
| "step": 25 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1093.8681176757811, |
| "epoch": 0.05827505827505827, |
| "grad_norm": 0.29946322491146965, |
| "learning_rate": 1e-06, |
| "loss": -0.0109, |
| "reward": 3.977658863067627, |
| "reward_std": 0.32108820773661134, |
| "rewards/agent_reward_func_MC": 0.869904808998108, |
| "rewards/correctness_reward_func": 1.8838095450401307, |
| "rewards/correctness_reward_func_eval": 0.8739682829380035, |
| "rewards/format_reward_func": 0.34997618436813355, |
| "step": 30 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1075.1785949707032, |
| "epoch": 0.06798756798756798, |
| "grad_norm": 0.3040633702971847, |
| "learning_rate": 1e-06, |
| "loss": 0.0037, |
| "reward": 4.008127021789551, |
| "reward_std": 0.3045533967390657, |
| "rewards/agent_reward_func_MC": 0.8873333740234375, |
| "rewards/correctness_reward_func": 1.892380964756012, |
| "rewards/correctness_reward_func_eval": 0.8784127116203309, |
| "rewards/format_reward_func": 0.3499999940395355, |
| "step": 35 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1051.7819262695311, |
| "epoch": 0.0777000777000777, |
| "grad_norm": 0.2654552473631526, |
| "learning_rate": 1e-06, |
| "loss": 0.0047, |
| "reward": 3.9801508474349974, |
| "reward_std": 0.3097702523320913, |
| "rewards/agent_reward_func_MC": 0.8759365463256836, |
| "rewards/correctness_reward_func": 1.8752381134033203, |
| "rewards/correctness_reward_func_eval": 0.8790476399660111, |
| "rewards/format_reward_func": 0.34992856383323667, |
| "step": 40 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 920.3571643066406, |
| "epoch": 0.08741258741258741, |
| "grad_norm": 0.30787656246569944, |
| "learning_rate": 1e-06, |
| "loss": 0.0022, |
| "reward": 4.030627017021179, |
| "reward_std": 0.257388199865818, |
| "rewards/agent_reward_func_MC": 0.9038730561733246, |
| "rewards/correctness_reward_func": 1.8942857313156127, |
| "rewards/correctness_reward_func_eval": 0.8825397026538849, |
| "rewards/format_reward_func": 0.34992856383323667, |
| "step": 45 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 950.1019201660156, |
| "epoch": 0.09712509712509712, |
| "grad_norm": 0.278978790381264, |
| "learning_rate": 1e-06, |
| "loss": 0.0049, |
| "reward": 4.0613808870315555, |
| "reward_std": 0.2030999060533941, |
| "rewards/agent_reward_func_MC": 0.9173016262054443, |
| "rewards/correctness_reward_func": 1.8819047713279724, |
| "rewards/correctness_reward_func_eval": 0.9122222352027893, |
| "rewards/format_reward_func": 0.34995237350463865, |
| "step": 50 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 988.4190625, |
| "epoch": 0.10683760683760683, |
| "grad_norm": 0.28501385265891443, |
| "learning_rate": 1e-06, |
| "loss": 0.0041, |
| "reward": 4.06360315322876, |
| "reward_std": 0.28966286245733497, |
| "rewards/agent_reward_func_MC": 0.9020952671766281, |
| "rewards/correctness_reward_func": 1.896190493106842, |
| "rewards/correctness_reward_func_eval": 0.9155555760860443, |
| "rewards/format_reward_func": 0.34976189851760864, |
| "step": 55 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 988.201923828125, |
| "epoch": 0.11655011655011654, |
| "grad_norm": 0.35841673382474903, |
| "learning_rate": 1e-06, |
| "loss": 0.0082, |
| "reward": 4.126190505027771, |
| "reward_std": 0.24293919634073974, |
| "rewards/agent_reward_func_MC": 0.918984169960022, |
| "rewards/correctness_reward_func": 1.9352381134033203, |
| "rewards/correctness_reward_func_eval": 0.9220635092258453, |
| "rewards/format_reward_func": 0.3499047553539276, |
| "step": 60 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 964.7343017578125, |
| "epoch": 0.12626262626262627, |
| "grad_norm": 0.3255852780092938, |
| "learning_rate": 1e-06, |
| "loss": 0.003, |
| "reward": 4.03707145690918, |
| "reward_std": 0.2535955292731524, |
| "rewards/agent_reward_func_MC": 0.9001587682962418, |
| "rewards/correctness_reward_func": 1.8800000143051148, |
| "rewards/correctness_reward_func_eval": 0.9069841504096985, |
| "rewards/format_reward_func": 0.34992856442928316, |
| "step": 65 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1023.8019348144531, |
| "epoch": 0.13597513597513597, |
| "grad_norm": 0.258258613758542, |
| "learning_rate": 1e-06, |
| "loss": 0.0021, |
| "reward": 4.010547647476196, |
| "reward_std": 0.30033121041953564, |
| "rewards/agent_reward_func_MC": 0.8870794075727463, |
| "rewards/correctness_reward_func": 1.867619068622589, |
| "rewards/correctness_reward_func_eval": 0.9058730411529541, |
| "rewards/format_reward_func": 0.34997618436813355, |
| "step": 70 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1049.6647875976562, |
| "epoch": 0.1456876456876457, |
| "grad_norm": 0.2868301098892232, |
| "learning_rate": 1e-06, |
| "loss": 0.0083, |
| "reward": 4.163015942573548, |
| "reward_std": 0.19761438958346844, |
| "rewards/agent_reward_func_MC": 0.936666705608368, |
| "rewards/correctness_reward_func": 1.9514285826683044, |
| "rewards/correctness_reward_func_eval": 0.9249206626415253, |
| "rewards/format_reward_func": 0.3499999940395355, |
| "step": 75 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1131.1047875976562, |
| "epoch": 0.1554001554001554, |
| "grad_norm": 0.22388105570487274, |
| "learning_rate": 1e-06, |
| "loss": 0.007, |
| "reward": 3.9448730325698853, |
| "reward_std": 0.26593201816082, |
| "rewards/agent_reward_func_MC": 0.8765079736709595, |
| "rewards/correctness_reward_func": 1.839047634601593, |
| "rewards/correctness_reward_func_eval": 0.879365097284317, |
| "rewards/format_reward_func": 0.34995237410068514, |
| "step": 80 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1176.7590686035155, |
| "epoch": 0.16511266511266512, |
| "grad_norm": 0.2821630374549152, |
| "learning_rate": 1e-06, |
| "loss": 0.0177, |
| "reward": 3.8298174810409544, |
| "reward_std": 0.25791051633656026, |
| "rewards/agent_reward_func_MC": 0.8357143165171146, |
| "rewards/correctness_reward_func": 1.7523809648305178, |
| "rewards/correctness_reward_func_eval": 0.8917460405826568, |
| "rewards/format_reward_func": 0.3499761837720871, |
| "step": 85 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1018.4166821289062, |
| "epoch": 0.17482517482517482, |
| "grad_norm": 0.29273821007360346, |
| "learning_rate": 1e-06, |
| "loss": 0.0077, |
| "reward": 4.071373038291931, |
| "reward_std": 0.18497788973152637, |
| "rewards/agent_reward_func_MC": 0.9020317900180816, |
| "rewards/correctness_reward_func": 1.8876190567016602, |
| "rewards/correctness_reward_func_eval": 0.9317460560798645, |
| "rewards/format_reward_func": 0.3499761837720871, |
| "step": 90 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1005.7043090820313, |
| "epoch": 0.18453768453768454, |
| "grad_norm": 0.24921090330226142, |
| "learning_rate": 1e-06, |
| "loss": 0.0056, |
| "reward": 4.00266664981842, |
| "reward_std": 0.22064166717231273, |
| "rewards/agent_reward_func_MC": 0.8949206686019897, |
| "rewards/correctness_reward_func": 1.849523823261261, |
| "rewards/correctness_reward_func_eval": 0.9084127140045166, |
| "rewards/format_reward_func": 0.3498095166683197, |
| "step": 95 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 943.940966796875, |
| "epoch": 0.19425019425019424, |
| "grad_norm": 0.255215749280062, |
| "learning_rate": 1e-06, |
| "loss": -0.0096, |
| "reward": 3.9010079383850096, |
| "reward_std": 0.26742793841287493, |
| "rewards/agent_reward_func_MC": 0.8706984454393387, |
| "rewards/correctness_reward_func": 1.796190493106842, |
| "rewards/correctness_reward_func_eval": 0.8842857277393341, |
| "rewards/format_reward_func": 0.3498333257436752, |
| "step": 100 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1131.998592529297, |
| "epoch": 0.20396270396270397, |
| "grad_norm": 0.24196861235641634, |
| "learning_rate": 1e-06, |
| "loss": 0.0061, |
| "reward": 3.9897142422199248, |
| "reward_std": 0.2609097701497376, |
| "rewards/agent_reward_func_MC": 0.8956508328020573, |
| "rewards/correctness_reward_func": 1.8285714424401522, |
| "rewards/correctness_reward_func_eval": 0.9158730319142342, |
| "rewards/format_reward_func": 0.34961903989315035, |
| "step": 105 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1269.4943090820314, |
| "epoch": 0.21367521367521367, |
| "grad_norm": 0.22055070432080545, |
| "learning_rate": 1e-06, |
| "loss": 0.0059, |
| "reward": 3.955634880065918, |
| "reward_std": 0.2273477977141738, |
| "rewards/agent_reward_func_MC": 0.8830476495623588, |
| "rewards/correctness_reward_func": 1.8095238244533538, |
| "rewards/correctness_reward_func_eval": 0.9134920847415924, |
| "rewards/format_reward_func": 0.3495714205503464, |
| "step": 110 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1250.7200256347655, |
| "epoch": 0.2233877233877234, |
| "grad_norm": 0.28987017976023693, |
| "learning_rate": 1e-06, |
| "loss": -0.0021, |
| "reward": 4.0416190052032475, |
| "reward_std": 0.20220871651545166, |
| "rewards/agent_reward_func_MC": 0.8979682916402817, |
| "rewards/correctness_reward_func": 1.8542857229709626, |
| "rewards/correctness_reward_func_eval": 0.9393650984764099, |
| "rewards/format_reward_func": 0.3499999940395355, |
| "step": 115 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1257.4157287597657, |
| "epoch": 0.2331002331002331, |
| "grad_norm": 0.19555680054474148, |
| "learning_rate": 1e-06, |
| "loss": 0.008, |
| "reward": 3.912722191810608, |
| "reward_std": 0.22784368658438325, |
| "rewards/agent_reward_func_MC": 0.869650827050209, |
| "rewards/correctness_reward_func": 1.796190493106842, |
| "rewards/correctness_reward_func_eval": 0.897142875790596, |
| "rewards/format_reward_func": 0.3497380870580673, |
| "step": 120 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1272.7885961914062, |
| "epoch": 0.24281274281274282, |
| "grad_norm": 0.18255389452293974, |
| "learning_rate": 1e-06, |
| "loss": -0.0106, |
| "reward": 3.8947618293762205, |
| "reward_std": 0.20762850038707256, |
| "rewards/agent_reward_func_MC": 0.8573016184568405, |
| "rewards/correctness_reward_func": 1.773333351612091, |
| "rewards/correctness_reward_func_eval": 0.9141269952058793, |
| "rewards/format_reward_func": 0.3499999940395355, |
| "step": 125 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1440.9728833007812, |
| "epoch": 0.25252525252525254, |
| "grad_norm": 0.2547576435452318, |
| "learning_rate": 1e-06, |
| "loss": 0.0119, |
| "reward": 3.864793620109558, |
| "reward_std": 0.29515512300655244, |
| "rewards/agent_reward_func_MC": 0.8445079725980759, |
| "rewards/correctness_reward_func": 1.7628571581840515, |
| "rewards/correctness_reward_func_eval": 0.9076190626621247, |
| "rewards/format_reward_func": 0.34980951607227323, |
| "step": 130 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1441.5447827148437, |
| "epoch": 0.26223776223776224, |
| "grad_norm": 0.24563201564074505, |
| "learning_rate": 1e-06, |
| "loss": 0.0135, |
| "reward": 3.883611145019531, |
| "reward_std": 0.2823502243310213, |
| "rewards/agent_reward_func_MC": 0.8565397095680237, |
| "rewards/correctness_reward_func": 1.7819047725200654, |
| "rewards/correctness_reward_func_eval": 0.8952381134033203, |
| "rewards/format_reward_func": 0.34992856562137603, |
| "step": 135 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1483.5781201171876, |
| "epoch": 0.27195027195027194, |
| "grad_norm": 0.22800872368606584, |
| "learning_rate": 1e-06, |
| "loss": 0.018, |
| "reward": 3.8423254334926606, |
| "reward_std": 0.2927846448868513, |
| "rewards/agent_reward_func_MC": 0.836825436502695, |
| "rewards/correctness_reward_func": 1.755238108932972, |
| "rewards/correctness_reward_func_eval": 0.900476205945015, |
| "rewards/format_reward_func": 0.34978570640087125, |
| "step": 140 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1435.1005004882813, |
| "epoch": 0.28166278166278164, |
| "grad_norm": 0.1789329116748729, |
| "learning_rate": 1e-06, |
| "loss": 0.0029, |
| "reward": 3.8026666712760924, |
| "reward_std": 0.2736345401033759, |
| "rewards/agent_reward_func_MC": 0.8252063795924187, |
| "rewards/correctness_reward_func": 1.7352381092309952, |
| "rewards/correctness_reward_func_eval": 0.8922222399711609, |
| "rewards/format_reward_func": 0.3499999940395355, |
| "step": 145 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1238.4776416015625, |
| "epoch": 0.2913752913752914, |
| "grad_norm": 0.25524578153777566, |
| "learning_rate": 1e-06, |
| "loss": 0.0066, |
| "reward": 3.9930158853530884, |
| "reward_std": 0.2553951171413064, |
| "rewards/agent_reward_func_MC": 0.876190505027771, |
| "rewards/correctness_reward_func": 1.8361904859542846, |
| "rewards/correctness_reward_func_eval": 0.9306349349021912, |
| "rewards/format_reward_func": 0.3499999940395355, |
| "step": 150 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1195.0243041992187, |
| "epoch": 0.3010878010878011, |
| "grad_norm": 0.23520669203586708, |
| "learning_rate": 1e-06, |
| "loss": 0.0052, |
| "reward": 4.016698341369629, |
| "reward_std": 0.281817892305553, |
| "rewards/agent_reward_func_MC": 0.9011428928375245, |
| "rewards/correctness_reward_func": 1.8390476369857789, |
| "rewards/correctness_reward_func_eval": 0.9265079569816589, |
| "rewards/format_reward_func": 0.3499999940395355, |
| "step": 155 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1184.551446533203, |
| "epoch": 0.3108003108003108, |
| "grad_norm": 0.23613211319734428, |
| "learning_rate": 1e-06, |
| "loss": 0.0083, |
| "reward": 3.7817697978019713, |
| "reward_std": 0.32593878942541776, |
| "rewards/agent_reward_func_MC": 0.8337143290042878, |
| "rewards/correctness_reward_func": 1.715238115787506, |
| "rewards/correctness_reward_func_eval": 0.8831746190786361, |
| "rewards/format_reward_func": 0.34964285016059876, |
| "step": 160 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1244.0347839355468, |
| "epoch": 0.32051282051282054, |
| "grad_norm": 0.24529652266955448, |
| "learning_rate": 1e-06, |
| "loss": 0.0014, |
| "reward": 3.7896666431427004, |
| "reward_std": 0.3328119495511055, |
| "rewards/agent_reward_func_MC": 0.8448571795225144, |
| "rewards/correctness_reward_func": 1.718095259666443, |
| "rewards/correctness_reward_func_eval": 0.8771428710222244, |
| "rewards/format_reward_func": 0.34957142114639284, |
| "step": 165 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1447.985263671875, |
| "epoch": 0.33022533022533024, |
| "grad_norm": 0.2065211585384318, |
| "learning_rate": 1e-06, |
| "loss": 0.0103, |
| "reward": 3.7185713863372802, |
| "reward_std": 0.3374773776344955, |
| "rewards/agent_reward_func_MC": 0.8139047813415528, |
| "rewards/correctness_reward_func": 1.686666680574417, |
| "rewards/correctness_reward_func_eval": 0.8680952602624893, |
| "rewards/format_reward_func": 0.3499047553539276, |
| "step": 170 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1459.0500244140626, |
| "epoch": 0.33993783993783994, |
| "grad_norm": 0.21734053290147362, |
| "learning_rate": 1e-06, |
| "loss": 0.0124, |
| "reward": 3.979817385673523, |
| "reward_std": 0.31312828628346323, |
| "rewards/agent_reward_func_MC": 0.8849206674098968, |
| "rewards/correctness_reward_func": 1.8247619199752807, |
| "rewards/correctness_reward_func_eval": 0.9206349372863769, |
| "rewards/format_reward_func": 0.34949999272823334, |
| "step": 175 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1425.19859375, |
| "epoch": 0.34965034965034963, |
| "grad_norm": 0.2907016194013063, |
| "learning_rate": 1e-06, |
| "loss": 0.0084, |
| "reward": 3.65442857503891, |
| "reward_std": 0.3651884417142719, |
| "rewards/agent_reward_func_MC": 0.785936538875103, |
| "rewards/correctness_reward_func": 1.647619072496891, |
| "rewards/correctness_reward_func_eval": 0.8711111295223236, |
| "rewards/format_reward_func": 0.3497618967294693, |
| "step": 180 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1544.9428930664062, |
| "epoch": 0.3593628593628594, |
| "grad_norm": 0.20955445164203745, |
| "learning_rate": 1e-06, |
| "loss": 0.0074, |
| "reward": 3.2816984033584595, |
| "reward_std": 0.4594585011713207, |
| "rewards/agent_reward_func_MC": 0.6978095433861017, |
| "rewards/correctness_reward_func": 1.4428571613132954, |
| "rewards/correctness_reward_func_eval": 0.7912698584794998, |
| "rewards/format_reward_func": 0.3497618967294693, |
| "step": 185 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1710.9262353515626, |
| "epoch": 0.3690753690753691, |
| "grad_norm": 0.22177847546734317, |
| "learning_rate": 1e-06, |
| "loss": 0.0104, |
| "reward": 3.5176428842544554, |
| "reward_std": 0.38553844563663003, |
| "rewards/agent_reward_func_MC": 0.7493016171455383, |
| "rewards/correctness_reward_func": 1.5676190626621247, |
| "rewards/correctness_reward_func_eval": 0.8507936751842499, |
| "rewards/format_reward_func": 0.34992856562137603, |
| "step": 190 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1679.4776538085937, |
| "epoch": 0.3787878787878788, |
| "grad_norm": 0.2779597660638594, |
| "learning_rate": 1e-06, |
| "loss": 0.0073, |
| "reward": 3.700857148170471, |
| "reward_std": 0.40202371578663587, |
| "rewards/agent_reward_func_MC": 0.7945079684257508, |
| "rewards/correctness_reward_func": 1.682857164144516, |
| "rewards/correctness_reward_func_eval": 0.8734920841455459, |
| "rewards/format_reward_func": 0.3499999940395355, |
| "step": 195 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 1529.8266943359374, |
| "epoch": 0.3885003885003885, |
| "grad_norm": 0.24354444168029116, |
| "learning_rate": 1e-06, |
| "loss": 0.0098, |
| "reward": 3.8641825485229493, |
| "reward_std": 0.3314341966807842, |
| "rewards/agent_reward_func_MC": 0.8359365397691727, |
| "rewards/correctness_reward_func": 1.7533333575725556, |
| "rewards/correctness_reward_func_eval": 0.9250793838500977, |
| "rewards/format_reward_func": 0.34983332693576813, |
| "step": 200 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 514, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 6, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|