| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.018465515649524512, |
| "eval_steps": 10000, |
| "global_step": 50, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00036931031299049027, |
| "grad_norm": 0.05093964371235935, |
| "learning_rate": 0.0, |
| "loss": 0.0176, |
| "reward/mean": 0.4305254817008972, |
| "reward/std": 0.023368891328573227, |
| "rewards/correct_answer_reward_func/mean": 0.8723958134651184, |
| "rewards/correct_answer_reward_func/std": 0.3338659703731537, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.939539909362793, |
| "rewards/correct_extract_func/std": 0.2306855320930481, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.4062052965164185, |
| "rewards/format_reward_func/std": 0.023711344227194786, |
| "step": 1 |
| }, |
| { |
| "epoch": 0.0007386206259809805, |
| "grad_norm": 0.05093885614710434, |
| "learning_rate": 1e-07, |
| "loss": 0.0176, |
| "step": 2 |
| }, |
| { |
| "epoch": 0.0011079309389714707, |
| "grad_norm": 0.049709599363636135, |
| "learning_rate": 2e-07, |
| "loss": 0.0177, |
| "reward/mean": 0.43362969160079956, |
| "reward/std": 0.024930372834205627, |
| "rewards/correct_answer_reward_func/mean": 0.8841145634651184, |
| "rewards/correct_answer_reward_func/std": 0.3202960789203644, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.9216580390930176, |
| "rewards/correct_extract_func/std": 0.25809445977211, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.4074559211730957, |
| "rewards/format_reward_func/std": 0.018833689391613007, |
| "step": 3 |
| }, |
| { |
| "epoch": 0.001477241251961961, |
| "grad_norm": 0.04919887971444478, |
| "learning_rate": 3e-07, |
| "loss": 0.0177, |
| "step": 4 |
| }, |
| { |
| "epoch": 0.0018465515649524512, |
| "grad_norm": 0.04609056285217934, |
| "learning_rate": 4e-07, |
| "loss": 0.017, |
| "reward/mean": 0.43184012174606323, |
| "reward/std": 0.016295205801725388, |
| "rewards/correct_answer_reward_func/mean": 0.87890625, |
| "rewards/correct_answer_reward_func/std": 0.32644879817962646, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.9188368320465088, |
| "rewards/correct_extract_func/std": 0.2631855309009552, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.4060311317443848, |
| "rewards/format_reward_func/std": 0.019542310386896133, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.0022158618779429414, |
| "grad_norm": 0.04790138249587581, |
| "learning_rate": 5e-07, |
| "loss": 0.017, |
| "step": 6 |
| }, |
| { |
| "epoch": 0.002585172190933432, |
| "grad_norm": 0.07345664511629879, |
| "learning_rate": 6e-07, |
| "loss": 0.0171, |
| "reward/mean": 0.4144955277442932, |
| "reward/std": 0.025130389258265495, |
| "rewards/correct_answer_reward_func/mean": 0.8255208134651184, |
| "rewards/correct_answer_reward_func/std": 0.37976834177970886, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.880946159362793, |
| "rewards/correct_extract_func/std": 0.31111887097358704, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.4041085243225098, |
| "rewards/format_reward_func/std": 0.03701591119170189, |
| "step": 7 |
| }, |
| { |
| "epoch": 0.002954482503923922, |
| "grad_norm": 0.07610320387658168, |
| "learning_rate": 7e-07, |
| "loss": 0.0171, |
| "step": 8 |
| }, |
| { |
| "epoch": 0.0033237928169144123, |
| "grad_norm": 0.05112137950522487, |
| "learning_rate": 8e-07, |
| "loss": 0.0181, |
| "reward/mean": 0.4387373626232147, |
| "reward/std": 0.019346633926033974, |
| "rewards/correct_answer_reward_func/mean": 0.9036458134651184, |
| "rewards/correct_answer_reward_func/std": 0.2952686548233032, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.9055989384651184, |
| "rewards/correct_extract_func/std": 0.28282052278518677, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.4047561883926392, |
| "rewards/format_reward_func/std": 0.06315362453460693, |
| "step": 9 |
| }, |
| { |
| "epoch": 0.0036931031299049025, |
| "grad_norm": 0.05094809364589051, |
| "learning_rate": 9e-07, |
| "loss": 0.0181, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.004062413442895393, |
| "grad_norm": 0.05275996751985643, |
| "learning_rate": 1e-06, |
| "loss": 0.0175, |
| "reward/mean": 0.43490076065063477, |
| "reward/std": 0.02377907559275627, |
| "rewards/correct_answer_reward_func/mean": 0.8893229365348816, |
| "rewards/correct_answer_reward_func/std": 0.3139362931251526, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.9134114384651184, |
| "rewards/correct_extract_func/std": 0.2700866460800171, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.407015323638916, |
| "rewards/format_reward_func/std": 0.01986781507730484, |
| "step": 11 |
| }, |
| { |
| "epoch": 0.004431723755885883, |
| "grad_norm": 0.051170250158850315, |
| "learning_rate": 1e-06, |
| "loss": 0.0175, |
| "step": 12 |
| }, |
| { |
| "epoch": 0.004801034068876373, |
| "grad_norm": 0.05767247415587884, |
| "learning_rate": 1e-06, |
| "loss": 0.0181, |
| "reward/mean": 0.4389788508415222, |
| "reward/std": 0.02013307623565197, |
| "rewards/correct_answer_reward_func/mean": 0.9049479365348816, |
| "rewards/correct_answer_reward_func/std": 0.29347798228263855, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.8903212547302246, |
| "rewards/correct_extract_func/std": 0.2998242974281311, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.4078478813171387, |
| "rewards/format_reward_func/std": 0.014290675520896912, |
| "step": 13 |
| }, |
| { |
| "epoch": 0.005170344381866864, |
| "grad_norm": 0.05451251011427531, |
| "learning_rate": 1e-06, |
| "loss": 0.0181, |
| "step": 14 |
| }, |
| { |
| "epoch": 0.005539654694857354, |
| "grad_norm": 0.0548131395951059, |
| "learning_rate": 1e-06, |
| "loss": 0.0181, |
| "reward/mean": 0.4339887797832489, |
| "reward/std": 0.018782436847686768, |
| "rewards/correct_answer_reward_func/mean": 0.88671875, |
| "rewards/correct_answer_reward_func/std": 0.3171428442001343, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.9032118320465088, |
| "rewards/correct_extract_func/std": 0.2824605405330658, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.4087677001953125, |
| "rewards/format_reward_func/std": 0.012980460189282894, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.005908965007847844, |
| "grad_norm": 0.05240983246508732, |
| "learning_rate": 1e-06, |
| "loss": 0.0181, |
| "step": 16 |
| }, |
| { |
| "epoch": 0.006278275320838334, |
| "grad_norm": 0.05134459409849994, |
| "learning_rate": 1e-06, |
| "loss": 0.0182, |
| "reward/mean": 0.43516525626182556, |
| "reward/std": 0.02198929898440838, |
| "rewards/correct_answer_reward_func/mean": 0.89453125, |
| "rewards/correct_answer_reward_func/std": 0.3073566257953644, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.8657768368721008, |
| "rewards/correct_extract_func/std": 0.3262862265110016, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.4083256721496582, |
| "rewards/format_reward_func/std": 0.02241017296910286, |
| "step": 17 |
| }, |
| { |
| "epoch": 0.006647585633828825, |
| "grad_norm": 0.050341338341445184, |
| "learning_rate": 1e-06, |
| "loss": 0.0182, |
| "step": 18 |
| }, |
| { |
| "epoch": 0.007016895946819315, |
| "grad_norm": 0.0432942729059209, |
| "learning_rate": 1e-06, |
| "loss": 0.0176, |
| "reward/mean": 0.43305787444114685, |
| "reward/std": 0.017742186784744263, |
| "rewards/correct_answer_reward_func/mean": 0.8854166865348816, |
| "rewards/correct_answer_reward_func/std": 0.3187260329723358, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.8930990099906921, |
| "rewards/correct_extract_func/std": 0.29494285583496094, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.4063993692398071, |
| "rewards/format_reward_func/std": 0.03480615094304085, |
| "step": 19 |
| }, |
| { |
| "epoch": 0.007386206259809805, |
| "grad_norm": 0.043736628776102855, |
| "learning_rate": 1e-06, |
| "loss": 0.0176, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.0077555165728002955, |
| "grad_norm": 0.05359119220833686, |
| "learning_rate": 1e-06, |
| "loss": 0.0171, |
| "reward/mean": 0.4271976351737976, |
| "reward/std": 0.02278582751750946, |
| "rewards/correct_answer_reward_func/mean": 0.8619791865348816, |
| "rewards/correct_answer_reward_func/std": 0.34514662623405457, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.9271919131278992, |
| "rewards/correct_extract_func/std": 0.2506465017795563, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.4078807830810547, |
| "rewards/format_reward_func/std": 0.011555412784218788, |
| "step": 21 |
| }, |
| { |
| "epoch": 0.008124826885790786, |
| "grad_norm": 0.053384876112853016, |
| "learning_rate": 1e-06, |
| "loss": 0.0171, |
| "step": 22 |
| }, |
| { |
| "epoch": 0.008494137198781277, |
| "grad_norm": 0.05457105190222447, |
| "learning_rate": 1e-06, |
| "loss": 0.0176, |
| "reward/mean": 0.4344092011451721, |
| "reward/std": 0.018806444481015205, |
| "rewards/correct_answer_reward_func/mean": 0.8893229365348816, |
| "rewards/correct_answer_reward_func/std": 0.3139362931251526, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.8963108062744141, |
| "rewards/correct_extract_func/std": 0.29719239473342896, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.4072299003601074, |
| "rewards/format_reward_func/std": 0.013067901134490967, |
| "step": 23 |
| }, |
| { |
| "epoch": 0.008863447511771766, |
| "grad_norm": 0.051790764460388904, |
| "learning_rate": 1e-06, |
| "loss": 0.0176, |
| "step": 24 |
| }, |
| { |
| "epoch": 0.009232757824762256, |
| "grad_norm": 0.06455557806308003, |
| "learning_rate": 1e-06, |
| "loss": 0.0177, |
| "reward/mean": 0.44281578063964844, |
| "reward/std": 0.016280503943562508, |
| "rewards/correct_answer_reward_func/mean": 0.9153645634651184, |
| "rewards/correct_answer_reward_func/std": 0.27851977944374084, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.9203993678092957, |
| "rewards/correct_extract_func/std": 0.25632622838020325, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.4059442281723022, |
| "rewards/format_reward_func/std": 0.02205752208828926, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.009602068137752747, |
| "grad_norm": 0.054460571323261056, |
| "learning_rate": 1e-06, |
| "loss": 0.0177, |
| "step": 26 |
| }, |
| { |
| "epoch": 0.009971378450743237, |
| "grad_norm": 0.045474497731843734, |
| "learning_rate": 1e-06, |
| "loss": 0.0175, |
| "reward/mean": 0.4445436894893646, |
| "reward/std": 0.014738515019416809, |
| "rewards/correct_answer_reward_func/mean": 0.9192708134651184, |
| "rewards/correct_answer_reward_func/std": 0.27259624004364014, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.9342448115348816, |
| "rewards/correct_extract_func/std": 0.23349910974502563, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.4073508977890015, |
| "rewards/format_reward_func/std": 0.014469039626419544, |
| "step": 27 |
| }, |
| { |
| "epoch": 0.010340688763733728, |
| "grad_norm": 0.04518446989067636, |
| "learning_rate": 1e-06, |
| "loss": 0.0175, |
| "step": 28 |
| }, |
| { |
| "epoch": 0.010709999076724217, |
| "grad_norm": 0.05758722247738054, |
| "learning_rate": 1e-06, |
| "loss": 0.0183, |
| "reward/mean": 0.44419676065444946, |
| "reward/std": 0.02353046089410782, |
| "rewards/correct_answer_reward_func/mean": 0.9192708134651184, |
| "rewards/correct_answer_reward_func/std": 0.27259624004364014, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.923828125, |
| "rewards/correct_extract_func/std": 0.25748127698898315, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.4070063829421997, |
| "rewards/format_reward_func/std": 0.013799347914755344, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.011079309389714707, |
| "grad_norm": 0.04951275155135316, |
| "learning_rate": 1e-06, |
| "loss": 0.0183, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.011448619702705198, |
| "grad_norm": 0.047494845238651454, |
| "learning_rate": 1e-06, |
| "loss": 0.0173, |
| "reward/mean": 0.44647669792175293, |
| "reward/std": 0.014447808265686035, |
| "rewards/correct_answer_reward_func/mean": 0.92578125, |
| "rewards/correct_answer_reward_func/std": 0.2622973620891571, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.9365885257720947, |
| "rewards/correct_extract_func/std": 0.24006153643131256, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.4064464569091797, |
| "rewards/format_reward_func/std": 0.02209184132516384, |
| "step": 31 |
| }, |
| { |
| "epoch": 0.011817930015695689, |
| "grad_norm": 0.046822948788372606, |
| "learning_rate": 1e-06, |
| "loss": 0.0173, |
| "step": 32 |
| }, |
| { |
| "epoch": 0.01218724032868618, |
| "grad_norm": 0.04380397393536513, |
| "learning_rate": 1e-06, |
| "loss": 0.0174, |
| "reward/mean": 0.4473347067832947, |
| "reward/std": 0.01410503126680851, |
| "rewards/correct_answer_reward_func/mean": 0.9309895634651184, |
| "rewards/correct_answer_reward_func/std": 0.25363701581954956, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.911241352558136, |
| "rewards/correct_extract_func/std": 0.28002622723579407, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.4070053100585938, |
| "rewards/format_reward_func/std": 0.007169181946665049, |
| "step": 33 |
| }, |
| { |
| "epoch": 0.012556550641676668, |
| "grad_norm": 0.04264955092023057, |
| "learning_rate": 1e-06, |
| "loss": 0.0174, |
| "step": 34 |
| }, |
| { |
| "epoch": 0.012925860954667159, |
| "grad_norm": 0.04313995996563885, |
| "learning_rate": 1e-06, |
| "loss": 0.0172, |
| "reward/mean": 0.4411402642726898, |
| "reward/std": 0.00797030795365572, |
| "rewards/correct_answer_reward_func/mean": 0.9088541865348816, |
| "rewards/correct_answer_reward_func/std": 0.28800395131111145, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.9279513359069824, |
| "rewards/correct_extract_func/std": 0.25393322110176086, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.4064545631408691, |
| "rewards/format_reward_func/std": 0.017624543979763985, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.01329517126765765, |
| "grad_norm": 0.04127535963471486, |
| "learning_rate": 1e-06, |
| "loss": 0.0172, |
| "step": 36 |
| }, |
| { |
| "epoch": 0.01366448158064814, |
| "grad_norm": 0.060380213829128886, |
| "learning_rate": 1e-06, |
| "loss": 0.0176, |
| "reward/mean": 0.4418516159057617, |
| "reward/std": 0.016873031854629517, |
| "rewards/correct_answer_reward_func/mean": 0.9075520634651184, |
| "rewards/correct_answer_reward_func/std": 0.2898460030555725, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.961718738079071, |
| "rewards/correct_extract_func/std": 0.18086452782154083, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.4073443412780762, |
| "rewards/format_reward_func/std": 0.013318442739546299, |
| "step": 37 |
| }, |
| { |
| "epoch": 0.01403379189363863, |
| "grad_norm": 0.05602980229868504, |
| "learning_rate": 1e-06, |
| "loss": 0.0176, |
| "step": 38 |
| }, |
| { |
| "epoch": 0.01440310220662912, |
| "grad_norm": 0.05184766134610364, |
| "learning_rate": 1e-06, |
| "loss": 0.0177, |
| "reward/mean": 0.43869584798812866, |
| "reward/std": 0.019008934497833252, |
| "rewards/correct_answer_reward_func/mean": 0.9036458134651184, |
| "rewards/correct_answer_reward_func/std": 0.2952686548233032, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.9003472328186035, |
| "rewards/correct_extract_func/std": 0.2904185354709625, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.405916690826416, |
| "rewards/format_reward_func/std": 0.0255013108253479, |
| "step": 39 |
| }, |
| { |
| "epoch": 0.01477241251961961, |
| "grad_norm": 0.05148310080801445, |
| "learning_rate": 1e-06, |
| "loss": 0.0176, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.0151417228326101, |
| "grad_norm": 0.05581834264976948, |
| "learning_rate": 1e-06, |
| "loss": 0.0175, |
| "reward/mean": 0.4319329261779785, |
| "reward/std": 0.020871102809906006, |
| "rewards/correct_answer_reward_func/mean": 0.8802083134651184, |
| "rewards/correct_answer_reward_func/std": 0.3249293863773346, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.907118022441864, |
| "rewards/correct_extract_func/std": 0.2817213237285614, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.406569004058838, |
| "rewards/format_reward_func/std": 0.02022281102836132, |
| "step": 41 |
| }, |
| { |
| "epoch": 0.015511033145600591, |
| "grad_norm": 0.05268407384500521, |
| "learning_rate": 1e-06, |
| "loss": 0.0175, |
| "step": 42 |
| }, |
| { |
| "epoch": 0.01588034345859108, |
| "grad_norm": 0.09367675201948025, |
| "learning_rate": 1e-06, |
| "loss": 0.0176, |
| "reward/mean": 0.4355073869228363, |
| "reward/std": 0.01918705925345421, |
| "rewards/correct_answer_reward_func/mean": 0.8919270634651184, |
| "rewards/correct_answer_reward_func/std": 0.3106748163700104, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.9130207896232605, |
| "rewards/correct_extract_func/std": 0.27445390820503235, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.405385971069336, |
| "rewards/format_reward_func/std": 0.026785731315612793, |
| "step": 43 |
| }, |
| { |
| "epoch": 0.016249653771581572, |
| "grad_norm": 0.05227366508331771, |
| "learning_rate": 1e-06, |
| "loss": 0.0176, |
| "step": 44 |
| }, |
| { |
| "epoch": 0.01661896408457206, |
| "grad_norm": 0.04497808527453425, |
| "learning_rate": 1e-06, |
| "loss": 0.0179, |
| "reward/mean": 0.4444352984428406, |
| "reward/std": 0.01481956522911787, |
| "rewards/correct_answer_reward_func/mean": 0.9192708134651184, |
| "rewards/correct_answer_reward_func/std": 0.27259624004364014, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.9329426884651184, |
| "rewards/correct_extract_func/std": 0.24013008177280426, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.406657338142395, |
| "rewards/format_reward_func/std": 0.015218171291053295, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.016988274397562553, |
| "grad_norm": 0.055837167119088496, |
| "learning_rate": 1e-06, |
| "loss": 0.0179, |
| "step": 46 |
| }, |
| { |
| "epoch": 0.017357584710553042, |
| "grad_norm": 0.05873478813480533, |
| "learning_rate": 1e-06, |
| "loss": 0.0178, |
| "reward/mean": 0.44062528014183044, |
| "reward/std": 0.01832752674818039, |
| "rewards/correct_answer_reward_func/mean": 0.9088541865348816, |
| "rewards/correct_answer_reward_func/std": 0.28800395131111145, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.9090712070465088, |
| "rewards/correct_extract_func/std": 0.27737653255462646, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.4069687128067017, |
| "rewards/format_reward_func/std": 0.02851445972919464, |
| "step": 47 |
| }, |
| { |
| "epoch": 0.01772689502354353, |
| "grad_norm": 0.05518986438955003, |
| "learning_rate": 1e-06, |
| "loss": 0.0178, |
| "step": 48 |
| }, |
| { |
| "epoch": 0.018096205336534023, |
| "grad_norm": 0.04474697485013119, |
| "learning_rate": 1e-06, |
| "loss": 0.0182, |
| "reward/mean": 0.44659337401390076, |
| "reward/std": 0.013036997988820076, |
| "rewards/correct_answer_reward_func/mean": 0.9296875, |
| "rewards/correct_answer_reward_func/std": 0.2558395564556122, |
| "rewards/correct_crop_func/mean": 0.0, |
| "rewards/correct_crop_func/std": 0.0, |
| "rewards/correct_extract_func/mean": 0.9007161259651184, |
| "rewards/correct_extract_func/std": 0.2947409451007843, |
| "rewards/correct_find_color/mean": 0.0, |
| "rewards/correct_find_color/std": 0.0, |
| "rewards/format_reward_func/mean": 1.406656265258789, |
| "rewards/format_reward_func/std": 0.00936658214777708, |
| "step": 49 |
| }, |
| { |
| "epoch": 0.018465515649524512, |
| "grad_norm": 0.043438923657574534, |
| "learning_rate": 1e-06, |
| "loss": 0.0181, |
| "step": 50 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 2708, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|