rlsft / trainer_state.json
eve1f's picture
Upload folder using huggingface_hub
fbd4f2a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.15903307888040713,
"eval_steps": 0,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"combined_loss": 0.7037124633789062,
"completion_length": 425.0,
"epoch": 0.0003180661577608143,
"grad_norm": 2.1160361766815186,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.7037,
"num_samples": 1.0,
"reward": 3.90625,
"reward_std": 1.062600016593933,
"rewards/gpt4o_holistic_reward": 3.90625,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.3457083702087402,
"speech_entropy": 2.5810890197753906,
"speech_kl": 0.0,
"step": 1,
"text_entropy": 0.44255462288856506,
"text_kl": 0.0,
"total_entropy": 1.9978519678115845
},
{
"combined_loss": 0.7883188724517822,
"completion_length": 347.125,
"epoch": 0.0006361323155216285,
"grad_norm": 2.1822354793548584,
"kl": 0.0,
"learning_rate": 2.3137821315975918e-07,
"loss": 0.7883,
"num_samples": 1.0,
"reward": 4.875,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_holistic_reward": 4.875,
"rl_loss": 0.0,
"sft_loss": 2.6277294158935547,
"speech_entropy": 2.6779050827026367,
"speech_kl": 0.0,
"step": 2,
"text_entropy": 0.528403639793396,
"text_kl": 0.0,
"total_entropy": 2.251002311706543
},
{
"combined_loss": 0.7728084921836853,
"completion_length": 490.375,
"epoch": 0.0009541984732824427,
"grad_norm": 2.1721348762512207,
"kl": 0.0,
"learning_rate": 3.6672579134208467e-07,
"loss": 0.7728,
"num_samples": 1.0,
"reward": 2.9375,
"reward_std": 1.3848260641098022,
"rewards/gpt4o_holistic_reward": 2.9375,
"rl_loss": -1.862645149230957e-09,
"sft_loss": 2.576028347015381,
"speech_entropy": 2.6699180603027344,
"speech_kl": 0.0,
"step": 3,
"text_entropy": 0.675686240196228,
"text_kl": 0.0,
"total_entropy": 2.2666218280792236
},
{
"combined_loss": 0.7510870695114136,
"completion_length": 396.3125,
"epoch": 0.001272264631043257,
"grad_norm": 3.0144259929656982,
"kl": 0.0,
"learning_rate": 4.6275642631951835e-07,
"loss": 0.7511,
"num_samples": 1.0,
"reward": 4.5625,
"reward_std": 0.6637751460075378,
"rewards/gpt4o_holistic_reward": 4.5625,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.5036234855651855,
"speech_entropy": 2.7026796340942383,
"speech_kl": 0.0,
"step": 4,
"text_entropy": 0.6944292783737183,
"text_kl": 0.0,
"total_entropy": 2.308650493621826
},
{
"combined_loss": 0.7991127967834473,
"completion_length": 467.625,
"epoch": 0.0015903307888040711,
"grad_norm": 5.25661039352417,
"kl": 0.0,
"learning_rate": 5.372435736804816e-07,
"loss": 0.7991,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 0.329224169254303,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.6637091636657715,
"speech_entropy": 2.6493892669677734,
"speech_kl": 0.0,
"step": 5,
"text_entropy": 0.8403033018112183,
"text_kl": 0.0,
"total_entropy": 2.2811856269836426
},
{
"combined_loss": 0.7556890249252319,
"completion_length": 261.8125,
"epoch": 0.0019083969465648854,
"grad_norm": 2.2916808128356934,
"kl": 0.0,
"learning_rate": 5.981040045018438e-07,
"loss": 0.7557,
"num_samples": 1.0,
"reward": 2.625,
"reward_std": 0.7500999569892883,
"rewards/gpt4o_holistic_reward": 2.625,
"rl_loss": 0.0,
"sft_loss": 2.51896333694458,
"speech_entropy": 2.7437381744384766,
"speech_kl": 0.0,
"step": 6,
"text_entropy": 0.7866218686103821,
"text_kl": 0.0,
"total_entropy": 2.3357348442077637
},
{
"combined_loss": 0.7672804594039917,
"completion_length": 284.625,
"epoch": 0.0022264631043256997,
"grad_norm": 8.151514053344727,
"kl": 0.0,
"learning_rate": 6.495607655709434e-07,
"loss": 0.7673,
"num_samples": 1.0,
"reward": 3.3125,
"reward_std": 0.36094391345977783,
"rewards/gpt4o_holistic_reward": 3.3125,
"rl_loss": 0.0,
"sft_loss": 2.5576014518737793,
"speech_entropy": 1.9880790710449219,
"speech_kl": 0.0,
"step": 7,
"text_entropy": 0.5499787926673889,
"text_kl": 0.0,
"total_entropy": 1.618296504020691
},
{
"combined_loss": 0.7481317520141602,
"completion_length": 526.5625,
"epoch": 0.002544529262086514,
"grad_norm": 2.451380491256714,
"kl": 0.0,
"learning_rate": 6.941346394792774e-07,
"loss": 0.7481,
"num_samples": 1.0,
"reward": 3.125,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_holistic_reward": 3.125,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.493772506713867,
"speech_entropy": 2.589114189147949,
"speech_kl": 0.0,
"step": 8,
"text_entropy": 0.6615394353866577,
"text_kl": 0.0,
"total_entropy": 2.1917660236358643
},
{
"combined_loss": 0.7818739414215088,
"completion_length": 261.625,
"epoch": 0.0028625954198473282,
"grad_norm": 2.807657480239868,
"kl": 0.0,
"learning_rate": 7.334515826841693e-07,
"loss": 0.7819,
"num_samples": 1.0,
"reward": 2.9375,
"reward_std": 0.46360161900520325,
"rewards/gpt4o_holistic_reward": 2.9375,
"rl_loss": 0.0,
"sft_loss": 2.6062464714050293,
"speech_entropy": 2.7798025608062744,
"speech_kl": 0.0,
"step": 9,
"text_entropy": 0.7685192823410034,
"text_kl": 0.0,
"total_entropy": 2.320415735244751
},
{
"combined_loss": 0.74410080909729,
"completion_length": 355.375,
"epoch": 0.0031806615776081423,
"grad_norm": 2.3492045402526855,
"kl": 0.0,
"learning_rate": 7.686217868402409e-07,
"loss": 0.7441,
"num_samples": 1.0,
"reward": 4.25,
"reward_std": 0.6921550035476685,
"rewards/gpt4o_holistic_reward": 4.25,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.4803357124328613,
"speech_entropy": 2.6616315841674805,
"speech_kl": 0.0,
"step": 10,
"text_entropy": 0.7898290753364563,
"text_kl": 0.0,
"total_entropy": 2.2620432376861572
},
{
"combined_loss": 0.8018359541893005,
"completion_length": 417.3125,
"epoch": 0.003498727735368957,
"grad_norm": 2.31730580329895,
"kl": 0.0,
"learning_rate": 8.004371064686714e-07,
"loss": 0.8018,
"num_samples": 1.0,
"reward": 4.0,
"reward_std": 0.8644567728042603,
"rewards/gpt4o_holistic_reward": 4.0,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.6727864742279053,
"speech_entropy": 2.7092902660369873,
"speech_kl": 0.0,
"step": 11,
"text_entropy": 0.7865443825721741,
"text_kl": 0.0,
"total_entropy": 2.3307557106018066
},
{
"combined_loss": 0.7841310501098633,
"completion_length": 271.0,
"epoch": 0.003816793893129771,
"grad_norm": 4.778654098510742,
"kl": 0.0,
"learning_rate": 8.29482217661603e-07,
"loss": 0.7841,
"num_samples": 1.0,
"reward": 2.65625,
"reward_std": 1.0673450231552124,
"rewards/gpt4o_holistic_reward": 2.65625,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.613770008087158,
"speech_entropy": 2.814521312713623,
"speech_kl": 0.0,
"step": 12,
"text_entropy": 1.110842227935791,
"text_kl": 0.0,
"total_entropy": 2.2540793418884277
},
{
"combined_loss": 0.788222074508667,
"completion_length": 318.9375,
"epoch": 0.004134860050890585,
"grad_norm": 2.6159579753875732,
"kl": 0.0,
"learning_rate": 8.562011298888888e-07,
"loss": 0.7882,
"num_samples": 1.0,
"reward": 3.9375,
"reward_std": 0.8315354585647583,
"rewards/gpt4o_holistic_reward": 3.9375,
"rl_loss": -1.4901161193847656e-08,
"sft_loss": 2.627406597137451,
"speech_entropy": 2.816967725753784,
"speech_kl": 0.0,
"step": 13,
"text_entropy": 0.8139775395393372,
"text_kl": 0.0,
"total_entropy": 2.4237911701202393
},
{
"combined_loss": 0.7619365453720093,
"completion_length": 598.8125,
"epoch": 0.004452926208651399,
"grad_norm": 1.9425196647644043,
"kl": 0.0,
"learning_rate": 8.809389787307026e-07,
"loss": 0.7619,
"num_samples": 1.0,
"reward": 3.5,
"reward_std": 1.2196787595748901,
"rewards/gpt4o_holistic_reward": 3.5,
"rl_loss": 2.60770320892334e-08,
"sft_loss": 2.539788246154785,
"speech_entropy": 2.607710838317871,
"speech_kl": 0.0,
"step": 14,
"text_entropy": 0.6120049357414246,
"text_kl": 0.0,
"total_entropy": 2.184032440185547
},
{
"combined_loss": 0.8240371942520142,
"completion_length": 376.6875,
"epoch": 0.004770992366412214,
"grad_norm": 2.834174871444702,
"kl": 0.0,
"learning_rate": 9.039693650225662e-07,
"loss": 0.824,
"num_samples": 1.0,
"reward": 4.3125,
"reward_std": 1.0792241096496582,
"rewards/gpt4o_holistic_reward": 4.3125,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.7467904090881348,
"speech_entropy": 2.7376527786254883,
"speech_kl": 0.0,
"step": 15,
"text_entropy": 1.3147271871566772,
"text_kl": 0.0,
"total_entropy": 2.462172031402588
},
{
"combined_loss": 0.7633702754974365,
"completion_length": 241.8125,
"epoch": 0.005089058524173028,
"grad_norm": 3.2188761234283447,
"kl": 0.0,
"learning_rate": 9.255128526390367e-07,
"loss": 0.7634,
"num_samples": 1.0,
"reward": 3.75,
"reward_std": 0.9856985807418823,
"rewards/gpt4o_holistic_reward": 3.75,
"rl_loss": 0.0,
"sft_loss": 2.544567584991455,
"speech_entropy": 2.6439318656921387,
"speech_kl": 0.0,
"step": 16,
"text_entropy": 0.9647745490074158,
"text_kl": 0.0,
"total_entropy": 2.306708812713623
},
{
"combined_loss": 0.8270055651664734,
"completion_length": 534.5,
"epoch": 0.005407124681933842,
"grad_norm": 2.851463556289673,
"kl": 0.0,
"learning_rate": 9.45749848565416e-07,
"loss": 0.827,
"num_samples": 1.0,
"reward": 2.75,
"reward_std": 1.0000998973846436,
"rewards/gpt4o_holistic_reward": 2.75,
"rl_loss": 0.0,
"sft_loss": 2.756685256958008,
"speech_entropy": 2.7267394065856934,
"speech_kl": 0.0,
"step": 17,
"text_entropy": 0.9119875431060791,
"text_kl": 0.0,
"total_entropy": 2.3539938926696777
},
{
"combined_loss": 0.8502093553543091,
"completion_length": 354.75,
"epoch": 0.0057251908396946565,
"grad_norm": 2.3651652336120605,
"kl": 0.0,
"learning_rate": 9.648297958439284e-07,
"loss": 0.8502,
"num_samples": 1.0,
"reward": 4.0,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_holistic_reward": 4.0,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.834031105041504,
"speech_entropy": 2.6965622901916504,
"speech_kl": 0.0,
"step": 18,
"text_entropy": 1.0353615283966064,
"text_kl": 0.0,
"total_entropy": 2.379913806915283
},
{
"combined_loss": 0.8145760297775269,
"completion_length": 331.375,
"epoch": 0.006043256997455471,
"grad_norm": 2.548919916152954,
"kl": 0.0,
"learning_rate": 9.828778776927557e-07,
"loss": 0.8146,
"num_samples": 1.0,
"reward": 3.5625,
"reward_std": 1.6985008716583252,
"rewards/gpt4o_holistic_reward": 3.5625,
"rl_loss": 2.2351741790771484e-08,
"sft_loss": 2.7152533531188965,
"speech_entropy": 2.7572903633117676,
"speech_kl": 0.0,
"step": 19,
"text_entropy": 0.8444140553474426,
"text_kl": 0.0,
"total_entropy": 2.3438541889190674
},
{
"combined_loss": 0.7225195169448853,
"completion_length": 397.3125,
"epoch": 0.006361323155216285,
"grad_norm": 2.0722339153289795,
"kl": 0.0,
"learning_rate": 1e-06,
"loss": 0.7225,
"num_samples": 1.0,
"reward": 3.5,
"reward_std": 0.8872368931770325,
"rewards/gpt4o_holistic_reward": 3.5,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.408398151397705,
"speech_entropy": 2.698948383331299,
"speech_kl": 0.0,
"step": 20,
"text_entropy": 0.8427600860595703,
"text_kl": 0.0,
"total_entropy": 2.3258702754974365
},
{
"combined_loss": 0.7821958661079407,
"completion_length": 306.125,
"epoch": 0.006679389312977099,
"grad_norm": 2.6868255138397217,
"kl": 0.0,
"learning_rate": 9.99999433562768e-07,
"loss": 0.7822,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 1.103813648223877,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": 1.862645149230957e-09,
"sft_loss": 2.6073193550109863,
"speech_entropy": 2.7519540786743164,
"speech_kl": 0.0,
"step": 21,
"text_entropy": 0.9722496271133423,
"text_kl": 0.0,
"total_entropy": 2.407655715942383
},
{
"combined_loss": 0.6645753979682922,
"completion_length": 460.1875,
"epoch": 0.006997455470737914,
"grad_norm": 2.0238919258117676,
"kl": 0.0,
"learning_rate": 9.99997734252498e-07,
"loss": 0.6646,
"num_samples": 1.0,
"reward": 3.9375,
"reward_std": 1.3854628801345825,
"rewards/gpt4o_holistic_reward": 3.9375,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.2152514457702637,
"speech_entropy": 2.5998382568359375,
"speech_kl": 0.0,
"step": 22,
"text_entropy": 0.31880295276641846,
"text_kl": 0.0,
"total_entropy": 2.1282958984375
},
{
"combined_loss": 0.7440503239631653,
"completion_length": 156.5,
"epoch": 0.007315521628498728,
"grad_norm": 2.6382734775543213,
"kl": 0.0,
"learning_rate": 9.999949020734677e-07,
"loss": 0.7441,
"num_samples": 1.0,
"reward": 3.5,
"reward_std": 0.9331126809120178,
"rewards/gpt4o_holistic_reward": 3.5,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.480167865753174,
"speech_entropy": 2.679046630859375,
"speech_kl": 0.0,
"step": 23,
"text_entropy": 0.8172353506088257,
"text_kl": 0.0,
"total_entropy": 2.290531873703003
},
{
"combined_loss": 0.8006659746170044,
"completion_length": 273.0,
"epoch": 0.007633587786259542,
"grad_norm": 2.4343767166137695,
"kl": 0.0,
"learning_rate": 9.999909370328077e-07,
"loss": 0.8007,
"num_samples": 1.0,
"reward": 3.1875,
"reward_std": 1.058112621307373,
"rewards/gpt4o_holistic_reward": 3.1875,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.668886423110962,
"speech_entropy": 2.74008846282959,
"speech_kl": 0.0,
"step": 24,
"text_entropy": 0.805307149887085,
"text_kl": 0.0,
"total_entropy": 2.3586955070495605
},
{
"combined_loss": 0.74156254529953,
"completion_length": 318.4375,
"epoch": 0.007951653944020356,
"grad_norm": 2.2340893745422363,
"kl": 0.0,
"learning_rate": 9.999858391404998e-07,
"loss": 0.7416,
"num_samples": 1.0,
"reward": 3.9375,
"reward_std": 0.9063550233840942,
"rewards/gpt4o_holistic_reward": 3.9375,
"rl_loss": -2.2351741790771484e-08,
"sft_loss": 2.4718751907348633,
"speech_entropy": 2.68562388420105,
"speech_kl": 0.0,
"step": 25,
"text_entropy": 0.8011962175369263,
"text_kl": 0.0,
"total_entropy": 2.301424503326416
},
{
"combined_loss": 0.7775212526321411,
"completion_length": 471.375,
"epoch": 0.00826972010178117,
"grad_norm": 1.9409180879592896,
"kl": 0.0,
"learning_rate": 9.999796084093777e-07,
"loss": 0.7775,
"num_samples": 1.0,
"reward": 3.90625,
"reward_std": 0.4376000165939331,
"rewards/gpt4o_holistic_reward": 3.90625,
"rl_loss": -2.2351741790771484e-08,
"sft_loss": 2.5917372703552246,
"speech_entropy": 2.6521334648132324,
"speech_kl": 0.0,
"step": 26,
"text_entropy": 0.755419909954071,
"text_kl": 0.0,
"total_entropy": 2.224979877471924
},
{
"combined_loss": 0.8118987083435059,
"completion_length": 381.25,
"epoch": 0.008587786259541985,
"grad_norm": 2.630601644515991,
"kl": 0.0,
"learning_rate": 9.999722448551275e-07,
"loss": 0.8119,
"num_samples": 1.0,
"reward": 3.03125,
"reward_std": 1.176088809967041,
"rewards/gpt4o_holistic_reward": 3.03125,
"rl_loss": 0.0,
"sft_loss": 2.706328868865967,
"speech_entropy": 2.7182955741882324,
"speech_kl": 0.0,
"step": 27,
"text_entropy": 0.9594783186912537,
"text_kl": 0.0,
"total_entropy": 2.36698055267334
},
{
"combined_loss": 0.7367856502532959,
"completion_length": 396.8125,
"epoch": 0.008905852417302799,
"grad_norm": 2.1226205825805664,
"kl": 0.0,
"learning_rate": 9.999637484962867e-07,
"loss": 0.7368,
"num_samples": 1.0,
"reward": 3.1875,
"reward_std": 0.6250999569892883,
"rewards/gpt4o_holistic_reward": 3.1875,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.4559521675109863,
"speech_entropy": 2.6961069107055664,
"speech_kl": 0.0,
"step": 28,
"text_entropy": 0.5669960975646973,
"text_kl": 0.0,
"total_entropy": 2.260632038116455
},
{
"combined_loss": 0.7663246393203735,
"completion_length": 175.125,
"epoch": 0.009223918575063612,
"grad_norm": 1.594689965248108,
"kl": 0.0,
"learning_rate": 9.99954119354245e-07,
"loss": 0.7663,
"num_samples": 1.0,
"reward": 4.5,
"reward_std": 9.999999747378752e-05,
"rewards/gpt4o_holistic_reward": 4.5,
"rl_loss": 0.0,
"sft_loss": 2.554415225982666,
"speech_entropy": 2.819967746734619,
"speech_kl": 0.0,
"step": 29,
"text_entropy": 0.7421623468399048,
"text_kl": 0.0,
"total_entropy": 2.4235970973968506
},
{
"combined_loss": 0.746996283531189,
"completion_length": 368.8125,
"epoch": 0.009541984732824428,
"grad_norm": 3.2165513038635254,
"kl": 0.0,
"learning_rate": 9.999433574532437e-07,
"loss": 0.747,
"num_samples": 1.0,
"reward": 3.21875,
"reward_std": 1.240410566329956,
"rewards/gpt4o_holistic_reward": 3.21875,
"rl_loss": -2.60770320892334e-08,
"sft_loss": 2.48998761177063,
"speech_entropy": 2.657895088195801,
"speech_kl": 0.0,
"step": 30,
"text_entropy": 0.9665651321411133,
"text_kl": 0.0,
"total_entropy": 2.3054580688476562
},
{
"combined_loss": 0.7402773499488831,
"completion_length": 272.4375,
"epoch": 0.009860050890585241,
"grad_norm": 1.8161159753799438,
"kl": 0.0,
"learning_rate": 9.99931462820376e-07,
"loss": 0.7403,
"num_samples": 1.0,
"reward": 4.875,
"reward_std": 0.14443756639957428,
"rewards/gpt4o_holistic_reward": 4.875,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.4675910472869873,
"speech_entropy": 2.647010326385498,
"speech_kl": 0.0,
"step": 31,
"text_entropy": 1.1171433925628662,
"text_kl": 0.0,
"total_entropy": 2.3485312461853027
},
{
"combined_loss": 0.7246508598327637,
"completion_length": 248.5,
"epoch": 0.010178117048346057,
"grad_norm": 2.0797693729400635,
"kl": 0.0,
"learning_rate": 9.999184354855866e-07,
"loss": 0.7247,
"num_samples": 1.0,
"reward": 3.875,
"reward_std": 0.6144567728042603,
"rewards/gpt4o_holistic_reward": 3.875,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.4155025482177734,
"speech_entropy": 2.6706910133361816,
"speech_kl": 0.0,
"step": 32,
"text_entropy": 0.736768901348114,
"text_kl": 0.0,
"total_entropy": 2.322854995727539
},
{
"combined_loss": 0.6848204731941223,
"completion_length": 422.125,
"epoch": 0.01049618320610687,
"grad_norm": 2.8221709728240967,
"kl": 0.0,
"learning_rate": 9.999042754816715e-07,
"loss": 0.6848,
"num_samples": 1.0,
"reward": 2.625,
"reward_std": 1.172311544418335,
"rewards/gpt4o_holistic_reward": 2.625,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.2827348709106445,
"speech_entropy": 2.7249035835266113,
"speech_kl": 0.0,
"step": 33,
"text_entropy": 1.2600326538085938,
"text_kl": 0.0,
"total_entropy": 2.3216300010681152
},
{
"combined_loss": 0.8137314319610596,
"completion_length": 450.6875,
"epoch": 0.010814249363867684,
"grad_norm": 1.7230591773986816,
"kl": 0.0,
"learning_rate": 9.99888982844279e-07,
"loss": 0.8137,
"num_samples": 1.0,
"reward": 4.1875,
"reward_std": 0.42705631256103516,
"rewards/gpt4o_holistic_reward": 4.1875,
"rl_loss": 0.0,
"sft_loss": 2.7124381065368652,
"speech_entropy": 2.5579869747161865,
"speech_kl": 0.0,
"step": 34,
"text_entropy": 1.105285406112671,
"text_kl": 0.0,
"total_entropy": 2.2585811614990234
},
{
"combined_loss": 0.6867671012878418,
"completion_length": 296.0625,
"epoch": 0.0111323155216285,
"grad_norm": 2.008439064025879,
"kl": 0.0,
"learning_rate": 9.99872557611908e-07,
"loss": 0.6868,
"num_samples": 1.0,
"reward": 2.125,
"reward_std": 0.8536534309387207,
"rewards/gpt4o_holistic_reward": 2.125,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.2892236709594727,
"speech_entropy": 2.624180316925049,
"speech_kl": 0.0,
"step": 35,
"text_entropy": 0.7666749358177185,
"text_kl": 0.0,
"total_entropy": 2.235846996307373
},
{
"combined_loss": 0.7378132343292236,
"completion_length": 518.0625,
"epoch": 0.011450381679389313,
"grad_norm": 1.8618037700653076,
"kl": 0.0,
"learning_rate": 9.99854999825909e-07,
"loss": 0.7378,
"num_samples": 1.0,
"reward": 2.75,
"reward_std": 0.8944376111030579,
"rewards/gpt4o_holistic_reward": 2.75,
"rl_loss": -1.30385160446167e-08,
"sft_loss": 2.4593772888183594,
"speech_entropy": 2.565446615219116,
"speech_kl": 0.0,
"step": 36,
"text_entropy": 0.864229679107666,
"text_kl": 0.0,
"total_entropy": 2.1934401988983154
},
{
"combined_loss": 0.7986043691635132,
"completion_length": 484.375,
"epoch": 0.011768447837150127,
"grad_norm": 1.954567551612854,
"kl": 0.0,
"learning_rate": 9.998363095304839e-07,
"loss": 0.7986,
"num_samples": 1.0,
"reward": 3.625,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_holistic_reward": 3.625,
"rl_loss": -9.313225746154785e-09,
"sft_loss": 2.6620142459869385,
"speech_entropy": 2.6134657859802246,
"speech_kl": 0.0,
"step": 37,
"text_entropy": 0.908026933670044,
"text_kl": 0.0,
"total_entropy": 2.264120578765869
},
{
"combined_loss": 0.7768511176109314,
"completion_length": 327.375,
"epoch": 0.012086513994910942,
"grad_norm": 2.6139323711395264,
"kl": 0.0,
"learning_rate": 9.99816486772685e-07,
"loss": 0.7769,
"num_samples": 1.0,
"reward": 4.0,
"reward_std": 0.9717878103256226,
"rewards/gpt4o_holistic_reward": 4.0,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.589503765106201,
"speech_entropy": 2.7159786224365234,
"speech_kl": 0.0,
"step": 38,
"text_entropy": 1.0709924697875977,
"text_kl": 0.0,
"total_entropy": 2.3924551010131836
},
{
"combined_loss": 0.690489649772644,
"completion_length": 294.0,
"epoch": 0.012404580152671756,
"grad_norm": 2.4900975227355957,
"kl": 0.0,
"learning_rate": 9.997955316024167e-07,
"loss": 0.6905,
"num_samples": 1.0,
"reward": 3.75,
"reward_std": 0.5774502754211426,
"rewards/gpt4o_holistic_reward": 3.75,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.3016321659088135,
"speech_entropy": 2.509531021118164,
"speech_kl": 0.0,
"step": 39,
"text_entropy": 0.8610185980796814,
"text_kl": 0.0,
"total_entropy": 2.159456729888916
},
{
"combined_loss": 0.8176020383834839,
"completion_length": 299.9375,
"epoch": 0.01272264631043257,
"grad_norm": 2.29056453704834,
"kl": 0.0,
"learning_rate": 9.997734440724333e-07,
"loss": 0.8176,
"num_samples": 1.0,
"reward": 2.6875,
"reward_std": 0.41377514600753784,
"rewards/gpt4o_holistic_reward": 2.6875,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.7253401279449463,
"speech_entropy": 2.6276955604553223,
"speech_kl": 0.0,
"step": 40,
"text_entropy": 0.7860556840896606,
"text_kl": 0.0,
"total_entropy": 2.2730958461761475
},
{
"combined_loss": 0.6987892985343933,
"completion_length": 346.4375,
"epoch": 0.013040712468193385,
"grad_norm": 1.9701205492019653,
"kl": 0.0,
"learning_rate": 9.9975022423834e-07,
"loss": 0.6988,
"num_samples": 1.0,
"reward": 3.5,
"reward_std": 0.36445680260658264,
"rewards/gpt4o_holistic_reward": 3.5,
"rl_loss": 0.0,
"sft_loss": 2.3292975425720215,
"speech_entropy": 2.602095365524292,
"speech_kl": 0.0,
"step": 41,
"text_entropy": 0.8417441844940186,
"text_kl": 0.0,
"total_entropy": 2.2476534843444824
},
{
"combined_loss": 0.7409491539001465,
"completion_length": 319.1875,
"epoch": 0.013358778625954198,
"grad_norm": 2.334261894226074,
"kl": 0.0,
"learning_rate": 9.997258721585931e-07,
"loss": 0.7409,
"num_samples": 1.0,
"reward": 2.9375,
"reward_std": 0.8751000165939331,
"rewards/gpt4o_holistic_reward": 2.9375,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.469830274581909,
"speech_entropy": 2.437565326690674,
"speech_kl": 0.0,
"step": 42,
"text_entropy": 0.7768386602401733,
"text_kl": 0.0,
"total_entropy": 2.0847883224487305
},
{
"combined_loss": 0.6795743107795715,
"completion_length": 245.375,
"epoch": 0.013676844783715014,
"grad_norm": 2.3061537742614746,
"kl": 0.0,
"learning_rate": 9.997003878944985e-07,
"loss": 0.6796,
"num_samples": 1.0,
"reward": 3.0,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_holistic_reward": 3.0,
"rl_loss": -1.4901161193847656e-08,
"sft_loss": 2.2652478218078613,
"speech_entropy": 2.602904796600342,
"speech_kl": 0.0,
"step": 43,
"text_entropy": 0.6788486242294312,
"text_kl": 0.0,
"total_entropy": 2.2193617820739746
},
{
"combined_loss": 0.7629357576370239,
"completion_length": 426.9375,
"epoch": 0.013994910941475827,
"grad_norm": 1.8447140455245972,
"kl": 0.0,
"learning_rate": 9.996737715102132e-07,
"loss": 0.7629,
"num_samples": 1.0,
"reward": 3.0625,
"reward_std": 0.6978486180305481,
"rewards/gpt4o_holistic_reward": 3.0625,
"rl_loss": 1.6763806343078613e-08,
"sft_loss": 2.543118953704834,
"speech_entropy": 2.4126617908477783,
"speech_kl": 0.0,
"step": 44,
"text_entropy": 0.933107852935791,
"text_kl": 0.0,
"total_entropy": 2.1085753440856934
},
{
"combined_loss": 0.7338756322860718,
"completion_length": 539.75,
"epoch": 0.01431297709923664,
"grad_norm": 6.8181939125061035,
"kl": 0.0,
"learning_rate": 9.996460230727435e-07,
"loss": 0.7339,
"num_samples": 1.0,
"reward": 4.0,
"reward_std": 0.5728486180305481,
"rewards/gpt4o_holistic_reward": 4.0,
"rl_loss": 0.0,
"sft_loss": 2.44625186920166,
"speech_entropy": 2.5158934593200684,
"speech_kl": 0.0,
"step": 45,
"text_entropy": 0.8566389083862305,
"text_kl": 0.0,
"total_entropy": 2.162264585494995
},
{
"combined_loss": 0.8030707240104675,
"completion_length": 346.375,
"epoch": 0.014631043256997456,
"grad_norm": 2.483579397201538,
"kl": 0.0,
"learning_rate": 9.996171426519463e-07,
"loss": 0.8031,
"num_samples": 1.0,
"reward": 2.9375,
"reward_std": 0.7394567728042603,
"rewards/gpt4o_holistic_reward": 2.9375,
"rl_loss": -2.60770320892334e-08,
"sft_loss": 2.6769022941589355,
"speech_entropy": 2.4880712032318115,
"speech_kl": 0.0,
"step": 46,
"text_entropy": 0.7922484874725342,
"text_kl": 0.0,
"total_entropy": 2.1411781311035156
},
{
"combined_loss": 0.721707820892334,
"completion_length": 403.5625,
"epoch": 0.01494910941475827,
"grad_norm": 1.8520557880401611,
"kl": 0.0,
"learning_rate": 9.995871303205279e-07,
"loss": 0.7217,
"num_samples": 1.0,
"reward": 4.5625,
"reward_std": 0.8751000165939331,
"rewards/gpt4o_holistic_reward": 4.5625,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.4056925773620605,
"speech_entropy": 2.421140193939209,
"speech_kl": 0.0,
"step": 47,
"text_entropy": 0.7047562003135681,
"text_kl": 0.0,
"total_entropy": 2.0640478134155273
},
{
"combined_loss": 0.7390530109405518,
"completion_length": 239.25,
"epoch": 0.015267175572519083,
"grad_norm": 2.3096864223480225,
"kl": 0.0,
"learning_rate": 9.995559861540447e-07,
"loss": 0.7391,
"num_samples": 1.0,
"reward": 2.5625,
"reward_std": 0.5194376111030579,
"rewards/gpt4o_holistic_reward": 2.5625,
"rl_loss": -1.4901161193847656e-08,
"sft_loss": 2.463510036468506,
"speech_entropy": 2.620394229888916,
"speech_kl": 0.0,
"step": 48,
"text_entropy": 1.4838829040527344,
"text_kl": 0.0,
"total_entropy": 2.386155605316162
},
{
"combined_loss": 0.8287366628646851,
"completion_length": 313.125,
"epoch": 0.015585241730279899,
"grad_norm": 2.5821146965026855,
"kl": 0.0,
"learning_rate": 9.995237102309018e-07,
"loss": 0.8287,
"num_samples": 1.0,
"reward": 4.0,
"reward_std": 1.2807698249816895,
"rewards/gpt4o_holistic_reward": 4.0,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.762455463409424,
"speech_entropy": 2.669032096862793,
"speech_kl": 0.0,
"step": 49,
"text_entropy": 1.0619488954544067,
"text_kl": 0.0,
"total_entropy": 2.372556447982788
},
{
"combined_loss": 0.6661741733551025,
"completion_length": 309.875,
"epoch": 0.015903307888040712,
"grad_norm": 1.8256869316101074,
"kl": 0.0,
"learning_rate": 9.994903026323536e-07,
"loss": 0.6662,
"num_samples": 1.0,
"reward": 4.125,
"reward_std": 0.14443756639957428,
"rewards/gpt4o_holistic_reward": 4.125,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.220580577850342,
"speech_entropy": 2.4839134216308594,
"speech_kl": 0.0,
"step": 50,
"text_entropy": 0.8060861825942993,
"text_kl": 0.0,
"total_entropy": 2.165754556655884
},
{
"combined_loss": 0.7401760220527649,
"completion_length": 260.75,
"epoch": 0.016221374045801526,
"grad_norm": 2.784619092941284,
"kl": 0.0,
"learning_rate": 9.994557634425038e-07,
"loss": 0.7402,
"num_samples": 1.0,
"reward": 3.375,
"reward_std": 1.1831127405166626,
"rewards/gpt4o_holistic_reward": 3.375,
"rl_loss": 2.2351741790771484e-08,
"sft_loss": 2.4672532081604004,
"speech_entropy": 1.8801207542419434,
"speech_kl": 0.0,
"step": 51,
"text_entropy": 1.1698064804077148,
"text_kl": 0.0,
"total_entropy": 1.9170701503753662
},
{
"combined_loss": 0.7516125440597534,
"completion_length": 388.875,
"epoch": 0.01653944020356234,
"grad_norm": 1.9916623830795288,
"kl": 0.0,
"learning_rate": 9.994200927483053e-07,
"loss": 0.7516,
"num_samples": 1.0,
"reward": 2.75,
"reward_std": 0.9002986550331116,
"rewards/gpt4o_holistic_reward": 2.75,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.5053749084472656,
"speech_entropy": 2.393984794616699,
"speech_kl": 0.0,
"step": 52,
"text_entropy": 1.078744649887085,
"text_kl": 0.0,
"total_entropy": 2.1226654052734375
},
{
"combined_loss": 0.6983101963996887,
"completion_length": 421.9375,
"epoch": 0.016857506361323157,
"grad_norm": 2.4226467609405518,
"kl": 0.0,
"learning_rate": 9.993832906395582e-07,
"loss": 0.6983,
"num_samples": 1.0,
"reward": 3.125,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_holistic_reward": 3.125,
"rl_loss": 0.0,
"sft_loss": 2.327700614929199,
"speech_entropy": 2.536123752593994,
"speech_kl": 0.0,
"step": 53,
"text_entropy": 1.049363136291504,
"text_kl": 0.0,
"total_entropy": 2.2463369369506836
},
{
"combined_loss": 0.738640308380127,
"completion_length": 277.5625,
"epoch": 0.01717557251908397,
"grad_norm": 2.4906787872314453,
"kl": 0.0,
"learning_rate": 9.993453572089124e-07,
"loss": 0.7386,
"num_samples": 1.0,
"reward": 4.125,
"reward_std": 0.7501000165939331,
"rewards/gpt4o_holistic_reward": 4.125,
"rl_loss": 2.9802322387695312e-08,
"sft_loss": 2.46213436126709,
"speech_entropy": 2.3886947631835938,
"speech_kl": 0.0,
"step": 54,
"text_entropy": 1.1047334671020508,
"text_kl": 0.0,
"total_entropy": 2.162165641784668
},
{
"combined_loss": 0.6425143480300903,
"completion_length": 306.8125,
"epoch": 0.017493638676844784,
"grad_norm": 1.8969634771347046,
"kl": 0.0,
"learning_rate": 9.99306292551865e-07,
"loss": 0.6425,
"num_samples": 1.0,
"reward": 3.375,
"reward_std": 0.14443756639957428,
"rewards/gpt4o_holistic_reward": 3.375,
"rl_loss": 0.0,
"sft_loss": 2.141714572906494,
"speech_entropy": 2.4277968406677246,
"speech_kl": 0.0,
"step": 55,
"text_entropy": 0.7971184253692627,
"text_kl": 0.0,
"total_entropy": 2.088435173034668
},
{
"combined_loss": 0.5967005491256714,
"completion_length": 362.5,
"epoch": 0.017811704834605598,
"grad_norm": 1.6407321691513062,
"kl": 0.0,
"learning_rate": 9.99266096766761e-07,
"loss": 0.5967,
"num_samples": 1.0,
"reward": 4.25,
"reward_std": 0.7501000165939331,
"rewards/gpt4o_holistic_reward": 4.25,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 1.989001750946045,
"speech_entropy": 2.3562631607055664,
"speech_kl": 0.0,
"step": 56,
"text_entropy": 0.650113582611084,
"text_kl": 0.0,
"total_entropy": 1.9950945377349854
},
{
"combined_loss": 0.6596853137016296,
"completion_length": 356.0,
"epoch": 0.01812977099236641,
"grad_norm": 3.1980700492858887,
"kl": 0.0,
"learning_rate": 9.992247699547936e-07,
"loss": 0.6597,
"num_samples": 1.0,
"reward": 3.0625,
"reward_std": 1.536826252937317,
"rewards/gpt4o_holistic_reward": 3.0625,
"rl_loss": -2.2351741790771484e-08,
"sft_loss": 2.198951244354248,
"speech_entropy": 2.4273312091827393,
"speech_kl": 0.0,
"step": 57,
"text_entropy": 0.5911531448364258,
"text_kl": 0.0,
"total_entropy": 2.0504350662231445
},
{
"combined_loss": 0.7111167907714844,
"completion_length": 328.8125,
"epoch": 0.018447837150127225,
"grad_norm": 2.5813424587249756,
"kl": 0.0,
"learning_rate": 9.99182312220003e-07,
"loss": 0.7111,
"num_samples": 1.0,
"reward": 2.5625,
"reward_std": 1.2798004150390625,
"rewards/gpt4o_holistic_reward": 2.5625,
"rl_loss": 0.0,
"sft_loss": 2.370388984680176,
"speech_entropy": 2.387758731842041,
"speech_kl": 0.0,
"step": 58,
"text_entropy": 1.290541410446167,
"text_kl": 0.0,
"total_entropy": 2.1883113384246826
},
{
"combined_loss": 0.7116686105728149,
"completion_length": 483.125,
"epoch": 0.018765903307888042,
"grad_norm": 1.8333094120025635,
"kl": 0.0,
"learning_rate": 9.991387236692764e-07,
"loss": 0.7117,
"num_samples": 1.0,
"reward": 4.3125,
"reward_std": 0.6038135886192322,
"rewards/gpt4o_holistic_reward": 4.3125,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.3722286224365234,
"speech_entropy": 2.387399196624756,
"speech_kl": 0.0,
"step": 59,
"text_entropy": 0.8359103798866272,
"text_kl": 0.0,
"total_entropy": 2.049403429031372
},
{
"combined_loss": 0.7444514036178589,
"completion_length": 254.6875,
"epoch": 0.019083969465648856,
"grad_norm": 2.073017120361328,
"kl": 0.0,
"learning_rate": 9.990940044123479e-07,
"loss": 0.7445,
"num_samples": 1.0,
"reward": 5.0,
"reward_std": 9.999999747378752e-05,
"rewards/gpt4o_holistic_reward": 5.0,
"rl_loss": 0.0,
"sft_loss": 2.4815046787261963,
"speech_entropy": 2.459671974182129,
"speech_kl": 0.0,
"step": 60,
"text_entropy": 0.8760254383087158,
"text_kl": 0.0,
"total_entropy": 2.171876907348633
},
{
"combined_loss": 0.6521174907684326,
"completion_length": 459.6875,
"epoch": 0.01940203562340967,
"grad_norm": 2.9567947387695312,
"kl": 0.0,
"learning_rate": 9.990481545617983e-07,
"loss": 0.6521,
"num_samples": 1.0,
"reward": 2.90625,
"reward_std": 0.6609638333320618,
"rewards/gpt4o_holistic_reward": 2.90625,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.173725128173828,
"speech_entropy": 1.9960131645202637,
"speech_kl": 0.0,
"step": 61,
"text_entropy": 0.9018564820289612,
"text_kl": 0.0,
"total_entropy": 1.8020646572113037
},
{
"combined_loss": 0.7371933460235596,
"completion_length": 338.1875,
"epoch": 0.019720101781170483,
"grad_norm": 1.8295475244522095,
"kl": 0.0,
"learning_rate": 9.990011742330542e-07,
"loss": 0.7372,
"num_samples": 1.0,
"reward": 4.4375,
"reward_std": 0.8081126809120178,
"rewards/gpt4o_holistic_reward": 4.4375,
"rl_loss": 0.0,
"sft_loss": 2.4573111534118652,
"speech_entropy": 2.409938097000122,
"speech_kl": 0.0,
"step": 62,
"text_entropy": 1.2029674053192139,
"text_kl": 0.0,
"total_entropy": 2.186131238937378
},
{
"combined_loss": 0.748369574546814,
"completion_length": 248.625,
"epoch": 0.020038167938931296,
"grad_norm": 2.8785228729248047,
"kl": 0.0,
"learning_rate": 9.98953063544389e-07,
"loss": 0.7484,
"num_samples": 1.0,
"reward": 3.375,
"reward_std": 1.7174440622329712,
"rewards/gpt4o_holistic_reward": 3.375,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.49456524848938,
"speech_entropy": 2.4662866592407227,
"speech_kl": 0.0,
"step": 63,
"text_entropy": 1.4685275554656982,
"text_kl": 0.0,
"total_entropy": 2.278367280960083
},
{
"combined_loss": 0.668720006942749,
"completion_length": 369.25,
"epoch": 0.020356234096692113,
"grad_norm": 2.220876693725586,
"kl": 0.0,
"learning_rate": 9.989038226169207e-07,
"loss": 0.6687,
"num_samples": 1.0,
"reward": 3.6875,
"reward_std": 1.019437551498413,
"rewards/gpt4o_holistic_reward": 3.6875,
"rl_loss": 2.2351741790771484e-08,
"sft_loss": 2.2290663719177246,
"speech_entropy": 2.3302509784698486,
"speech_kl": 0.0,
"step": 64,
"text_entropy": 0.9201998710632324,
"text_kl": 0.0,
"total_entropy": 2.049210786819458
},
{
"combined_loss": 0.7535024881362915,
"completion_length": 459.625,
"epoch": 0.020674300254452927,
"grad_norm": 3.4402458667755127,
"kl": 0.0,
"learning_rate": 9.98853451574614e-07,
"loss": 0.7535,
"num_samples": 1.0,
"reward": 2.625,
"reward_std": 1.161826252937317,
"rewards/gpt4o_holistic_reward": 2.625,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.5116748809814453,
"speech_entropy": 2.1198697090148926,
"speech_kl": 0.0,
"step": 65,
"text_entropy": 0.7827379703521729,
"text_kl": 0.0,
"total_entropy": 1.855495572090149
},
{
"combined_loss": 0.6956632733345032,
"completion_length": 267.0,
"epoch": 0.02099236641221374,
"grad_norm": 2.199934244155884,
"kl": 0.0,
"learning_rate": 9.988019505442775e-07,
"loss": 0.6957,
"num_samples": 1.0,
"reward": 4.5625,
"reward_std": 0.42705631256103516,
"rewards/gpt4o_holistic_reward": 4.5625,
"rl_loss": -9.313225746154785e-09,
"sft_loss": 2.3188774585723877,
"speech_entropy": 2.392827033996582,
"speech_kl": 0.0,
"step": 66,
"text_entropy": 1.4710502624511719,
"text_kl": 0.0,
"total_entropy": 2.220284938812256
},
{
"combined_loss": 0.6765873432159424,
"completion_length": 389.0,
"epoch": 0.021310432569974554,
"grad_norm": 2.0207324028015137,
"kl": 0.0,
"learning_rate": 9.987493196555649e-07,
"loss": 0.6766,
"num_samples": 1.0,
"reward": 4.25,
"reward_std": 0.9565354585647583,
"rewards/gpt4o_holistic_reward": 4.25,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.255290985107422,
"speech_entropy": 2.4497034549713135,
"speech_kl": 0.0,
"step": 67,
"text_entropy": 0.9174681901931763,
"text_kl": 0.0,
"total_entropy": 2.17930006980896
},
{
"combined_loss": 0.7614186406135559,
"completion_length": 355.125,
"epoch": 0.021628498727735368,
"grad_norm": 2.724764585494995,
"kl": 0.0,
"learning_rate": 9.986955590409747e-07,
"loss": 0.7614,
"num_samples": 1.0,
"reward": 3.0,
"reward_std": 0.9788135290145874,
"rewards/gpt4o_holistic_reward": 3.0,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.53806209564209,
"speech_entropy": 2.3254313468933105,
"speech_kl": 0.0,
"step": 68,
"text_entropy": 0.8083021640777588,
"text_kl": 0.0,
"total_entropy": 2.063969612121582
},
{
"combined_loss": 0.6472816467285156,
"completion_length": 407.5,
"epoch": 0.02194656488549618,
"grad_norm": 1.8889881372451782,
"kl": 0.0,
"learning_rate": 9.986406688358491e-07,
"loss": 0.6473,
"num_samples": 1.0,
"reward": 3.75,
"reward_std": 1.2387304306030273,
"rewards/gpt4o_holistic_reward": 3.75,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.1576054096221924,
"speech_entropy": 2.329939842224121,
"speech_kl": 0.0,
"step": 69,
"text_entropy": 0.53284752368927,
"text_kl": 0.0,
"total_entropy": 1.9823434352874756
},
{
"combined_loss": 0.8229079842567444,
"completion_length": 370.9375,
"epoch": 0.022264631043257,
"grad_norm": 2.0763661861419678,
"kl": 0.0,
"learning_rate": 9.98584649178374e-07,
"loss": 0.8229,
"num_samples": 1.0,
"reward": 4.3125,
"reward_std": 0.8538135886192322,
"rewards/gpt4o_holistic_reward": 4.3125,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.7430264949798584,
"speech_entropy": 2.232337474822998,
"speech_kl": 0.0,
"step": 70,
"text_entropy": 1.1738722324371338,
"text_kl": 0.0,
"total_entropy": 2.027456283569336
},
{
"combined_loss": 0.7078136205673218,
"completion_length": 402.1875,
"epoch": 0.022582697201017812,
"grad_norm": 2.2039167881011963,
"kl": 0.0,
"learning_rate": 9.985275002095789e-07,
"loss": 0.7078,
"num_samples": 1.0,
"reward": 3.5,
"reward_std": 1.183112621307373,
"rewards/gpt4o_holistic_reward": 3.5,
"rl_loss": 0.0,
"sft_loss": 2.3593788146972656,
"speech_entropy": 2.290937900543213,
"speech_kl": 0.0,
"step": 71,
"text_entropy": 1.0952609777450562,
"text_kl": 0.0,
"total_entropy": 2.054002285003662
},
{
"combined_loss": 0.7774462699890137,
"completion_length": 370.0,
"epoch": 0.022900763358778626,
"grad_norm": 2.0490500926971436,
"kl": 0.0,
"learning_rate": 9.984692220733363e-07,
"loss": 0.7774,
"num_samples": 1.0,
"reward": 4.0,
"reward_std": 0.7126991748809814,
"rewards/gpt4o_holistic_reward": 4.0,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.591487407684326,
"speech_entropy": 2.2809252738952637,
"speech_kl": 0.0,
"step": 72,
"text_entropy": 1.4932012557983398,
"text_kl": 0.0,
"total_entropy": 2.1317849159240723
},
{
"combined_loss": 0.7310476899147034,
"completion_length": 416.125,
"epoch": 0.02321882951653944,
"grad_norm": 1.7452352046966553,
"kl": 0.0,
"learning_rate": 9.984098149163612e-07,
"loss": 0.731,
"num_samples": 1.0,
"reward": 4.375,
"reward_std": 0.5520563125610352,
"rewards/gpt4o_holistic_reward": 4.375,
"rl_loss": 0.0,
"sft_loss": 2.4368255138397217,
"speech_entropy": 2.2939229011535645,
"speech_kl": 0.0,
"step": 73,
"text_entropy": 1.1391348838806152,
"text_kl": 0.0,
"total_entropy": 2.0588903427124023
},
{
"combined_loss": 0.6863812208175659,
"completion_length": 426.4375,
"epoch": 0.023536895674300253,
"grad_norm": 1.8024567365646362,
"kl": 0.0,
"learning_rate": 9.98349278888211e-07,
"loss": 0.6864,
"num_samples": 1.0,
"reward": 2.5,
"reward_std": 0.14443756639957428,
"rewards/gpt4o_holistic_reward": 2.5,
"rl_loss": 0.0,
"sft_loss": 2.2879374027252197,
"speech_entropy": 1.6559635400772095,
"speech_kl": 0.0,
"step": 74,
"text_entropy": 0.796265184879303,
"text_kl": 0.0,
"total_entropy": 1.6575778722763062
},
{
"combined_loss": 0.6243323683738708,
"completion_length": 371.375,
"epoch": 0.02385496183206107,
"grad_norm": 1.5667005777359009,
"kl": 0.0,
"learning_rate": 9.982876141412855e-07,
"loss": 0.6243,
"num_samples": 1.0,
"reward": 4.25,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_holistic_reward": 4.25,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.0811080932617188,
"speech_entropy": 2.248847723007202,
"speech_kl": 0.0,
"step": 75,
"text_entropy": 0.7592964172363281,
"text_kl": 0.0,
"total_entropy": 1.9378535747528076
},
{
"combined_loss": 0.745708703994751,
"completion_length": 340.0,
"epoch": 0.024173027989821884,
"grad_norm": 2.7676212787628174,
"kl": 0.0,
"learning_rate": 9.982248208308253e-07,
"loss": 0.7457,
"num_samples": 1.0,
"reward": 3.4375,
"reward_std": 1.1250998973846436,
"rewards/gpt4o_holistic_reward": 3.4375,
"rl_loss": 1.30385160446167e-08,
"sft_loss": 2.4856958389282227,
"speech_entropy": 2.3308167457580566,
"speech_kl": 0.0,
"step": 76,
"text_entropy": 1.140113353729248,
"text_kl": 0.0,
"total_entropy": 2.1104745864868164
},
{
"combined_loss": 0.6474106907844543,
"completion_length": 482.3125,
"epoch": 0.024491094147582698,
"grad_norm": 2.045401096343994,
"kl": 0.0,
"learning_rate": 9.981608991149123e-07,
"loss": 0.6474,
"num_samples": 1.0,
"reward": 3.9375,
"reward_std": 1.375100016593933,
"rewards/gpt4o_holistic_reward": 3.9375,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.1580355167388916,
"speech_entropy": 2.319225311279297,
"speech_kl": 0.0,
"step": 77,
"text_entropy": 0.8749170899391174,
"text_kl": 0.0,
"total_entropy": 2.047513723373413
},
{
"combined_loss": 0.7499513626098633,
"completion_length": 317.375,
"epoch": 0.02480916030534351,
"grad_norm": 2.194958209991455,
"kl": 0.0,
"learning_rate": 9.980958491544697e-07,
"loss": 0.75,
"num_samples": 1.0,
"reward": 3.75,
"reward_std": 0.6444375514984131,
"rewards/gpt4o_holistic_reward": 3.75,
"rl_loss": -5.587935447692871e-09,
"sft_loss": 2.499837875366211,
"speech_entropy": 2.3051891326904297,
"speech_kl": 0.0,
"step": 78,
"text_entropy": 1.2871158123016357,
"text_kl": 0.0,
"total_entropy": 2.1196823120117188
},
{
"combined_loss": 0.686776876449585,
"completion_length": 423.0,
"epoch": 0.025127226463104325,
"grad_norm": 2.2229573726654053,
"kl": 0.0,
"learning_rate": 9.980296711132606e-07,
"loss": 0.6868,
"num_samples": 1.0,
"reward": 3.1875,
"reward_std": 1.2525264024734497,
"rewards/gpt4o_holistic_reward": 3.1875,
"rl_loss": -1.4901161193847656e-08,
"sft_loss": 2.2892560958862305,
"speech_entropy": 2.3141417503356934,
"speech_kl": 0.0,
"step": 79,
"text_entropy": 1.1506476402282715,
"text_kl": 0.0,
"total_entropy": 2.09428071975708
},
{
"combined_loss": 0.7100934386253357,
"completion_length": 345.375,
"epoch": 0.02544529262086514,
"grad_norm": 2.2556166648864746,
"kl": 0.0,
"learning_rate": 9.97962365157888e-07,
"loss": 0.7101,
"num_samples": 1.0,
"reward": 3.3125,
"reward_std": 1.7286533117294312,
"rewards/gpt4o_holistic_reward": 3.3125,
"rl_loss": 1.862645149230957e-08,
"sft_loss": 2.366978168487549,
"speech_entropy": 2.3218116760253906,
"speech_kl": 0.0,
"step": 80,
"text_entropy": 1.308366060256958,
"text_kl": 0.0,
"total_entropy": 2.1330385208129883
},
{
"combined_loss": 0.7132716774940491,
"completion_length": 592.8125,
"epoch": 0.025763358778625955,
"grad_norm": 2.3514413833618164,
"kl": 0.0,
"learning_rate": 9.97893931457795e-07,
"loss": 0.7133,
"num_samples": 1.0,
"reward": 3.625,
"reward_std": 1.0983424186706543,
"rewards/gpt4o_holistic_reward": 3.625,
"rl_loss": -1.1175870895385742e-08,
"sft_loss": 2.3775720596313477,
"speech_entropy": 2.166839838027954,
"speech_kl": 0.0,
"step": 81,
"text_entropy": 0.6873091459274292,
"text_kl": 0.0,
"total_entropy": 1.8318581581115723
},
{
"combined_loss": 0.7247699499130249,
"completion_length": 368.875,
"epoch": 0.02608142493638677,
"grad_norm": 2.5765998363494873,
"kl": 0.0,
"learning_rate": 9.978243701852625e-07,
"loss": 0.7248,
"num_samples": 1.0,
"reward": 3.875,
"reward_std": 1.0000998973846436,
"rewards/gpt4o_holistic_reward": 3.875,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.4158997535705566,
"speech_entropy": 2.3246705532073975,
"speech_kl": 0.0,
"step": 82,
"text_entropy": 1.0593510866165161,
"text_kl": 0.0,
"total_entropy": 1.8370463848114014
},
{
"combined_loss": 0.7713165879249573,
"completion_length": 510.5,
"epoch": 0.026399491094147583,
"grad_norm": 2.231696844100952,
"kl": 0.0,
"learning_rate": 9.977536815154117e-07,
"loss": 0.7713,
"num_samples": 1.0,
"reward": 3.125,
"reward_std": 0.7180101871490479,
"rewards/gpt4o_holistic_reward": 3.125,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.5710554122924805,
"speech_entropy": 2.2227771282196045,
"speech_kl": 0.0,
"step": 83,
"text_entropy": 0.9712114334106445,
"text_kl": 0.0,
"total_entropy": 1.9926376342773438
},
{
"combined_loss": 0.66883784532547,
"completion_length": 462.8125,
"epoch": 0.026717557251908396,
"grad_norm": 2.2363080978393555,
"kl": 0.0,
"learning_rate": 9.97681865626201e-07,
"loss": 0.6688,
"num_samples": 1.0,
"reward": 2.625,
"reward_std": 1.017488718032837,
"rewards/gpt4o_holistic_reward": 2.625,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.229459524154663,
"speech_entropy": 2.178924083709717,
"speech_kl": 0.0,
"step": 84,
"text_entropy": 0.9698714017868042,
"text_kl": 0.0,
"total_entropy": 1.9314519166946411
},
{
"combined_loss": 0.7741247415542603,
"completion_length": 383.75,
"epoch": 0.02703562340966921,
"grad_norm": 2.050072193145752,
"kl": 0.0,
"learning_rate": 9.97608922698427e-07,
"loss": 0.7741,
"num_samples": 1.0,
"reward": 3.5,
"reward_std": 1.1404881477355957,
"rewards/gpt4o_holistic_reward": 3.5,
"rl_loss": 1.862645149230957e-08,
"sft_loss": 2.580415725708008,
"speech_entropy": 2.2788493633270264,
"speech_kl": 0.0,
"step": 85,
"text_entropy": 0.9160740971565247,
"text_kl": 0.0,
"total_entropy": 2.0253312587738037
},
{
"combined_loss": 0.6860091090202332,
"completion_length": 373.125,
"epoch": 0.027353689567430027,
"grad_norm": 3.053793430328369,
"kl": 0.0,
"learning_rate": 9.975348529157229e-07,
"loss": 0.686,
"num_samples": 1.0,
"reward": 3.875,
"reward_std": 0.8944376111030579,
"rewards/gpt4o_holistic_reward": 3.875,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.2866969108581543,
"speech_entropy": 2.2216386795043945,
"speech_kl": 0.0,
"step": 86,
"text_entropy": 0.9611391425132751,
"text_kl": 0.0,
"total_entropy": 1.9948720932006836
},
{
"combined_loss": 0.7096176147460938,
"completion_length": 320.625,
"epoch": 0.02767175572519084,
"grad_norm": 2.05188250541687,
"kl": 0.0,
"learning_rate": 9.974596564645598e-07,
"loss": 0.7096,
"num_samples": 1.0,
"reward": 3.75,
"reward_std": 0.5520563125610352,
"rewards/gpt4o_holistic_reward": 3.75,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.365391969680786,
"speech_entropy": 2.3104031085968018,
"speech_kl": 0.0,
"step": 87,
"text_entropy": 1.122492790222168,
"text_kl": 0.0,
"total_entropy": 2.0940003395080566
},
{
"combined_loss": 0.6567751169204712,
"completion_length": 530.125,
"epoch": 0.027989821882951654,
"grad_norm": 1.9356623888015747,
"kl": 0.0,
"learning_rate": 9.973833335342446e-07,
"loss": 0.6568,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 0.807937741279602,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.1892502307891846,
"speech_entropy": 2.227321147918701,
"speech_kl": 0.0,
"step": 88,
"text_entropy": 1.0134230852127075,
"text_kl": 0.0,
"total_entropy": 1.9499976634979248
},
{
"combined_loss": 0.7411503791809082,
"completion_length": 485.9375,
"epoch": 0.028307888040712468,
"grad_norm": 1.8700544834136963,
"kl": 0.0,
"learning_rate": 9.9730588431692e-07,
"loss": 0.7412,
"num_samples": 1.0,
"reward": 3.1875,
"reward_std": 0.6637751460075378,
"rewards/gpt4o_holistic_reward": 3.1875,
"rl_loss": 5.587935447692871e-09,
"sft_loss": 2.47050142288208,
"speech_entropy": 2.357027530670166,
"speech_kl": 0.0,
"step": 89,
"text_entropy": 0.865372896194458,
"text_kl": 0.0,
"total_entropy": 2.083202600479126
},
{
"combined_loss": 0.6864016056060791,
"completion_length": 444.875,
"epoch": 0.02862595419847328,
"grad_norm": 1.8632216453552246,
"kl": 0.0,
"learning_rate": 9.972273090075645e-07,
"loss": 0.6864,
"num_samples": 1.0,
"reward": 2.90625,
"reward_std": 0.7911534309387207,
"rewards/gpt4o_holistic_reward": 2.90625,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.2880053520202637,
"speech_entropy": 2.133165121078491,
"speech_kl": 0.0,
"step": 90,
"text_entropy": 0.9260656833648682,
"text_kl": 0.0,
"total_entropy": 1.80254328250885
},
{
"combined_loss": 0.6964578628540039,
"completion_length": 433.9375,
"epoch": 0.028944020356234095,
"grad_norm": 1.954607367515564,
"kl": 0.0,
"learning_rate": 9.97147607803991e-07,
"loss": 0.6965,
"num_samples": 1.0,
"reward": 2.875,
"reward_std": 0.6983708143234253,
"rewards/gpt4o_holistic_reward": 2.875,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.321526050567627,
"speech_entropy": 2.222456455230713,
"speech_kl": 0.0,
"step": 91,
"text_entropy": 1.1704072952270508,
"text_kl": 0.0,
"total_entropy": 1.9853535890579224
},
{
"combined_loss": 0.7411965131759644,
"completion_length": 500.125,
"epoch": 0.029262086513994912,
"grad_norm": 2.046849250793457,
"kl": 0.0,
"learning_rate": 9.970667809068474e-07,
"loss": 0.7412,
"num_samples": 1.0,
"reward": 2.6875,
"reward_std": 0.8678992986679077,
"rewards/gpt4o_holistic_reward": 2.6875,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.4706549644470215,
"speech_entropy": 2.2015461921691895,
"speech_kl": 0.0,
"step": 92,
"text_entropy": 1.1552786827087402,
"text_kl": 0.0,
"total_entropy": 2.0065386295318604
},
{
"combined_loss": 0.7156788110733032,
"completion_length": 403.125,
"epoch": 0.029580152671755726,
"grad_norm": 2.1912262439727783,
"kl": 0.0,
"learning_rate": 9.969848285196157e-07,
"loss": 0.7157,
"num_samples": 1.0,
"reward": 3.875,
"reward_std": 1.4470233917236328,
"rewards/gpt4o_holistic_reward": 3.875,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.3855957984924316,
"speech_entropy": 2.2147631645202637,
"speech_kl": 0.0,
"step": 93,
"text_entropy": 1.0130202770233154,
"text_kl": 0.0,
"total_entropy": 1.9480912685394287
},
{
"combined_loss": 0.660508394241333,
"completion_length": 303.5,
"epoch": 0.02989821882951654,
"grad_norm": 2.6118781566619873,
"kl": 0.0,
"learning_rate": 9.969017508486105e-07,
"loss": 0.6605,
"num_samples": 1.0,
"reward": 3.9375,
"reward_std": 1.2233422994613647,
"rewards/gpt4o_holistic_reward": 3.9375,
"rl_loss": 0.0,
"sft_loss": 2.2016944885253906,
"speech_entropy": 2.45943284034729,
"speech_kl": 0.0,
"step": 94,
"text_entropy": 1.1381962299346924,
"text_kl": 0.0,
"total_entropy": 2.197706460952759
},
{
"combined_loss": 0.7968100309371948,
"completion_length": 326.6875,
"epoch": 0.030216284987277353,
"grad_norm": 2.501716375350952,
"kl": 0.0,
"learning_rate": 9.968175481029798e-07,
"loss": 0.7968,
"num_samples": 1.0,
"reward": 3.75,
"reward_std": 0.7887751460075378,
"rewards/gpt4o_holistic_reward": 3.75,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.656033515930176,
"speech_entropy": 2.389923334121704,
"speech_kl": 0.0,
"step": 95,
"text_entropy": 1.3742587566375732,
"text_kl": 0.0,
"total_entropy": 2.177299976348877
},
{
"combined_loss": 0.7844936847686768,
"completion_length": 325.375,
"epoch": 0.030534351145038167,
"grad_norm": 2.390887498855591,
"kl": 0.0,
"learning_rate": 9.967322204947038e-07,
"loss": 0.7845,
"num_samples": 1.0,
"reward": 3.5625,
"reward_std": 1.3041632175445557,
"rewards/gpt4o_holistic_reward": 3.5625,
"rl_loss": -1.4901161193847656e-08,
"sft_loss": 2.6149790287017822,
"speech_entropy": 2.296114206314087,
"speech_kl": 0.0,
"step": 96,
"text_entropy": 1.206712245941162,
"text_kl": 0.0,
"total_entropy": 2.0752415657043457
},
{
"combined_loss": 0.7896230220794678,
"completion_length": 317.3125,
"epoch": 0.030852417302798984,
"grad_norm": 2.363767147064209,
"kl": 0.0,
"learning_rate": 9.96645768238595e-07,
"loss": 0.7896,
"num_samples": 1.0,
"reward": 4.3125,
"reward_std": 1.0173285007476807,
"rewards/gpt4o_holistic_reward": 4.3125,
"rl_loss": 2.2351741790771484e-08,
"sft_loss": 2.6320767402648926,
"speech_entropy": 2.3654651641845703,
"speech_kl": 0.0,
"step": 97,
"text_entropy": 1.4019675254821777,
"text_kl": 0.0,
"total_entropy": 2.174586296081543
},
{
"combined_loss": 0.6473546028137207,
"completion_length": 494.8125,
"epoch": 0.031170483460559797,
"grad_norm": 1.823893427848816,
"kl": 0.0,
"learning_rate": 9.965581915522964e-07,
"loss": 0.6474,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 0.9981511235237122,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.157848358154297,
"speech_entropy": 2.229156494140625,
"speech_kl": 0.0,
"step": 98,
"text_entropy": 1.1541342735290527,
"text_kl": 0.0,
"total_entropy": 2.026672840118408
},
{
"combined_loss": 0.7282466292381287,
"completion_length": 373.5625,
"epoch": 0.03148854961832061,
"grad_norm": 2.1396214962005615,
"kl": 0.0,
"learning_rate": 9.964694906562826e-07,
"loss": 0.7282,
"num_samples": 1.0,
"reward": 3.0625,
"reward_std": 0.5281319618225098,
"rewards/gpt4o_holistic_reward": 3.0625,
"rl_loss": 0.0,
"sft_loss": 2.4274885654449463,
"speech_entropy": 2.2445669174194336,
"speech_kl": 0.0,
"step": 99,
"text_entropy": 1.1222586631774902,
"text_kl": 0.0,
"total_entropy": 2.014863967895508
},
{
"combined_loss": 0.7340562343597412,
"completion_length": 360.875,
"epoch": 0.031806615776081425,
"grad_norm": 2.1381564140319824,
"kl": 0.0,
"learning_rate": 9.96379665773858e-07,
"loss": 0.7341,
"num_samples": 1.0,
"reward": 4.0625,
"reward_std": 0.7394567728042603,
"rewards/gpt4o_holistic_reward": 4.0625,
"rl_loss": -1.4901161193847656e-08,
"sft_loss": 2.4468541145324707,
"speech_entropy": 2.292088270187378,
"speech_kl": 0.0,
"step": 100,
"text_entropy": 1.465188980102539,
"text_kl": 0.0,
"total_entropy": 2.1374363899230957
},
{
"combined_loss": 0.7976064682006836,
"completion_length": 296.25,
"epoch": 0.03212468193384224,
"grad_norm": 1.9820961952209473,
"kl": 0.0,
"learning_rate": 9.962887171311562e-07,
"loss": 0.7976,
"num_samples": 1.0,
"reward": 4.6875,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_holistic_reward": 4.6875,
"rl_loss": 1.862645149230957e-09,
"sft_loss": 2.6586880683898926,
"speech_entropy": 2.284330368041992,
"speech_kl": 0.0,
"step": 101,
"text_entropy": 1.2345999479293823,
"text_kl": 0.0,
"total_entropy": 2.0803914070129395
},
{
"combined_loss": 0.6758935451507568,
"completion_length": 429.5625,
"epoch": 0.03244274809160305,
"grad_norm": 2.111237049102783,
"kl": 0.0,
"learning_rate": 9.961966449571407e-07,
"loss": 0.6759,
"num_samples": 1.0,
"reward": 2.875,
"reward_std": 1.5787245035171509,
"rewards/gpt4o_holistic_reward": 2.875,
"rl_loss": -1.862645149230957e-08,
"sft_loss": 2.2529783248901367,
"speech_entropy": 2.263260841369629,
"speech_kl": 0.0,
"step": 102,
"text_entropy": 0.8939234614372253,
"text_kl": 0.0,
"total_entropy": 1.9995882511138916
},
{
"combined_loss": 0.7361711263656616,
"completion_length": 395.9375,
"epoch": 0.03276081424936387,
"grad_norm": 1.850319743156433,
"kl": 0.0,
"learning_rate": 9.961034494836029e-07,
"loss": 0.7362,
"num_samples": 1.0,
"reward": 4.5,
"reward_std": 0.614456832408905,
"rewards/gpt4o_holistic_reward": 4.5,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.4539036750793457,
"speech_entropy": 2.2334747314453125,
"speech_kl": 0.0,
"step": 103,
"text_entropy": 1.248520851135254,
"text_kl": 0.0,
"total_entropy": 2.0419020652770996
},
{
"combined_loss": 0.6475083827972412,
"completion_length": 475.625,
"epoch": 0.03307888040712468,
"grad_norm": 1.9739583730697632,
"kl": 0.0,
"learning_rate": 9.960091309451625e-07,
"loss": 0.6475,
"num_samples": 1.0,
"reward": 2.375,
"reward_std": 1.1752138137817383,
"rewards/gpt4o_holistic_reward": 2.375,
"rl_loss": -1.1175870895385742e-08,
"sft_loss": 2.1583614349365234,
"speech_entropy": 2.26711368560791,
"speech_kl": 0.0,
"step": 104,
"text_entropy": 0.8396031856536865,
"text_kl": 0.0,
"total_entropy": 1.9932773113250732
},
{
"combined_loss": 0.7815308570861816,
"completion_length": 394.0625,
"epoch": 0.033396946564885496,
"grad_norm": 2.9025793075561523,
"kl": 0.0,
"learning_rate": 9.95913689579266e-07,
"loss": 0.7815,
"num_samples": 1.0,
"reward": 4.5,
"reward_std": 0.704224169254303,
"rewards/gpt4o_holistic_reward": 4.5,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.6051025390625,
"speech_entropy": 2.256810188293457,
"speech_kl": 0.0,
"step": 105,
"text_entropy": 1.4908084869384766,
"text_kl": 0.0,
"total_entropy": 2.115090847015381
},
{
"combined_loss": 0.7154955863952637,
"completion_length": 475.25,
"epoch": 0.03371501272264631,
"grad_norm": 1.8907688856124878,
"kl": 0.0,
"learning_rate": 9.958171256261873e-07,
"loss": 0.7155,
"num_samples": 1.0,
"reward": 3.625,
"reward_std": 0.7587944269180298,
"rewards/gpt4o_holistic_reward": 3.625,
"rl_loss": 5.587935447692871e-09,
"sft_loss": 2.3849852085113525,
"speech_entropy": 2.25828218460083,
"speech_kl": 0.0,
"step": 106,
"text_entropy": 1.0575703382492065,
"text_kl": 0.0,
"total_entropy": 2.039116144180298
},
{
"combined_loss": 0.6845788955688477,
"completion_length": 512.5625,
"epoch": 0.034033078880407124,
"grad_norm": 1.646696925163269,
"kl": 0.0,
"learning_rate": 9.957194393290259e-07,
"loss": 0.6846,
"num_samples": 1.0,
"reward": 3.25,
"reward_std": 0.28877514600753784,
"rewards/gpt4o_holistic_reward": 3.25,
"rl_loss": 0.0,
"sft_loss": 2.2819294929504395,
"speech_entropy": 2.166106700897217,
"speech_kl": 0.0,
"step": 107,
"text_entropy": 0.6757108569145203,
"text_kl": 0.0,
"total_entropy": 1.8433654308319092
},
{
"combined_loss": 0.7150195837020874,
"completion_length": 491.6875,
"epoch": 0.03435114503816794,
"grad_norm": 1.833224892616272,
"kl": 0.0,
"learning_rate": 9.956206309337066e-07,
"loss": 0.715,
"num_samples": 1.0,
"reward": 3.0625,
"reward_std": 0.5581126809120178,
"rewards/gpt4o_holistic_reward": 3.0625,
"rl_loss": 0.0,
"sft_loss": 2.3833985328674316,
"speech_entropy": 2.1860873699188232,
"speech_kl": 0.0,
"step": 108,
"text_entropy": 1.2409619092941284,
"text_kl": 0.0,
"total_entropy": 2.0030808448791504
},
{
"combined_loss": 0.6441072821617126,
"completion_length": 293.3125,
"epoch": 0.03466921119592875,
"grad_norm": 2.175922393798828,
"kl": 0.0,
"learning_rate": 9.9552070068898e-07,
"loss": 0.6441,
"num_samples": 1.0,
"reward": 3.6875,
"reward_std": 0.8750999569892883,
"rewards/gpt4o_holistic_reward": 3.6875,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.147024154663086,
"speech_entropy": 2.2451610565185547,
"speech_kl": 0.0,
"step": 109,
"text_entropy": 1.1438672542572021,
"text_kl": 0.0,
"total_entropy": 2.044260025024414
},
{
"combined_loss": 0.6975011825561523,
"completion_length": 398.8125,
"epoch": 0.03498727735368957,
"grad_norm": 1.7699748277664185,
"kl": 0.0,
"learning_rate": 9.954196488464196e-07,
"loss": 0.6975,
"num_samples": 1.0,
"reward": 3.3125,
"reward_std": 0.2694375813007355,
"rewards/gpt4o_holistic_reward": 3.3125,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.3250038623809814,
"speech_entropy": 2.3067374229431152,
"speech_kl": 0.0,
"step": 110,
"text_entropy": 1.097214937210083,
"text_kl": 0.0,
"total_entropy": 2.070528268814087
},
{
"combined_loss": 0.6993280649185181,
"completion_length": 462.0625,
"epoch": 0.035305343511450385,
"grad_norm": 1.6435576677322388,
"kl": 0.0,
"learning_rate": 9.953174756604242e-07,
"loss": 0.6993,
"num_samples": 1.0,
"reward": 4.375,
"reward_std": 0.8483423590660095,
"rewards/gpt4o_holistic_reward": 4.375,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.3310935497283936,
"speech_entropy": 2.2712697982788086,
"speech_kl": 0.0,
"step": 111,
"text_entropy": 0.8282650113105774,
"text_kl": 0.0,
"total_entropy": 1.9767301082611084
},
{
"combined_loss": 0.6761323809623718,
"completion_length": 294.25,
"epoch": 0.035623409669211195,
"grad_norm": 2.934861183166504,
"kl": 0.0,
"learning_rate": 9.95214181388214e-07,
"loss": 0.6761,
"num_samples": 1.0,
"reward": 2.9375,
"reward_std": 1.14496648311615,
"rewards/gpt4o_holistic_reward": 2.9375,
"rl_loss": -1.1175870895385742e-08,
"sft_loss": 2.253774404525757,
"speech_entropy": 2.4185285568237305,
"speech_kl": 0.0,
"step": 112,
"text_entropy": 1.202892541885376,
"text_kl": 0.0,
"total_entropy": 2.171070098876953
},
{
"combined_loss": 0.7770819067955017,
"completion_length": 376.625,
"epoch": 0.03594147582697201,
"grad_norm": 73.62300872802734,
"kl": 0.0,
"learning_rate": 9.951097662898325e-07,
"loss": 0.7771,
"num_samples": 1.0,
"reward": 3.6875,
"reward_std": 0.8750999569892883,
"rewards/gpt4o_holistic_reward": 3.6875,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.590272903442383,
"speech_entropy": 2.2988834381103516,
"speech_kl": 0.0,
"step": 113,
"text_entropy": 1.125475287437439,
"text_kl": 0.0,
"total_entropy": 2.0705349445343018
},
{
"combined_loss": 0.6613748073577881,
"completion_length": 509.4375,
"epoch": 0.03625954198473282,
"grad_norm": 2.1758439540863037,
"kl": 0.0,
"learning_rate": 9.950042306281445e-07,
"loss": 0.6614,
"num_samples": 1.0,
"reward": 3.4375,
"reward_std": 1.4686723947525024,
"rewards/gpt4o_holistic_reward": 3.4375,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.204582691192627,
"speech_entropy": 2.27337908744812,
"speech_kl": 0.0,
"step": 114,
"text_entropy": 0.9519417881965637,
"text_kl": 0.0,
"total_entropy": 2.023104667663574
},
{
"combined_loss": 0.6832470893859863,
"completion_length": 496.5625,
"epoch": 0.03657760814249364,
"grad_norm": 9.72231674194336,
"kl": 0.0,
"learning_rate": 9.94897574668836e-07,
"loss": 0.6832,
"num_samples": 1.0,
"reward": 4.3125,
"reward_std": 0.6250999569892883,
"rewards/gpt4o_holistic_reward": 4.3125,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.2774901390075684,
"speech_entropy": 2.29954195022583,
"speech_kl": 0.0,
"step": 115,
"text_entropy": 1.2875080108642578,
"text_kl": 0.0,
"total_entropy": 2.0839648246765137
},
{
"combined_loss": 0.6530779600143433,
"completion_length": 418.875,
"epoch": 0.03689567430025445,
"grad_norm": 2.1920166015625,
"kl": 0.0,
"learning_rate": 9.94789798680413e-07,
"loss": 0.6531,
"num_samples": 1.0,
"reward": 4.4375,
"reward_std": 0.23945678770542145,
"rewards/gpt4o_holistic_reward": 4.4375,
"rl_loss": 0.0,
"sft_loss": 2.176926612854004,
"speech_entropy": 2.3347601890563965,
"speech_kl": 0.0,
"step": 116,
"text_entropy": 1.2426201105117798,
"text_kl": 0.0,
"total_entropy": 2.1373891830444336
},
{
"combined_loss": 0.6800841689109802,
"completion_length": 381.5625,
"epoch": 0.03721374045801527,
"grad_norm": 2.2477047443389893,
"kl": 0.0,
"learning_rate": 9.94680902934202e-07,
"loss": 0.6801,
"num_samples": 1.0,
"reward": 4.0,
"reward_std": 0.9940415620803833,
"rewards/gpt4o_holistic_reward": 4.0,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.2669472694396973,
"speech_entropy": 2.3144116401672363,
"speech_kl": 0.0,
"step": 117,
"text_entropy": 1.3952128887176514,
"text_kl": 0.0,
"total_entropy": 2.1378135681152344
},
{
"combined_loss": 0.7205807566642761,
"completion_length": 266.75,
"epoch": 0.037531806615776084,
"grad_norm": 2.3268606662750244,
"kl": 0.0,
"learning_rate": 9.94570887704347e-07,
"loss": 0.7206,
"num_samples": 1.0,
"reward": 4.25,
"reward_std": 1.2288135290145874,
"rewards/gpt4o_holistic_reward": 4.25,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.4019358158111572,
"speech_entropy": 2.3998208045959473,
"speech_kl": 0.0,
"step": 118,
"text_entropy": 1.6290910243988037,
"text_kl": 0.0,
"total_entropy": 2.2556941509246826
},
{
"combined_loss": 0.7098033428192139,
"completion_length": 314.0,
"epoch": 0.037849872773536894,
"grad_norm": 2.0043420791625977,
"kl": 0.0,
"learning_rate": 9.944597532678119e-07,
"loss": 0.7098,
"num_samples": 1.0,
"reward": 3.0,
"reward_std": 0.6444375514984131,
"rewards/gpt4o_holistic_reward": 3.0,
"rl_loss": -1.862645149230957e-08,
"sft_loss": 2.366011142730713,
"speech_entropy": 2.3302297592163086,
"speech_kl": 0.0,
"step": 119,
"text_entropy": 1.2116031646728516,
"text_kl": 0.0,
"total_entropy": 2.1241815090179443
},
{
"combined_loss": 0.7127311825752258,
"completion_length": 565.0625,
"epoch": 0.03816793893129771,
"grad_norm": 1.6876592636108398,
"kl": 0.0,
"learning_rate": 9.943474999043775e-07,
"loss": 0.7127,
"num_samples": 1.0,
"reward": 2.6875,
"reward_std": 0.7235617637634277,
"rewards/gpt4o_holistic_reward": 2.6875,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.3757705688476562,
"speech_entropy": 2.1477317810058594,
"speech_kl": 0.0,
"step": 120,
"text_entropy": 0.8258525133132935,
"text_kl": 0.0,
"total_entropy": 1.8741722106933594
},
{
"combined_loss": 0.6518399715423584,
"completion_length": 460.875,
"epoch": 0.03848600508905852,
"grad_norm": 1.9149906635284424,
"kl": 0.0,
"learning_rate": 9.94234127896641e-07,
"loss": 0.6518,
"num_samples": 1.0,
"reward": 2.5,
"reward_std": 1.2335585355758667,
"rewards/gpt4o_holistic_reward": 2.5,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.172799825668335,
"speech_entropy": 2.247347593307495,
"speech_kl": 0.0,
"step": 121,
"text_entropy": 0.7082687020301819,
"text_kl": 0.0,
"total_entropy": 1.955895185470581
},
{
"combined_loss": 0.6932583451271057,
"completion_length": 415.9375,
"epoch": 0.03880407124681934,
"grad_norm": 1.803132176399231,
"kl": 0.0,
"learning_rate": 9.94119637530017e-07,
"loss": 0.6933,
"num_samples": 1.0,
"reward": 2.5625,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_holistic_reward": 2.5625,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.310861110687256,
"speech_entropy": 2.2421672344207764,
"speech_kl": 0.0,
"step": 122,
"text_entropy": 1.031731367111206,
"text_kl": 0.0,
"total_entropy": 2.0151727199554443
},
{
"combined_loss": 0.6039379239082336,
"completion_length": 606.0625,
"epoch": 0.039122137404580155,
"grad_norm": 1.545535683631897,
"kl": 0.0,
"learning_rate": 9.940040290927343e-07,
"loss": 0.6039,
"num_samples": 1.0,
"reward": 3.6875,
"reward_std": 1.2654881477355957,
"rewards/gpt4o_holistic_reward": 3.6875,
"rl_loss": -1.1175870895385742e-08,
"sft_loss": 2.0131263732910156,
"speech_entropy": 2.1528759002685547,
"speech_kl": 0.0,
"step": 123,
"text_entropy": 0.7468642592430115,
"text_kl": 0.0,
"total_entropy": 1.8691860437393188
},
{
"combined_loss": 0.637839674949646,
"completion_length": 377.375,
"epoch": 0.039440203562340966,
"grad_norm": 1.8493309020996094,
"kl": 0.0,
"learning_rate": 9.938873028758374e-07,
"loss": 0.6378,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 1.0646765232086182,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": 1.862645149230957e-08,
"sft_loss": 2.126132011413574,
"speech_entropy": 2.3589277267456055,
"speech_kl": 0.0,
"step": 124,
"text_entropy": 1.0232412815093994,
"text_kl": 0.0,
"total_entropy": 2.1352920532226562
},
{
"combined_loss": 0.6103811860084534,
"completion_length": 330.75,
"epoch": 0.03975826972010178,
"grad_norm": 1.9315379858016968,
"kl": 0.0,
"learning_rate": 9.93769459173184e-07,
"loss": 0.6104,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 0.8751000165939331,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": 2.60770320892334e-08,
"sft_loss": 2.0346038341522217,
"speech_entropy": 2.236910343170166,
"speech_kl": 0.0,
"step": 125,
"text_entropy": 0.7762662172317505,
"text_kl": 0.0,
"total_entropy": 1.9460570812225342
},
{
"combined_loss": 0.6840636730194092,
"completion_length": 393.25,
"epoch": 0.04007633587786259,
"grad_norm": 2.2317352294921875,
"kl": 0.0,
"learning_rate": 9.936504982814457e-07,
"loss": 0.6841,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 0.579224169254303,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.28021240234375,
"speech_entropy": 2.2817130088806152,
"speech_kl": 0.0,
"step": 126,
"text_entropy": 0.9976484775543213,
"text_kl": 0.0,
"total_entropy": 2.0223848819732666
},
{
"combined_loss": 0.6637799739837646,
"completion_length": 422.9375,
"epoch": 0.04039440203562341,
"grad_norm": 1.987329125404358,
"kl": 0.0,
"learning_rate": 9.935304205001066e-07,
"loss": 0.6638,
"num_samples": 1.0,
"reward": 3.375,
"reward_std": 0.9001989364624023,
"rewards/gpt4o_holistic_reward": 3.375,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.212599754333496,
"speech_entropy": 2.2056527137756348,
"speech_kl": 0.0,
"step": 127,
"text_entropy": 0.9696847200393677,
"text_kl": 0.0,
"total_entropy": 1.965734839439392
},
{
"combined_loss": 0.7613767981529236,
"completion_length": 356.6875,
"epoch": 0.04071246819338423,
"grad_norm": 2.8389971256256104,
"kl": 0.0,
"learning_rate": 9.934092261314617e-07,
"loss": 0.7614,
"num_samples": 1.0,
"reward": 2.375,
"reward_std": 1.0087943077087402,
"rewards/gpt4o_holistic_reward": 2.375,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.5379223823547363,
"speech_entropy": 2.238985538482666,
"speech_kl": 0.0,
"step": 128,
"text_entropy": 0.8710644841194153,
"text_kl": 0.0,
"total_entropy": 1.9456055164337158
},
{
"combined_loss": 0.7112630009651184,
"completion_length": 368.5,
"epoch": 0.04103053435114504,
"grad_norm": 1.772113561630249,
"kl": 0.0,
"learning_rate": 9.932869154806185e-07,
"loss": 0.7113,
"num_samples": 1.0,
"reward": 4.75,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_holistic_reward": 4.75,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.3708765506744385,
"speech_entropy": 2.2802562713623047,
"speech_kl": 0.0,
"step": 129,
"text_entropy": 1.0544356107711792,
"text_kl": 0.0,
"total_entropy": 2.0886013507843018
},
{
"combined_loss": 0.6580800414085388,
"completion_length": 434.4375,
"epoch": 0.041348600508905854,
"grad_norm": 1.6276386976242065,
"kl": 0.0,
"learning_rate": 9.931634888554935e-07,
"loss": 0.6581,
"num_samples": 1.0,
"reward": 4.75,
"reward_std": 0.28877514600753784,
"rewards/gpt4o_holistic_reward": 4.75,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.1935999393463135,
"speech_entropy": 2.217723846435547,
"speech_kl": 0.0,
"step": 130,
"text_entropy": 0.9710134267807007,
"text_kl": 0.0,
"total_entropy": 1.9742225408554077
},
{
"combined_loss": 0.7769891023635864,
"completion_length": 299.3125,
"epoch": 0.041666666666666664,
"grad_norm": 2.6481971740722656,
"kl": 0.0,
"learning_rate": 9.930389465668132e-07,
"loss": 0.777,
"num_samples": 1.0,
"reward": 4.375,
"reward_std": 1.250100016593933,
"rewards/gpt4o_holistic_reward": 4.375,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.589963436126709,
"speech_entropy": 2.583065986633301,
"speech_kl": 0.0,
"step": 131,
"text_entropy": 1.1438889503479004,
"text_kl": 0.0,
"total_entropy": 2.3853845596313477
},
{
"combined_loss": 0.6638728380203247,
"completion_length": 330.6875,
"epoch": 0.04198473282442748,
"grad_norm": 1.9016973972320557,
"kl": 0.0,
"learning_rate": 9.929132889281126e-07,
"loss": 0.6639,
"num_samples": 1.0,
"reward": 4.1875,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_holistic_reward": 4.1875,
"rl_loss": 0.0,
"sft_loss": 2.212909460067749,
"speech_entropy": 2.2528860569000244,
"speech_kl": 0.0,
"step": 132,
"text_entropy": 1.0528066158294678,
"text_kl": 0.0,
"total_entropy": 2.020822525024414
},
{
"combined_loss": 0.7390019297599792,
"completion_length": 362.9375,
"epoch": 0.0423027989821883,
"grad_norm": 1.9329123497009277,
"kl": 0.0,
"learning_rate": 9.927865162557345e-07,
"loss": 0.739,
"num_samples": 1.0,
"reward": 4.0,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_holistic_reward": 4.0,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.4633398056030273,
"speech_entropy": 2.1905789375305176,
"speech_kl": 0.0,
"step": 133,
"text_entropy": 1.1139158010482788,
"text_kl": 0.0,
"total_entropy": 1.9759125709533691
},
{
"combined_loss": 0.8078758716583252,
"completion_length": 340.375,
"epoch": 0.04262086513994911,
"grad_norm": 2.5768580436706543,
"kl": 0.0,
"learning_rate": 9.926586288688295e-07,
"loss": 0.8079,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 1.0792241096496582,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.6929194927215576,
"speech_entropy": 2.2723331451416016,
"speech_kl": 0.0,
"step": 134,
"text_entropy": 1.9749469757080078,
"text_kl": 0.0,
"total_entropy": 2.219290256500244
},
{
"combined_loss": 0.6676887273788452,
"completion_length": 364.0625,
"epoch": 0.042938931297709926,
"grad_norm": 1.688926339149475,
"kl": 0.0,
"learning_rate": 9.925296270893531e-07,
"loss": 0.6677,
"num_samples": 1.0,
"reward": 4.4375,
"reward_std": 0.8538135886192322,
"rewards/gpt4o_holistic_reward": 4.4375,
"rl_loss": 2.2351741790771484e-08,
"sft_loss": 2.2256290912628174,
"speech_entropy": 2.1805365085601807,
"speech_kl": 0.0,
"step": 135,
"text_entropy": 0.9880690574645996,
"text_kl": 0.0,
"total_entropy": 1.9571375846862793
},
{
"combined_loss": 0.7317559123039246,
"completion_length": 442.375,
"epoch": 0.043256997455470736,
"grad_norm": 2.035095691680908,
"kl": 0.0,
"learning_rate": 9.923995112420679e-07,
"loss": 0.7318,
"num_samples": 1.0,
"reward": 3.375,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_holistic_reward": 3.375,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.4391863346099854,
"speech_entropy": 2.255812644958496,
"speech_kl": 0.0,
"step": 136,
"text_entropy": 1.4666073322296143,
"text_kl": 0.0,
"total_entropy": 2.107949733734131
},
{
"combined_loss": 0.624023973941803,
"completion_length": 389.125,
"epoch": 0.04357506361323155,
"grad_norm": 2.0001988410949707,
"kl": 0.0,
"learning_rate": 9.922682816545399e-07,
"loss": 0.624,
"num_samples": 1.0,
"reward": 3.375,
"reward_std": 0.9396764636039734,
"rewards/gpt4o_holistic_reward": 3.375,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.0800797939300537,
"speech_entropy": 2.1931605339050293,
"speech_kl": 0.0,
"step": 137,
"text_entropy": 1.0324208736419678,
"text_kl": 0.0,
"total_entropy": 2.005070924758911
},
{
"combined_loss": 0.6598743796348572,
"completion_length": 486.0625,
"epoch": 0.04389312977099236,
"grad_norm": 2.193516731262207,
"kl": 0.0,
"learning_rate": 9.9213593865714e-07,
"loss": 0.6599,
"num_samples": 1.0,
"reward": 4.125,
"reward_std": 1.183112621307373,
"rewards/gpt4o_holistic_reward": 4.125,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.1995811462402344,
"speech_entropy": 2.289968252182007,
"speech_kl": 0.0,
"step": 138,
"text_entropy": 1.0836520195007324,
"text_kl": 0.0,
"total_entropy": 2.0700204372406006
},
{
"combined_loss": 0.6657881736755371,
"completion_length": 326.8125,
"epoch": 0.04421119592875318,
"grad_norm": 1.7214406728744507,
"kl": 0.0,
"learning_rate": 9.920024825830406e-07,
"loss": 0.6658,
"num_samples": 1.0,
"reward": 4.5625,
"reward_std": 0.3146764636039734,
"rewards/gpt4o_holistic_reward": 4.5625,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.2192935943603516,
"speech_entropy": 2.253314971923828,
"speech_kl": 0.0,
"step": 139,
"text_entropy": 1.1099263429641724,
"text_kl": 0.0,
"total_entropy": 2.0267200469970703
},
{
"combined_loss": 0.6005537509918213,
"completion_length": 402.875,
"epoch": 0.044529262086514,
"grad_norm": 2.0783207416534424,
"kl": 0.0,
"learning_rate": 9.91867913768218e-07,
"loss": 0.6006,
"num_samples": 1.0,
"reward": 3.0,
"reward_std": 1.0774502754211426,
"rewards/gpt4o_holistic_reward": 3.0,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.0018458366394043,
"speech_entropy": 2.27315354347229,
"speech_kl": 0.0,
"step": 140,
"text_entropy": 1.074316143989563,
"text_kl": 0.0,
"total_entropy": 2.044334888458252
},
{
"combined_loss": 0.6734490394592285,
"completion_length": 382.6875,
"epoch": 0.04484732824427481,
"grad_norm": 1.4923752546310425,
"kl": 0.0,
"learning_rate": 9.917322325514487e-07,
"loss": 0.6734,
"num_samples": 1.0,
"reward": 4.5,
"reward_std": 0.4565354883670807,
"rewards/gpt4o_holistic_reward": 4.5,
"rl_loss": 0.0,
"sft_loss": 2.2448298931121826,
"speech_entropy": 2.2860617637634277,
"speech_kl": 0.0,
"step": 141,
"text_entropy": 0.9306349754333496,
"text_kl": 0.0,
"total_entropy": 2.0320706367492676
},
{
"combined_loss": 0.6445150375366211,
"completion_length": 401.875,
"epoch": 0.045165394402035625,
"grad_norm": 2.9365267753601074,
"kl": 0.0,
"learning_rate": 9.915954392743102e-07,
"loss": 0.6445,
"num_samples": 1.0,
"reward": 2.25,
"reward_std": 0.9002986550331116,
"rewards/gpt4o_holistic_reward": 2.25,
"rl_loss": -1.862645149230957e-08,
"sft_loss": 2.148383617401123,
"speech_entropy": 2.395698070526123,
"speech_kl": 0.0,
"step": 142,
"text_entropy": 1.1935876607894897,
"text_kl": 0.0,
"total_entropy": 2.16239070892334
},
{
"combined_loss": 0.6403375864028931,
"completion_length": 475.5,
"epoch": 0.045483460559796435,
"grad_norm": 1.8146846294403076,
"kl": 0.0,
"learning_rate": 9.914575342811792e-07,
"loss": 0.6403,
"num_samples": 1.0,
"reward": 2.6875,
"reward_std": 1.096787929534912,
"rewards/gpt4o_holistic_reward": 2.6875,
"rl_loss": 0.0,
"sft_loss": 2.134458541870117,
"speech_entropy": 2.208829641342163,
"speech_kl": 0.0,
"step": 143,
"text_entropy": 0.6652034521102905,
"text_kl": 0.0,
"total_entropy": 1.8854793310165405
},
{
"combined_loss": 0.6461049914360046,
"completion_length": 507.3125,
"epoch": 0.04580152671755725,
"grad_norm": 1.7192074060440063,
"kl": 0.0,
"learning_rate": 9.913185179192316e-07,
"loss": 0.6461,
"num_samples": 1.0,
"reward": 3.375,
"reward_std": 0.8536533713340759,
"rewards/gpt4o_holistic_reward": 3.375,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.1536831855773926,
"speech_entropy": 2.2175731658935547,
"speech_kl": 0.0,
"step": 144,
"text_entropy": 0.8226357698440552,
"text_kl": 0.0,
"total_entropy": 1.9434648752212524
},
{
"combined_loss": 0.6483294367790222,
"completion_length": 542.5,
"epoch": 0.04611959287531807,
"grad_norm": 1.73550546169281,
"kl": 0.0,
"learning_rate": 9.911783905384405e-07,
"loss": 0.6483,
"num_samples": 1.0,
"reward": 4.4375,
"reward_std": 0.9137751460075378,
"rewards/gpt4o_holistic_reward": 4.4375,
"rl_loss": -1.862645149230957e-08,
"sft_loss": 2.161098003387451,
"speech_entropy": 2.1762547492980957,
"speech_kl": 0.0,
"step": 145,
"text_entropy": 1.2476625442504883,
"text_kl": 0.0,
"total_entropy": 2.0006935596466064
},
{
"combined_loss": 0.6429945230484009,
"completion_length": 354.8125,
"epoch": 0.04643765903307888,
"grad_norm": 1.9572664499282837,
"kl": 0.0,
"learning_rate": 9.910371524915768e-07,
"loss": 0.643,
"num_samples": 1.0,
"reward": 2.6875,
"reward_std": 0.6229909658432007,
"rewards/gpt4o_holistic_reward": 2.6875,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.143314838409424,
"speech_entropy": 2.268470287322998,
"speech_kl": 0.0,
"step": 146,
"text_entropy": 0.9877128601074219,
"text_kl": 0.0,
"total_entropy": 2.013819694519043
},
{
"combined_loss": 0.725261926651001,
"completion_length": 361.375,
"epoch": 0.046755725190839696,
"grad_norm": 2.1626226902008057,
"kl": 0.0,
"learning_rate": 9.908948041342072e-07,
"loss": 0.7253,
"num_samples": 1.0,
"reward": 3.5,
"reward_std": 1.1963939666748047,
"rewards/gpt4o_holistic_reward": 3.5,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.417539596557617,
"speech_entropy": 2.394669532775879,
"speech_kl": 0.0,
"step": 147,
"text_entropy": 1.3553166389465332,
"text_kl": 0.0,
"total_entropy": 2.1874074935913086
},
{
"combined_loss": 0.7557258009910583,
"completion_length": 520.1875,
"epoch": 0.047073791348600506,
"grad_norm": 1.967831015586853,
"kl": 0.0,
"learning_rate": 9.907513458246934e-07,
"loss": 0.7557,
"num_samples": 1.0,
"reward": 4.4375,
"reward_std": 0.9733423590660095,
"rewards/gpt4o_holistic_reward": 4.4375,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.5190858840942383,
"speech_entropy": 2.2636446952819824,
"speech_kl": 0.0,
"step": 148,
"text_entropy": 1.526688814163208,
"text_kl": 0.0,
"total_entropy": 2.1319243907928467
},
{
"combined_loss": 0.6749532222747803,
"completion_length": 338.5,
"epoch": 0.04739185750636132,
"grad_norm": 1.4147047996520996,
"kl": 0.0,
"learning_rate": 9.90606777924191e-07,
"loss": 0.675,
"num_samples": 1.0,
"reward": 3.6875,
"reward_std": 0.23945678770542145,
"rewards/gpt4o_holistic_reward": 3.6875,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.2498438358306885,
"speech_entropy": 2.2806315422058105,
"speech_kl": 0.0,
"step": 149,
"text_entropy": 0.9889430999755859,
"text_kl": 0.0,
"total_entropy": 2.014362096786499
},
{
"combined_loss": 0.7438491582870483,
"completion_length": 432.8125,
"epoch": 0.04770992366412214,
"grad_norm": 1.7533581256866455,
"kl": 0.0,
"learning_rate": 9.904611007966504e-07,
"loss": 0.7438,
"num_samples": 1.0,
"reward": 4.6875,
"reward_std": 0.6251000165939331,
"rewards/gpt4o_holistic_reward": 4.6875,
"rl_loss": -1.1175870895385742e-08,
"sft_loss": 2.479497194290161,
"speech_entropy": 2.2352869510650635,
"speech_kl": 0.0,
"step": 150,
"text_entropy": 1.373764991760254,
"text_kl": 0.0,
"total_entropy": 2.0701396465301514
},
{
"combined_loss": 0.6471817493438721,
"completion_length": 439.8125,
"epoch": 0.04802798982188295,
"grad_norm": 1.6037753820419312,
"kl": 0.0,
"learning_rate": 9.90314314808813e-07,
"loss": 0.6472,
"num_samples": 1.0,
"reward": 3.9375,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_holistic_reward": 3.9375,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.1572723388671875,
"speech_entropy": 2.206624746322632,
"speech_kl": 0.0,
"step": 151,
"text_entropy": 1.0710368156433105,
"text_kl": 0.0,
"total_entropy": 1.9989676475524902
},
{
"combined_loss": 0.713241696357727,
"completion_length": 396.6875,
"epoch": 0.04834605597964377,
"grad_norm": 1.923511266708374,
"kl": 0.0,
"learning_rate": 9.901664203302124e-07,
"loss": 0.7132,
"num_samples": 1.0,
"reward": 4.25,
"reward_std": 0.8536533713340759,
"rewards/gpt4o_holistic_reward": 4.25,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.377472162246704,
"speech_entropy": 2.1870193481445312,
"speech_kl": 0.0,
"step": 152,
"text_entropy": 1.3757061958312988,
"text_kl": 0.0,
"total_entropy": 2.0416908264160156
},
{
"combined_loss": 0.671237587928772,
"completion_length": 504.0,
"epoch": 0.04866412213740458,
"grad_norm": 2.226810932159424,
"kl": 0.0,
"learning_rate": 9.90017417733173e-07,
"loss": 0.6712,
"num_samples": 1.0,
"reward": 3.3125,
"reward_std": 1.125100016593933,
"rewards/gpt4o_holistic_reward": 3.3125,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.2374584674835205,
"speech_entropy": 2.2338881492614746,
"speech_kl": 0.0,
"step": 153,
"text_entropy": 1.1020745038986206,
"text_kl": 0.0,
"total_entropy": 2.044203758239746
},
{
"combined_loss": 0.6561381816864014,
"completion_length": 230.1875,
"epoch": 0.048982188295165395,
"grad_norm": 2.5940654277801514,
"kl": 0.0,
"learning_rate": 9.898673073928087e-07,
"loss": 0.6561,
"num_samples": 1.0,
"reward": 4.3125,
"reward_std": 0.9524502158164978,
"rewards/gpt4o_holistic_reward": 4.3125,
"rl_loss": 0.0,
"sft_loss": 2.187127113342285,
"speech_entropy": 2.3985471725463867,
"speech_kl": 0.0,
"step": 154,
"text_entropy": 1.0082178115844727,
"text_kl": 0.0,
"total_entropy": 2.182006359100342
},
{
"combined_loss": 0.73064124584198,
"completion_length": 516.375,
"epoch": 0.04930025445292621,
"grad_norm": 1.78936767578125,
"kl": 0.0,
"learning_rate": 9.897160896870217e-07,
"loss": 0.7306,
"num_samples": 1.0,
"reward": 4.1875,
"reward_std": 0.6637751460075378,
"rewards/gpt4o_holistic_reward": 4.1875,
"rl_loss": 9.313225746154785e-09,
"sft_loss": 2.4354705810546875,
"speech_entropy": 2.2467494010925293,
"speech_kl": 0.0,
"step": 155,
"text_entropy": 1.0586869716644287,
"text_kl": 0.0,
"total_entropy": 2.0154693126678467
},
{
"combined_loss": 0.721272349357605,
"completion_length": 523.8125,
"epoch": 0.04961832061068702,
"grad_norm": 2.0883450508117676,
"kl": 0.0,
"learning_rate": 9.895637649965028e-07,
"loss": 0.7213,
"num_samples": 1.0,
"reward": 3.6875,
"reward_std": 1.1250998973846436,
"rewards/gpt4o_holistic_reward": 3.6875,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.4042410850524902,
"speech_entropy": 2.18511700630188,
"speech_kl": 0.0,
"step": 156,
"text_entropy": 1.0734648704528809,
"text_kl": 0.0,
"total_entropy": 1.961314082145691
},
{
"combined_loss": 0.6512343883514404,
"completion_length": 284.4375,
"epoch": 0.04993638676844784,
"grad_norm": 2.0043251514434814,
"kl": 0.0,
"learning_rate": 9.89410333704729e-07,
"loss": 0.6512,
"num_samples": 1.0,
"reward": 4.8125,
"reward_std": 0.2694375813007355,
"rewards/gpt4o_holistic_reward": 4.8125,
"rl_loss": 0.0,
"sft_loss": 2.170781135559082,
"speech_entropy": 2.359740734100342,
"speech_kl": 0.0,
"step": 157,
"text_entropy": 1.0422818660736084,
"text_kl": 0.0,
"total_entropy": 2.0986175537109375
},
{
"combined_loss": 0.6856322884559631,
"completion_length": 369.625,
"epoch": 0.05025445292620865,
"grad_norm": 2.0799102783203125,
"kl": 0.0,
"learning_rate": 9.892557961979634e-07,
"loss": 0.6856,
"num_samples": 1.0,
"reward": 4.25,
"reward_std": 1.183112621307373,
"rewards/gpt4o_holistic_reward": 4.25,
"rl_loss": 0.0,
"sft_loss": 2.2854409217834473,
"speech_entropy": 2.2864654064178467,
"speech_kl": 0.0,
"step": 158,
"text_entropy": 1.1000885963439941,
"text_kl": 0.0,
"total_entropy": 2.0594735145568848
},
{
"combined_loss": 0.6959141492843628,
"completion_length": 417.0625,
"epoch": 0.05057251908396947,
"grad_norm": 1.7902482748031616,
"kl": 0.0,
"learning_rate": 9.891001528652542e-07,
"loss": 0.6959,
"num_samples": 1.0,
"reward": 3.75,
"reward_std": 0.7042241096496582,
"rewards/gpt4o_holistic_reward": 3.75,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.319713592529297,
"speech_entropy": 2.2585508823394775,
"speech_kl": 0.0,
"step": 159,
"text_entropy": 1.5893566608428955,
"text_kl": 0.0,
"total_entropy": 2.1322367191314697
},
{
"combined_loss": 0.7123146057128906,
"completion_length": 478.875,
"epoch": 0.05089058524173028,
"grad_norm": 1.8146113157272339,
"kl": 0.0,
"learning_rate": 9.889434040984331e-07,
"loss": 0.7123,
"num_samples": 1.0,
"reward": 3.5,
"reward_std": 0.8020563125610352,
"rewards/gpt4o_holistic_reward": 3.5,
"rl_loss": 1.30385160446167e-08,
"sft_loss": 2.3743817806243896,
"speech_entropy": 2.307361602783203,
"speech_kl": 0.0,
"step": 160,
"text_entropy": 1.4639774560928345,
"text_kl": 0.0,
"total_entropy": 2.144629955291748
},
{
"combined_loss": 0.6143687963485718,
"completion_length": 426.8125,
"epoch": 0.051208651399491094,
"grad_norm": 1.7007486820220947,
"kl": 0.0,
"learning_rate": 9.88785550292115e-07,
"loss": 0.6144,
"num_samples": 1.0,
"reward": 3.5625,
"reward_std": 1.1308612823486328,
"rewards/gpt4o_holistic_reward": 3.5625,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.047895908355713,
"speech_entropy": 2.206613540649414,
"speech_kl": 0.0,
"step": 161,
"text_entropy": 1.1258774995803833,
"text_kl": 0.0,
"total_entropy": 1.9874699115753174
},
{
"combined_loss": 0.7055187821388245,
"completion_length": 364.375,
"epoch": 0.05152671755725191,
"grad_norm": 1.8136767148971558,
"kl": 0.0,
"learning_rate": 9.886265918436966e-07,
"loss": 0.7055,
"num_samples": 1.0,
"reward": 3.0625,
"reward_std": 0.8920267820358276,
"rewards/gpt4o_holistic_reward": 3.0625,
"rl_loss": 1.862645149230957e-08,
"sft_loss": 2.351729154586792,
"speech_entropy": 2.2987916469573975,
"speech_kl": 0.0,
"step": 162,
"text_entropy": 1.06236732006073,
"text_kl": 0.0,
"total_entropy": 2.0641303062438965
},
{
"combined_loss": 0.61933434009552,
"completion_length": 491.375,
"epoch": 0.05184478371501272,
"grad_norm": 1.4536187648773193,
"kl": 0.0,
"learning_rate": 9.88466529153356e-07,
"loss": 0.6193,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.0644476413726807,
"speech_entropy": 2.1868205070495605,
"speech_kl": 0.0,
"step": 163,
"text_entropy": 0.7075154781341553,
"text_kl": 0.0,
"total_entropy": 1.885782241821289
},
{
"combined_loss": 0.6253555417060852,
"completion_length": 537.5625,
"epoch": 0.05216284987277354,
"grad_norm": 1.6687992811203003,
"kl": 0.0,
"learning_rate": 9.883053626240501e-07,
"loss": 0.6254,
"num_samples": 1.0,
"reward": 2.5625,
"reward_std": 0.829224169254303,
"rewards/gpt4o_holistic_reward": 2.5625,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.0845184326171875,
"speech_entropy": 2.169955015182495,
"speech_kl": 0.0,
"step": 164,
"text_entropy": 0.7777245044708252,
"text_kl": 0.0,
"total_entropy": 1.884574055671692
},
{
"combined_loss": 0.6416522264480591,
"completion_length": 443.125,
"epoch": 0.05248091603053435,
"grad_norm": 2.1316330432891846,
"kl": 0.0,
"learning_rate": 9.88143092661516e-07,
"loss": 0.6417,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 0.8750999569892883,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.138840675354004,
"speech_entropy": 2.191802501678467,
"speech_kl": 0.0,
"step": 165,
"text_entropy": 0.8017352819442749,
"text_kl": 0.0,
"total_entropy": 1.9171819686889648
},
{
"combined_loss": 0.6333335638046265,
"completion_length": 309.625,
"epoch": 0.052798982188295165,
"grad_norm": 1.602042317390442,
"kl": 0.0,
"learning_rate": 9.87979719674268e-07,
"loss": 0.6333,
"num_samples": 1.0,
"reward": 4.875,
"reward_std": 0.14443756639957428,
"rewards/gpt4o_holistic_reward": 4.875,
"rl_loss": 0.0,
"sft_loss": 2.111111640930176,
"speech_entropy": 2.29636549949646,
"speech_kl": 0.0,
"step": 166,
"text_entropy": 1.0871713161468506,
"text_kl": 0.0,
"total_entropy": 2.0691018104553223
},
{
"combined_loss": 0.753852903842926,
"completion_length": 504.5625,
"epoch": 0.05311704834605598,
"grad_norm": 1.7408075332641602,
"kl": 0.0,
"learning_rate": 9.878152440735971e-07,
"loss": 0.7539,
"num_samples": 1.0,
"reward": 4.5625,
"reward_std": 0.7286534309387207,
"rewards/gpt4o_holistic_reward": 4.5625,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.512842893600464,
"speech_entropy": 2.2807769775390625,
"speech_kl": 0.0,
"step": 167,
"text_entropy": 1.8450149297714233,
"text_kl": 0.0,
"total_entropy": 2.1998775005340576
},
{
"combined_loss": 0.7053524255752563,
"completion_length": 346.375,
"epoch": 0.05343511450381679,
"grad_norm": 1.8231476545333862,
"kl": 0.0,
"learning_rate": 9.876496662735711e-07,
"loss": 0.7054,
"num_samples": 1.0,
"reward": 4.6875,
"reward_std": 0.6250999569892883,
"rewards/gpt4o_holistic_reward": 4.6875,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.3511745929718018,
"speech_entropy": 2.353151559829712,
"speech_kl": 0.0,
"step": 168,
"text_entropy": 1.3969731330871582,
"text_kl": 0.0,
"total_entropy": 2.1840548515319824
},
{
"combined_loss": 0.6700061559677124,
"completion_length": 525.125,
"epoch": 0.05375318066157761,
"grad_norm": 1.6231902837753296,
"kl": 0.0,
"learning_rate": 9.874829866910313e-07,
"loss": 0.67,
"num_samples": 1.0,
"reward": 2.8125,
"reward_std": 0.6637752056121826,
"rewards/gpt4o_holistic_reward": 2.8125,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.233353853225708,
"speech_entropy": 2.205387592315674,
"speech_kl": 0.0,
"step": 169,
"text_entropy": 0.9943252801895142,
"text_kl": 0.0,
"total_entropy": 1.9630248546600342
},
{
"combined_loss": 0.6313989162445068,
"completion_length": 331.25,
"epoch": 0.05407124681933842,
"grad_norm": 1.6013562679290771,
"kl": 0.0,
"learning_rate": 9.873152057455938e-07,
"loss": 0.6314,
"num_samples": 1.0,
"reward": 3.3125,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_holistic_reward": 3.3125,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.1046628952026367,
"speech_entropy": 2.343129873275757,
"speech_kl": 0.0,
"step": 170,
"text_entropy": 0.897804856300354,
"text_kl": 0.0,
"total_entropy": 2.0444600582122803
},
{
"combined_loss": 0.6534004211425781,
"completion_length": 617.3125,
"epoch": 0.05438931297709924,
"grad_norm": 1.932045578956604,
"kl": 0.0,
"learning_rate": 9.871463238596464e-07,
"loss": 0.6534,
"num_samples": 1.0,
"reward": 3.5625,
"reward_std": 0.8751000165939331,
"rewards/gpt4o_holistic_reward": 3.5625,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.1780014038085938,
"speech_entropy": 2.1928927898406982,
"speech_kl": 0.0,
"step": 171,
"text_entropy": 0.8436852693557739,
"text_kl": 0.0,
"total_entropy": 1.9311038255691528
},
{
"combined_loss": 0.6976642608642578,
"completion_length": 512.0625,
"epoch": 0.054707379134860054,
"grad_norm": 1.7346135377883911,
"kl": 0.0,
"learning_rate": 9.869763414583495e-07,
"loss": 0.6977,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 0.5646764636039734,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.325547218322754,
"speech_entropy": 2.1841320991516113,
"speech_kl": 0.0,
"step": 172,
"text_entropy": 1.026000738143921,
"text_kl": 0.0,
"total_entropy": 1.9609410762786865
},
{
"combined_loss": 0.7238253355026245,
"completion_length": 598.0625,
"epoch": 0.055025445292620864,
"grad_norm": 1.9262490272521973,
"kl": 0.0,
"learning_rate": 9.868052589696336e-07,
"loss": 0.7238,
"num_samples": 1.0,
"reward": 4.5,
"reward_std": 0.4928992986679077,
"rewards/gpt4o_holistic_reward": 4.5,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.4127509593963623,
"speech_entropy": 2.1995720863342285,
"speech_kl": 0.0,
"step": 173,
"text_entropy": 1.5878872871398926,
"text_kl": 0.0,
"total_entropy": 2.0725741386413574
},
{
"combined_loss": 0.6825557947158813,
"completion_length": 435.9375,
"epoch": 0.05534351145038168,
"grad_norm": 1.9291430711746216,
"kl": 0.0,
"learning_rate": 9.866330768241983e-07,
"loss": 0.6826,
"num_samples": 1.0,
"reward": 4.25,
"reward_std": 0.7548449039459229,
"rewards/gpt4o_holistic_reward": 4.25,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.2751858234405518,
"speech_entropy": 2.231337070465088,
"speech_kl": 0.0,
"step": 174,
"text_entropy": 0.8735387325286865,
"text_kl": 0.0,
"total_entropy": 1.9939302206039429
},
{
"combined_loss": 0.7245073914527893,
"completion_length": 432.0,
"epoch": 0.05566157760814249,
"grad_norm": 2.2042038440704346,
"kl": 0.0,
"learning_rate": 9.864597954555122e-07,
"loss": 0.7245,
"num_samples": 1.0,
"reward": 3.5625,
"reward_std": 1.6403796672821045,
"rewards/gpt4o_holistic_reward": 3.5625,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.415024757385254,
"speech_entropy": 2.3350276947021484,
"speech_kl": 0.0,
"step": 175,
"text_entropy": 1.081442952156067,
"text_kl": 0.0,
"total_entropy": 2.1579785346984863
},
{
"combined_loss": 0.7414748668670654,
"completion_length": 358.9375,
"epoch": 0.05597964376590331,
"grad_norm": 1.7152729034423828,
"kl": 0.0,
"learning_rate": 9.86285415299811e-07,
"loss": 0.7415,
"num_samples": 1.0,
"reward": 5.0,
"reward_std": 9.999999747378752e-05,
"rewards/gpt4o_holistic_reward": 5.0,
"rl_loss": 0.0,
"sft_loss": 2.4715828895568848,
"speech_entropy": 2.273726463317871,
"speech_kl": 0.0,
"step": 176,
"text_entropy": 1.5496562719345093,
"text_kl": 0.0,
"total_entropy": 2.137073040008545
},
{
"combined_loss": 0.6711795330047607,
"completion_length": 557.6875,
"epoch": 0.05629770992366412,
"grad_norm": 1.7282371520996094,
"kl": 0.0,
"learning_rate": 9.861099367960964e-07,
"loss": 0.6712,
"num_samples": 1.0,
"reward": 3.6875,
"reward_std": 1.1298449039459229,
"rewards/gpt4o_holistic_reward": 3.6875,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.237265110015869,
"speech_entropy": 2.1773886680603027,
"speech_kl": 0.0,
"step": 177,
"text_entropy": 0.8734384775161743,
"text_kl": 0.0,
"total_entropy": 1.9134694337844849
},
{
"combined_loss": 0.7701910138130188,
"completion_length": 591.25,
"epoch": 0.056615776081424936,
"grad_norm": 1.7133898735046387,
"kl": 0.0,
"learning_rate": 9.859333603861353e-07,
"loss": 0.7702,
"num_samples": 1.0,
"reward": 3.1875,
"reward_std": 0.5194375514984131,
"rewards/gpt4o_holistic_reward": 3.1875,
"rl_loss": -1.1175870895385742e-08,
"sft_loss": 2.56730318069458,
"speech_entropy": 2.189483165740967,
"speech_kl": 0.0,
"step": 178,
"text_entropy": 1.366645097732544,
"text_kl": 0.0,
"total_entropy": 2.039158582687378
},
{
"combined_loss": 0.5972847938537598,
"completion_length": 284.375,
"epoch": 0.05693384223918575,
"grad_norm": 2.5166637897491455,
"kl": 0.0,
"learning_rate": 9.857556865144585e-07,
"loss": 0.5973,
"num_samples": 1.0,
"reward": 3.625,
"reward_std": 1.7565135955810547,
"rewards/gpt4o_holistic_reward": 3.625,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 1.990949273109436,
"speech_entropy": 2.3300843238830566,
"speech_kl": 0.0,
"step": 179,
"text_entropy": 0.771490752696991,
"text_kl": 0.0,
"total_entropy": 2.0337142944335938
},
{
"combined_loss": 0.6814907193183899,
"completion_length": 378.6875,
"epoch": 0.05725190839694656,
"grad_norm": 1.7109546661376953,
"kl": 0.0,
"learning_rate": 9.855769156283603e-07,
"loss": 0.6815,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.2716357707977295,
"speech_entropy": 2.227330207824707,
"speech_kl": 0.0,
"step": 180,
"text_entropy": 1.1663092374801636,
"text_kl": 0.0,
"total_entropy": 2.016590118408203
},
{
"combined_loss": 0.5935865640640259,
"completion_length": 368.4375,
"epoch": 0.05756997455470738,
"grad_norm": 1.8667573928833008,
"kl": 0.0,
"learning_rate": 9.853970481778956e-07,
"loss": 0.5936,
"num_samples": 1.0,
"reward": 4.0,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_holistic_reward": 4.0,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 1.9786219596862793,
"speech_entropy": 2.1767773628234863,
"speech_kl": 0.0,
"step": 181,
"text_entropy": 0.6917097568511963,
"text_kl": 0.0,
"total_entropy": 1.8641599416732788
},
{
"combined_loss": 0.716184675693512,
"completion_length": 458.625,
"epoch": 0.05788804071246819,
"grad_norm": 1.799391508102417,
"kl": 0.0,
"learning_rate": 9.852160846158806e-07,
"loss": 0.7162,
"num_samples": 1.0,
"reward": 3.125,
"reward_std": 0.454224169254303,
"rewards/gpt4o_holistic_reward": 3.125,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.387282133102417,
"speech_entropy": 2.258867025375366,
"speech_kl": 0.0,
"step": 182,
"text_entropy": 1.5640207529067993,
"text_kl": 0.0,
"total_entropy": 2.1286048889160156
},
{
"combined_loss": 0.7314043045043945,
"completion_length": 255.3125,
"epoch": 0.05820610687022901,
"grad_norm": 2.2445545196533203,
"kl": 0.0,
"learning_rate": 9.850340253978911e-07,
"loss": 0.7314,
"num_samples": 1.0,
"reward": 3.9375,
"reward_std": 1.125100016593933,
"rewards/gpt4o_holistic_reward": 3.9375,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.438014268875122,
"speech_entropy": 2.723639726638794,
"speech_kl": 0.0,
"step": 183,
"text_entropy": 0.9034126400947571,
"text_kl": 0.0,
"total_entropy": 2.436861991882324
},
{
"combined_loss": 0.6868402361869812,
"completion_length": 410.8125,
"epoch": 0.058524173027989825,
"grad_norm": 1.6614896059036255,
"kl": 0.0,
"learning_rate": 9.848508709822607e-07,
"loss": 0.6868,
"num_samples": 1.0,
"reward": 3.5,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_holistic_reward": 3.5,
"rl_loss": 1.862645149230957e-08,
"sft_loss": 2.2894670963287354,
"speech_entropy": 2.232311964035034,
"speech_kl": 0.0,
"step": 184,
"text_entropy": 1.3743724822998047,
"text_kl": 0.0,
"total_entropy": 2.072235584259033
},
{
"combined_loss": 0.6055930852890015,
"completion_length": 391.9375,
"epoch": 0.058842239185750635,
"grad_norm": 1.8519771099090576,
"kl": 0.0,
"learning_rate": 9.846666218300807e-07,
"loss": 0.6056,
"num_samples": 1.0,
"reward": 3.3125,
"reward_std": 0.9712333679199219,
"rewards/gpt4o_holistic_reward": 3.3125,
"rl_loss": 2.2351741790771484e-08,
"sft_loss": 2.018643379211426,
"speech_entropy": 2.2869997024536133,
"speech_kl": 0.0,
"step": 185,
"text_entropy": 0.664115846157074,
"text_kl": 0.0,
"total_entropy": 1.9729156494140625
},
{
"combined_loss": 0.6875672936439514,
"completion_length": 485.3125,
"epoch": 0.05916030534351145,
"grad_norm": 2.1656157970428467,
"kl": 0.0,
"learning_rate": 9.844812784051978e-07,
"loss": 0.6876,
"num_samples": 1.0,
"reward": 3.1875,
"reward_std": 0.9129188060760498,
"rewards/gpt4o_holistic_reward": 3.1875,
"rl_loss": -1.1175870895385742e-08,
"sft_loss": 2.291890859603882,
"speech_entropy": 2.182652473449707,
"speech_kl": 0.0,
"step": 186,
"text_entropy": 0.9604874849319458,
"text_kl": 0.0,
"total_entropy": 1.9440966844558716
},
{
"combined_loss": 0.6967759728431702,
"completion_length": 383.75,
"epoch": 0.05947837150127226,
"grad_norm": 1.8742365837097168,
"kl": 0.0,
"learning_rate": 9.84294841174214e-07,
"loss": 0.6968,
"num_samples": 1.0,
"reward": 4.125,
"reward_std": 1.0308762788772583,
"rewards/gpt4o_holistic_reward": 4.125,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.3225862979888916,
"speech_entropy": 2.2666873931884766,
"speech_kl": 0.0,
"step": 187,
"text_entropy": 1.5268785953521729,
"text_kl": 0.0,
"total_entropy": 2.1126818656921387
},
{
"combined_loss": 0.6987001895904541,
"completion_length": 427.75,
"epoch": 0.05979643765903308,
"grad_norm": 1.6652710437774658,
"kl": 0.0,
"learning_rate": 9.841073106064852e-07,
"loss": 0.6987,
"num_samples": 1.0,
"reward": 2.875,
"reward_std": 0.5387751460075378,
"rewards/gpt4o_holistic_reward": 2.875,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.329000473022461,
"speech_entropy": 2.187225103378296,
"speech_kl": 0.0,
"step": 188,
"text_entropy": 1.3647143840789795,
"text_kl": 0.0,
"total_entropy": 2.0151681900024414
},
{
"combined_loss": 0.6457899808883667,
"completion_length": 442.75,
"epoch": 0.060114503816793896,
"grad_norm": 3.32289719581604,
"kl": 0.0,
"learning_rate": 9.839186871741186e-07,
"loss": 0.6458,
"num_samples": 1.0,
"reward": 3.75,
"reward_std": 0.8536533117294312,
"rewards/gpt4o_holistic_reward": 3.75,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.1526331901550293,
"speech_entropy": 2.330132007598877,
"speech_kl": 0.0,
"step": 189,
"text_entropy": 1.0898842811584473,
"text_kl": 0.0,
"total_entropy": 2.0880885124206543
},
{
"combined_loss": 0.7040784955024719,
"completion_length": 552.4375,
"epoch": 0.060432569974554706,
"grad_norm": 1.7851852178573608,
"kl": 0.0,
"learning_rate": 9.83728971351974e-07,
"loss": 0.7041,
"num_samples": 1.0,
"reward": 2.25,
"reward_std": 0.6444376111030579,
"rewards/gpt4o_holistic_reward": 2.25,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.346928119659424,
"speech_entropy": 2.2672841548919678,
"speech_kl": 0.0,
"step": 190,
"text_entropy": 1.2936159372329712,
"text_kl": 0.0,
"total_entropy": 2.0743160247802734
},
{
"combined_loss": 0.6943396329879761,
"completion_length": 251.625,
"epoch": 0.06075063613231552,
"grad_norm": 1.8841829299926758,
"kl": 0.0,
"learning_rate": 9.835381636176605e-07,
"loss": 0.6943,
"num_samples": 1.0,
"reward": 3.25,
"reward_std": 1.0983422994613647,
"rewards/gpt4o_holistic_reward": 3.25,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.3144655227661133,
"speech_entropy": 2.4599857330322266,
"speech_kl": 0.0,
"step": 191,
"text_entropy": 1.357433557510376,
"text_kl": 0.0,
"total_entropy": 2.25075101852417
},
{
"combined_loss": 0.6926529407501221,
"completion_length": 340.9375,
"epoch": 0.061068702290076333,
"grad_norm": 1.9702279567718506,
"kl": 0.0,
"learning_rate": 9.833462644515366e-07,
"loss": 0.6927,
"num_samples": 1.0,
"reward": 3.3125,
"reward_std": 0.6251000165939331,
"rewards/gpt4o_holistic_reward": 3.3125,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.3088431358337402,
"speech_entropy": 2.233442783355713,
"speech_kl": 0.0,
"step": 192,
"text_entropy": 1.2242528200149536,
"text_kl": 0.0,
"total_entropy": 2.0358939170837402
},
{
"combined_loss": 0.6026841998100281,
"completion_length": 382.8125,
"epoch": 0.06138676844783715,
"grad_norm": 1.9015692472457886,
"kl": 0.0,
"learning_rate": 9.83153274336708e-07,
"loss": 0.6027,
"num_samples": 1.0,
"reward": 3.0,
"reward_std": 0.6444375514984131,
"rewards/gpt4o_holistic_reward": 3.0,
"rl_loss": -1.1175870895385742e-08,
"sft_loss": 2.0089473724365234,
"speech_entropy": 2.1782498359680176,
"speech_kl": 0.0,
"step": 193,
"text_entropy": 0.8885223269462585,
"text_kl": 0.0,
"total_entropy": 1.9228941202163696
},
{
"combined_loss": 0.8226222991943359,
"completion_length": 641.9375,
"epoch": 0.06170483460559797,
"grad_norm": 4.10353946685791,
"kl": 0.0,
"learning_rate": 9.829591937590273e-07,
"loss": 0.8226,
"num_samples": 1.0,
"reward": 4.25,
"reward_std": 1.288775086402893,
"rewards/gpt4o_holistic_reward": 4.25,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.7420742511749268,
"speech_entropy": 2.501152992248535,
"speech_kl": 0.0,
"step": 194,
"text_entropy": 1.702864646911621,
"text_kl": 0.0,
"total_entropy": 2.362520217895508
},
{
"combined_loss": 0.6262680292129517,
"completion_length": 441.6875,
"epoch": 0.06202290076335878,
"grad_norm": 1.9847943782806396,
"kl": 0.0,
"learning_rate": 9.82764023207092e-07,
"loss": 0.6263,
"num_samples": 1.0,
"reward": 3.25,
"reward_std": 1.1531318426132202,
"rewards/gpt4o_holistic_reward": 3.25,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.087559938430786,
"speech_entropy": 2.365201473236084,
"speech_kl": 0.0,
"step": 195,
"text_entropy": 1.2258142232894897,
"text_kl": 0.0,
"total_entropy": 2.138317584991455
},
{
"combined_loss": 0.7117223739624023,
"completion_length": 181.375,
"epoch": 0.062340966921119595,
"grad_norm": 1.908617377281189,
"kl": 0.0,
"learning_rate": 9.825677631722435e-07,
"loss": 0.7117,
"num_samples": 1.0,
"reward": 4.25,
"reward_std": 0.20422415435314178,
"rewards/gpt4o_holistic_reward": 4.25,
"rl_loss": 0.0,
"sft_loss": 2.372407913208008,
"speech_entropy": 2.5207109451293945,
"speech_kl": 0.0,
"step": 196,
"text_entropy": 0.8852477073669434,
"text_kl": 0.0,
"total_entropy": 2.2032618522644043
},
{
"combined_loss": 0.6184705495834351,
"completion_length": 488.5625,
"epoch": 0.0626590330788804,
"grad_norm": 1.9325580596923828,
"kl": 0.0,
"learning_rate": 9.823704141485666e-07,
"loss": 0.6185,
"num_samples": 1.0,
"reward": 3.6875,
"reward_std": 0.8751000165939331,
"rewards/gpt4o_holistic_reward": 3.6875,
"rl_loss": 0.0,
"sft_loss": 2.061568260192871,
"speech_entropy": 2.212630033493042,
"speech_kl": 0.0,
"step": 197,
"text_entropy": 1.0164635181427002,
"text_kl": 0.0,
"total_entropy": 1.9818271398544312
},
{
"combined_loss": 0.6890236139297485,
"completion_length": 352.3125,
"epoch": 0.06297709923664122,
"grad_norm": 2.0383756160736084,
"kl": 0.0,
"learning_rate": 9.82171976632887e-07,
"loss": 0.689,
"num_samples": 1.0,
"reward": 3.9375,
"reward_std": 0.6251000165939331,
"rewards/gpt4o_holistic_reward": 3.9375,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.2967453002929688,
"speech_entropy": 2.279099702835083,
"speech_kl": 0.0,
"step": 198,
"text_entropy": 1.1877176761627197,
"text_kl": 0.0,
"total_entropy": 2.078038215637207
},
{
"combined_loss": 0.727849006652832,
"completion_length": 459.625,
"epoch": 0.06329516539440204,
"grad_norm": 4.476938724517822,
"kl": 0.0,
"learning_rate": 9.81972451124771e-07,
"loss": 0.7278,
"num_samples": 1.0,
"reward": 2.5625,
"reward_std": 1.036826252937317,
"rewards/gpt4o_holistic_reward": 2.5625,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.4261631965637207,
"speech_entropy": 2.485858201980591,
"speech_kl": 0.0,
"step": 199,
"text_entropy": 0.7858097553253174,
"text_kl": 0.0,
"total_entropy": 2.1179397106170654
},
{
"combined_loss": 0.6614863276481628,
"completion_length": 468.25,
"epoch": 0.06361323155216285,
"grad_norm": 1.9755204916000366,
"kl": 0.0,
"learning_rate": 9.817718381265238e-07,
"loss": 0.6615,
"num_samples": 1.0,
"reward": 3.5,
"reward_std": 0.6144567728042603,
"rewards/gpt4o_holistic_reward": 3.5,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.2049543857574463,
"speech_entropy": 2.3594043254852295,
"speech_kl": 0.0,
"step": 200,
"text_entropy": 1.0148909091949463,
"text_kl": 0.0,
"total_entropy": 2.100245475769043
},
{
"combined_loss": 0.6087247133255005,
"completion_length": 624.625,
"epoch": 0.06393129770992366,
"grad_norm": 1.7758878469467163,
"kl": 0.0,
"learning_rate": 9.815701381431885e-07,
"loss": 0.6087,
"num_samples": 1.0,
"reward": 2.875,
"reward_std": 0.68720543384552,
"rewards/gpt4o_holistic_reward": 2.875,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.0290822982788086,
"speech_entropy": 2.1871001720428467,
"speech_kl": 0.0,
"step": 201,
"text_entropy": 1.0028703212738037,
"text_kl": 0.0,
"total_entropy": 1.9624230861663818
},
{
"combined_loss": 0.7581361532211304,
"completion_length": 428.5625,
"epoch": 0.06424936386768448,
"grad_norm": 2.9266483783721924,
"kl": 0.0,
"learning_rate": 9.813673516825443e-07,
"loss": 0.7581,
"num_samples": 1.0,
"reward": 3.4375,
"reward_std": 0.8376991748809814,
"rewards/gpt4o_holistic_reward": 3.4375,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.527120351791382,
"speech_entropy": 2.388430595397949,
"speech_kl": 0.0,
"step": 202,
"text_entropy": 1.3267887830734253,
"text_kl": 0.0,
"total_entropy": 2.1770803928375244
},
{
"combined_loss": 0.679368257522583,
"completion_length": 652.9375,
"epoch": 0.0645674300254453,
"grad_norm": 1.8700424432754517,
"kl": 0.0,
"learning_rate": 9.81163479255106e-07,
"loss": 0.6794,
"num_samples": 1.0,
"reward": 2.875,
"reward_std": 0.7171862125396729,
"rewards/gpt4o_holistic_reward": 2.875,
"rl_loss": 0.0,
"sft_loss": 2.2645606994628906,
"speech_entropy": 2.1289772987365723,
"speech_kl": 0.0,
"step": 203,
"text_entropy": 1.0961881875991821,
"text_kl": 0.0,
"total_entropy": 1.903523325920105
},
{
"combined_loss": 0.7537906169891357,
"completion_length": 480.3125,
"epoch": 0.0648854961832061,
"grad_norm": 2.1203622817993164,
"kl": 0.0,
"learning_rate": 9.809585213741224e-07,
"loss": 0.7538,
"num_samples": 1.0,
"reward": 3.0,
"reward_std": 0.7288135886192322,
"rewards/gpt4o_holistic_reward": 3.0,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.5126352310180664,
"speech_entropy": 2.323160171508789,
"speech_kl": 0.0,
"step": 204,
"text_entropy": 1.5344562530517578,
"text_kl": 0.0,
"total_entropy": 2.1817140579223633
},
{
"combined_loss": 0.7854565978050232,
"completion_length": 404.1875,
"epoch": 0.06520356234096693,
"grad_norm": 6.252560138702393,
"kl": 0.0,
"learning_rate": 9.807524785555744e-07,
"loss": 0.7855,
"num_samples": 1.0,
"reward": 2.9375,
"reward_std": 0.23945678770542145,
"rewards/gpt4o_holistic_reward": 2.9375,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.6181886196136475,
"speech_entropy": 2.286557674407959,
"speech_kl": 0.0,
"step": 205,
"text_entropy": 1.3187462091445923,
"text_kl": 0.0,
"total_entropy": 2.110858917236328
},
{
"combined_loss": 0.6578105092048645,
"completion_length": 464.8125,
"epoch": 0.06552162849872774,
"grad_norm": 1.7753384113311768,
"kl": 0.0,
"learning_rate": 9.805453513181746e-07,
"loss": 0.6578,
"num_samples": 1.0,
"reward": 3.625,
"reward_std": 0.4331127107143402,
"rewards/gpt4o_holistic_reward": 3.625,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.192701816558838,
"speech_entropy": 2.21907639503479,
"speech_kl": 0.0,
"step": 206,
"text_entropy": 1.3251252174377441,
"text_kl": 0.0,
"total_entropy": 2.0617318153381348
},
{
"combined_loss": 0.7920005321502686,
"completion_length": 623.25,
"epoch": 0.06583969465648855,
"grad_norm": 1.7488422393798828,
"kl": 0.0,
"learning_rate": 9.80337140183366e-07,
"loss": 0.792,
"num_samples": 1.0,
"reward": 3.9375,
"reward_std": 0.5194375514984131,
"rewards/gpt4o_holistic_reward": 3.9375,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.6400017738342285,
"speech_entropy": 2.1968274116516113,
"speech_kl": 0.0,
"step": 207,
"text_entropy": 1.44332754611969,
"text_kl": 0.0,
"total_entropy": 2.0511021614074707
},
{
"combined_loss": 0.6618804931640625,
"completion_length": 418.75,
"epoch": 0.06615776081424936,
"grad_norm": 1.7858362197875977,
"kl": 0.0,
"learning_rate": 9.801278456753193e-07,
"loss": 0.6619,
"num_samples": 1.0,
"reward": 4.8125,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_holistic_reward": 4.8125,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.206268310546875,
"speech_entropy": 2.241903781890869,
"speech_kl": 0.0,
"step": 208,
"text_entropy": 0.9072421193122864,
"text_kl": 0.0,
"total_entropy": 1.9820117950439453
},
{
"combined_loss": 0.6754240989685059,
"completion_length": 353.1875,
"epoch": 0.06647582697201018,
"grad_norm": 2.1248185634613037,
"kl": 0.0,
"learning_rate": 9.799174683209336e-07,
"loss": 0.6754,
"num_samples": 1.0,
"reward": 2.9375,
"reward_std": 1.2371759414672852,
"rewards/gpt4o_holistic_reward": 2.9375,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.251413583755493,
"speech_entropy": 2.267850875854492,
"speech_kl": 0.0,
"step": 209,
"text_entropy": 1.2034542560577393,
"text_kl": 0.0,
"total_entropy": 2.069960594177246
},
{
"combined_loss": 0.6498122215270996,
"completion_length": 532.8125,
"epoch": 0.06679389312977099,
"grad_norm": 1.8640666007995605,
"kl": 0.0,
"learning_rate": 9.797060086498332e-07,
"loss": 0.6498,
"num_samples": 1.0,
"reward": 2.75,
"reward_std": 1.0983422994613647,
"rewards/gpt4o_holistic_reward": 2.75,
"rl_loss": 0.0,
"sft_loss": 2.1660404205322266,
"speech_entropy": 2.172290563583374,
"speech_kl": 0.0,
"step": 210,
"text_entropy": 1.1589062213897705,
"text_kl": 0.0,
"total_entropy": 1.9814873933792114
},
{
"combined_loss": 0.6711292862892151,
"completion_length": 657.9375,
"epoch": 0.0671119592875318,
"grad_norm": 1.5342586040496826,
"kl": 0.0,
"learning_rate": 9.79493467194368e-07,
"loss": 0.6711,
"num_samples": 1.0,
"reward": 3.6875,
"reward_std": 0.2694375813007355,
"rewards/gpt4o_holistic_reward": 3.6875,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.2370975017547607,
"speech_entropy": 2.2588064670562744,
"speech_kl": 0.0,
"step": 211,
"text_entropy": 0.9491192102432251,
"text_kl": 0.0,
"total_entropy": 2.00757098197937
},
{
"combined_loss": 0.7253336906433105,
"completion_length": 462.125,
"epoch": 0.06743002544529263,
"grad_norm": 1.837734341621399,
"kl": 0.0,
"learning_rate": 9.792798444896107e-07,
"loss": 0.7253,
"num_samples": 1.0,
"reward": 4.5,
"reward_std": 0.6831126809120178,
"rewards/gpt4o_holistic_reward": 4.5,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.417778730392456,
"speech_entropy": 2.2486274242401123,
"speech_kl": 0.0,
"step": 212,
"text_entropy": 1.3472001552581787,
"text_kl": 0.0,
"total_entropy": 2.0750081539154053
},
{
"combined_loss": 0.7301596999168396,
"completion_length": 306.8125,
"epoch": 0.06774809160305344,
"grad_norm": 2.5512921810150146,
"kl": 0.0,
"learning_rate": 9.790651410733562e-07,
"loss": 0.7302,
"num_samples": 1.0,
"reward": 4.625,
"reward_std": 0.4331127107143402,
"rewards/gpt4o_holistic_reward": 4.625,
"rl_loss": 0.0,
"sft_loss": 2.433865547180176,
"speech_entropy": 2.319303512573242,
"speech_kl": 0.0,
"step": 213,
"text_entropy": 0.9959409832954407,
"text_kl": 0.0,
"total_entropy": 2.0706756114959717
},
{
"combined_loss": 0.7285110950469971,
"completion_length": 426.6875,
"epoch": 0.06806615776081425,
"grad_norm": 2.5493366718292236,
"kl": 0.0,
"learning_rate": 9.788493574861199e-07,
"loss": 0.7285,
"num_samples": 1.0,
"reward": 3.5,
"reward_std": 1.0774502754211426,
"rewards/gpt4o_holistic_reward": 3.5,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.428370237350464,
"speech_entropy": 2.256551742553711,
"speech_kl": 0.0,
"step": 214,
"text_entropy": 0.9938225150108337,
"text_kl": 0.0,
"total_entropy": 2.015477180480957
},
{
"combined_loss": 0.7399945855140686,
"completion_length": 371.375,
"epoch": 0.06838422391857506,
"grad_norm": 1.7615910768508911,
"kl": 0.0,
"learning_rate": 9.786324942711371e-07,
"loss": 0.74,
"num_samples": 1.0,
"reward": 4.75,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_holistic_reward": 4.75,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.466648578643799,
"speech_entropy": 2.2756552696228027,
"speech_kl": 0.0,
"step": 215,
"text_entropy": 1.2351531982421875,
"text_kl": 0.0,
"total_entropy": 2.0899648666381836
},
{
"combined_loss": 0.6048213243484497,
"completion_length": 541.9375,
"epoch": 0.06870229007633588,
"grad_norm": 1.555175542831421,
"kl": 0.0,
"learning_rate": 9.784145519743606e-07,
"loss": 0.6048,
"num_samples": 1.0,
"reward": 4.375,
"reward_std": 0.4331127107143402,
"rewards/gpt4o_holistic_reward": 4.375,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.016071081161499,
"speech_entropy": 2.1028213500976562,
"speech_kl": 0.0,
"step": 216,
"text_entropy": 0.6853211522102356,
"text_kl": 0.0,
"total_entropy": 1.8233641386032104
},
{
"combined_loss": 0.6289892196655273,
"completion_length": 317.1875,
"epoch": 0.06902035623409669,
"grad_norm": 1.876192569732666,
"kl": 0.0,
"learning_rate": 9.781955311444596e-07,
"loss": 0.629,
"num_samples": 1.0,
"reward": 4.25,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_holistic_reward": 4.25,
"rl_loss": 0.0,
"sft_loss": 2.096630573272705,
"speech_entropy": 2.2578630447387695,
"speech_kl": 0.0,
"step": 217,
"text_entropy": 1.1112196445465088,
"text_kl": 0.0,
"total_entropy": 2.032492160797119
},
{
"combined_loss": 0.6842765808105469,
"completion_length": 477.25,
"epoch": 0.0693384223918575,
"grad_norm": 2.3141274452209473,
"kl": 0.0,
"learning_rate": 9.779754323328192e-07,
"loss": 0.6843,
"num_samples": 1.0,
"reward": 3.5625,
"reward_std": 1.269437551498413,
"rewards/gpt4o_holistic_reward": 3.5625,
"rl_loss": 0.0,
"sft_loss": 2.2809219360351562,
"speech_entropy": 2.205554485321045,
"speech_kl": 0.0,
"step": 218,
"text_entropy": 1.3427551984786987,
"text_kl": 0.0,
"total_entropy": 2.0352625846862793
},
{
"combined_loss": 0.6563563346862793,
"completion_length": 386.4375,
"epoch": 0.06965648854961833,
"grad_norm": 1.7859476804733276,
"kl": 0.0,
"learning_rate": 9.777542560935373e-07,
"loss": 0.6564,
"num_samples": 1.0,
"reward": 4.625,
"reward_std": 0.5983423590660095,
"rewards/gpt4o_holistic_reward": 4.625,
"rl_loss": 2.2351741790771484e-08,
"sft_loss": 2.187854290008545,
"speech_entropy": 2.2751975059509277,
"speech_kl": 0.0,
"step": 219,
"text_entropy": 0.9894624352455139,
"text_kl": 0.0,
"total_entropy": 2.025278091430664
},
{
"combined_loss": 0.6714828610420227,
"completion_length": 397.6875,
"epoch": 0.06997455470737914,
"grad_norm": 2.1150624752044678,
"kl": 0.0,
"learning_rate": 9.775320029834254e-07,
"loss": 0.6715,
"num_samples": 1.0,
"reward": 2.75,
"reward_std": 0.9565354585647583,
"rewards/gpt4o_holistic_reward": 2.75,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.238276243209839,
"speech_entropy": 2.2403571605682373,
"speech_kl": 0.0,
"step": 220,
"text_entropy": 1.4980268478393555,
"text_kl": 0.0,
"total_entropy": 2.0874156951904297
},
{
"combined_loss": 0.7247896790504456,
"completion_length": 497.375,
"epoch": 0.07029262086513995,
"grad_norm": 1.8192265033721924,
"kl": 0.0,
"learning_rate": 9.773086735620053e-07,
"loss": 0.7248,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 0.4435809552669525,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": 0.0,
"sft_loss": 2.4159655570983887,
"speech_entropy": 2.3147919178009033,
"speech_kl": 0.0,
"step": 221,
"text_entropy": 1.4983744621276855,
"text_kl": 0.0,
"total_entropy": 2.1691160202026367
},
{
"combined_loss": 0.616317868232727,
"completion_length": 594.25,
"epoch": 0.07061068702290077,
"grad_norm": 1.6542634963989258,
"kl": 0.0,
"learning_rate": 9.770842683915082e-07,
"loss": 0.6163,
"num_samples": 1.0,
"reward": 3.4375,
"reward_std": 0.47356173396110535,
"rewards/gpt4o_holistic_reward": 3.4375,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.0543928146362305,
"speech_entropy": 2.261496067047119,
"speech_kl": 0.0,
"step": 222,
"text_entropy": 0.8812452554702759,
"text_kl": 0.0,
"total_entropy": 1.996401309967041
},
{
"combined_loss": 0.8017942905426025,
"completion_length": 413.75,
"epoch": 0.07092875318066158,
"grad_norm": 2.075211763381958,
"kl": 0.0,
"learning_rate": 9.768587880368742e-07,
"loss": 0.8018,
"num_samples": 1.0,
"reward": 3.6875,
"reward_std": 0.6251000165939331,
"rewards/gpt4o_holistic_reward": 3.6875,
"rl_loss": 0.0,
"sft_loss": 2.672647714614868,
"speech_entropy": 2.2792246341705322,
"speech_kl": 0.0,
"step": 223,
"text_entropy": 1.4464610815048218,
"text_kl": 0.0,
"total_entropy": 2.1342384815216064
},
{
"combined_loss": 0.733474612236023,
"completion_length": 342.875,
"epoch": 0.07124681933842239,
"grad_norm": 1.4014809131622314,
"kl": 0.0,
"learning_rate": 9.766322330657497e-07,
"loss": 0.7335,
"num_samples": 1.0,
"reward": 2.5,
"reward_std": 9.999999747378752e-05,
"rewards/gpt4o_holistic_reward": 2.5,
"rl_loss": 0.0,
"sft_loss": 2.444915294647217,
"speech_entropy": 2.490835428237915,
"speech_kl": 0.0,
"step": 224,
"text_entropy": 1.531280755996704,
"text_kl": 0.0,
"total_entropy": 2.3087196350097656
},
{
"combined_loss": 0.7604819536209106,
"completion_length": 377.125,
"epoch": 0.0715648854961832,
"grad_norm": 2.309183120727539,
"kl": 0.0,
"learning_rate": 9.764046040484868e-07,
"loss": 0.7605,
"num_samples": 1.0,
"reward": 3.375,
"reward_std": 9.999999747378752e-05,
"rewards/gpt4o_holistic_reward": 3.375,
"rl_loss": 0.0,
"sft_loss": 2.534939765930176,
"speech_entropy": 2.4606456756591797,
"speech_kl": 0.0,
"step": 225,
"text_entropy": 0.7890236377716064,
"text_kl": 0.0,
"total_entropy": 2.0891623497009277
},
{
"combined_loss": 0.6990571022033691,
"completion_length": 528.3125,
"epoch": 0.07188295165394402,
"grad_norm": 1.9987963438034058,
"kl": 0.0,
"learning_rate": 9.76175901558141e-07,
"loss": 0.6991,
"num_samples": 1.0,
"reward": 3.5625,
"reward_std": 0.6770563125610352,
"rewards/gpt4o_holistic_reward": 3.5625,
"rl_loss": 0.0,
"sft_loss": 2.3301901817321777,
"speech_entropy": 2.320341110229492,
"speech_kl": 0.0,
"step": 226,
"text_entropy": 1.3003857135772705,
"text_kl": 0.0,
"total_entropy": 2.1371474266052246
},
{
"combined_loss": 0.7233133316040039,
"completion_length": 355.9375,
"epoch": 0.07220101781170483,
"grad_norm": 1.6554374694824219,
"kl": 0.0,
"learning_rate": 9.759461261704705e-07,
"loss": 0.7233,
"num_samples": 1.0,
"reward": 4.375,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_holistic_reward": 4.375,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.4110443592071533,
"speech_entropy": 2.268995761871338,
"speech_kl": 0.0,
"step": 227,
"text_entropy": 1.413590431213379,
"text_kl": 0.0,
"total_entropy": 2.0987539291381836
},
{
"combined_loss": 0.7177764773368835,
"completion_length": 507.4375,
"epoch": 0.07251908396946564,
"grad_norm": 1.8407859802246094,
"kl": 0.0,
"learning_rate": 9.757152784639347e-07,
"loss": 0.7178,
"num_samples": 1.0,
"reward": 4.3125,
"reward_std": 0.7622368335723877,
"rewards/gpt4o_holistic_reward": 4.3125,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.3925881385803223,
"speech_entropy": 2.314845561981201,
"speech_kl": 0.0,
"step": 228,
"text_entropy": 1.0042717456817627,
"text_kl": 0.0,
"total_entropy": 2.067537784576416
},
{
"combined_loss": 0.7055172324180603,
"completion_length": 615.5625,
"epoch": 0.07283715012722647,
"grad_norm": 1.6229463815689087,
"kl": 0.0,
"learning_rate": 9.754833590196926e-07,
"loss": 0.7055,
"num_samples": 1.0,
"reward": 4.625,
"reward_std": 0.6444375514984131,
"rewards/gpt4o_holistic_reward": 4.625,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.3517239093780518,
"speech_entropy": 2.1739797592163086,
"speech_kl": 0.0,
"step": 229,
"text_entropy": 1.4586985111236572,
"text_kl": 0.0,
"total_entropy": 2.04360294342041
},
{
"combined_loss": 0.7200095057487488,
"completion_length": 461.0,
"epoch": 0.07315521628498728,
"grad_norm": 2.110440254211426,
"kl": 0.0,
"learning_rate": 9.752503684216007e-07,
"loss": 0.72,
"num_samples": 1.0,
"reward": 4.5625,
"reward_std": 0.8751000165939331,
"rewards/gpt4o_holistic_reward": 4.5625,
"rl_loss": 0.0,
"sft_loss": 2.400031566619873,
"speech_entropy": 2.22104811668396,
"speech_kl": 0.0,
"step": 230,
"text_entropy": 1.356811761856079,
"text_kl": 0.0,
"total_entropy": 2.0665719509124756
},
{
"combined_loss": 0.6246628761291504,
"completion_length": 591.875,
"epoch": 0.07347328244274809,
"grad_norm": 2.518479585647583,
"kl": 0.0,
"learning_rate": 9.75016307256213e-07,
"loss": 0.6247,
"num_samples": 1.0,
"reward": 3.1875,
"reward_std": 0.9550646543502808,
"rewards/gpt4o_holistic_reward": 3.1875,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.082209587097168,
"speech_entropy": 2.3805904388427734,
"speech_kl": 0.0,
"step": 231,
"text_entropy": 0.864353597164154,
"text_kl": 0.0,
"total_entropy": 2.0375137329101562
},
{
"combined_loss": 0.6500042676925659,
"completion_length": 494.125,
"epoch": 0.0737913486005089,
"grad_norm": 1.7023345232009888,
"kl": 0.0,
"learning_rate": 9.74781176112778e-07,
"loss": 0.65,
"num_samples": 1.0,
"reward": 3.625,
"reward_std": 0.7674887180328369,
"rewards/gpt4o_holistic_reward": 3.625,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.1666808128356934,
"speech_entropy": 2.172966957092285,
"speech_kl": 0.0,
"step": 232,
"text_entropy": 1.2742629051208496,
"text_kl": 0.0,
"total_entropy": 2.008732318878174
},
{
"combined_loss": 0.6174441576004028,
"completion_length": 380.1875,
"epoch": 0.07410941475826972,
"grad_norm": 1.9199665784835815,
"kl": 0.0,
"learning_rate": 9.74544975583238e-07,
"loss": 0.6174,
"num_samples": 1.0,
"reward": 3.625,
"reward_std": 1.6082265377044678,
"rewards/gpt4o_holistic_reward": 3.625,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.0581471920013428,
"speech_entropy": 2.2164857387542725,
"speech_kl": 0.0,
"step": 233,
"text_entropy": 0.900662899017334,
"text_kl": 0.0,
"total_entropy": 1.9652466773986816
},
{
"combined_loss": 0.7314097285270691,
"completion_length": 307.0625,
"epoch": 0.07442748091603053,
"grad_norm": 2.08520245552063,
"kl": 0.0,
"learning_rate": 9.743077062622278e-07,
"loss": 0.7314,
"num_samples": 1.0,
"reward": 3.1875,
"reward_std": 0.8014019727706909,
"rewards/gpt4o_holistic_reward": 3.1875,
"rl_loss": -1.1175870895385742e-08,
"sft_loss": 2.438032388687134,
"speech_entropy": 2.3524227142333984,
"speech_kl": 0.0,
"step": 234,
"text_entropy": 0.8163133859634399,
"text_kl": 0.0,
"total_entropy": 2.0464541912078857
},
{
"combined_loss": 0.6391720175743103,
"completion_length": 435.5625,
"epoch": 0.07474554707379134,
"grad_norm": 1.9668989181518555,
"kl": 0.0,
"learning_rate": 9.740693687470722e-07,
"loss": 0.6392,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 0.7694376111030579,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.130573272705078,
"speech_entropy": 2.305225372314453,
"speech_kl": 0.0,
"step": 235,
"text_entropy": 0.7647875547409058,
"text_kl": 0.0,
"total_entropy": 2.0108447074890137
},
{
"combined_loss": 0.630230724811554,
"completion_length": 401.0625,
"epoch": 0.07506361323155217,
"grad_norm": 2.1346487998962402,
"kl": 0.0,
"learning_rate": 9.738299636377862e-07,
"loss": 0.6302,
"num_samples": 1.0,
"reward": 3.125,
"reward_std": 0.8859703540802002,
"rewards/gpt4o_holistic_reward": 3.125,
"rl_loss": 9.313225746154785e-09,
"sft_loss": 2.10076904296875,
"speech_entropy": 2.27579402923584,
"speech_kl": 0.0,
"step": 236,
"text_entropy": 0.8570226430892944,
"text_kl": 0.0,
"total_entropy": 2.0057263374328613
},
{
"combined_loss": 0.6507729291915894,
"completion_length": 424.8125,
"epoch": 0.07538167938931298,
"grad_norm": 1.7010408639907837,
"kl": 0.0,
"learning_rate": 9.735894915370712e-07,
"loss": 0.6508,
"num_samples": 1.0,
"reward": 3.9375,
"reward_std": 0.989456832408905,
"rewards/gpt4o_holistic_reward": 3.9375,
"rl_loss": -1.4901161193847656e-08,
"sft_loss": 2.169243097305298,
"speech_entropy": 2.329667329788208,
"speech_kl": 0.0,
"step": 237,
"text_entropy": 1.0157864093780518,
"text_kl": 0.0,
"total_entropy": 2.0907399654388428
},
{
"combined_loss": 0.6825940608978271,
"completion_length": 556.8125,
"epoch": 0.07569974554707379,
"grad_norm": 2.043261766433716,
"kl": 0.0,
"learning_rate": 9.73347953050316e-07,
"loss": 0.6826,
"num_samples": 1.0,
"reward": 3.1875,
"reward_std": 0.7705972194671631,
"rewards/gpt4o_holistic_reward": 3.1875,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.275313377380371,
"speech_entropy": 2.336167812347412,
"speech_kl": 0.0,
"step": 238,
"text_entropy": 1.4063137769699097,
"text_kl": 0.0,
"total_entropy": 2.168437957763672
},
{
"combined_loss": 0.6205179691314697,
"completion_length": 378.75,
"epoch": 0.07601781170483461,
"grad_norm": 2.229231595993042,
"kl": 0.0,
"learning_rate": 9.731053487855932e-07,
"loss": 0.6205,
"num_samples": 1.0,
"reward": 3.0625,
"reward_std": 0.8837943077087402,
"rewards/gpt4o_holistic_reward": 3.0625,
"rl_loss": 1.862645149230957e-08,
"sft_loss": 2.0683932304382324,
"speech_entropy": 2.4868783950805664,
"speech_kl": 0.0,
"step": 239,
"text_entropy": 1.320270299911499,
"text_kl": 0.0,
"total_entropy": 2.2788162231445312
},
{
"combined_loss": 0.7069031596183777,
"completion_length": 469.5625,
"epoch": 0.07633587786259542,
"grad_norm": 2.225677728652954,
"kl": 0.0,
"learning_rate": 9.728616793536587e-07,
"loss": 0.7069,
"num_samples": 1.0,
"reward": 3.1875,
"reward_std": 1.3122053146362305,
"rewards/gpt4o_holistic_reward": 3.1875,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.3563437461853027,
"speech_entropy": 2.316706657409668,
"speech_kl": 0.0,
"step": 240,
"text_entropy": 1.6138790845870972,
"text_kl": 0.0,
"total_entropy": 2.198500156402588
},
{
"combined_loss": 0.6990001797676086,
"completion_length": 413.25,
"epoch": 0.07665394402035623,
"grad_norm": 1.7011535167694092,
"kl": 0.0,
"learning_rate": 9.726169453679502e-07,
"loss": 0.699,
"num_samples": 1.0,
"reward": 4.125,
"reward_std": 0.6764019727706909,
"rewards/gpt4o_holistic_reward": 4.125,
"rl_loss": 2.60770320892334e-08,
"sft_loss": 2.330000400543213,
"speech_entropy": 2.288140296936035,
"speech_kl": 0.0,
"step": 241,
"text_entropy": 1.567537546157837,
"text_kl": 0.0,
"total_entropy": 2.162627696990967
},
{
"combined_loss": 0.7738334536552429,
"completion_length": 296.8125,
"epoch": 0.07697201017811704,
"grad_norm": 2.249540328979492,
"kl": 0.0,
"learning_rate": 9.72371147444585e-07,
"loss": 0.7738,
"num_samples": 1.0,
"reward": 3.25,
"reward_std": 1.8020561933517456,
"rewards/gpt4o_holistic_reward": 3.25,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.579444646835327,
"speech_entropy": 2.2558746337890625,
"speech_kl": 0.0,
"step": 242,
"text_entropy": 1.140458583831787,
"text_kl": 0.0,
"total_entropy": 2.0467886924743652
},
{
"combined_loss": 0.6218153834342957,
"completion_length": 417.1875,
"epoch": 0.07729007633587787,
"grad_norm": 1.888468861579895,
"kl": 0.0,
"learning_rate": 9.721242862023591e-07,
"loss": 0.6218,
"num_samples": 1.0,
"reward": 4.6875,
"reward_std": 0.6251000165939331,
"rewards/gpt4o_holistic_reward": 4.6875,
"rl_loss": 0.0,
"sft_loss": 2.0727176666259766,
"speech_entropy": 2.2084412574768066,
"speech_kl": 0.0,
"step": 243,
"text_entropy": 0.5448473691940308,
"text_kl": 0.0,
"total_entropy": 1.8867276906967163
},
{
"combined_loss": 0.6377319097518921,
"completion_length": 424.375,
"epoch": 0.07760814249363868,
"grad_norm": 1.7285096645355225,
"kl": 0.0,
"learning_rate": 9.718763622627458e-07,
"loss": 0.6377,
"num_samples": 1.0,
"reward": 4.5625,
"reward_std": 0.7286534309387207,
"rewards/gpt4o_holistic_reward": 4.5625,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.1257729530334473,
"speech_entropy": 2.167391538619995,
"speech_kl": 0.0,
"step": 244,
"text_entropy": 1.0646146535873413,
"text_kl": 0.0,
"total_entropy": 1.956799030303955
},
{
"combined_loss": 0.6482560634613037,
"completion_length": 407.4375,
"epoch": 0.07792620865139949,
"grad_norm": 1.660476803779602,
"kl": 0.0,
"learning_rate": 9.716273762498929e-07,
"loss": 0.6483,
"num_samples": 1.0,
"reward": 4.4375,
"reward_std": 0.42705631256103516,
"rewards/gpt4o_holistic_reward": 4.4375,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.160853385925293,
"speech_entropy": 2.198376417160034,
"speech_kl": 0.0,
"step": 245,
"text_entropy": 1.3145016431808472,
"text_kl": 0.0,
"total_entropy": 2.031963348388672
},
{
"combined_loss": 0.6421551704406738,
"completion_length": 586.875,
"epoch": 0.07824427480916031,
"grad_norm": 1.4380934238433838,
"kl": 0.0,
"learning_rate": 9.71377328790622e-07,
"loss": 0.6422,
"num_samples": 1.0,
"reward": 3.5,
"reward_std": 0.5774502754211426,
"rewards/gpt4o_holistic_reward": 3.5,
"rl_loss": 0.0,
"sft_loss": 2.140516996383667,
"speech_entropy": 2.2290802001953125,
"speech_kl": 0.0,
"step": 246,
"text_entropy": 0.9156054258346558,
"text_kl": 0.0,
"total_entropy": 1.9700312614440918
},
{
"combined_loss": 0.7020258903503418,
"completion_length": 419.125,
"epoch": 0.07856234096692112,
"grad_norm": 3.0993080139160156,
"kl": 0.0,
"learning_rate": 9.711262205144285e-07,
"loss": 0.702,
"num_samples": 1.0,
"reward": 4.1875,
"reward_std": 0.6637751460075378,
"rewards/gpt4o_holistic_reward": 4.1875,
"rl_loss": -2.2351741790771484e-08,
"sft_loss": 2.340085983276367,
"speech_entropy": 2.3030998706817627,
"speech_kl": 0.0,
"step": 247,
"text_entropy": 1.1675899028778076,
"text_kl": 0.0,
"total_entropy": 2.082179307937622
},
{
"combined_loss": 0.686911940574646,
"completion_length": 387.25,
"epoch": 0.07888040712468193,
"grad_norm": 1.6759203672409058,
"kl": 0.0,
"learning_rate": 9.70874052053476e-07,
"loss": 0.6869,
"num_samples": 1.0,
"reward": 4.9375,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_holistic_reward": 4.9375,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.2897064685821533,
"speech_entropy": 2.1984457969665527,
"speech_kl": 0.0,
"step": 248,
"text_entropy": 1.1239951848983765,
"text_kl": 0.0,
"total_entropy": 1.9890978336334229
},
{
"combined_loss": 0.6796283721923828,
"completion_length": 495.375,
"epoch": 0.07919847328244274,
"grad_norm": 1.5949386358261108,
"kl": 0.0,
"learning_rate": 9.706208240425988e-07,
"loss": 0.6796,
"num_samples": 1.0,
"reward": 4.1875,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_holistic_reward": 4.1875,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.265427589416504,
"speech_entropy": 2.263617992401123,
"speech_kl": 0.0,
"step": 249,
"text_entropy": 1.3017940521240234,
"text_kl": 0.0,
"total_entropy": 2.0769906044006348
},
{
"combined_loss": 0.6568690538406372,
"completion_length": 432.3125,
"epoch": 0.07951653944020357,
"grad_norm": 1.5095460414886475,
"kl": 0.0,
"learning_rate": 9.70366537119298e-07,
"loss": 0.6569,
"num_samples": 1.0,
"reward": 4.5,
"reward_std": 0.3536534011363983,
"rewards/gpt4o_holistic_reward": 4.5,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.189563274383545,
"speech_entropy": 2.2465767860412598,
"speech_kl": 0.0,
"step": 250,
"text_entropy": 0.8271604776382446,
"text_kl": 0.0,
"total_entropy": 1.9808815717697144
},
{
"combined_loss": 0.691437840461731,
"completion_length": 455.625,
"epoch": 0.07983460559796438,
"grad_norm": 2.107602119445801,
"kl": 0.0,
"learning_rate": 9.701111919237408e-07,
"loss": 0.6914,
"num_samples": 1.0,
"reward": 2.625,
"reward_std": 1.228813648223877,
"rewards/gpt4o_holistic_reward": 2.625,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.304792881011963,
"speech_entropy": 2.464122772216797,
"speech_kl": 0.0,
"step": 251,
"text_entropy": 1.3184483051300049,
"text_kl": 0.0,
"total_entropy": 2.2695252895355225
},
{
"combined_loss": 0.6778163909912109,
"completion_length": 400.5625,
"epoch": 0.08015267175572519,
"grad_norm": 1.9228991270065308,
"kl": 0.0,
"learning_rate": 9.698547890987584e-07,
"loss": 0.6778,
"num_samples": 1.0,
"reward": 3.4375,
"reward_std": 0.8751000165939331,
"rewards/gpt4o_holistic_reward": 3.4375,
"rl_loss": 1.862645149230957e-09,
"sft_loss": 2.259387969970703,
"speech_entropy": 2.1710927486419678,
"speech_kl": 0.0,
"step": 252,
"text_entropy": 1.0632684230804443,
"text_kl": 0.0,
"total_entropy": 1.970045566558838
},
{
"combined_loss": 0.7383031845092773,
"completion_length": 435.9375,
"epoch": 0.08047073791348601,
"grad_norm": 1.9669862985610962,
"kl": 0.0,
"learning_rate": 9.695973292898442e-07,
"loss": 0.7383,
"num_samples": 1.0,
"reward": 4.3125,
"reward_std": 1.0876991748809814,
"rewards/gpt4o_holistic_reward": 4.3125,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.461010456085205,
"speech_entropy": 2.2386293411254883,
"speech_kl": 0.0,
"step": 253,
"text_entropy": 1.4104411602020264,
"text_kl": 0.0,
"total_entropy": 2.0837998390197754
},
{
"combined_loss": 0.6110467910766602,
"completion_length": 388.125,
"epoch": 0.08078880407124682,
"grad_norm": 1.7974542379379272,
"kl": 0.0,
"learning_rate": 9.693388131451536e-07,
"loss": 0.611,
"num_samples": 1.0,
"reward": 4.0,
"reward_std": 1.3661253452301025,
"rewards/gpt4o_holistic_reward": 4.0,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.036822557449341,
"speech_entropy": 2.301140785217285,
"speech_kl": 0.0,
"step": 254,
"text_entropy": 0.9246535897254944,
"text_kl": 0.0,
"total_entropy": 2.01608943939209
},
{
"combined_loss": 0.6638992428779602,
"completion_length": 567.8125,
"epoch": 0.08110687022900763,
"grad_norm": 1.5331536531448364,
"kl": 0.0,
"learning_rate": 9.690792413155002e-07,
"loss": 0.6639,
"num_samples": 1.0,
"reward": 4.1875,
"reward_std": 0.5646764636039734,
"rewards/gpt4o_holistic_reward": 4.1875,
"rl_loss": 0.0,
"sft_loss": 2.2129974365234375,
"speech_entropy": 2.180696964263916,
"speech_kl": 0.0,
"step": 255,
"text_entropy": 1.1481618881225586,
"text_kl": 0.0,
"total_entropy": 1.9820756912231445
},
{
"combined_loss": 0.6851294040679932,
"completion_length": 547.0,
"epoch": 0.08142493638676845,
"grad_norm": 1.8251949548721313,
"kl": 0.0,
"learning_rate": 9.688186144543558e-07,
"loss": 0.6851,
"num_samples": 1.0,
"reward": 3.6875,
"reward_std": 0.8081126809120178,
"rewards/gpt4o_holistic_reward": 3.6875,
"rl_loss": -1.1175870895385742e-08,
"sft_loss": 2.2837648391723633,
"speech_entropy": 2.2099359035491943,
"speech_kl": 0.0,
"step": 256,
"text_entropy": 1.4614191055297852,
"text_kl": 0.0,
"total_entropy": 2.0672175884246826
},
{
"combined_loss": 0.7618996500968933,
"completion_length": 372.0625,
"epoch": 0.08174300254452926,
"grad_norm": 2.1775362491607666,
"kl": 0.0,
"learning_rate": 9.685569332178487e-07,
"loss": 0.7619,
"num_samples": 1.0,
"reward": 2.8125,
"reward_std": 0.5194376111030579,
"rewards/gpt4o_holistic_reward": 2.8125,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.539665460586548,
"speech_entropy": 2.2694239616394043,
"speech_kl": 0.0,
"step": 257,
"text_entropy": 0.980126142501831,
"text_kl": 0.0,
"total_entropy": 2.0079169273376465
},
{
"combined_loss": 0.6269736289978027,
"completion_length": 546.5,
"epoch": 0.08206106870229007,
"grad_norm": 1.405476689338684,
"kl": 0.0,
"learning_rate": 9.682941982647605e-07,
"loss": 0.627,
"num_samples": 1.0,
"reward": 2.9375,
"reward_std": 0.6251000165939331,
"rewards/gpt4o_holistic_reward": 2.9375,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.089912176132202,
"speech_entropy": 2.1415200233459473,
"speech_kl": 0.0,
"step": 258,
"text_entropy": 1.16708242893219,
"text_kl": 0.0,
"total_entropy": 1.9402116537094116
},
{
"combined_loss": 0.6667758226394653,
"completion_length": 376.0625,
"epoch": 0.08237913486005088,
"grad_norm": 1.9283385276794434,
"kl": 0.0,
"learning_rate": 9.680304102565265e-07,
"loss": 0.6668,
"num_samples": 1.0,
"reward": 3.0625,
"reward_std": 0.7394567728042603,
"rewards/gpt4o_holistic_reward": 3.0625,
"rl_loss": -1.4901161193847656e-08,
"sft_loss": 2.222586154937744,
"speech_entropy": 2.2938990592956543,
"speech_kl": 0.0,
"step": 259,
"text_entropy": 0.9376010894775391,
"text_kl": 0.0,
"total_entropy": 2.0546226501464844
},
{
"combined_loss": 0.6579493880271912,
"completion_length": 224.5,
"epoch": 0.08269720101781171,
"grad_norm": 2.025418519973755,
"kl": 0.0,
"learning_rate": 9.677655698572325e-07,
"loss": 0.6579,
"num_samples": 1.0,
"reward": 4.375,
"reward_std": 0.3228486180305481,
"rewards/gpt4o_holistic_reward": 4.375,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.193164587020874,
"speech_entropy": 2.3803346157073975,
"speech_kl": 0.0,
"step": 260,
"text_entropy": 1.0623462200164795,
"text_kl": 0.0,
"total_entropy": 2.11057186126709
},
{
"combined_loss": 0.7478048205375671,
"completion_length": 362.0625,
"epoch": 0.08301526717557252,
"grad_norm": 3.3174071311950684,
"kl": 0.0,
"learning_rate": 9.674996777336142e-07,
"loss": 0.7478,
"num_samples": 1.0,
"reward": 4.5625,
"reward_std": 0.3146764636039734,
"rewards/gpt4o_holistic_reward": 4.5625,
"rl_loss": 0.0,
"sft_loss": 2.492682456970215,
"speech_entropy": 2.2766928672790527,
"speech_kl": 0.0,
"step": 261,
"text_entropy": 1.202368140220642,
"text_kl": 0.0,
"total_entropy": 2.069495677947998
},
{
"combined_loss": 0.6093226075172424,
"completion_length": 404.625,
"epoch": 0.08333333333333333,
"grad_norm": 2.024925470352173,
"kl": 0.0,
"learning_rate": 9.672327345550543e-07,
"loss": 0.6093,
"num_samples": 1.0,
"reward": 3.0,
"reward_std": 1.1161253452301025,
"rewards/gpt4o_holistic_reward": 3.0,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.0310750007629395,
"speech_entropy": 2.2833635807037354,
"speech_kl": 0.0,
"step": 262,
"text_entropy": 0.996048629283905,
"text_kl": 0.0,
"total_entropy": 2.037139654159546
},
{
"combined_loss": 0.6289666891098022,
"completion_length": 487.0625,
"epoch": 0.08365139949109415,
"grad_norm": 2.06803297996521,
"kl": 0.0,
"learning_rate": 9.669647409935822e-07,
"loss": 0.629,
"num_samples": 1.0,
"reward": 3.625,
"reward_std": 1.2440414428710938,
"rewards/gpt4o_holistic_reward": 3.625,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.096555709838867,
"speech_entropy": 2.283952236175537,
"speech_kl": 0.0,
"step": 263,
"text_entropy": 1.3200794458389282,
"text_kl": 0.0,
"total_entropy": 2.095597267150879
},
{
"combined_loss": 0.6302919387817383,
"completion_length": 503.0625,
"epoch": 0.08396946564885496,
"grad_norm": 1.8565832376480103,
"kl": 0.0,
"learning_rate": 9.666956977238711e-07,
"loss": 0.6303,
"num_samples": 1.0,
"reward": 3.75,
"reward_std": 0.6144567728042603,
"rewards/gpt4o_holistic_reward": 3.75,
"rl_loss": 0.0,
"sft_loss": 2.100973129272461,
"speech_entropy": 2.1800198554992676,
"speech_kl": 0.0,
"step": 264,
"text_entropy": 0.9526975154876709,
"text_kl": 0.0,
"total_entropy": 1.956667184829712
},
{
"combined_loss": 0.7350342273712158,
"completion_length": 456.9375,
"epoch": 0.08428753180661577,
"grad_norm": 1.7183167934417725,
"kl": 0.0,
"learning_rate": 9.664256054232374e-07,
"loss": 0.735,
"num_samples": 1.0,
"reward": 4.875,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_holistic_reward": 4.875,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.4501140117645264,
"speech_entropy": 2.2488903999328613,
"speech_kl": 0.0,
"step": 265,
"text_entropy": 1.4711058139801025,
"text_kl": 0.0,
"total_entropy": 2.073747396469116
},
{
"combined_loss": 0.6489673256874084,
"completion_length": 426.8125,
"epoch": 0.0846055979643766,
"grad_norm": 1.6044869422912598,
"kl": 0.0,
"learning_rate": 9.66154464771638e-07,
"loss": 0.649,
"num_samples": 1.0,
"reward": 3.0,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_holistic_reward": 3.0,
"rl_loss": 0.0,
"sft_loss": 2.163224220275879,
"speech_entropy": 2.525622606277466,
"speech_kl": 0.0,
"step": 266,
"text_entropy": 1.200254201889038,
"text_kl": 0.0,
"total_entropy": 2.274986505508423
},
{
"combined_loss": 0.7551906108856201,
"completion_length": 445.6875,
"epoch": 0.08492366412213741,
"grad_norm": 1.658619999885559,
"kl": 0.0,
"learning_rate": 9.658822764516693e-07,
"loss": 0.7552,
"num_samples": 1.0,
"reward": 4.4375,
"reward_std": 0.8081126809120178,
"rewards/gpt4o_holistic_reward": 4.4375,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.5173017978668213,
"speech_entropy": 2.1905436515808105,
"speech_kl": 0.0,
"step": 267,
"text_entropy": 1.1561870574951172,
"text_kl": 0.0,
"total_entropy": 1.9924242496490479
},
{
"combined_loss": 0.6897430419921875,
"completion_length": 678.3125,
"epoch": 0.08524173027989822,
"grad_norm": 1.7225435972213745,
"kl": 0.0,
"learning_rate": 9.65609041148565e-07,
"loss": 0.6897,
"num_samples": 1.0,
"reward": 3.5625,
"reward_std": 0.9435809850692749,
"rewards/gpt4o_holistic_reward": 3.5625,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.2991433143615723,
"speech_entropy": 2.1389427185058594,
"speech_kl": 0.0,
"step": 268,
"text_entropy": 1.3221979141235352,
"text_kl": 0.0,
"total_entropy": 1.9796137809753418
},
{
"combined_loss": 0.5930161476135254,
"completion_length": 548.0,
"epoch": 0.08555979643765903,
"grad_norm": 1.4510716199874878,
"kl": 0.0,
"learning_rate": 9.653347595501946e-07,
"loss": 0.593,
"num_samples": 1.0,
"reward": 4.125,
"reward_std": 0.14443756639957428,
"rewards/gpt4o_holistic_reward": 4.125,
"rl_loss": 0.0,
"sft_loss": 1.9767203330993652,
"speech_entropy": 2.1469504833221436,
"speech_kl": 0.0,
"step": 269,
"text_entropy": 0.8947275876998901,
"text_kl": 0.0,
"total_entropy": 1.9000020027160645
},
{
"combined_loss": 0.652092456817627,
"completion_length": 484.625,
"epoch": 0.08587786259541985,
"grad_norm": 1.9512776136398315,
"kl": 0.0,
"learning_rate": 9.650594323470617e-07,
"loss": 0.6521,
"num_samples": 1.0,
"reward": 2.8125,
"reward_std": 0.8808612823486328,
"rewards/gpt4o_holistic_reward": 2.8125,
"rl_loss": -1.4901161193847656e-08,
"sft_loss": 2.1736412048339844,
"speech_entropy": 2.1839957237243652,
"speech_kl": 0.0,
"step": 270,
"text_entropy": 1.3368381261825562,
"text_kl": 0.0,
"total_entropy": 2.01955246925354
},
{
"combined_loss": 0.6888371706008911,
"completion_length": 471.3125,
"epoch": 0.08619592875318066,
"grad_norm": 1.8091537952423096,
"kl": 0.0,
"learning_rate": 9.64783060232302e-07,
"loss": 0.6888,
"num_samples": 1.0,
"reward": 3.875,
"reward_std": 1.183112621307373,
"rewards/gpt4o_holistic_reward": 3.875,
"rl_loss": 2.2351741790771484e-08,
"sft_loss": 2.296123743057251,
"speech_entropy": 2.1703720092773438,
"speech_kl": 0.0,
"step": 271,
"text_entropy": 1.033645749092102,
"text_kl": 0.0,
"total_entropy": 1.927350401878357
},
{
"combined_loss": 0.6634478569030762,
"completion_length": 618.1875,
"epoch": 0.08651399491094147,
"grad_norm": 1.6202704906463623,
"kl": 0.0,
"learning_rate": 9.645056439016825e-07,
"loss": 0.6634,
"num_samples": 1.0,
"reward": 3.125,
"reward_std": 1.5379188060760498,
"rewards/gpt4o_holistic_reward": 3.125,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.2114930152893066,
"speech_entropy": 2.063384532928467,
"speech_kl": 0.0,
"step": 272,
"text_entropy": 0.6551350951194763,
"text_kl": 0.0,
"total_entropy": 1.7709801197052002
},
{
"combined_loss": 0.6863433718681335,
"completion_length": 390.0625,
"epoch": 0.0868320610687023,
"grad_norm": 1.5252995491027832,
"kl": 0.0,
"learning_rate": 9.64227184053598e-07,
"loss": 0.6863,
"num_samples": 1.0,
"reward": 3.875,
"reward_std": 0.4788135886192322,
"rewards/gpt4o_holistic_reward": 3.875,
"rl_loss": 0.0,
"sft_loss": 2.287811040878296,
"speech_entropy": 2.296483039855957,
"speech_kl": 0.0,
"step": 273,
"text_entropy": 1.0703375339508057,
"text_kl": 0.0,
"total_entropy": 2.0609984397888184
},
{
"combined_loss": 0.7279566526412964,
"completion_length": 336.625,
"epoch": 0.0871501272264631,
"grad_norm": 2.0608808994293213,
"kl": 0.0,
"learning_rate": 9.639476813890713e-07,
"loss": 0.728,
"num_samples": 1.0,
"reward": 2.875,
"reward_std": 0.6444375514984131,
"rewards/gpt4o_holistic_reward": 2.875,
"rl_loss": 0.0,
"sft_loss": 2.4265217781066895,
"speech_entropy": 2.2220163345336914,
"speech_kl": 0.0,
"step": 274,
"text_entropy": 1.446899175643921,
"text_kl": 0.0,
"total_entropy": 2.081493377685547
},
{
"combined_loss": 0.7374498248100281,
"completion_length": 533.1875,
"epoch": 0.08746819338422392,
"grad_norm": 1.9891971349716187,
"kl": 0.0,
"learning_rate": 9.636671366117494e-07,
"loss": 0.7374,
"num_samples": 1.0,
"reward": 2.5,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_holistic_reward": 2.5,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.4581661224365234,
"speech_entropy": 2.133476495742798,
"speech_kl": 0.0,
"step": 275,
"text_entropy": 1.2225779294967651,
"text_kl": 0.0,
"total_entropy": 1.9578973054885864
},
{
"combined_loss": 0.6486621499061584,
"completion_length": 366.875,
"epoch": 0.08778625954198473,
"grad_norm": 3.3881635665893555,
"kl": 0.0,
"learning_rate": 9.63385550427904e-07,
"loss": 0.6487,
"num_samples": 1.0,
"reward": 4.375,
"reward_std": 0.9331126809120178,
"rewards/gpt4o_holistic_reward": 4.375,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.1622071266174316,
"speech_entropy": 2.344599723815918,
"speech_kl": 0.0,
"step": 276,
"text_entropy": 1.1587448120117188,
"text_kl": 0.0,
"total_entropy": 2.122767448425293
},
{
"combined_loss": 0.6684514284133911,
"completion_length": 424.9375,
"epoch": 0.08810432569974555,
"grad_norm": 1.9074612855911255,
"kl": 0.0,
"learning_rate": 9.631029235464278e-07,
"loss": 0.6685,
"num_samples": 1.0,
"reward": 4.5,
"reward_std": 0.5774502158164978,
"rewards/gpt4o_holistic_reward": 4.5,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.2281713485717773,
"speech_entropy": 2.4562625885009766,
"speech_kl": 0.0,
"step": 277,
"text_entropy": 1.2592930793762207,
"text_kl": 0.0,
"total_entropy": 2.2500598430633545
},
{
"combined_loss": 0.671592116355896,
"completion_length": 438.0,
"epoch": 0.08842239185750636,
"grad_norm": 1.861733078956604,
"kl": 0.0,
"learning_rate": 9.628192566788335e-07,
"loss": 0.6716,
"num_samples": 1.0,
"reward": 3.75,
"reward_std": 0.6404881477355957,
"rewards/gpt4o_holistic_reward": 3.75,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.238640308380127,
"speech_entropy": 2.177516222000122,
"speech_kl": 0.0,
"step": 278,
"text_entropy": 1.346944808959961,
"text_kl": 0.0,
"total_entropy": 2.020207643508911
},
{
"combined_loss": 0.638625979423523,
"completion_length": 528.4375,
"epoch": 0.08874045801526717,
"grad_norm": 1.7982735633850098,
"kl": 0.0,
"learning_rate": 9.625345505392522e-07,
"loss": 0.6386,
"num_samples": 1.0,
"reward": 2.625,
"reward_std": 1.0474694967269897,
"rewards/gpt4o_holistic_reward": 2.625,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.128753185272217,
"speech_entropy": 2.180100679397583,
"speech_kl": 0.0,
"step": 279,
"text_entropy": 0.8083711862564087,
"text_kl": 0.0,
"total_entropy": 1.9105725288391113
},
{
"combined_loss": 0.7118488550186157,
"completion_length": 514.25,
"epoch": 0.089058524173028,
"grad_norm": 1.5744655132293701,
"kl": 0.0,
"learning_rate": 9.622488058444313e-07,
"loss": 0.7118,
"num_samples": 1.0,
"reward": 4.375,
"reward_std": 0.8228486180305481,
"rewards/gpt4o_holistic_reward": 4.375,
"rl_loss": 5.587935447692871e-09,
"sft_loss": 2.3728294372558594,
"speech_entropy": 2.1545000076293945,
"speech_kl": 0.0,
"step": 280,
"text_entropy": 1.114195466041565,
"text_kl": 0.0,
"total_entropy": 1.9586902856826782
},
{
"combined_loss": 0.6681440472602844,
"completion_length": 377.4375,
"epoch": 0.0893765903307888,
"grad_norm": 1.993377447128296,
"kl": 0.0,
"learning_rate": 9.619620233137326e-07,
"loss": 0.6681,
"num_samples": 1.0,
"reward": 3.4375,
"reward_std": 0.6724694967269897,
"rewards/gpt4o_holistic_reward": 3.4375,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.227146863937378,
"speech_entropy": 2.207726240158081,
"speech_kl": 0.0,
"step": 281,
"text_entropy": 1.0726655721664429,
"text_kl": 0.0,
"total_entropy": 1.990494728088379
},
{
"combined_loss": 0.6539067029953003,
"completion_length": 420.5625,
"epoch": 0.08969465648854962,
"grad_norm": 1.9349361658096313,
"kl": 0.0,
"learning_rate": 9.61674203669131e-07,
"loss": 0.6539,
"num_samples": 1.0,
"reward": 4.375,
"reward_std": 1.250100016593933,
"rewards/gpt4o_holistic_reward": 4.375,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.1796889305114746,
"speech_entropy": 2.3027138710021973,
"speech_kl": 0.0,
"step": 282,
"text_entropy": 1.0392296314239502,
"text_kl": 0.0,
"total_entropy": 2.0374677181243896
},
{
"combined_loss": 0.7105068564414978,
"completion_length": 469.1875,
"epoch": 0.09001272264631044,
"grad_norm": 1.9159810543060303,
"kl": 0.0,
"learning_rate": 9.61385347635212e-07,
"loss": 0.7105,
"num_samples": 1.0,
"reward": 3.5,
"reward_std": 0.7288135886192322,
"rewards/gpt4o_holistic_reward": 3.5,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.368356227874756,
"speech_entropy": 2.4519176483154297,
"speech_kl": 0.0,
"step": 283,
"text_entropy": 2.145763397216797,
"text_kl": 0.0,
"total_entropy": 2.3996076583862305
},
{
"combined_loss": 0.562969982624054,
"completion_length": 570.3125,
"epoch": 0.09033078880407125,
"grad_norm": 1.6282283067703247,
"kl": 0.0,
"learning_rate": 9.610954559391704e-07,
"loss": 0.563,
"num_samples": 1.0,
"reward": 3.5,
"reward_std": 0.9788135886192322,
"rewards/gpt4o_holistic_reward": 3.5,
"rl_loss": 0.0,
"sft_loss": 1.8765664100646973,
"speech_entropy": 2.180119514465332,
"speech_kl": 0.0,
"step": 284,
"text_entropy": 0.6460127830505371,
"text_kl": 0.0,
"total_entropy": 1.8707494735717773
},
{
"combined_loss": 0.691946268081665,
"completion_length": 618.0625,
"epoch": 0.09064885496183206,
"grad_norm": 1.7233694791793823,
"kl": 0.0,
"learning_rate": 9.60804529310808e-07,
"loss": 0.6919,
"num_samples": 1.0,
"reward": 3.5,
"reward_std": 1.3536533117294312,
"rewards/gpt4o_holistic_reward": 3.5,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.306487560272217,
"speech_entropy": 2.1861276626586914,
"speech_kl": 0.0,
"step": 285,
"text_entropy": 0.9004356861114502,
"text_kl": 0.0,
"total_entropy": 1.9308792352676392
},
{
"combined_loss": 0.6357396841049194,
"completion_length": 506.5625,
"epoch": 0.09096692111959287,
"grad_norm": 1.5584510564804077,
"kl": 0.0,
"learning_rate": 9.605125684825322e-07,
"loss": 0.6357,
"num_samples": 1.0,
"reward": 3.375,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_holistic_reward": 3.375,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.1191322803497314,
"speech_entropy": 2.424971580505371,
"speech_kl": 0.0,
"step": 286,
"text_entropy": 1.4699835777282715,
"text_kl": 0.0,
"total_entropy": 2.243284225463867
},
{
"combined_loss": 0.8120319843292236,
"completion_length": 579.6875,
"epoch": 0.0912849872773537,
"grad_norm": 1.81868577003479,
"kl": 0.0,
"learning_rate": 9.602195741893546e-07,
"loss": 0.812,
"num_samples": 1.0,
"reward": 4.375,
"reward_std": 0.6115237474441528,
"rewards/gpt4o_holistic_reward": 4.375,
"rl_loss": -1.862645149230957e-09,
"sft_loss": 2.706773042678833,
"speech_entropy": 2.2252883911132812,
"speech_kl": 0.0,
"step": 287,
"text_entropy": 1.768620491027832,
"text_kl": 0.0,
"total_entropy": 2.1409504413604736
},
{
"combined_loss": 0.7210448384284973,
"completion_length": 399.75,
"epoch": 0.0916030534351145,
"grad_norm": 3.3182334899902344,
"kl": 0.0,
"learning_rate": 9.59925547168887e-07,
"loss": 0.721,
"num_samples": 1.0,
"reward": 3.4375,
"reward_std": 1.0731656551361084,
"rewards/gpt4o_holistic_reward": 3.4375,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.403482675552368,
"speech_entropy": 2.526559829711914,
"speech_kl": 0.0,
"step": 288,
"text_entropy": 1.5308034420013428,
"text_kl": 0.0,
"total_entropy": 2.3434336185455322
},
{
"combined_loss": 0.7331863641738892,
"completion_length": 562.9375,
"epoch": 0.09192111959287531,
"grad_norm": 1.7665117979049683,
"kl": 0.0,
"learning_rate": 9.596304881613432e-07,
"loss": 0.7332,
"num_samples": 1.0,
"reward": 3.375,
"reward_std": 1.0327467918395996,
"rewards/gpt4o_holistic_reward": 3.375,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.4439544677734375,
"speech_entropy": 2.499924659729004,
"speech_kl": 0.0,
"step": 289,
"text_entropy": 1.3738188743591309,
"text_kl": 0.0,
"total_entropy": 2.2846546173095703
},
{
"combined_loss": 0.7444977760314941,
"completion_length": 483.3125,
"epoch": 0.09223918575063614,
"grad_norm": 1.8501068353652954,
"kl": 0.0,
"learning_rate": 9.593343979095332e-07,
"loss": 0.7445,
"num_samples": 1.0,
"reward": 3.75,
"reward_std": 0.5000999569892883,
"rewards/gpt4o_holistic_reward": 3.75,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.481659173965454,
"speech_entropy": 2.394073486328125,
"speech_kl": 0.0,
"step": 290,
"text_entropy": 1.5317790508270264,
"text_kl": 0.0,
"total_entropy": 2.2343432903289795
},
{
"combined_loss": 0.6787593364715576,
"completion_length": 557.375,
"epoch": 0.09255725190839695,
"grad_norm": 2.0699431896209717,
"kl": 0.0,
"learning_rate": 9.59037277158864e-07,
"loss": 0.6788,
"num_samples": 1.0,
"reward": 2.6875,
"reward_std": 0.9478486180305481,
"rewards/gpt4o_holistic_reward": 2.6875,
"rl_loss": 1.862645149230957e-08,
"sft_loss": 2.26253080368042,
"speech_entropy": 2.463019609451294,
"speech_kl": 0.0,
"step": 291,
"text_entropy": 1.1903096437454224,
"text_kl": 0.0,
"total_entropy": 2.2193331718444824
},
{
"combined_loss": 0.6866003274917603,
"completion_length": 340.25,
"epoch": 0.09287531806615776,
"grad_norm": 1.7148224115371704,
"kl": 0.0,
"learning_rate": 9.587391266573366e-07,
"loss": 0.6866,
"num_samples": 1.0,
"reward": 4.875,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_holistic_reward": 4.875,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.288667678833008,
"speech_entropy": 2.2371129989624023,
"speech_kl": 0.0,
"step": 292,
"text_entropy": 1.372521162033081,
"text_kl": 0.0,
"total_entropy": 2.084916591644287
},
{
"combined_loss": 0.6642424464225769,
"completion_length": 414.375,
"epoch": 0.09319338422391857,
"grad_norm": 1.98994779586792,
"kl": 0.0,
"learning_rate": 9.584399471555449e-07,
"loss": 0.6642,
"num_samples": 1.0,
"reward": 4.375,
"reward_std": 1.1036534309387207,
"rewards/gpt4o_holistic_reward": 4.375,
"rl_loss": 0.0,
"sft_loss": 2.214141368865967,
"speech_entropy": 2.3391613960266113,
"speech_kl": 0.0,
"step": 293,
"text_entropy": 1.0432538986206055,
"text_kl": 0.0,
"total_entropy": 2.0753378868103027
},
{
"combined_loss": 0.6237722635269165,
"completion_length": 518.5,
"epoch": 0.09351145038167939,
"grad_norm": 1.5434069633483887,
"kl": 0.0,
"learning_rate": 9.581397394066726e-07,
"loss": 0.6238,
"num_samples": 1.0,
"reward": 3.75,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_holistic_reward": 3.75,
"rl_loss": 1.862645149230957e-09,
"sft_loss": 2.0792407989501953,
"speech_entropy": 2.4281280040740967,
"speech_kl": 0.0,
"step": 294,
"text_entropy": 1.0986557006835938,
"text_kl": 0.0,
"total_entropy": 2.148758888244629
},
{
"combined_loss": 0.6445981860160828,
"completion_length": 476.0625,
"epoch": 0.0938295165394402,
"grad_norm": 1.9953515529632568,
"kl": 0.0,
"learning_rate": 9.578385041664925e-07,
"loss": 0.6446,
"num_samples": 1.0,
"reward": 3.0,
"reward_std": 0.5774502754211426,
"rewards/gpt4o_holistic_reward": 3.0,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.148660659790039,
"speech_entropy": 2.1423118114471436,
"speech_kl": 0.0,
"step": 295,
"text_entropy": 1.1786762475967407,
"text_kl": 0.0,
"total_entropy": 1.9594595432281494
},
{
"combined_loss": 0.698508620262146,
"completion_length": 513.6875,
"epoch": 0.09414758269720101,
"grad_norm": 1.7260363101959229,
"kl": 0.0,
"learning_rate": 9.575362421933638e-07,
"loss": 0.6985,
"num_samples": 1.0,
"reward": 3.5625,
"reward_std": 0.9786533117294312,
"rewards/gpt4o_holistic_reward": 3.5625,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.328361988067627,
"speech_entropy": 2.1992037296295166,
"speech_kl": 0.0,
"step": 296,
"text_entropy": 1.1559290885925293,
"text_kl": 0.0,
"total_entropy": 1.9949164390563965
},
{
"combined_loss": 0.6678205132484436,
"completion_length": 562.1875,
"epoch": 0.09446564885496184,
"grad_norm": 2.3985586166381836,
"kl": 0.0,
"learning_rate": 9.572329542482309e-07,
"loss": 0.6678,
"num_samples": 1.0,
"reward": 3.3125,
"reward_std": 0.5622053742408752,
"rewards/gpt4o_holistic_reward": 3.3125,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.2260682582855225,
"speech_entropy": 2.17387056350708,
"speech_kl": 0.0,
"step": 297,
"text_entropy": 1.249056339263916,
"text_kl": 0.0,
"total_entropy": 1.9902275800704956
},
{
"combined_loss": 0.7157855033874512,
"completion_length": 452.0,
"epoch": 0.09478371501272265,
"grad_norm": 1.746224045753479,
"kl": 0.0,
"learning_rate": 9.569286410946207e-07,
"loss": 0.7158,
"num_samples": 1.0,
"reward": 4.5,
"reward_std": 0.5774502158164978,
"rewards/gpt4o_holistic_reward": 4.5,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.385951519012451,
"speech_entropy": 2.403526782989502,
"speech_kl": 0.0,
"step": 298,
"text_entropy": 1.6002991199493408,
"text_kl": 0.0,
"total_entropy": 2.2620410919189453
},
{
"combined_loss": 0.7848547101020813,
"completion_length": 352.8125,
"epoch": 0.09510178117048346,
"grad_norm": 2.679422616958618,
"kl": 0.0,
"learning_rate": 9.566233034986411e-07,
"loss": 0.7849,
"num_samples": 1.0,
"reward": 3.0,
"reward_std": 0.7500999569892883,
"rewards/gpt4o_holistic_reward": 3.0,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.6161820888519287,
"speech_entropy": 2.4429244995117188,
"speech_kl": 0.0,
"step": 299,
"text_entropy": 1.1291344165802002,
"text_kl": 0.0,
"total_entropy": 2.160275936126709
},
{
"combined_loss": 0.7183820009231567,
"completion_length": 509.0,
"epoch": 0.09541984732824428,
"grad_norm": 2.244758129119873,
"kl": 0.0,
"learning_rate": 9.563169422289796e-07,
"loss": 0.7184,
"num_samples": 1.0,
"reward": 3.75,
"reward_std": 1.478813648223877,
"rewards/gpt4o_holistic_reward": 3.75,
"rl_loss": 1.862645149230957e-09,
"sft_loss": 2.394606590270996,
"speech_entropy": 2.7869982719421387,
"speech_kl": 0.0,
"step": 300,
"text_entropy": 1.4828916788101196,
"text_kl": 0.0,
"total_entropy": 2.5221667289733887
},
{
"combined_loss": 0.6260841488838196,
"completion_length": 436.25,
"epoch": 0.09573791348600509,
"grad_norm": 2.182548761367798,
"kl": 0.0,
"learning_rate": 9.560095580568996e-07,
"loss": 0.6261,
"num_samples": 1.0,
"reward": 3.75,
"reward_std": 1.395711898803711,
"rewards/gpt4o_holistic_reward": 3.75,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.086947202682495,
"speech_entropy": 2.514786720275879,
"speech_kl": 0.0,
"step": 301,
"text_entropy": 1.6524386405944824,
"text_kl": 0.0,
"total_entropy": 2.2878293991088867
},
{
"combined_loss": 0.7308363914489746,
"completion_length": 391.375,
"epoch": 0.0960559796437659,
"grad_norm": 1.8132033348083496,
"kl": 0.0,
"learning_rate": 9.55701151756241e-07,
"loss": 0.7308,
"num_samples": 1.0,
"reward": 4.375,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_holistic_reward": 4.375,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.4361209869384766,
"speech_entropy": 2.2866296768188477,
"speech_kl": 0.0,
"step": 302,
"text_entropy": 1.3407695293426514,
"text_kl": 0.0,
"total_entropy": 2.0895495414733887
},
{
"combined_loss": 0.6096498966217041,
"completion_length": 303.9375,
"epoch": 0.09637404580152671,
"grad_norm": 2.869903087615967,
"kl": 0.0,
"learning_rate": 9.55391724103416e-07,
"loss": 0.6096,
"num_samples": 1.0,
"reward": 4.0625,
"reward_std": 0.7394567728042603,
"rewards/gpt4o_holistic_reward": 4.0625,
"rl_loss": 0.0,
"sft_loss": 2.0321662425994873,
"speech_entropy": 2.8348755836486816,
"speech_kl": 0.0,
"step": 303,
"text_entropy": 1.151402235031128,
"text_kl": 0.0,
"total_entropy": 2.51552152633667
},
{
"combined_loss": 0.6735842227935791,
"completion_length": 553.4375,
"epoch": 0.09669211195928754,
"grad_norm": 1.5486979484558105,
"kl": 0.0,
"learning_rate": 9.550812758774085e-07,
"loss": 0.6736,
"num_samples": 1.0,
"reward": 3.625,
"reward_std": 0.4788135886192322,
"rewards/gpt4o_holistic_reward": 3.625,
"rl_loss": 0.0,
"sft_loss": 2.2452807426452637,
"speech_entropy": 2.6690659523010254,
"speech_kl": 0.0,
"step": 304,
"text_entropy": 1.2094488143920898,
"text_kl": 0.0,
"total_entropy": 2.3848817348480225
},
{
"combined_loss": 0.6192010641098022,
"completion_length": 465.4375,
"epoch": 0.09701017811704835,
"grad_norm": 1.6173670291900635,
"kl": 0.0,
"learning_rate": 9.547698078597713e-07,
"loss": 0.6192,
"num_samples": 1.0,
"reward": 4.0625,
"reward_std": 0.6770563125610352,
"rewards/gpt4o_holistic_reward": 4.0625,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.0640034675598145,
"speech_entropy": 2.154529094696045,
"speech_kl": 0.0,
"step": 305,
"text_entropy": 1.051210641860962,
"text_kl": 0.0,
"total_entropy": 1.921095848083496
},
{
"combined_loss": 0.6843652725219727,
"completion_length": 401.125,
"epoch": 0.09732824427480916,
"grad_norm": 2.191951036453247,
"kl": 0.0,
"learning_rate": 9.54457320834625e-07,
"loss": 0.6844,
"num_samples": 1.0,
"reward": 4.3125,
"reward_std": 0.5581127405166626,
"rewards/gpt4o_holistic_reward": 4.3125,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.281217575073242,
"speech_entropy": 2.282792091369629,
"speech_kl": 0.0,
"step": 306,
"text_entropy": 1.1090407371520996,
"text_kl": 0.0,
"total_entropy": 2.0719239711761475
},
{
"combined_loss": 0.6455174684524536,
"completion_length": 423.625,
"epoch": 0.09764631043256998,
"grad_norm": 2.207777261734009,
"kl": 0.0,
"learning_rate": 9.541438155886554e-07,
"loss": 0.6455,
"num_samples": 1.0,
"reward": 2.625,
"reward_std": 0.8644567728042603,
"rewards/gpt4o_holistic_reward": 2.625,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.1517248153686523,
"speech_entropy": 2.6509978771209717,
"speech_kl": 0.0,
"step": 307,
"text_entropy": 1.262976884841919,
"text_kl": 0.0,
"total_entropy": 2.38793683052063
},
{
"combined_loss": 0.7142800092697144,
"completion_length": 549.875,
"epoch": 0.09796437659033079,
"grad_norm": 1.5794309377670288,
"kl": 0.0,
"learning_rate": 9.538292929111112e-07,
"loss": 0.7143,
"num_samples": 1.0,
"reward": 3.5,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_holistic_reward": 3.5,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.3809332847595215,
"speech_entropy": 2.319915294647217,
"speech_kl": 0.0,
"step": 308,
"text_entropy": 1.5507756471633911,
"text_kl": 0.0,
"total_entropy": 2.1751065254211426
},
{
"combined_loss": 0.6554994583129883,
"completion_length": 314.3125,
"epoch": 0.0982824427480916,
"grad_norm": 1.8395949602127075,
"kl": 0.0,
"learning_rate": 9.535137535938031e-07,
"loss": 0.6555,
"num_samples": 1.0,
"reward": 4.4375,
"reward_std": 0.5646764636039734,
"rewards/gpt4o_holistic_reward": 4.4375,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.1849982738494873,
"speech_entropy": 2.135890245437622,
"speech_kl": 0.0,
"step": 309,
"text_entropy": 0.8897652626037598,
"text_kl": 0.0,
"total_entropy": 1.9156931638717651
},
{
"combined_loss": 0.738502025604248,
"completion_length": 449.25,
"epoch": 0.09860050890585242,
"grad_norm": 1.810171127319336,
"kl": 0.0,
"learning_rate": 9.531971984311011e-07,
"loss": 0.7385,
"num_samples": 1.0,
"reward": 3.5625,
"reward_std": 0.7654882073402405,
"rewards/gpt4o_holistic_reward": 3.5625,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.4616734981536865,
"speech_entropy": 2.2290310859680176,
"speech_kl": 0.0,
"step": 310,
"text_entropy": 1.629712462425232,
"text_kl": 0.0,
"total_entropy": 2.121129035949707
},
{
"combined_loss": 0.7250782251358032,
"completion_length": 504.125,
"epoch": 0.09891857506361323,
"grad_norm": 1.8087352514266968,
"kl": 0.0,
"learning_rate": 9.528796282199321e-07,
"loss": 0.7251,
"num_samples": 1.0,
"reward": 3.6875,
"reward_std": 0.8750999569892883,
"rewards/gpt4o_holistic_reward": 3.6875,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.4169273376464844,
"speech_entropy": 2.1521968841552734,
"speech_kl": 0.0,
"step": 311,
"text_entropy": 1.3652535676956177,
"text_kl": 0.0,
"total_entropy": 2.007472038269043
},
{
"combined_loss": 0.6740373373031616,
"completion_length": 274.0625,
"epoch": 0.09923664122137404,
"grad_norm": 2.0441370010375977,
"kl": 0.0,
"learning_rate": 9.52561043759779e-07,
"loss": 0.674,
"num_samples": 1.0,
"reward": 4.25,
"reward_std": 1.183112621307373,
"rewards/gpt4o_holistic_reward": 4.25,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.246790885925293,
"speech_entropy": 2.477415084838867,
"speech_kl": 0.0,
"step": 312,
"text_entropy": 0.9889883399009705,
"text_kl": 0.0,
"total_entropy": 2.1920857429504395
},
{
"combined_loss": 0.64084392786026,
"completion_length": 396.0625,
"epoch": 0.09955470737913485,
"grad_norm": 1.5913077592849731,
"kl": 0.0,
"learning_rate": 9.522414458526778e-07,
"loss": 0.6408,
"num_samples": 1.0,
"reward": 4.1875,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_holistic_reward": 4.1875,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.1361465454101562,
"speech_entropy": 2.176445484161377,
"speech_kl": 0.0,
"step": 313,
"text_entropy": 1.2821910381317139,
"text_kl": 0.0,
"total_entropy": 2.0197858810424805
},
{
"combined_loss": 0.6571515202522278,
"completion_length": 449.0625,
"epoch": 0.09987277353689568,
"grad_norm": 1.8189066648483276,
"kl": 0.0,
"learning_rate": 9.519208353032158e-07,
"loss": 0.6572,
"num_samples": 1.0,
"reward": 4.3125,
"reward_std": 0.4733423590660095,
"rewards/gpt4o_holistic_reward": 4.3125,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.190505027770996,
"speech_entropy": 2.1687145233154297,
"speech_kl": 0.0,
"step": 314,
"text_entropy": 1.071582555770874,
"text_kl": 0.0,
"total_entropy": 1.9608268737792969
},
{
"combined_loss": 0.5773699283599854,
"completion_length": 375.3125,
"epoch": 0.10019083969465649,
"grad_norm": 1.6602267026901245,
"kl": 0.0,
"learning_rate": 9.515992129185294e-07,
"loss": 0.5774,
"num_samples": 1.0,
"reward": 4.375,
"reward_std": 0.8274502158164978,
"rewards/gpt4o_holistic_reward": 4.375,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 1.924566388130188,
"speech_entropy": 2.1676058769226074,
"speech_kl": 0.0,
"step": 315,
"text_entropy": 0.8978569507598877,
"text_kl": 0.0,
"total_entropy": 1.9022667407989502
},
{
"combined_loss": 0.591927170753479,
"completion_length": 374.9375,
"epoch": 0.1005089058524173,
"grad_norm": 1.8265076875686646,
"kl": 0.0,
"learning_rate": 9.512765795083029e-07,
"loss": 0.5919,
"num_samples": 1.0,
"reward": 4.0625,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_holistic_reward": 4.0625,
"rl_loss": 0.0,
"sft_loss": 1.973090410232544,
"speech_entropy": 2.2445173263549805,
"speech_kl": 0.0,
"step": 316,
"text_entropy": 0.9660072922706604,
"text_kl": 0.0,
"total_entropy": 1.9937914609909058
},
{
"combined_loss": 0.7159188985824585,
"completion_length": 387.875,
"epoch": 0.10082697201017812,
"grad_norm": 2.4045403003692627,
"kl": 0.0,
"learning_rate": 9.509529358847654e-07,
"loss": 0.7159,
"num_samples": 1.0,
"reward": 4.625,
"reward_std": 0.6444375514984131,
"rewards/gpt4o_holistic_reward": 4.625,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.3863961696624756,
"speech_entropy": 2.341677665710449,
"speech_kl": 0.0,
"step": 317,
"text_entropy": 1.6361427307128906,
"text_kl": 0.0,
"total_entropy": 2.214456796646118
},
{
"combined_loss": 0.6382442712783813,
"completion_length": 391.0625,
"epoch": 0.10114503816793893,
"grad_norm": 1.9414576292037964,
"kl": 0.0,
"learning_rate": 9.506282828626894e-07,
"loss": 0.6382,
"num_samples": 1.0,
"reward": 2.75,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_holistic_reward": 2.75,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.1274807453155518,
"speech_entropy": 2.3970725536346436,
"speech_kl": 0.0,
"step": 318,
"text_entropy": 1.257880449295044,
"text_kl": 0.0,
"total_entropy": 2.180114269256592
},
{
"combined_loss": 0.6456592082977295,
"completion_length": 357.625,
"epoch": 0.10146310432569974,
"grad_norm": 1.8857872486114502,
"kl": 0.0,
"learning_rate": 9.503026212593886e-07,
"loss": 0.6457,
"num_samples": 1.0,
"reward": 3.3125,
"reward_std": 0.6251000165939331,
"rewards/gpt4o_holistic_reward": 3.3125,
"rl_loss": -1.4901161193847656e-08,
"sft_loss": 2.1521973609924316,
"speech_entropy": 2.1984481811523438,
"speech_kl": 0.0,
"step": 319,
"text_entropy": 1.3871957063674927,
"text_kl": 0.0,
"total_entropy": 2.054154634475708
},
{
"combined_loss": 0.6763216257095337,
"completion_length": 374.75,
"epoch": 0.10178117048346055,
"grad_norm": 1.619836688041687,
"kl": 0.0,
"learning_rate": 9.499759518947154e-07,
"loss": 0.6763,
"num_samples": 1.0,
"reward": 4.25,
"reward_std": 9.999999747378752e-05,
"rewards/gpt4o_holistic_reward": 4.25,
"rl_loss": 0.0,
"sft_loss": 2.2544054985046387,
"speech_entropy": 2.144301414489746,
"speech_kl": 0.0,
"step": 320,
"text_entropy": 0.933074951171875,
"text_kl": 0.0,
"total_entropy": 1.9263477325439453
},
{
"combined_loss": 0.8536103963851929,
"completion_length": 433.5625,
"epoch": 0.10209923664122138,
"grad_norm": 3.0396976470947266,
"kl": 0.0,
"learning_rate": 9.496482755910599e-07,
"loss": 0.8536,
"num_samples": 1.0,
"reward": 2.875,
"reward_std": 0.772705078125,
"rewards/gpt4o_holistic_reward": 2.875,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.8453681468963623,
"speech_entropy": 2.4692482948303223,
"speech_kl": 0.0,
"step": 321,
"text_entropy": 1.5737974643707275,
"text_kl": 0.0,
"total_entropy": 2.3104732036590576
},
{
"combined_loss": 0.6165826320648193,
"completion_length": 455.8125,
"epoch": 0.10241730279898219,
"grad_norm": 1.8582490682601929,
"kl": 0.0,
"learning_rate": 9.493195931733465e-07,
"loss": 0.6166,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 0.8750999569892883,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.0552754402160645,
"speech_entropy": 2.3463516235351562,
"speech_kl": 0.0,
"step": 322,
"text_entropy": 0.7413707971572876,
"text_kl": 0.0,
"total_entropy": 2.0313949584960938
},
{
"combined_loss": 0.7083429098129272,
"completion_length": 455.0625,
"epoch": 0.102735368956743,
"grad_norm": 1.5822250843048096,
"kl": 0.0,
"learning_rate": 9.489899054690329e-07,
"loss": 0.7083,
"num_samples": 1.0,
"reward": 3.5,
"reward_std": 0.3536534011363983,
"rewards/gpt4o_holistic_reward": 3.5,
"rl_loss": 0.0,
"sft_loss": 2.361143112182617,
"speech_entropy": 2.1958041191101074,
"speech_kl": 0.0,
"step": 323,
"text_entropy": 1.5856623649597168,
"text_kl": 0.0,
"total_entropy": 2.0761876106262207
},
{
"combined_loss": 0.6362742185592651,
"completion_length": 430.3125,
"epoch": 0.10305343511450382,
"grad_norm": 2.0028886795043945,
"kl": 0.0,
"learning_rate": 9.486592133081075e-07,
"loss": 0.6363,
"num_samples": 1.0,
"reward": 4.3125,
"reward_std": 0.4478486180305481,
"rewards/gpt4o_holistic_reward": 4.3125,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.1209139823913574,
"speech_entropy": 2.1673266887664795,
"speech_kl": 0.0,
"step": 324,
"text_entropy": 1.05299711227417,
"text_kl": 0.0,
"total_entropy": 1.9691715240478516
},
{
"combined_loss": 0.7382842302322388,
"completion_length": 476.875,
"epoch": 0.10337150127226463,
"grad_norm": 2.1232645511627197,
"kl": 0.0,
"learning_rate": 9.483275175230874e-07,
"loss": 0.7383,
"num_samples": 1.0,
"reward": 3.75,
"reward_std": 1.0774502754211426,
"rewards/gpt4o_holistic_reward": 3.75,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.460947275161743,
"speech_entropy": 2.690502882003784,
"speech_kl": 0.0,
"step": 325,
"text_entropy": 1.2036354541778564,
"text_kl": 0.0,
"total_entropy": 2.3894941806793213
},
{
"combined_loss": 0.6234292387962341,
"completion_length": 601.4375,
"epoch": 0.10368956743002544,
"grad_norm": 1.537564992904663,
"kl": 0.0,
"learning_rate": 9.479948189490164e-07,
"loss": 0.6234,
"num_samples": 1.0,
"reward": 4.3125,
"reward_std": 0.5581127405166626,
"rewards/gpt4o_holistic_reward": 4.3125,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.078097343444824,
"speech_entropy": 2.1220028400421143,
"speech_kl": 0.0,
"step": 326,
"text_entropy": 1.139967918395996,
"text_kl": 0.0,
"total_entropy": 1.9373573064804077
},
{
"combined_loss": 0.6593961715698242,
"completion_length": 401.125,
"epoch": 0.10400763358778627,
"grad_norm": 2.1285951137542725,
"kl": 0.0,
"learning_rate": 9.476611184234627e-07,
"loss": 0.6594,
"num_samples": 1.0,
"reward": 3.3125,
"reward_std": 1.019437551498413,
"rewards/gpt4o_holistic_reward": 3.3125,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.1979873180389404,
"speech_entropy": 2.558565616607666,
"speech_kl": 0.0,
"step": 327,
"text_entropy": 1.2850337028503418,
"text_kl": 0.0,
"total_entropy": 2.329627513885498
},
{
"combined_loss": 0.7995979189872742,
"completion_length": 416.5625,
"epoch": 0.10432569974554708,
"grad_norm": 2.2032155990600586,
"kl": 0.0,
"learning_rate": 9.473264167865171e-07,
"loss": 0.7996,
"num_samples": 1.0,
"reward": 3.6875,
"reward_std": 0.6637751460075378,
"rewards/gpt4o_holistic_reward": 3.6875,
"rl_loss": 0.0,
"sft_loss": 2.6653263568878174,
"speech_entropy": 2.289992332458496,
"speech_kl": 0.0,
"step": 328,
"text_entropy": 1.8079229593276978,
"text_kl": 0.0,
"total_entropy": 2.2008442878723145
},
{
"combined_loss": 0.7375897169113159,
"completion_length": 515.0625,
"epoch": 0.10464376590330789,
"grad_norm": 1.9254885911941528,
"kl": 0.0,
"learning_rate": 9.469907148807904e-07,
"loss": 0.7376,
"num_samples": 1.0,
"reward": 3.125,
"reward_std": 0.8515443801879883,
"rewards/gpt4o_holistic_reward": 3.125,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.458632469177246,
"speech_entropy": 2.328657627105713,
"speech_kl": 0.0,
"step": 329,
"text_entropy": 1.6102688312530518,
"text_kl": 0.0,
"total_entropy": 2.207139492034912
},
{
"combined_loss": 0.6494673490524292,
"completion_length": 605.5,
"epoch": 0.1049618320610687,
"grad_norm": 1.5243898630142212,
"kl": 0.0,
"learning_rate": 9.466540135514118e-07,
"loss": 0.6495,
"num_samples": 1.0,
"reward": 3.5625,
"reward_std": 0.5581127405166626,
"rewards/gpt4o_holistic_reward": 3.5625,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.164891004562378,
"speech_entropy": 2.216064929962158,
"speech_kl": 0.0,
"step": 330,
"text_entropy": 1.050743579864502,
"text_kl": 0.0,
"total_entropy": 1.9922053813934326
},
{
"combined_loss": 0.6644630432128906,
"completion_length": 465.4375,
"epoch": 0.10527989821882952,
"grad_norm": 2.0127413272857666,
"kl": 0.0,
"learning_rate": 9.463163136460267e-07,
"loss": 0.6645,
"num_samples": 1.0,
"reward": 3.625,
"reward_std": 1.1372368335723877,
"rewards/gpt4o_holistic_reward": 3.625,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.214876651763916,
"speech_entropy": 2.606668472290039,
"speech_kl": 0.0,
"step": 331,
"text_entropy": 1.4162580966949463,
"text_kl": 0.0,
"total_entropy": 2.3680739402770996
},
{
"combined_loss": 0.6894693374633789,
"completion_length": 457.5,
"epoch": 0.10559796437659033,
"grad_norm": 1.5198652744293213,
"kl": 0.0,
"learning_rate": 9.45977616014794e-07,
"loss": 0.6895,
"num_samples": 1.0,
"reward": 4.8125,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_holistic_reward": 4.8125,
"rl_loss": 0.0,
"sft_loss": 2.2982308864593506,
"speech_entropy": 2.0877485275268555,
"speech_kl": 0.0,
"step": 332,
"text_entropy": 1.2472989559173584,
"text_kl": 0.0,
"total_entropy": 1.9227503538131714
},
{
"combined_loss": 0.6039384603500366,
"completion_length": 533.625,
"epoch": 0.10591603053435114,
"grad_norm": 1.8885440826416016,
"kl": 0.0,
"learning_rate": 9.456379215103845e-07,
"loss": 0.6039,
"num_samples": 1.0,
"reward": 3.125,
"reward_std": 1.010462999343872,
"rewards/gpt4o_holistic_reward": 3.125,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.0131282806396484,
"speech_entropy": 2.623091697692871,
"speech_kl": 0.0,
"step": 333,
"text_entropy": 0.8223081827163696,
"text_kl": 0.0,
"total_entropy": 2.1995439529418945
},
{
"combined_loss": 0.7336439490318298,
"completion_length": 343.3125,
"epoch": 0.10623409669211197,
"grad_norm": 1.7900915145874023,
"kl": 0.0,
"learning_rate": 9.452972309879789e-07,
"loss": 0.7336,
"num_samples": 1.0,
"reward": 3.875,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_holistic_reward": 3.875,
"rl_loss": -1.4901161193847656e-08,
"sft_loss": 2.4454798698425293,
"speech_entropy": 2.2133467197418213,
"speech_kl": 0.0,
"step": 334,
"text_entropy": 1.487365484237671,
"text_kl": 0.0,
"total_entropy": 2.088663339614868
},
{
"combined_loss": 0.6571398377418518,
"completion_length": 472.6875,
"epoch": 0.10655216284987278,
"grad_norm": 1.6357449293136597,
"kl": 0.0,
"learning_rate": 9.44955545305265e-07,
"loss": 0.6571,
"num_samples": 1.0,
"reward": 4.75,
"reward_std": 0.5000999569892883,
"rewards/gpt4o_holistic_reward": 4.75,
"rl_loss": 0.0,
"sft_loss": 2.1904659271240234,
"speech_entropy": 2.30568790435791,
"speech_kl": 0.0,
"step": 335,
"text_entropy": 0.9574769139289856,
"text_kl": 0.0,
"total_entropy": 2.014315128326416
},
{
"combined_loss": 0.663806140422821,
"completion_length": 316.8125,
"epoch": 0.10687022900763359,
"grad_norm": 1.8551936149597168,
"kl": 0.0,
"learning_rate": 9.446128653224363e-07,
"loss": 0.6638,
"num_samples": 1.0,
"reward": 2.9375,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_holistic_reward": 2.9375,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.2126870155334473,
"speech_entropy": 2.284435749053955,
"speech_kl": 0.0,
"step": 336,
"text_entropy": 1.4010343551635742,
"text_kl": 0.0,
"total_entropy": 2.119978904724121
},
{
"combined_loss": 0.696797788143158,
"completion_length": 450.1875,
"epoch": 0.1071882951653944,
"grad_norm": 1.812302589416504,
"kl": 0.0,
"learning_rate": 9.442691919021891e-07,
"loss": 0.6968,
"num_samples": 1.0,
"reward": 3.25,
"reward_std": 0.7887751460075378,
"rewards/gpt4o_holistic_reward": 3.25,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.3226590156555176,
"speech_entropy": 2.48870587348938,
"speech_kl": 0.0,
"step": 337,
"text_entropy": 1.0859538316726685,
"text_kl": 0.0,
"total_entropy": 2.2247252464294434
},
{
"combined_loss": 0.6544849276542664,
"completion_length": 362.5,
"epoch": 0.10750636132315522,
"grad_norm": 1.7814265489578247,
"kl": 0.0,
"learning_rate": 9.43924525909721e-07,
"loss": 0.6545,
"num_samples": 1.0,
"reward": 4.25,
"reward_std": 0.6831126809120178,
"rewards/gpt4o_holistic_reward": 4.25,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.1816163063049316,
"speech_entropy": 2.2594857215881348,
"speech_kl": 0.0,
"step": 338,
"text_entropy": 1.0299164056777954,
"text_kl": 0.0,
"total_entropy": 2.003577470779419
},
{
"combined_loss": 0.5944312214851379,
"completion_length": 279.625,
"epoch": 0.10782442748091603,
"grad_norm": 1.5945285558700562,
"kl": 0.0,
"learning_rate": 9.43578868212728e-07,
"loss": 0.5944,
"num_samples": 1.0,
"reward": 4.3125,
"reward_std": 0.4733423590660095,
"rewards/gpt4o_holistic_reward": 4.3125,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 1.9814373254776,
"speech_entropy": 2.3196024894714355,
"speech_kl": 0.0,
"step": 339,
"text_entropy": 0.6264474391937256,
"text_kl": 0.0,
"total_entropy": 1.9987720251083374
},
{
"combined_loss": 0.7080922722816467,
"completion_length": 357.75,
"epoch": 0.10814249363867684,
"grad_norm": 1.8270853757858276,
"kl": 0.0,
"learning_rate": 9.432322196814032e-07,
"loss": 0.7081,
"num_samples": 1.0,
"reward": 3.0,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_holistic_reward": 3.0,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.360307455062866,
"speech_entropy": 2.377963066101074,
"speech_kl": 0.0,
"step": 340,
"text_entropy": 1.37347412109375,
"text_kl": 0.0,
"total_entropy": 2.1954591274261475
},
{
"combined_loss": 0.6495932340621948,
"completion_length": 439.4375,
"epoch": 0.10846055979643766,
"grad_norm": 1.823044776916504,
"kl": 0.0,
"learning_rate": 9.428845811884336e-07,
"loss": 0.6496,
"num_samples": 1.0,
"reward": 3.3125,
"reward_std": 0.2694375813007355,
"rewards/gpt4o_holistic_reward": 3.3125,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.165310859680176,
"speech_entropy": 2.547529697418213,
"speech_kl": 0.0,
"step": 341,
"text_entropy": 1.3410420417785645,
"text_kl": 0.0,
"total_entropy": 2.3126533031463623
},
{
"combined_loss": 0.744182825088501,
"completion_length": 700.0625,
"epoch": 0.10877862595419847,
"grad_norm": 1.4928951263427734,
"kl": 0.0,
"learning_rate": 9.42535953608999e-07,
"loss": 0.7442,
"num_samples": 1.0,
"reward": 3.625,
"reward_std": 0.36445680260658264,
"rewards/gpt4o_holistic_reward": 3.625,
"rl_loss": 0.0,
"sft_loss": 2.48060941696167,
"speech_entropy": 2.0912342071533203,
"speech_kl": 0.0,
"step": 342,
"text_entropy": 1.2619953155517578,
"text_kl": 0.0,
"total_entropy": 1.9290344715118408
},
{
"combined_loss": 0.6683810353279114,
"completion_length": 419.0,
"epoch": 0.10909669211195928,
"grad_norm": 2.006411075592041,
"kl": 0.0,
"learning_rate": 9.421863378207685e-07,
"loss": 0.6684,
"num_samples": 1.0,
"reward": 3.5625,
"reward_std": 1.0154881477355957,
"rewards/gpt4o_holistic_reward": 3.5625,
"rl_loss": 2.2351741790771484e-08,
"sft_loss": 2.2279367446899414,
"speech_entropy": 2.1261141300201416,
"speech_kl": 0.0,
"step": 343,
"text_entropy": 1.138145923614502,
"text_kl": 0.0,
"total_entropy": 1.941408395767212
},
{
"combined_loss": 0.6885769367218018,
"completion_length": 448.625,
"epoch": 0.10941475826972011,
"grad_norm": 1.6280336380004883,
"kl": 0.0,
"learning_rate": 9.418357347038998e-07,
"loss": 0.6886,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.2952563762664795,
"speech_entropy": 2.533572196960449,
"speech_kl": 0.0,
"step": 344,
"text_entropy": 1.452898621559143,
"text_kl": 0.0,
"total_entropy": 2.3209335803985596
},
{
"combined_loss": 0.6933377981185913,
"completion_length": 469.25,
"epoch": 0.10973282442748092,
"grad_norm": 1.7831571102142334,
"kl": 0.0,
"learning_rate": 9.414841451410354e-07,
"loss": 0.6933,
"num_samples": 1.0,
"reward": 3.625,
"reward_std": 0.7217878103256226,
"rewards/gpt4o_holistic_reward": 3.625,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.3111257553100586,
"speech_entropy": 2.4832825660705566,
"speech_kl": 0.0,
"step": 345,
"text_entropy": 1.2831058502197266,
"text_kl": 0.0,
"total_entropy": 2.2077646255493164
},
{
"combined_loss": 0.6893813610076904,
"completion_length": 457.25,
"epoch": 0.11005089058524173,
"grad_norm": 1.5565513372421265,
"kl": 0.0,
"learning_rate": 9.411315700173023e-07,
"loss": 0.6894,
"num_samples": 1.0,
"reward": 4.25,
"reward_std": 9.999999747378752e-05,
"rewards/gpt4o_holistic_reward": 4.25,
"rl_loss": 0.0,
"sft_loss": 2.2979378700256348,
"speech_entropy": 2.1734261512756348,
"speech_kl": 0.0,
"step": 346,
"text_entropy": 1.4588537216186523,
"text_kl": 0.0,
"total_entropy": 2.0491230487823486
},
{
"combined_loss": 0.6448703408241272,
"completion_length": 340.125,
"epoch": 0.11036895674300254,
"grad_norm": 2.0143826007843018,
"kl": 0.0,
"learning_rate": 9.407780102203073e-07,
"loss": 0.6449,
"num_samples": 1.0,
"reward": 3.0,
"reward_std": 1.0313551425933838,
"rewards/gpt4o_holistic_reward": 3.0,
"rl_loss": -2.2351741790771484e-08,
"sft_loss": 2.1495676040649414,
"speech_entropy": 2.1733856201171875,
"speech_kl": 0.0,
"step": 347,
"text_entropy": 1.2910046577453613,
"text_kl": 0.0,
"total_entropy": 2.0091493129730225
},
{
"combined_loss": 0.7653839588165283,
"completion_length": 219.375,
"epoch": 0.11068702290076336,
"grad_norm": 2.6158885955810547,
"kl": 0.0,
"learning_rate": 9.40423466640137e-07,
"loss": 0.7654,
"num_samples": 1.0,
"reward": 3.4375,
"reward_std": 0.4478486180305481,
"rewards/gpt4o_holistic_reward": 3.4375,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.5512795448303223,
"speech_entropy": 2.3109560012817383,
"speech_kl": 0.0,
"step": 348,
"text_entropy": 1.525007963180542,
"text_kl": 0.0,
"total_entropy": 2.177790641784668
},
{
"combined_loss": 0.7735346555709839,
"completion_length": 485.0,
"epoch": 0.11100508905852417,
"grad_norm": 1.7478300333023071,
"kl": 0.0,
"learning_rate": 9.400679401693546e-07,
"loss": 0.7735,
"num_samples": 1.0,
"reward": 4.1875,
"reward_std": 0.48945680260658264,
"rewards/gpt4o_holistic_reward": 4.1875,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.578448534011841,
"speech_entropy": 2.2317914962768555,
"speech_kl": 0.0,
"step": 349,
"text_entropy": 1.4859943389892578,
"text_kl": 0.0,
"total_entropy": 2.102978229522705
},
{
"combined_loss": 0.7051453590393066,
"completion_length": 413.4375,
"epoch": 0.11132315521628498,
"grad_norm": 1.8334420919418335,
"kl": 0.0,
"learning_rate": 9.397114317029974e-07,
"loss": 0.7051,
"num_samples": 1.0,
"reward": 3.125,
"reward_std": 0.14443756639957428,
"rewards/gpt4o_holistic_reward": 3.125,
"rl_loss": 0.0,
"sft_loss": 2.3504843711853027,
"speech_entropy": 2.3960375785827637,
"speech_kl": 0.0,
"step": 350,
"text_entropy": 1.6391247510910034,
"text_kl": 0.0,
"total_entropy": 2.2544939517974854
},
{
"combined_loss": 0.6969923973083496,
"completion_length": 393.25,
"epoch": 0.11164122137404581,
"grad_norm": 1.7989410161972046,
"kl": 0.0,
"learning_rate": 9.393539421385749e-07,
"loss": 0.697,
"num_samples": 1.0,
"reward": 3.5625,
"reward_std": 0.2694375813007355,
"rewards/gpt4o_holistic_reward": 3.5625,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.323307514190674,
"speech_entropy": 2.3470993041992188,
"speech_kl": 0.0,
"step": 351,
"text_entropy": 1.3637549877166748,
"text_kl": 0.0,
"total_entropy": 2.1490964889526367
},
{
"combined_loss": 0.6479834318161011,
"completion_length": 286.0625,
"epoch": 0.11195928753180662,
"grad_norm": 1.3780865669250488,
"kl": 0.0,
"learning_rate": 9.38995472376067e-07,
"loss": 0.648,
"num_samples": 1.0,
"reward": 3.0,
"reward_std": 9.999999747378752e-05,
"rewards/gpt4o_holistic_reward": 3.0,
"rl_loss": 0.0,
"sft_loss": 2.159944534301758,
"speech_entropy": 2.2307074069976807,
"speech_kl": 0.0,
"step": 352,
"text_entropy": 1.2372667789459229,
"text_kl": 0.0,
"total_entropy": 2.0377559661865234
},
{
"combined_loss": 0.9208307862281799,
"completion_length": 562.375,
"epoch": 0.11227735368956743,
"grad_norm": 2.392199754714966,
"kl": 0.0,
"learning_rate": 9.386360233179206e-07,
"loss": 0.9208,
"num_samples": 1.0,
"reward": 2.875,
"reward_std": 0.7065354585647583,
"rewards/gpt4o_holistic_reward": 2.875,
"rl_loss": 0.0,
"sft_loss": 3.0694358348846436,
"speech_entropy": 2.4065301418304443,
"speech_kl": 0.0,
"step": 353,
"text_entropy": 1.0953893661499023,
"text_kl": 0.0,
"total_entropy": 2.1367077827453613
},
{
"combined_loss": 0.6610080599784851,
"completion_length": 534.125,
"epoch": 0.11259541984732824,
"grad_norm": 1.6925069093704224,
"kl": 0.0,
"learning_rate": 9.382755958690485e-07,
"loss": 0.661,
"num_samples": 1.0,
"reward": 4.625,
"reward_std": 0.7500999569892883,
"rewards/gpt4o_holistic_reward": 4.625,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.203360080718994,
"speech_entropy": 2.2898178100585938,
"speech_kl": 0.0,
"step": 354,
"text_entropy": 1.2608642578125,
"text_kl": 0.0,
"total_entropy": 2.0855863094329834
},
{
"combined_loss": 0.6016900539398193,
"completion_length": 431.875,
"epoch": 0.11291348600508906,
"grad_norm": 2.1892993450164795,
"kl": 0.0,
"learning_rate": 9.379141909368262e-07,
"loss": 0.6017,
"num_samples": 1.0,
"reward": 3.5625,
"reward_std": 1.2394567728042603,
"rewards/gpt4o_holistic_reward": 3.5625,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.0056333541870117,
"speech_entropy": 2.3564295768737793,
"speech_kl": 0.0,
"step": 355,
"text_entropy": 1.2530461549758911,
"text_kl": 0.0,
"total_entropy": 2.141939640045166
},
{
"combined_loss": 0.6209220290184021,
"completion_length": 342.8125,
"epoch": 0.11323155216284987,
"grad_norm": 2.095818519592285,
"kl": 0.0,
"learning_rate": 9.375518094310902e-07,
"loss": 0.6209,
"num_samples": 1.0,
"reward": 4.5,
"reward_std": 0.7288135886192322,
"rewards/gpt4o_holistic_reward": 4.5,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.069740056991577,
"speech_entropy": 2.1869020462036133,
"speech_kl": 0.0,
"step": 356,
"text_entropy": 1.4376100301742554,
"text_kl": 0.0,
"total_entropy": 2.056838035583496
},
{
"combined_loss": 0.6652117967605591,
"completion_length": 436.75,
"epoch": 0.11354961832061068,
"grad_norm": 1.6426494121551514,
"kl": 0.0,
"learning_rate": 9.371884522641357e-07,
"loss": 0.6652,
"num_samples": 1.0,
"reward": 4.3125,
"reward_std": 0.4733423590660095,
"rewards/gpt4o_holistic_reward": 4.3125,
"rl_loss": -1.4901161193847656e-08,
"sft_loss": 2.217372417449951,
"speech_entropy": 2.4225001335144043,
"speech_kl": 0.0,
"step": 357,
"text_entropy": 1.0462160110473633,
"text_kl": 0.0,
"total_entropy": 2.1539487838745117
},
{
"combined_loss": 0.6718185544013977,
"completion_length": 358.9375,
"epoch": 0.1138676844783715,
"grad_norm": 2.442697048187256,
"kl": 0.0,
"learning_rate": 9.368241203507136e-07,
"loss": 0.6718,
"num_samples": 1.0,
"reward": 3.375,
"reward_std": 0.6831126809120178,
"rewards/gpt4o_holistic_reward": 3.375,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.2393951416015625,
"speech_entropy": 2.331137180328369,
"speech_kl": 0.0,
"step": 358,
"text_entropy": 1.2198386192321777,
"text_kl": 0.0,
"total_entropy": 2.1104815006256104
},
{
"combined_loss": 0.6823822259902954,
"completion_length": 366.125,
"epoch": 0.11418575063613232,
"grad_norm": 2.0511248111724854,
"kl": 0.0,
"learning_rate": 9.364588146080293e-07,
"loss": 0.6824,
"num_samples": 1.0,
"reward": 3.25,
"reward_std": 0.7042241096496582,
"rewards/gpt4o_holistic_reward": 3.25,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.2746074199676514,
"speech_entropy": 2.4417941570281982,
"speech_kl": 0.0,
"step": 359,
"text_entropy": 1.3095173835754395,
"text_kl": 0.0,
"total_entropy": 2.220010757446289
},
{
"combined_loss": 0.8011901378631592,
"completion_length": 471.9375,
"epoch": 0.11450381679389313,
"grad_norm": 2.027939558029175,
"kl": 0.0,
"learning_rate": 9.360925359557396e-07,
"loss": 0.8012,
"num_samples": 1.0,
"reward": 3.25,
"reward_std": 0.6444375514984131,
"rewards/gpt4o_holistic_reward": 3.25,
"rl_loss": -1.4901161193847656e-08,
"sft_loss": 2.6706337928771973,
"speech_entropy": 2.198263168334961,
"speech_kl": 0.0,
"step": 360,
"text_entropy": 1.6125402450561523,
"text_kl": 0.0,
"total_entropy": 2.0894393920898438
},
{
"combined_loss": 0.6634936332702637,
"completion_length": 622.0625,
"epoch": 0.11482188295165395,
"grad_norm": 1.804551124572754,
"kl": 0.0,
"learning_rate": 9.357252853159505e-07,
"loss": 0.6635,
"num_samples": 1.0,
"reward": 3.875,
"reward_std": 1.2180101871490479,
"rewards/gpt4o_holistic_reward": 3.875,
"rl_loss": -1.4901161193847656e-08,
"sft_loss": 2.2116456031799316,
"speech_entropy": 2.1450726985931396,
"speech_kl": 0.0,
"step": 361,
"text_entropy": 1.2231552600860596,
"text_kl": 0.0,
"total_entropy": 1.972318410873413
},
{
"combined_loss": 0.7286593317985535,
"completion_length": 344.0,
"epoch": 0.11513994910941476,
"grad_norm": 2.2605528831481934,
"kl": 0.0,
"learning_rate": 9.35357063613215e-07,
"loss": 0.7287,
"num_samples": 1.0,
"reward": 3.625,
"reward_std": 0.933112621307373,
"rewards/gpt4o_holistic_reward": 3.625,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.4288644790649414,
"speech_entropy": 2.370051383972168,
"speech_kl": 0.0,
"step": 362,
"text_entropy": 1.7082545757293701,
"text_kl": 0.0,
"total_entropy": 2.255542039871216
},
{
"combined_loss": 0.7230384349822998,
"completion_length": 372.1875,
"epoch": 0.11545801526717557,
"grad_norm": 1.8940509557724,
"kl": 0.0,
"learning_rate": 9.349878717745308e-07,
"loss": 0.723,
"num_samples": 1.0,
"reward": 4.875,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_holistic_reward": 4.875,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.410127878189087,
"speech_entropy": 2.214015245437622,
"speech_kl": 0.0,
"step": 363,
"text_entropy": 1.4911437034606934,
"text_kl": 0.0,
"total_entropy": 2.0867202281951904
},
{
"combined_loss": 0.7651809453964233,
"completion_length": 591.6875,
"epoch": 0.11577608142493638,
"grad_norm": 1.642069697380066,
"kl": 0.0,
"learning_rate": 9.34617710729338e-07,
"loss": 0.7652,
"num_samples": 1.0,
"reward": 2.6875,
"reward_std": 0.8538135886192322,
"rewards/gpt4o_holistic_reward": 2.6875,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.550603151321411,
"speech_entropy": 2.124917507171631,
"speech_kl": 0.0,
"step": 364,
"text_entropy": 1.3163607120513916,
"text_kl": 0.0,
"total_entropy": 1.9680266380310059
},
{
"combined_loss": 0.6506673693656921,
"completion_length": 625.3125,
"epoch": 0.1160941475826972,
"grad_norm": 2.7186765670776367,
"kl": 0.0,
"learning_rate": 9.342465814095166e-07,
"loss": 0.6507,
"num_samples": 1.0,
"reward": 2.4375,
"reward_std": 0.5194375514984131,
"rewards/gpt4o_holistic_reward": 2.4375,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.168890953063965,
"speech_entropy": 2.8031604290008545,
"speech_kl": 0.0,
"step": 365,
"text_entropy": 1.175290822982788,
"text_kl": 0.0,
"total_entropy": 2.457470655441284
},
{
"combined_loss": 0.6432278156280518,
"completion_length": 494.9375,
"epoch": 0.11641221374045801,
"grad_norm": 1.8091281652450562,
"kl": 0.0,
"learning_rate": 9.338744847493842e-07,
"loss": 0.6432,
"num_samples": 1.0,
"reward": 3.875,
"reward_std": 0.4788135886192322,
"rewards/gpt4o_holistic_reward": 3.875,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.1440927982330322,
"speech_entropy": 2.3679351806640625,
"speech_kl": 0.0,
"step": 366,
"text_entropy": 1.285107135772705,
"text_kl": 0.0,
"total_entropy": 2.1672964096069336
},
{
"combined_loss": 0.6938588619232178,
"completion_length": 392.9375,
"epoch": 0.11673027989821882,
"grad_norm": 2.0217089653015137,
"kl": 0.0,
"learning_rate": 9.335014216856936e-07,
"loss": 0.6939,
"num_samples": 1.0,
"reward": 4.5625,
"reward_std": 0.8750999569892883,
"rewards/gpt4o_holistic_reward": 4.5625,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.3128628730773926,
"speech_entropy": 2.1415584087371826,
"speech_kl": 0.0,
"step": 367,
"text_entropy": 1.1758991479873657,
"text_kl": 0.0,
"total_entropy": 1.9482624530792236
},
{
"combined_loss": 0.726897120475769,
"completion_length": 487.875,
"epoch": 0.11704834605597965,
"grad_norm": 1.7421088218688965,
"kl": 0.0,
"learning_rate": 9.331273931576306e-07,
"loss": 0.7269,
"num_samples": 1.0,
"reward": 4.0625,
"reward_std": 0.8146764636039734,
"rewards/gpt4o_holistic_reward": 4.0625,
"rl_loss": 1.862645149230957e-08,
"sft_loss": 2.422990322113037,
"speech_entropy": 2.3109121322631836,
"speech_kl": 0.0,
"step": 368,
"text_entropy": 1.0041307210922241,
"text_kl": 0.0,
"total_entropy": 2.0410120487213135
},
{
"combined_loss": 0.6220736503601074,
"completion_length": 688.875,
"epoch": 0.11736641221374046,
"grad_norm": 1.5585706233978271,
"kl": 0.0,
"learning_rate": 9.327524001068118e-07,
"loss": 0.6221,
"num_samples": 1.0,
"reward": 2.875,
"reward_std": 0.8644567728042603,
"rewards/gpt4o_holistic_reward": 2.875,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.0735785961151123,
"speech_entropy": 2.154421806335449,
"speech_kl": 0.0,
"step": 369,
"text_entropy": 1.0012127161026,
"text_kl": 0.0,
"total_entropy": 1.9310146570205688
},
{
"combined_loss": 0.6993934512138367,
"completion_length": 423.875,
"epoch": 0.11768447837150127,
"grad_norm": 1.9345406293869019,
"kl": 0.0,
"learning_rate": 9.323764434772815e-07,
"loss": 0.6994,
"num_samples": 1.0,
"reward": 2.5,
"reward_std": 0.6231511831283569,
"rewards/gpt4o_holistic_reward": 2.5,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.3313114643096924,
"speech_entropy": 2.3823764324188232,
"speech_kl": 0.0,
"step": 370,
"text_entropy": 1.6932473182678223,
"text_kl": 0.0,
"total_entropy": 2.245680332183838
},
{
"combined_loss": 0.6287088394165039,
"completion_length": 436.6875,
"epoch": 0.1180025445292621,
"grad_norm": 2.2497189044952393,
"kl": 0.0,
"learning_rate": 9.319995242155101e-07,
"loss": 0.6287,
"num_samples": 1.0,
"reward": 3.6875,
"reward_std": 0.8750999569892883,
"rewards/gpt4o_holistic_reward": 3.6875,
"rl_loss": 0.0,
"sft_loss": 2.095696210861206,
"speech_entropy": 2.30611515045166,
"speech_kl": 0.0,
"step": 371,
"text_entropy": 0.9617571234703064,
"text_kl": 0.0,
"total_entropy": 2.0462255477905273
},
{
"combined_loss": 0.6533428430557251,
"completion_length": 536.4375,
"epoch": 0.1183206106870229,
"grad_norm": 2.011059045791626,
"kl": 0.0,
"learning_rate": 9.316216432703917e-07,
"loss": 0.6533,
"num_samples": 1.0,
"reward": 3.6875,
"reward_std": 1.057937741279602,
"rewards/gpt4o_holistic_reward": 3.6875,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.177809238433838,
"speech_entropy": 2.581768035888672,
"speech_kl": 0.0,
"step": 372,
"text_entropy": 1.0381070375442505,
"text_kl": 0.0,
"total_entropy": 2.2744696140289307
},
{
"combined_loss": 0.6885940432548523,
"completion_length": 485.1875,
"epoch": 0.11863867684478371,
"grad_norm": 2.2359321117401123,
"kl": 0.0,
"learning_rate": 9.312428015932407e-07,
"loss": 0.6886,
"num_samples": 1.0,
"reward": 3.3125,
"reward_std": 1.226402759552002,
"rewards/gpt4o_holistic_reward": 3.3125,
"rl_loss": 1.862645149230957e-09,
"sft_loss": 2.2953133583068848,
"speech_entropy": 3.0134053230285645,
"speech_kl": 0.0,
"step": 373,
"text_entropy": 1.2640031576156616,
"text_kl": 0.0,
"total_entropy": 2.5999834537506104
},
{
"combined_loss": 0.6422200202941895,
"completion_length": 451.625,
"epoch": 0.11895674300254452,
"grad_norm": 1.8345330953598022,
"kl": 0.0,
"learning_rate": 9.308630001377909e-07,
"loss": 0.6422,
"num_samples": 1.0,
"reward": 2.75,
"reward_std": 0.9565354585647583,
"rewards/gpt4o_holistic_reward": 2.75,
"rl_loss": 0.0,
"sft_loss": 2.140733242034912,
"speech_entropy": 2.1300010681152344,
"speech_kl": 0.0,
"step": 374,
"text_entropy": 1.349212408065796,
"text_kl": 0.0,
"total_entropy": 1.9911762475967407
},
{
"combined_loss": 0.6511521339416504,
"completion_length": 440.8125,
"epoch": 0.11927480916030535,
"grad_norm": 2.062229871749878,
"kl": 0.0,
"learning_rate": 9.304822398601919e-07,
"loss": 0.6512,
"num_samples": 1.0,
"reward": 3.3125,
"reward_std": 0.2694375813007355,
"rewards/gpt4o_holistic_reward": 3.3125,
"rl_loss": 0.0,
"sft_loss": 2.1705071926116943,
"speech_entropy": 2.244220495223999,
"speech_kl": 0.0,
"step": 375,
"text_entropy": 1.1989765167236328,
"text_kl": 0.0,
"total_entropy": 2.0404000282287598
},
{
"combined_loss": 0.7069913148880005,
"completion_length": 472.375,
"epoch": 0.11959287531806616,
"grad_norm": 2.3955235481262207,
"kl": 0.0,
"learning_rate": 9.301005217190072e-07,
"loss": 0.707,
"num_samples": 1.0,
"reward": 2.625,
"reward_std": 1.0000998973846436,
"rewards/gpt4o_holistic_reward": 2.625,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.356637477874756,
"speech_entropy": 3.064028263092041,
"speech_kl": 0.0,
"step": 376,
"text_entropy": 1.921829104423523,
"text_kl": 0.0,
"total_entropy": 2.78952693939209
},
{
"combined_loss": 0.648313045501709,
"completion_length": 499.5625,
"epoch": 0.11991094147582697,
"grad_norm": 2.091409206390381,
"kl": 0.0,
"learning_rate": 9.297178466752118e-07,
"loss": 0.6483,
"num_samples": 1.0,
"reward": 3.0,
"reward_std": 1.0774502754211426,
"rewards/gpt4o_holistic_reward": 3.0,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.161043405532837,
"speech_entropy": 3.0961804389953613,
"speech_kl": 0.0,
"step": 377,
"text_entropy": 1.5107793807983398,
"text_kl": 0.0,
"total_entropy": 2.7813608646392822
},
{
"combined_loss": 0.713459849357605,
"completion_length": 469.625,
"epoch": 0.12022900763358779,
"grad_norm": 2.0358638763427734,
"kl": 0.0,
"learning_rate": 9.293342156921896e-07,
"loss": 0.7135,
"num_samples": 1.0,
"reward": 4.1875,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_holistic_reward": 4.1875,
"rl_loss": 0.0,
"sft_loss": 2.378199338912964,
"speech_entropy": 2.2007017135620117,
"speech_kl": 0.0,
"step": 378,
"text_entropy": 1.2517070770263672,
"text_kl": 0.0,
"total_entropy": 2.017704963684082
},
{
"combined_loss": 0.6941728591918945,
"completion_length": 430.8125,
"epoch": 0.1205470737913486,
"grad_norm": 1.5506932735443115,
"kl": 0.0,
"learning_rate": 9.289496297357313e-07,
"loss": 0.6942,
"num_samples": 1.0,
"reward": 5.0,
"reward_std": 9.999999747378752e-05,
"rewards/gpt4o_holistic_reward": 5.0,
"rl_loss": 0.0,
"sft_loss": 2.3139095306396484,
"speech_entropy": 2.1886677742004395,
"speech_kl": 0.0,
"step": 379,
"text_entropy": 1.5006790161132812,
"text_kl": 0.0,
"total_entropy": 2.0677237510681152
},
{
"combined_loss": 0.7921469211578369,
"completion_length": 444.0,
"epoch": 0.12086513994910941,
"grad_norm": 1.8955799341201782,
"kl": 0.0,
"learning_rate": 9.285640897740315e-07,
"loss": 0.7921,
"num_samples": 1.0,
"reward": 4.375,
"reward_std": 0.4331127107143402,
"rewards/gpt4o_holistic_reward": 4.375,
"rl_loss": 0.0,
"sft_loss": 2.6404895782470703,
"speech_entropy": 2.0556373596191406,
"speech_kl": 0.0,
"step": 380,
"text_entropy": 1.4717328548431396,
"text_kl": 0.0,
"total_entropy": 1.9481353759765625
},
{
"combined_loss": 0.7151652574539185,
"completion_length": 566.5,
"epoch": 0.12118320610687022,
"grad_norm": 1.9992356300354004,
"kl": 0.0,
"learning_rate": 9.281775967776865e-07,
"loss": 0.7152,
"num_samples": 1.0,
"reward": 4.5,
"reward_std": 0.864456832408905,
"rewards/gpt4o_holistic_reward": 4.5,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.3838841915130615,
"speech_entropy": 2.1918318271636963,
"speech_kl": 0.0,
"step": 381,
"text_entropy": 1.32926344871521,
"text_kl": 0.0,
"total_entropy": 2.039583683013916
},
{
"combined_loss": 0.6602721214294434,
"completion_length": 361.8125,
"epoch": 0.12150127226463105,
"grad_norm": 2.4221584796905518,
"kl": 0.0,
"learning_rate": 9.277901517196921e-07,
"loss": 0.6603,
"num_samples": 1.0,
"reward": 3.25,
"reward_std": 0.704224169254303,
"rewards/gpt4o_holistic_reward": 3.25,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.2009072303771973,
"speech_entropy": 3.4160265922546387,
"speech_kl": 0.0,
"step": 382,
"text_entropy": 1.843759536743164,
"text_kl": 0.0,
"total_entropy": 3.1454572677612305
},
{
"combined_loss": 0.6596149206161499,
"completion_length": 469.5625,
"epoch": 0.12181933842239186,
"grad_norm": 2.0681304931640625,
"kl": 0.0,
"learning_rate": 9.274017555754407e-07,
"loss": 0.6596,
"num_samples": 1.0,
"reward": 3.3125,
"reward_std": 0.8676799535751343,
"rewards/gpt4o_holistic_reward": 3.3125,
"rl_loss": -1.1175870895385742e-08,
"sft_loss": 2.198716402053833,
"speech_entropy": 2.4791064262390137,
"speech_kl": 0.0,
"step": 383,
"text_entropy": 1.5029938220977783,
"text_kl": 0.0,
"total_entropy": 2.3004837036132812
},
{
"combined_loss": 0.6301867961883545,
"completion_length": 606.3125,
"epoch": 0.12213740458015267,
"grad_norm": 2.0828793048858643,
"kl": 0.0,
"learning_rate": 9.270124093227192e-07,
"loss": 0.6302,
"num_samples": 1.0,
"reward": 3.0,
"reward_std": 0.8483423590660095,
"rewards/gpt4o_holistic_reward": 3.0,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.1006226539611816,
"speech_entropy": 2.148000717163086,
"speech_kl": 0.0,
"step": 384,
"text_entropy": 0.49018028378486633,
"text_kl": 0.0,
"total_entropy": 1.8085635900497437
},
{
"combined_loss": 0.589484453201294,
"completion_length": 417.5,
"epoch": 0.12245547073791349,
"grad_norm": 2.000455856323242,
"kl": 0.0,
"learning_rate": 9.266221139417064e-07,
"loss": 0.5895,
"num_samples": 1.0,
"reward": 4.0,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_holistic_reward": 4.0,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 1.9649479389190674,
"speech_entropy": 2.3916573524475098,
"speech_kl": 0.0,
"step": 385,
"text_entropy": 1.274303913116455,
"text_kl": 0.0,
"total_entropy": 2.180541515350342
},
{
"combined_loss": 0.6605916023254395,
"completion_length": 463.625,
"epoch": 0.1227735368956743,
"grad_norm": 2.3290114402770996,
"kl": 0.0,
"learning_rate": 9.262308704149701e-07,
"loss": 0.6606,
"num_samples": 1.0,
"reward": 3.4375,
"reward_std": 0.6176798939704895,
"rewards/gpt4o_holistic_reward": 3.4375,
"rl_loss": -1.4901161193847656e-08,
"sft_loss": 2.201972007751465,
"speech_entropy": 3.0757741928100586,
"speech_kl": 0.0,
"step": 386,
"text_entropy": 1.367950439453125,
"text_kl": 0.0,
"total_entropy": 2.818263530731201
},
{
"combined_loss": 0.6665786504745483,
"completion_length": 479.8125,
"epoch": 0.12309160305343511,
"grad_norm": 3.4283993244171143,
"kl": 0.0,
"learning_rate": 9.258386797274658e-07,
"loss": 0.6666,
"num_samples": 1.0,
"reward": 2.8125,
"reward_std": 1.2235616445541382,
"rewards/gpt4o_holistic_reward": 2.8125,
"rl_loss": -5.587935447692871e-09,
"sft_loss": 2.221928596496582,
"speech_entropy": 3.2466955184936523,
"speech_kl": 0.0,
"step": 387,
"text_entropy": 1.366699457168579,
"text_kl": 0.0,
"total_entropy": 2.97598934173584
},
{
"combined_loss": 0.6852359771728516,
"completion_length": 439.5,
"epoch": 0.12340966921119594,
"grad_norm": 2.255603551864624,
"kl": 0.0,
"learning_rate": 9.254455428665329e-07,
"loss": 0.6852,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 0.8751000165939331,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.2841198444366455,
"speech_entropy": 2.7042312622070312,
"speech_kl": 0.0,
"step": 388,
"text_entropy": 1.335016131401062,
"text_kl": 0.0,
"total_entropy": 2.444276809692383
},
{
"combined_loss": 0.6944676041603088,
"completion_length": 545.0,
"epoch": 0.12372773536895675,
"grad_norm": 3.5577309131622314,
"kl": 0.0,
"learning_rate": 9.250514608218928e-07,
"loss": 0.6945,
"num_samples": 1.0,
"reward": 3.375,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_holistic_reward": 3.375,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.314892053604126,
"speech_entropy": 2.9382314682006836,
"speech_kl": 0.0,
"step": 389,
"text_entropy": 1.2093441486358643,
"text_kl": 0.0,
"total_entropy": 2.6713953018188477
},
{
"combined_loss": 0.6847076416015625,
"completion_length": 318.875,
"epoch": 0.12404580152671756,
"grad_norm": 2.288792610168457,
"kl": 0.0,
"learning_rate": 9.24656434585647e-07,
"loss": 0.6847,
"num_samples": 1.0,
"reward": 4.0,
"reward_std": 0.7042241096496582,
"rewards/gpt4o_holistic_reward": 4.0,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.2823588848114014,
"speech_entropy": 2.4487361907958984,
"speech_kl": 0.0,
"step": 390,
"text_entropy": 1.1644865274429321,
"text_kl": 0.0,
"total_entropy": 2.222623348236084
},
{
"combined_loss": 0.6812296509742737,
"completion_length": 496.5,
"epoch": 0.12436386768447837,
"grad_norm": 2.5237197875976562,
"kl": 0.0,
"learning_rate": 9.242604651522735e-07,
"loss": 0.6812,
"num_samples": 1.0,
"reward": 3.0,
"reward_std": 1.2178384065628052,
"rewards/gpt4o_holistic_reward": 3.0,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.2707653045654297,
"speech_entropy": 3.798156261444092,
"speech_kl": 0.0,
"step": 391,
"text_entropy": 1.453238606452942,
"text_kl": 0.0,
"total_entropy": 3.4767727851867676
},
{
"combined_loss": 0.6517486572265625,
"completion_length": 350.5,
"epoch": 0.12468193384223919,
"grad_norm": 1.931217908859253,
"kl": 0.0,
"learning_rate": 9.238635535186246e-07,
"loss": 0.6517,
"num_samples": 1.0,
"reward": 1.875,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_holistic_reward": 1.875,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.1724953651428223,
"speech_entropy": 2.189283847808838,
"speech_kl": 0.0,
"step": 392,
"text_entropy": 1.1253941059112549,
"text_kl": 0.0,
"total_entropy": 1.9858797788619995
},
{
"combined_loss": 0.693282961845398,
"completion_length": 286.8125,
"epoch": 0.125,
"grad_norm": 2.019578218460083,
"kl": 0.0,
"learning_rate": 9.234657006839249e-07,
"loss": 0.6933,
"num_samples": 1.0,
"reward": 3.25,
"reward_std": 0.5000999569892883,
"rewards/gpt4o_holistic_reward": 3.25,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.310943126678467,
"speech_entropy": 2.226673126220703,
"speech_kl": 0.0,
"step": 393,
"text_entropy": 1.3821954727172852,
"text_kl": 0.0,
"total_entropy": 2.0710020065307617
},
{
"combined_loss": 0.6477099657058716,
"completion_length": 431.5625,
"epoch": 0.1253180661577608,
"grad_norm": 3.454335927963257,
"kl": 0.0,
"learning_rate": 9.230669076497687e-07,
"loss": 0.6477,
"num_samples": 1.0,
"reward": 3.5625,
"reward_std": 1.501086711883545,
"rewards/gpt4o_holistic_reward": 3.5625,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.1590328216552734,
"speech_entropy": 5.795164108276367,
"speech_kl": 0.0,
"step": 394,
"text_entropy": 1.2117631435394287,
"text_kl": 0.0,
"total_entropy": 5.437085151672363
},
{
"combined_loss": 0.6723122596740723,
"completion_length": 557.5625,
"epoch": 0.12563613231552162,
"grad_norm": 1.878871202468872,
"kl": 0.0,
"learning_rate": 9.226671754201167e-07,
"loss": 0.6723,
"num_samples": 1.0,
"reward": 3.0625,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_holistic_reward": 3.0625,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.2410407066345215,
"speech_entropy": 3.8088769912719727,
"speech_kl": 0.0,
"step": 395,
"text_entropy": 1.0774390697479248,
"text_kl": 0.0,
"total_entropy": 3.4700815677642822
},
{
"combined_loss": 0.7662006616592407,
"completion_length": 397.5625,
"epoch": 0.12595419847328243,
"grad_norm": 2.458489418029785,
"kl": 0.0,
"learning_rate": 9.222665050012947e-07,
"loss": 0.7662,
"num_samples": 1.0,
"reward": 3.4375,
"reward_std": 0.7654881477355957,
"rewards/gpt4o_holistic_reward": 3.4375,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.554001808166504,
"speech_entropy": 2.176966667175293,
"speech_kl": 0.0,
"step": 396,
"text_entropy": 1.7679321765899658,
"text_kl": 0.0,
"total_entropy": 2.1059505939483643
},
{
"combined_loss": 0.5795345902442932,
"completion_length": 308.3125,
"epoch": 0.12627226463104327,
"grad_norm": 2.792732000350952,
"kl": 0.0,
"learning_rate": 9.218648974019896e-07,
"loss": 0.5795,
"num_samples": 1.0,
"reward": 4.125,
"reward_std": 0.614456832408905,
"rewards/gpt4o_holistic_reward": 4.125,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 1.9317818880081177,
"speech_entropy": 2.6999130249023438,
"speech_kl": 0.0,
"step": 397,
"text_entropy": 1.0977978706359863,
"text_kl": 0.0,
"total_entropy": 2.432361602783203
},
{
"combined_loss": 0.8004469275474548,
"completion_length": 436.625,
"epoch": 0.12659033078880408,
"grad_norm": 4.002890586853027,
"kl": 0.0,
"learning_rate": 9.214623536332482e-07,
"loss": 0.8004,
"num_samples": 1.0,
"reward": 3.0,
"reward_std": 0.9788135290145874,
"rewards/gpt4o_holistic_reward": 3.0,
"rl_loss": -1.1175870895385742e-08,
"sft_loss": 2.668156385421753,
"speech_entropy": 5.438493728637695,
"speech_kl": 0.0,
"step": 398,
"text_entropy": 1.7521681785583496,
"text_kl": 0.0,
"total_entropy": 5.220246315002441
},
{
"combined_loss": 0.7312483787536621,
"completion_length": 681.8125,
"epoch": 0.1269083969465649,
"grad_norm": 2.7238285541534424,
"kl": 0.0,
"learning_rate": 9.21058874708474e-07,
"loss": 0.7312,
"num_samples": 1.0,
"reward": 4.0,
"reward_std": 0.9463939070701599,
"rewards/gpt4o_holistic_reward": 4.0,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.4374947547912598,
"speech_entropy": 3.460540294647217,
"speech_kl": 0.0,
"step": 399,
"text_entropy": 1.679933786392212,
"text_kl": 0.0,
"total_entropy": 3.2443912029266357
},
{
"combined_loss": 0.7706436514854431,
"completion_length": 449.875,
"epoch": 0.1272264631043257,
"grad_norm": 1.9544907808303833,
"kl": 0.0,
"learning_rate": 9.206544616434248e-07,
"loss": 0.7706,
"num_samples": 1.0,
"reward": 4.125,
"reward_std": 1.0308762788772583,
"rewards/gpt4o_holistic_reward": 4.125,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.568812131881714,
"speech_entropy": 2.1668813228607178,
"speech_kl": 0.0,
"step": 400,
"text_entropy": 1.5073027610778809,
"text_kl": 0.0,
"total_entropy": 2.0442421436309814
},
{
"combined_loss": 0.6995494365692139,
"completion_length": 447.75,
"epoch": 0.1275445292620865,
"grad_norm": 1.9926646947860718,
"kl": 0.0,
"learning_rate": 9.202491154562097e-07,
"loss": 0.6995,
"num_samples": 1.0,
"reward": 3.875,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_holistic_reward": 3.875,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.331831455230713,
"speech_entropy": 3.2041335105895996,
"speech_kl": 0.0,
"step": 401,
"text_entropy": 1.542539119720459,
"text_kl": 0.0,
"total_entropy": 3.0047965049743652
},
{
"combined_loss": 0.674731433391571,
"completion_length": 414.75,
"epoch": 0.12786259541984732,
"grad_norm": 1.9648370742797852,
"kl": 0.0,
"learning_rate": 9.198428371672874e-07,
"loss": 0.6747,
"num_samples": 1.0,
"reward": 4.1875,
"reward_std": 0.6682298183441162,
"rewards/gpt4o_holistic_reward": 4.1875,
"rl_loss": 1.862645149230957e-09,
"sft_loss": 2.2491049766540527,
"speech_entropy": 2.0483105182647705,
"speech_kl": 0.0,
"step": 402,
"text_entropy": 1.0541050434112549,
"text_kl": 0.0,
"total_entropy": 1.8693532943725586
},
{
"combined_loss": 0.7094697952270508,
"completion_length": 490.75,
"epoch": 0.12818066157760813,
"grad_norm": 3.759694814682007,
"kl": 0.0,
"learning_rate": 9.194356277994632e-07,
"loss": 0.7095,
"num_samples": 1.0,
"reward": 3.4375,
"reward_std": 0.5713939070701599,
"rewards/gpt4o_holistic_reward": 3.4375,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.364899158477783,
"speech_entropy": 2.0986576080322266,
"speech_kl": 0.0,
"step": 403,
"text_entropy": 1.6229678392410278,
"text_kl": 0.0,
"total_entropy": 2.018561363220215
},
{
"combined_loss": 0.6625657081604004,
"completion_length": 677.5,
"epoch": 0.12849872773536897,
"grad_norm": 6.927186012268066,
"kl": 0.0,
"learning_rate": 9.19027488377886e-07,
"loss": 0.6626,
"num_samples": 1.0,
"reward": 3.125,
"reward_std": 1.3692017793655396,
"rewards/gpt4o_holistic_reward": 3.125,
"rl_loss": 1.862645149230957e-08,
"sft_loss": 2.208552122116089,
"speech_entropy": 3.995192527770996,
"speech_kl": 0.0,
"step": 404,
"text_entropy": 1.7138077020645142,
"text_kl": 0.0,
"total_entropy": 3.6808578968048096
},
{
"combined_loss": 0.6429011225700378,
"completion_length": 530.25,
"epoch": 0.12881679389312978,
"grad_norm": 5.672791004180908,
"kl": 0.0,
"learning_rate": 9.186184199300463e-07,
"loss": 0.6429,
"num_samples": 1.0,
"reward": 3.5,
"reward_std": 1.3904881477355957,
"rewards/gpt4o_holistic_reward": 3.5,
"rl_loss": 0.0,
"sft_loss": 2.1430037021636963,
"speech_entropy": 4.908637046813965,
"speech_kl": 0.0,
"step": 405,
"text_entropy": 1.521721363067627,
"text_kl": 0.0,
"total_entropy": 4.66417932510376
},
{
"combined_loss": 0.6549615859985352,
"completion_length": 477.375,
"epoch": 0.1291348600508906,
"grad_norm": 2.929534912109375,
"kl": 0.0,
"learning_rate": 9.182084234857735e-07,
"loss": 0.655,
"num_samples": 1.0,
"reward": 2.3125,
"reward_std": 0.8430101871490479,
"rewards/gpt4o_holistic_reward": 2.3125,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.1832053661346436,
"speech_entropy": 2.1303212642669678,
"speech_kl": 0.0,
"step": 406,
"text_entropy": 1.1068902015686035,
"text_kl": 0.0,
"total_entropy": 1.9402127265930176
},
{
"combined_loss": 0.6791283488273621,
"completion_length": 461.6875,
"epoch": 0.1294529262086514,
"grad_norm": 2.371797800064087,
"kl": 0.0,
"learning_rate": 9.17797500077233e-07,
"loss": 0.6791,
"num_samples": 1.0,
"reward": 4.4375,
"reward_std": 0.8538135886192322,
"rewards/gpt4o_holistic_reward": 4.4375,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.263761043548584,
"speech_entropy": 3.0191469192504883,
"speech_kl": 0.0,
"step": 407,
"text_entropy": 2.0163426399230957,
"text_kl": 0.0,
"total_entropy": 2.867436408996582
},
{
"combined_loss": 0.683641791343689,
"completion_length": 407.125,
"epoch": 0.1297709923664122,
"grad_norm": 3.123765468597412,
"kl": 0.0,
"learning_rate": 9.173856507389244e-07,
"loss": 0.6836,
"num_samples": 1.0,
"reward": 3.0,
"reward_std": 1.3274502754211426,
"rewards/gpt4o_holistic_reward": 3.0,
"rl_loss": 0.0,
"sft_loss": 2.278805732727051,
"speech_entropy": 4.666620254516602,
"speech_kl": 0.0,
"step": 408,
"text_entropy": 4.294477462768555,
"text_kl": 0.0,
"total_entropy": 4.622071266174316
},
{
"combined_loss": 0.7313430309295654,
"completion_length": 254.5625,
"epoch": 0.13008905852417302,
"grad_norm": 2.3901264667510986,
"kl": 0.0,
"learning_rate": 9.169728765076774e-07,
"loss": 0.7313,
"num_samples": 1.0,
"reward": 4.0625,
"reward_std": 0.2694375813007355,
"rewards/gpt4o_holistic_reward": 4.0625,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.437809944152832,
"speech_entropy": 2.1548728942871094,
"speech_kl": 0.0,
"step": 409,
"text_entropy": 1.255692958831787,
"text_kl": 0.0,
"total_entropy": 2.0065855979919434
},
{
"combined_loss": 0.6742551326751709,
"completion_length": 608.0,
"epoch": 0.13040712468193386,
"grad_norm": 2.8765196800231934,
"kl": 0.0,
"learning_rate": 9.165591784226511e-07,
"loss": 0.6743,
"num_samples": 1.0,
"reward": 2.0625,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_holistic_reward": 2.0625,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.2475171089172363,
"speech_entropy": 2.9422497749328613,
"speech_kl": 0.0,
"step": 410,
"text_entropy": 2.1728014945983887,
"text_kl": 0.0,
"total_entropy": 2.778034210205078
},
{
"combined_loss": 0.6012893319129944,
"completion_length": 402.5,
"epoch": 0.13072519083969467,
"grad_norm": 4.226046085357666,
"kl": 0.0,
"learning_rate": 9.161445575253295e-07,
"loss": 0.6013,
"num_samples": 1.0,
"reward": 3.375,
"reward_std": 0.9331126809120178,
"rewards/gpt4o_holistic_reward": 3.375,
"rl_loss": 0.0,
"sft_loss": 2.0042977333068848,
"speech_entropy": 3.145230293273926,
"speech_kl": 0.0,
"step": 411,
"text_entropy": 2.9247562885284424,
"text_kl": 0.0,
"total_entropy": 3.075840473175049
},
{
"combined_loss": 0.6655627489089966,
"completion_length": 465.5625,
"epoch": 0.13104325699745548,
"grad_norm": 4.012775421142578,
"kl": 0.0,
"learning_rate": 9.157290148595206e-07,
"loss": 0.6656,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 0.6250999569892883,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.2185425758361816,
"speech_entropy": 2.861677646636963,
"speech_kl": 0.0,
"step": 412,
"text_entropy": 2.6879758834838867,
"text_kl": 0.0,
"total_entropy": 2.890303134918213
},
{
"combined_loss": 0.7086147665977478,
"completion_length": 562.5625,
"epoch": 0.1313613231552163,
"grad_norm": 4.615509510040283,
"kl": 0.0,
"learning_rate": 9.153125514713523e-07,
"loss": 0.7086,
"num_samples": 1.0,
"reward": 3.125,
"reward_std": 1.1756925582885742,
"rewards/gpt4o_holistic_reward": 3.125,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.362049102783203,
"speech_entropy": 3.222433090209961,
"speech_kl": 0.0,
"step": 413,
"text_entropy": 2.680953025817871,
"text_kl": 0.0,
"total_entropy": 3.1390838623046875
},
{
"combined_loss": 0.7332242131233215,
"completion_length": 579.3125,
"epoch": 0.1316793893129771,
"grad_norm": 8.893054962158203,
"kl": 0.0,
"learning_rate": 9.148951684092709e-07,
"loss": 0.7332,
"num_samples": 1.0,
"reward": 3.0625,
"reward_std": 1.2498197555541992,
"rewards/gpt4o_holistic_reward": 3.0625,
"rl_loss": 0.0,
"sft_loss": 2.4440808296203613,
"speech_entropy": 3.2557754516601562,
"speech_kl": 0.0,
"step": 414,
"text_entropy": 3.8602097034454346,
"text_kl": 0.0,
"total_entropy": 3.7247118949890137
},
{
"combined_loss": 0.6186578273773193,
"completion_length": 346.75,
"epoch": 0.1319974554707379,
"grad_norm": 2.8829197883605957,
"kl": 0.0,
"learning_rate": 9.144768667240375e-07,
"loss": 0.6187,
"num_samples": 1.0,
"reward": 4.6875,
"reward_std": 0.4733423590660095,
"rewards/gpt4o_holistic_reward": 4.6875,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.062192678451538,
"speech_entropy": 2.5403671264648438,
"speech_kl": 0.0,
"step": 415,
"text_entropy": 2.09316086769104,
"text_kl": 0.0,
"total_entropy": 2.482808828353882
},
{
"combined_loss": 0.6629255414009094,
"completion_length": 498.1875,
"epoch": 0.13231552162849872,
"grad_norm": 1.7470639944076538,
"kl": 0.0,
"learning_rate": 9.140576474687263e-07,
"loss": 0.6629,
"num_samples": 1.0,
"reward": 4.625,
"reward_std": 0.7501000165939331,
"rewards/gpt4o_holistic_reward": 4.625,
"rl_loss": 1.862645149230957e-08,
"sft_loss": 2.209751605987549,
"speech_entropy": 2.040069103240967,
"speech_kl": 0.0,
"step": 416,
"text_entropy": 1.1820553541183472,
"text_kl": 0.0,
"total_entropy": 1.8843843936920166
},
{
"combined_loss": 0.7431026101112366,
"completion_length": 490.5,
"epoch": 0.13263358778625955,
"grad_norm": 1.9392704963684082,
"kl": 0.0,
"learning_rate": 9.136375116987211e-07,
"loss": 0.7431,
"num_samples": 1.0,
"reward": 3.9375,
"reward_std": 0.7394567728042603,
"rewards/gpt4o_holistic_reward": 3.9375,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.477008581161499,
"speech_entropy": 2.424851894378662,
"speech_kl": 0.0,
"step": 417,
"text_entropy": 1.5257370471954346,
"text_kl": 0.0,
"total_entropy": 2.2481653690338135
},
{
"combined_loss": 0.6801667809486389,
"completion_length": 366.6875,
"epoch": 0.13295165394402036,
"grad_norm": 2.0551204681396484,
"kl": 0.0,
"learning_rate": 9.132164604717135e-07,
"loss": 0.6802,
"num_samples": 1.0,
"reward": 4.125,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_holistic_reward": 4.125,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.2672226428985596,
"speech_entropy": 2.0900187492370605,
"speech_kl": 0.0,
"step": 418,
"text_entropy": 1.2720967531204224,
"text_kl": 0.0,
"total_entropy": 1.9535316228866577
},
{
"combined_loss": 0.7415467500686646,
"completion_length": 447.4375,
"epoch": 0.13326972010178118,
"grad_norm": 3.4998650550842285,
"kl": 0.0,
"learning_rate": 9.127944948476993e-07,
"loss": 0.7415,
"num_samples": 1.0,
"reward": 3.4375,
"reward_std": 1.3322067260742188,
"rewards/gpt4o_holistic_reward": 3.4375,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.471822500228882,
"speech_entropy": 2.6262001991271973,
"speech_kl": 0.0,
"step": 419,
"text_entropy": 2.3739140033721924,
"text_kl": 0.0,
"total_entropy": 2.5941972732543945
},
{
"combined_loss": 0.6638205051422119,
"completion_length": 492.5,
"epoch": 0.13358778625954199,
"grad_norm": 2.7289879322052,
"kl": 0.0,
"learning_rate": 9.123716158889764e-07,
"loss": 0.6638,
"num_samples": 1.0,
"reward": 4.75,
"reward_std": 0.5000999569892883,
"rewards/gpt4o_holistic_reward": 4.75,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.212735176086426,
"speech_entropy": 2.982739210128784,
"speech_kl": 0.0,
"step": 420,
"text_entropy": 2.626387119293213,
"text_kl": 0.0,
"total_entropy": 2.923607110977173
},
{
"combined_loss": 0.801853358745575,
"completion_length": 382.75,
"epoch": 0.1339058524173028,
"grad_norm": 5.198933124542236,
"kl": 0.0,
"learning_rate": 9.11947824660142e-07,
"loss": 0.8019,
"num_samples": 1.0,
"reward": 3.4375,
"reward_std": 0.41377514600753784,
"rewards/gpt4o_holistic_reward": 3.4375,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.672844409942627,
"speech_entropy": 2.06710147857666,
"speech_kl": 0.0,
"step": 421,
"text_entropy": 1.2483782768249512,
"text_kl": 0.0,
"total_entropy": 1.9093949794769287
},
{
"combined_loss": 0.6525322198867798,
"completion_length": 461.3125,
"epoch": 0.1342239185750636,
"grad_norm": 2.1027040481567383,
"kl": 0.0,
"learning_rate": 9.115231222280901e-07,
"loss": 0.6525,
"num_samples": 1.0,
"reward": 3.0625,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_holistic_reward": 3.0625,
"rl_loss": 0.0,
"sft_loss": 2.175107479095459,
"speech_entropy": 2.58778715133667,
"speech_kl": 0.0,
"step": 422,
"text_entropy": 1.6190730333328247,
"text_kl": 0.0,
"total_entropy": 2.423537492752075
},
{
"combined_loss": 0.6848458051681519,
"completion_length": 399.25,
"epoch": 0.13454198473282442,
"grad_norm": 3.9089579582214355,
"kl": 0.0,
"learning_rate": 9.110975096620087e-07,
"loss": 0.6848,
"num_samples": 1.0,
"reward": 4.375,
"reward_std": 1.2500998973846436,
"rewards/gpt4o_holistic_reward": 4.375,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.2828192710876465,
"speech_entropy": 4.557295799255371,
"speech_kl": 0.0,
"step": 423,
"text_entropy": 4.6551923751831055,
"text_kl": 0.0,
"total_entropy": 4.5974225997924805
},
{
"combined_loss": 0.6781374216079712,
"completion_length": 464.25,
"epoch": 0.13486005089058525,
"grad_norm": 2.1914126873016357,
"kl": 0.0,
"learning_rate": 9.106709880333768e-07,
"loss": 0.6781,
"num_samples": 1.0,
"reward": 3.375,
"reward_std": 0.9031319618225098,
"rewards/gpt4o_holistic_reward": 3.375,
"rl_loss": -1.862645149230957e-08,
"sft_loss": 2.260457992553711,
"speech_entropy": 2.97335147857666,
"speech_kl": 0.0,
"step": 424,
"text_entropy": 3.013314723968506,
"text_kl": 0.0,
"total_entropy": 3.1752607822418213
},
{
"combined_loss": 0.671977698802948,
"completion_length": 338.625,
"epoch": 0.13517811704834606,
"grad_norm": 3.089296579360962,
"kl": 0.0,
"learning_rate": 9.102435584159621e-07,
"loss": 0.672,
"num_samples": 1.0,
"reward": 4.25,
"reward_std": 1.077450156211853,
"rewards/gpt4o_holistic_reward": 4.25,
"rl_loss": 0.0,
"sft_loss": 2.2399253845214844,
"speech_entropy": 3.009519100189209,
"speech_kl": 0.0,
"step": 425,
"text_entropy": 3.133180618286133,
"text_kl": 0.0,
"total_entropy": 3.1568689346313477
},
{
"combined_loss": 0.6367964148521423,
"completion_length": 485.375,
"epoch": 0.13549618320610687,
"grad_norm": 2.3259880542755127,
"kl": 0.0,
"learning_rate": 9.098152218858182e-07,
"loss": 0.6368,
"num_samples": 1.0,
"reward": 3.875,
"reward_std": 0.36445680260658264,
"rewards/gpt4o_holistic_reward": 3.875,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.122654676437378,
"speech_entropy": 2.219896078109741,
"speech_kl": 0.0,
"step": 426,
"text_entropy": 2.419771671295166,
"text_kl": 0.0,
"total_entropy": 2.319455862045288
},
{
"combined_loss": 0.7209469079971313,
"completion_length": 481.4375,
"epoch": 0.13581424936386768,
"grad_norm": 3.2517311573028564,
"kl": 0.0,
"learning_rate": 9.093859795212817e-07,
"loss": 0.7209,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 0.9894567728042603,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": 0.0,
"sft_loss": 2.403156280517578,
"speech_entropy": 2.589256525039673,
"speech_kl": 0.0,
"step": 427,
"text_entropy": 3.033482074737549,
"text_kl": 0.0,
"total_entropy": 2.8865761756896973
},
{
"combined_loss": 0.6140339970588684,
"completion_length": 439.125,
"epoch": 0.1361323155216285,
"grad_norm": 2.3876121044158936,
"kl": 0.0,
"learning_rate": 9.089558324029699e-07,
"loss": 0.614,
"num_samples": 1.0,
"reward": 2.75,
"reward_std": 1.2350690364837646,
"rewards/gpt4o_holistic_reward": 2.75,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.0467801094055176,
"speech_entropy": 2.7281534671783447,
"speech_kl": 0.0,
"step": 428,
"text_entropy": 2.7204747200012207,
"text_kl": 0.0,
"total_entropy": 2.784191846847534
},
{
"combined_loss": 0.6906248331069946,
"completion_length": 602.125,
"epoch": 0.1364503816793893,
"grad_norm": 1.9201291799545288,
"kl": 0.0,
"learning_rate": 9.085247816137775e-07,
"loss": 0.6906,
"num_samples": 1.0,
"reward": 2.6875,
"reward_std": 0.45901402831077576,
"rewards/gpt4o_holistic_reward": 2.6875,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.3020825386047363,
"speech_entropy": 2.1891660690307617,
"speech_kl": 0.0,
"step": 429,
"text_entropy": 1.7951006889343262,
"text_kl": 0.0,
"total_entropy": 2.125535488128662
},
{
"combined_loss": 0.8007920980453491,
"completion_length": 364.9375,
"epoch": 0.13676844783715011,
"grad_norm": 3.1121604442596436,
"kl": 0.0,
"learning_rate": 9.080928282388745e-07,
"loss": 0.8008,
"num_samples": 1.0,
"reward": 3.5,
"reward_std": 0.8024665117263794,
"rewards/gpt4o_holistic_reward": 3.5,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.669306755065918,
"speech_entropy": 3.015037775039673,
"speech_kl": 0.0,
"step": 430,
"text_entropy": 3.2195496559143066,
"text_kl": 0.0,
"total_entropy": 3.1852550506591797
},
{
"combined_loss": 0.627656102180481,
"completion_length": 145.8125,
"epoch": 0.13708651399491095,
"grad_norm": 1.940796971321106,
"kl": 0.0,
"learning_rate": 9.076599733657027e-07,
"loss": 0.6277,
"num_samples": 1.0,
"reward": 4.875,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_holistic_reward": 4.875,
"rl_loss": 0.0,
"sft_loss": 2.09218692779541,
"speech_entropy": 2.180631637573242,
"speech_kl": 0.0,
"step": 431,
"text_entropy": 0.7923494577407837,
"text_kl": 0.0,
"total_entropy": 1.9149651527404785
},
{
"combined_loss": 0.6692330241203308,
"completion_length": 400.875,
"epoch": 0.13740458015267176,
"grad_norm": 2.384009599685669,
"kl": 0.0,
"learning_rate": 9.072262180839741e-07,
"loss": 0.6692,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 0.8750999569892883,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.230776786804199,
"speech_entropy": 2.6541309356689453,
"speech_kl": 0.0,
"step": 432,
"text_entropy": 2.617983102798462,
"text_kl": 0.0,
"total_entropy": 2.7352967262268066
},
{
"combined_loss": 0.619999885559082,
"completion_length": 364.625,
"epoch": 0.13772264631043257,
"grad_norm": 1.7166526317596436,
"kl": 0.0,
"learning_rate": 9.06791563485667e-07,
"loss": 0.62,
"num_samples": 1.0,
"reward": 4.875,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_holistic_reward": 4.875,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.0666658878326416,
"speech_entropy": 2.0609240531921387,
"speech_kl": 0.0,
"step": 433,
"text_entropy": 1.1358726024627686,
"text_kl": 0.0,
"total_entropy": 1.8963937759399414
},
{
"combined_loss": 0.6619538068771362,
"completion_length": 455.8125,
"epoch": 0.13804071246819338,
"grad_norm": 2.4818007946014404,
"kl": 0.0,
"learning_rate": 9.063560106650238e-07,
"loss": 0.662,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 1.363730549812317,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": -1.862645149230957e-09,
"sft_loss": 2.206512451171875,
"speech_entropy": 3.38385272026062,
"speech_kl": 0.0,
"step": 434,
"text_entropy": 3.1482739448547363,
"text_kl": 0.0,
"total_entropy": 3.4985315799713135
},
{
"combined_loss": 0.6583099365234375,
"completion_length": 388.0,
"epoch": 0.1383587786259542,
"grad_norm": 1.6376653909683228,
"kl": 0.0,
"learning_rate": 9.059195607185481e-07,
"loss": 0.6583,
"num_samples": 1.0,
"reward": 5.0,
"reward_std": 9.999999747378752e-05,
"rewards/gpt4o_holistic_reward": 5.0,
"rl_loss": 0.0,
"sft_loss": 2.194366455078125,
"speech_entropy": 2.094740152359009,
"speech_kl": 0.0,
"step": 435,
"text_entropy": 1.0836068391799927,
"text_kl": 0.0,
"total_entropy": 1.9306923151016235
},
{
"combined_loss": 0.6782636642456055,
"completion_length": 487.8125,
"epoch": 0.138676844783715,
"grad_norm": 2.002826452255249,
"kl": 0.0,
"learning_rate": 9.054822147450022e-07,
"loss": 0.6783,
"num_samples": 1.0,
"reward": 4.375,
"reward_std": 0.8904882073402405,
"rewards/gpt4o_holistic_reward": 4.375,
"rl_loss": 1.862645149230957e-08,
"sft_loss": 2.260878562927246,
"speech_entropy": 1.9968137741088867,
"speech_kl": 0.0,
"step": 436,
"text_entropy": 1.1932947635650635,
"text_kl": 0.0,
"total_entropy": 1.8503947257995605
},
{
"combined_loss": 0.7755292654037476,
"completion_length": 452.0625,
"epoch": 0.1389949109414758,
"grad_norm": 1.9055498838424683,
"kl": 0.0,
"learning_rate": 9.050439738454042e-07,
"loss": 0.7755,
"num_samples": 1.0,
"reward": 3.5625,
"reward_std": 1.058112621307373,
"rewards/gpt4o_holistic_reward": 3.5625,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.585097551345825,
"speech_entropy": 1.9971908330917358,
"speech_kl": 0.0,
"step": 437,
"text_entropy": 1.1110544204711914,
"text_kl": 0.0,
"total_entropy": 1.8316848278045654
},
{
"combined_loss": 0.6731459498405457,
"completion_length": 323.3125,
"epoch": 0.13931297709923665,
"grad_norm": 2.3526909351348877,
"kl": 0.0,
"learning_rate": 9.046048391230247e-07,
"loss": 0.6731,
"num_samples": 1.0,
"reward": 4.6875,
"reward_std": 0.6251000165939331,
"rewards/gpt4o_holistic_reward": 4.6875,
"rl_loss": 2.2351741790771484e-08,
"sft_loss": 2.2438197135925293,
"speech_entropy": 2.0683212280273438,
"speech_kl": 0.0,
"step": 438,
"text_entropy": 1.1782288551330566,
"text_kl": 0.0,
"total_entropy": 1.9051158428192139
},
{
"combined_loss": 0.7724200487136841,
"completion_length": 532.0,
"epoch": 0.13963104325699746,
"grad_norm": 1.7427912950515747,
"kl": 0.0,
"learning_rate": 9.041648116833853e-07,
"loss": 0.7724,
"num_samples": 1.0,
"reward": 3.75,
"reward_std": 0.20422415435314178,
"rewards/gpt4o_holistic_reward": 3.75,
"rl_loss": 0.0,
"sft_loss": 2.5747334957122803,
"speech_entropy": 2.0349836349487305,
"speech_kl": 0.0,
"step": 439,
"text_entropy": 1.427042841911316,
"text_kl": 0.0,
"total_entropy": 1.928051233291626
},
{
"combined_loss": 0.5910226106643677,
"completion_length": 380.6875,
"epoch": 0.13994910941475827,
"grad_norm": 1.7229056358337402,
"kl": 0.0,
"learning_rate": 9.037238926342543e-07,
"loss": 0.591,
"num_samples": 1.0,
"reward": 5.0,
"reward_std": 9.999999747378752e-05,
"rewards/gpt4o_holistic_reward": 5.0,
"rl_loss": 0.0,
"sft_loss": 1.970075249671936,
"speech_entropy": 1.9323487281799316,
"speech_kl": 0.0,
"step": 440,
"text_entropy": 0.8262702226638794,
"text_kl": 0.0,
"total_entropy": 1.7402493953704834
},
{
"combined_loss": 0.6713898777961731,
"completion_length": 546.75,
"epoch": 0.14026717557251908,
"grad_norm": 1.5187904834747314,
"kl": 0.0,
"learning_rate": 9.032820830856449e-07,
"loss": 0.6714,
"num_samples": 1.0,
"reward": 3.4375,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_holistic_reward": 3.4375,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.2379660606384277,
"speech_entropy": 2.4945101737976074,
"speech_kl": 0.0,
"step": 441,
"text_entropy": 2.5544350147247314,
"text_kl": 0.0,
"total_entropy": 2.6159682273864746
},
{
"combined_loss": 0.6225212812423706,
"completion_length": 336.75,
"epoch": 0.1405852417302799,
"grad_norm": 1.7801567316055298,
"kl": 0.0,
"learning_rate": 9.028393841498121e-07,
"loss": 0.6225,
"num_samples": 1.0,
"reward": 4.125,
"reward_std": 0.14443756639957428,
"rewards/gpt4o_holistic_reward": 4.125,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.075070858001709,
"speech_entropy": 2.058767080307007,
"speech_kl": 0.0,
"step": 442,
"text_entropy": 0.9935916662216187,
"text_kl": 0.0,
"total_entropy": 1.8773467540740967
},
{
"combined_loss": 0.6718326210975647,
"completion_length": 393.75,
"epoch": 0.1409033078880407,
"grad_norm": 1.7748103141784668,
"kl": 0.0,
"learning_rate": 9.023957969412499e-07,
"loss": 0.6718,
"num_samples": 1.0,
"reward": 2.375,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_holistic_reward": 2.375,
"rl_loss": 0.0,
"sft_loss": 2.2394418716430664,
"speech_entropy": 2.0696630477905273,
"speech_kl": 0.0,
"step": 443,
"text_entropy": 1.2485952377319336,
"text_kl": 0.0,
"total_entropy": 1.9180147647857666
},
{
"combined_loss": 0.6381258964538574,
"completion_length": 492.1875,
"epoch": 0.14122137404580154,
"grad_norm": 3.0544703006744385,
"kl": 0.0,
"learning_rate": 9.019513225766888e-07,
"loss": 0.6381,
"num_samples": 1.0,
"reward": 4.125,
"reward_std": 0.8483423590660095,
"rewards/gpt4o_holistic_reward": 4.125,
"rl_loss": 0.0,
"sft_loss": 2.1270861625671387,
"speech_entropy": 2.6119399070739746,
"speech_kl": 0.0,
"step": 444,
"text_entropy": 1.2080113887786865,
"text_kl": 0.0,
"total_entropy": 2.211930990219116
},
{
"combined_loss": 0.6671527624130249,
"completion_length": 339.375,
"epoch": 0.14153944020356235,
"grad_norm": 3.0724539756774902,
"kl": 0.0,
"learning_rate": 9.01505962175092e-07,
"loss": 0.6672,
"num_samples": 1.0,
"reward": 4.3125,
"reward_std": 1.125100016593933,
"rewards/gpt4o_holistic_reward": 4.3125,
"rl_loss": 1.862645149230957e-09,
"sft_loss": 2.2238426208496094,
"speech_entropy": 2.0776891708374023,
"speech_kl": 0.0,
"step": 445,
"text_entropy": 1.0866782665252686,
"text_kl": 0.0,
"total_entropy": 1.9007325172424316
},
{
"combined_loss": 0.6805820465087891,
"completion_length": 565.625,
"epoch": 0.14185750636132316,
"grad_norm": 2.9971096515655518,
"kl": 0.0,
"learning_rate": 9.010597168576542e-07,
"loss": 0.6806,
"num_samples": 1.0,
"reward": 3.125,
"reward_std": 0.14443756639957428,
"rewards/gpt4o_holistic_reward": 3.125,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.268606662750244,
"speech_entropy": 3.4692184925079346,
"speech_kl": 0.0,
"step": 446,
"text_entropy": 4.159717082977295,
"text_kl": 0.0,
"total_entropy": 3.8382232189178467
},
{
"combined_loss": 0.6768923401832581,
"completion_length": 378.5,
"epoch": 0.14217557251908397,
"grad_norm": 1.5186785459518433,
"kl": 0.0,
"learning_rate": 9.006125877477975e-07,
"loss": 0.6769,
"num_samples": 1.0,
"reward": 4.0625,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_holistic_reward": 4.0625,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.256307601928711,
"speech_entropy": 2.0326013565063477,
"speech_kl": 0.0,
"step": 447,
"text_entropy": 1.1156654357910156,
"text_kl": 0.0,
"total_entropy": 1.8572213649749756
},
{
"combined_loss": 0.6424182653427124,
"completion_length": 378.5625,
"epoch": 0.14249363867684478,
"grad_norm": 1.8854279518127441,
"kl": 0.0,
"learning_rate": 9.001645759711687e-07,
"loss": 0.6424,
"num_samples": 1.0,
"reward": 3.625,
"reward_std": 0.6008730530738831,
"rewards/gpt4o_holistic_reward": 3.625,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.1413941383361816,
"speech_entropy": 2.0020644664764404,
"speech_kl": 0.0,
"step": 448,
"text_entropy": 0.8331085443496704,
"text_kl": 0.0,
"total_entropy": 1.8029673099517822
},
{
"combined_loss": 0.6341080665588379,
"completion_length": 501.1875,
"epoch": 0.1428117048346056,
"grad_norm": 1.8343334197998047,
"kl": 0.0,
"learning_rate": 8.997156826556369e-07,
"loss": 0.6341,
"num_samples": 1.0,
"reward": 4.0625,
"reward_std": 0.8600690364837646,
"rewards/gpt4o_holistic_reward": 4.0625,
"rl_loss": -1.862645149230957e-09,
"sft_loss": 2.1136932373046875,
"speech_entropy": 2.640713691711426,
"speech_kl": 0.0,
"step": 449,
"text_entropy": 2.6889724731445312,
"text_kl": 0.0,
"total_entropy": 2.807018280029297
},
{
"combined_loss": 0.6706026792526245,
"completion_length": 411.875,
"epoch": 0.1431297709923664,
"grad_norm": 2.0064008235931396,
"kl": 0.0,
"learning_rate": 8.992659089312905e-07,
"loss": 0.6706,
"num_samples": 1.0,
"reward": 4.5625,
"reward_std": 0.6251000165939331,
"rewards/gpt4o_holistic_reward": 4.5625,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.235342025756836,
"speech_entropy": 2.0443549156188965,
"speech_kl": 0.0,
"step": 450,
"text_entropy": 1.1791572570800781,
"text_kl": 0.0,
"total_entropy": 1.8973615169525146
},
{
"combined_loss": 0.7649298906326294,
"completion_length": 356.4375,
"epoch": 0.14344783715012724,
"grad_norm": 2.9334285259246826,
"kl": 0.0,
"learning_rate": 8.988152559304345e-07,
"loss": 0.7649,
"num_samples": 1.0,
"reward": 3.875,
"reward_std": 1.4256925582885742,
"rewards/gpt4o_holistic_reward": 3.875,
"rl_loss": 0.0,
"sft_loss": 2.5497660636901855,
"speech_entropy": 2.2932775020599365,
"speech_kl": 0.0,
"step": 451,
"text_entropy": 1.6037144660949707,
"text_kl": 0.0,
"total_entropy": 2.1606814861297607
},
{
"combined_loss": 0.8326526880264282,
"completion_length": 575.9375,
"epoch": 0.14376590330788805,
"grad_norm": 2.880450963973999,
"kl": 0.0,
"learning_rate": 8.983637247875872e-07,
"loss": 0.8327,
"num_samples": 1.0,
"reward": 4.125,
"reward_std": 1.1683900356292725,
"rewards/gpt4o_holistic_reward": 4.125,
"rl_loss": 5.587935447692871e-09,
"sft_loss": 2.7755088806152344,
"speech_entropy": 2.5213093757629395,
"speech_kl": 0.0,
"step": 452,
"text_entropy": 1.762268304824829,
"text_kl": 0.0,
"total_entropy": 2.381089448928833
},
{
"combined_loss": 0.7429494261741638,
"completion_length": 398.0,
"epoch": 0.14408396946564886,
"grad_norm": 2.2405805587768555,
"kl": 0.0,
"learning_rate": 8.979113166394775e-07,
"loss": 0.7429,
"num_samples": 1.0,
"reward": 3.75,
"reward_std": 0.5000999569892883,
"rewards/gpt4o_holistic_reward": 3.75,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.4764981269836426,
"speech_entropy": 3.2521719932556152,
"speech_kl": 0.0,
"step": 453,
"text_entropy": 3.165997266769409,
"text_kl": 0.0,
"total_entropy": 3.449254035949707
},
{
"combined_loss": 0.5944318175315857,
"completion_length": 424.1875,
"epoch": 0.14440203562340967,
"grad_norm": 1.6569174528121948,
"kl": 0.0,
"learning_rate": 8.974580326250424e-07,
"loss": 0.5944,
"num_samples": 1.0,
"reward": 3.3125,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_holistic_reward": 3.3125,
"rl_loss": 1.862645149230957e-09,
"sft_loss": 1.9814393520355225,
"speech_entropy": 1.9895391464233398,
"speech_kl": 0.0,
"step": 454,
"text_entropy": 0.9808429479598999,
"text_kl": 0.0,
"total_entropy": 1.808826208114624
},
{
"combined_loss": 0.6611145734786987,
"completion_length": 466.625,
"epoch": 0.14472010178117048,
"grad_norm": 1.9934300184249878,
"kl": 0.0,
"learning_rate": 8.970038738854244e-07,
"loss": 0.6611,
"num_samples": 1.0,
"reward": 3.9375,
"reward_std": 0.5194375514984131,
"rewards/gpt4o_holistic_reward": 3.9375,
"rl_loss": 0.0,
"sft_loss": 2.2037153244018555,
"speech_entropy": 2.034078359603882,
"speech_kl": 0.0,
"step": 455,
"text_entropy": 1.189087986946106,
"text_kl": 0.0,
"total_entropy": 1.8778494596481323
},
{
"combined_loss": 0.6767491698265076,
"completion_length": 517.75,
"epoch": 0.1450381679389313,
"grad_norm": 1.939832091331482,
"kl": 0.0,
"learning_rate": 8.965488415639671e-07,
"loss": 0.6767,
"num_samples": 1.0,
"reward": 2.875,
"reward_std": 0.7217878103256226,
"rewards/gpt4o_holistic_reward": 2.875,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.2558302879333496,
"speech_entropy": 2.4079430103302,
"speech_kl": 0.0,
"step": 456,
"text_entropy": 2.5751752853393555,
"text_kl": 0.0,
"total_entropy": 2.6069483757019043
},
{
"combined_loss": 0.6979454159736633,
"completion_length": 484.125,
"epoch": 0.1453562340966921,
"grad_norm": 1.6610511541366577,
"kl": 0.0,
"learning_rate": 8.960929368062138e-07,
"loss": 0.6979,
"num_samples": 1.0,
"reward": 3.75,
"reward_std": 0.5774502754211426,
"rewards/gpt4o_holistic_reward": 3.75,
"rl_loss": 0.0,
"sft_loss": 2.3264846801757812,
"speech_entropy": 2.0884976387023926,
"speech_kl": 0.0,
"step": 457,
"text_entropy": 1.078056812286377,
"text_kl": 0.0,
"total_entropy": 1.8959829807281494
},
{
"combined_loss": 0.6162172555923462,
"completion_length": 581.4375,
"epoch": 0.14567430025445294,
"grad_norm": 1.4069401025772095,
"kl": 0.0,
"learning_rate": 8.956361607599043e-07,
"loss": 0.6162,
"num_samples": 1.0,
"reward": 3.625,
"reward_std": 0.4331127107143402,
"rewards/gpt4o_holistic_reward": 3.625,
"rl_loss": 0.0,
"sft_loss": 2.0540573596954346,
"speech_entropy": 1.9977447986602783,
"speech_kl": 0.0,
"step": 458,
"text_entropy": 0.7805925607681274,
"text_kl": 0.0,
"total_entropy": 1.7389767169952393
},
{
"combined_loss": 0.677904486656189,
"completion_length": 249.4375,
"epoch": 0.14599236641221375,
"grad_norm": 2.1168832778930664,
"kl": 0.0,
"learning_rate": 8.951785145749719e-07,
"loss": 0.6779,
"num_samples": 1.0,
"reward": 3.5625,
"reward_std": 0.7090140581130981,
"rewards/gpt4o_holistic_reward": 3.5625,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.259681463241577,
"speech_entropy": 2.0867042541503906,
"speech_kl": 0.0,
"step": 459,
"text_entropy": 0.5684058666229248,
"text_kl": 0.0,
"total_entropy": 1.8018689155578613
},
{
"combined_loss": 0.6343377828598022,
"completion_length": 347.4375,
"epoch": 0.14631043256997456,
"grad_norm": 2.351423501968384,
"kl": 0.0,
"learning_rate": 8.9471999940354e-07,
"loss": 0.6343,
"num_samples": 1.0,
"reward": 3.375,
"reward_std": 1.1404881477355957,
"rewards/gpt4o_holistic_reward": 3.375,
"rl_loss": 0.0,
"sft_loss": 2.1144590377807617,
"speech_entropy": 2.272289752960205,
"speech_kl": 0.0,
"step": 460,
"text_entropy": 1.7216747999191284,
"text_kl": 0.0,
"total_entropy": 2.1894326210021973
},
{
"combined_loss": 0.7236204147338867,
"completion_length": 413.1875,
"epoch": 0.14662849872773537,
"grad_norm": 2.023482322692871,
"kl": 0.0,
"learning_rate": 8.942606163999204e-07,
"loss": 0.7236,
"num_samples": 1.0,
"reward": 3.6875,
"reward_std": 0.6250999569892883,
"rewards/gpt4o_holistic_reward": 3.6875,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.4120678901672363,
"speech_entropy": 2.0738351345062256,
"speech_kl": 0.0,
"step": 461,
"text_entropy": 1.424910306930542,
"text_kl": 0.0,
"total_entropy": 1.9624372720718384
},
{
"combined_loss": 0.6593168377876282,
"completion_length": 440.625,
"epoch": 0.14694656488549618,
"grad_norm": 2.023776054382324,
"kl": 0.0,
"learning_rate": 8.93800366720609e-07,
"loss": 0.6593,
"num_samples": 1.0,
"reward": 3.3125,
"reward_std": 0.651972770690918,
"rewards/gpt4o_holistic_reward": 3.3125,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.197722911834717,
"speech_entropy": 2.1023144721984863,
"speech_kl": 0.0,
"step": 462,
"text_entropy": 1.0320696830749512,
"text_kl": 0.0,
"total_entropy": 1.9018843173980713
},
{
"combined_loss": 0.6318823099136353,
"completion_length": 373.1875,
"epoch": 0.147264631043257,
"grad_norm": 2.802096128463745,
"kl": 0.0,
"learning_rate": 8.933392515242838e-07,
"loss": 0.6319,
"num_samples": 1.0,
"reward": 3.375,
"reward_std": 0.8731511235237122,
"rewards/gpt4o_holistic_reward": 3.375,
"rl_loss": 0.0,
"sft_loss": 2.106274366378784,
"speech_entropy": 3.7126779556274414,
"speech_kl": 0.0,
"step": 463,
"text_entropy": 3.271230697631836,
"text_kl": 0.0,
"total_entropy": 3.7780895233154297
},
{
"combined_loss": 0.6602550745010376,
"completion_length": 463.25,
"epoch": 0.1475826972010178,
"grad_norm": 2.1378464698791504,
"kl": 0.0,
"learning_rate": 8.928772719718018e-07,
"loss": 0.6603,
"num_samples": 1.0,
"reward": 4.3125,
"reward_std": 0.6477051377296448,
"rewards/gpt4o_holistic_reward": 4.3125,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.200850248336792,
"speech_entropy": 1.9239245653152466,
"speech_kl": 0.0,
"step": 464,
"text_entropy": 1.3505656719207764,
"text_kl": 0.0,
"total_entropy": 1.8138980865478516
},
{
"combined_loss": 0.6675082445144653,
"completion_length": 347.75,
"epoch": 0.14790076335877864,
"grad_norm": 2.2810842990875244,
"kl": 0.0,
"learning_rate": 8.924144292261962e-07,
"loss": 0.6675,
"num_samples": 1.0,
"reward": 3.1875,
"reward_std": 0.8750999569892883,
"rewards/gpt4o_holistic_reward": 3.1875,
"rl_loss": -2.2351741790771484e-08,
"sft_loss": 2.225027561187744,
"speech_entropy": 2.1317272186279297,
"speech_kl": 0.0,
"step": 465,
"text_entropy": 1.1749463081359863,
"text_kl": 0.0,
"total_entropy": 1.9694660902023315
},
{
"combined_loss": 0.7008163928985596,
"completion_length": 294.5625,
"epoch": 0.14821882951653945,
"grad_norm": 2.6175572872161865,
"kl": 0.0,
"learning_rate": 8.919507244526726e-07,
"loss": 0.7008,
"num_samples": 1.0,
"reward": 4.875,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_holistic_reward": 4.875,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.336054563522339,
"speech_entropy": 2.284971237182617,
"speech_kl": 0.0,
"step": 466,
"text_entropy": 1.1186554431915283,
"text_kl": 0.0,
"total_entropy": 2.061189889907837
},
{
"combined_loss": 0.7040484547615051,
"completion_length": 343.25,
"epoch": 0.14853689567430026,
"grad_norm": 2.2278120517730713,
"kl": 0.0,
"learning_rate": 8.914861588186076e-07,
"loss": 0.704,
"num_samples": 1.0,
"reward": 4.0625,
"reward_std": 0.6038135886192322,
"rewards/gpt4o_holistic_reward": 4.0625,
"rl_loss": 0.0,
"sft_loss": 2.346827983856201,
"speech_entropy": 2.906318426132202,
"speech_kl": 0.0,
"step": 467,
"text_entropy": 3.0286753177642822,
"text_kl": 0.0,
"total_entropy": 3.109015464782715
},
{
"combined_loss": 0.6465004682540894,
"completion_length": 415.1875,
"epoch": 0.14885496183206107,
"grad_norm": 2.1413097381591797,
"kl": 0.0,
"learning_rate": 8.910207334935446e-07,
"loss": 0.6465,
"num_samples": 1.0,
"reward": 4.0,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_holistic_reward": 4.0,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.155001640319824,
"speech_entropy": 2.1206655502319336,
"speech_kl": 0.0,
"step": 468,
"text_entropy": 1.0818634033203125,
"text_kl": 0.0,
"total_entropy": 1.9281299114227295
},
{
"combined_loss": 0.7584635615348816,
"completion_length": 545.25,
"epoch": 0.14917302798982188,
"grad_norm": 1.5538190603256226,
"kl": 0.0,
"learning_rate": 8.90554449649191e-07,
"loss": 0.7585,
"num_samples": 1.0,
"reward": 5.0,
"reward_std": 9.999999747378752e-05,
"rewards/gpt4o_holistic_reward": 5.0,
"rl_loss": 0.0,
"sft_loss": 2.528211832046509,
"speech_entropy": 2.115117073059082,
"speech_kl": 0.0,
"step": 469,
"text_entropy": 1.4869662523269653,
"text_kl": 0.0,
"total_entropy": 2.0077667236328125
},
{
"combined_loss": 0.7136243581771851,
"completion_length": 401.3125,
"epoch": 0.1494910941475827,
"grad_norm": 1.8857200145721436,
"kl": 0.0,
"learning_rate": 8.900873084594161e-07,
"loss": 0.7136,
"num_samples": 1.0,
"reward": 4.8125,
"reward_std": 0.2694375813007355,
"rewards/gpt4o_holistic_reward": 4.8125,
"rl_loss": -3.725290298461914e-09,
"sft_loss": 2.3787477016448975,
"speech_entropy": 2.175069808959961,
"speech_kl": 0.0,
"step": 470,
"text_entropy": 1.4863579273223877,
"text_kl": 0.0,
"total_entropy": 2.0467123985290527
},
{
"combined_loss": 0.7089301347732544,
"completion_length": 378.0625,
"epoch": 0.14980916030534353,
"grad_norm": 2.2557709217071533,
"kl": 0.0,
"learning_rate": 8.896193111002475e-07,
"loss": 0.7089,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 0.739456832408905,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": 0.0,
"sft_loss": 2.363100528717041,
"speech_entropy": 2.1490824222564697,
"speech_kl": 0.0,
"step": 471,
"text_entropy": 0.9428769946098328,
"text_kl": 0.0,
"total_entropy": 1.9167104959487915
},
{
"combined_loss": 0.6535341143608093,
"completion_length": 417.375,
"epoch": 0.15012722646310434,
"grad_norm": 1.727138876914978,
"kl": 0.0,
"learning_rate": 8.891504587498674e-07,
"loss": 0.6535,
"num_samples": 1.0,
"reward": 3.5625,
"reward_std": 0.2694375813007355,
"rewards/gpt4o_holistic_reward": 3.5625,
"rl_loss": 0.0,
"sft_loss": 2.1784467697143555,
"speech_entropy": 2.090775489807129,
"speech_kl": 0.0,
"step": 472,
"text_entropy": 1.3160064220428467,
"text_kl": 0.0,
"total_entropy": 1.9528473615646362
},
{
"combined_loss": 0.7152138948440552,
"completion_length": 432.625,
"epoch": 0.15044529262086515,
"grad_norm": 1.9705350399017334,
"kl": 0.0,
"learning_rate": 8.886807525886113e-07,
"loss": 0.7152,
"num_samples": 1.0,
"reward": 4.75,
"reward_std": 0.5001000165939331,
"rewards/gpt4o_holistic_reward": 4.75,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.3840465545654297,
"speech_entropy": 2.1602611541748047,
"speech_kl": 0.0,
"step": 473,
"text_entropy": 1.172673225402832,
"text_kl": 0.0,
"total_entropy": 1.9916683435440063
},
{
"combined_loss": 0.6396362781524658,
"completion_length": 311.8125,
"epoch": 0.15076335877862596,
"grad_norm": 1.9259790182113647,
"kl": 0.0,
"learning_rate": 8.882101937989642e-07,
"loss": 0.6396,
"num_samples": 1.0,
"reward": 4.1875,
"reward_std": 0.5194376111030579,
"rewards/gpt4o_holistic_reward": 4.1875,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.1321208477020264,
"speech_entropy": 2.147437334060669,
"speech_kl": 0.0,
"step": 474,
"text_entropy": 0.9479535818099976,
"text_kl": 0.0,
"total_entropy": 1.929750919342041
},
{
"combined_loss": 0.6557704210281372,
"completion_length": 485.6875,
"epoch": 0.15108142493638677,
"grad_norm": 2.2060537338256836,
"kl": 0.0,
"learning_rate": 8.87738783565557e-07,
"loss": 0.6558,
"num_samples": 1.0,
"reward": 3.625,
"reward_std": 0.933112621307373,
"rewards/gpt4o_holistic_reward": 3.625,
"rl_loss": 0.0,
"sft_loss": 2.185901165008545,
"speech_entropy": 2.608290672302246,
"speech_kl": 0.0,
"step": 475,
"text_entropy": 2.9066762924194336,
"text_kl": 0.0,
"total_entropy": 2.873114585876465
},
{
"combined_loss": 0.6064234972000122,
"completion_length": 427.6875,
"epoch": 0.15139949109414758,
"grad_norm": 1.4221584796905518,
"kl": 0.0,
"learning_rate": 8.872665230751643e-07,
"loss": 0.6064,
"num_samples": 1.0,
"reward": 4.8125,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_holistic_reward": 4.8125,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.021411657333374,
"speech_entropy": 1.9495731592178345,
"speech_kl": 0.0,
"step": 476,
"text_entropy": 0.6150962114334106,
"text_kl": 0.0,
"total_entropy": 1.6934847831726074
},
{
"combined_loss": 0.6592838168144226,
"completion_length": 535.5,
"epoch": 0.15171755725190839,
"grad_norm": 1.9688835144042969,
"kl": 0.0,
"learning_rate": 8.867934135167016e-07,
"loss": 0.6593,
"num_samples": 1.0,
"reward": 2.5,
"reward_std": 0.8944375514984131,
"rewards/gpt4o_holistic_reward": 2.5,
"rl_loss": -1.1175870895385742e-08,
"sft_loss": 2.197612762451172,
"speech_entropy": 2.0396499633789062,
"speech_kl": 0.0,
"step": 477,
"text_entropy": 1.2544045448303223,
"text_kl": 0.0,
"total_entropy": 1.8939778804779053
},
{
"combined_loss": 0.7103521823883057,
"completion_length": 476.3125,
"epoch": 0.15203562340966922,
"grad_norm": 1.6043519973754883,
"kl": 0.0,
"learning_rate": 8.863194560812214e-07,
"loss": 0.7104,
"num_samples": 1.0,
"reward": 5.0,
"reward_std": 9.999999747378752e-05,
"rewards/gpt4o_holistic_reward": 5.0,
"rl_loss": 0.0,
"sft_loss": 2.367840528488159,
"speech_entropy": 2.1345765590667725,
"speech_kl": 0.0,
"step": 478,
"text_entropy": 1.4203035831451416,
"text_kl": 0.0,
"total_entropy": 2.0092082023620605
},
{
"combined_loss": 0.6551531553268433,
"completion_length": 424.875,
"epoch": 0.15235368956743003,
"grad_norm": 1.9476300477981567,
"kl": 0.0,
"learning_rate": 8.858446519619112e-07,
"loss": 0.6552,
"num_samples": 1.0,
"reward": 4.0625,
"reward_std": 0.9063550233840942,
"rewards/gpt4o_holistic_reward": 4.0625,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.1838438510894775,
"speech_entropy": 2.67842435836792,
"speech_kl": 0.0,
"step": 479,
"text_entropy": 2.9070305824279785,
"text_kl": 0.0,
"total_entropy": 2.8740692138671875
},
{
"combined_loss": 0.6260417699813843,
"completion_length": 583.6875,
"epoch": 0.15267175572519084,
"grad_norm": 1.565351128578186,
"kl": 0.0,
"learning_rate": 8.853690023540895e-07,
"loss": 0.626,
"num_samples": 1.0,
"reward": 4.3125,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_holistic_reward": 4.3125,
"rl_loss": -1.4901161193847656e-08,
"sft_loss": 2.086805820465088,
"speech_entropy": 1.9153913259506226,
"speech_kl": 0.0,
"step": 480,
"text_entropy": 0.715573251247406,
"text_kl": 0.0,
"total_entropy": 1.687865972518921
},
{
"combined_loss": 0.6601178050041199,
"completion_length": 385.25,
"epoch": 0.15298982188295165,
"grad_norm": 2.2061314582824707,
"kl": 0.0,
"learning_rate": 8.84892508455204e-07,
"loss": 0.6601,
"num_samples": 1.0,
"reward": 3.75,
"reward_std": 1.1313834190368652,
"rewards/gpt4o_holistic_reward": 3.75,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.200392484664917,
"speech_entropy": 2.007784366607666,
"speech_kl": 0.0,
"step": 481,
"text_entropy": 1.371636986732483,
"text_kl": 0.0,
"total_entropy": 1.8983948230743408
},
{
"combined_loss": 0.6691581010818481,
"completion_length": 463.3125,
"epoch": 0.15330788804071246,
"grad_norm": 1.7773209810256958,
"kl": 0.0,
"learning_rate": 8.844151714648274e-07,
"loss": 0.6692,
"num_samples": 1.0,
"reward": 3.3125,
"reward_std": 0.853813648223877,
"rewards/gpt4o_holistic_reward": 3.3125,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.230526924133301,
"speech_entropy": 2.0789544582366943,
"speech_kl": 0.0,
"step": 482,
"text_entropy": 1.1371136903762817,
"text_kl": 0.0,
"total_entropy": 1.8991715908050537
},
{
"combined_loss": 0.6369400024414062,
"completion_length": 410.6875,
"epoch": 0.15362595419847327,
"grad_norm": 1.8571422100067139,
"kl": 0.0,
"learning_rate": 8.839369925846548e-07,
"loss": 0.6369,
"num_samples": 1.0,
"reward": 2.8125,
"reward_std": 0.48945680260658264,
"rewards/gpt4o_holistic_reward": 2.8125,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.1231331825256348,
"speech_entropy": 2.2014362812042236,
"speech_kl": 0.0,
"step": 483,
"text_entropy": 1.6307443380355835,
"text_kl": 0.0,
"total_entropy": 2.1009421348571777
},
{
"combined_loss": 0.7150350213050842,
"completion_length": 738.4375,
"epoch": 0.15394402035623408,
"grad_norm": 1.8518329858779907,
"kl": 0.0,
"learning_rate": 8.834579730185012e-07,
"loss": 0.715,
"num_samples": 1.0,
"reward": 3.9375,
"reward_std": 0.6935809850692749,
"rewards/gpt4o_holistic_reward": 3.9375,
"rl_loss": -1.1175870895385742e-08,
"sft_loss": 2.3834500312805176,
"speech_entropy": 2.07857608795166,
"speech_kl": 0.0,
"step": 484,
"text_entropy": 1.5495516061782837,
"text_kl": 0.0,
"total_entropy": 1.9772846698760986
},
{
"combined_loss": 0.7554680109024048,
"completion_length": 434.4375,
"epoch": 0.15426208651399492,
"grad_norm": 1.8048903942108154,
"kl": 0.0,
"learning_rate": 8.829781139722978e-07,
"loss": 0.7555,
"num_samples": 1.0,
"reward": 3.9375,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_holistic_reward": 3.9375,
"rl_loss": -1.1175870895385742e-08,
"sft_loss": 2.5182266235351562,
"speech_entropy": 2.2054073810577393,
"speech_kl": 0.0,
"step": 485,
"text_entropy": 1.5983672142028809,
"text_kl": 0.0,
"total_entropy": 2.0975542068481445
},
{
"combined_loss": 0.7759544253349304,
"completion_length": 575.25,
"epoch": 0.15458015267175573,
"grad_norm": 1.7967489957809448,
"kl": 0.0,
"learning_rate": 8.824974166540889e-07,
"loss": 0.776,
"num_samples": 1.0,
"reward": 3.625,
"reward_std": 0.3944375813007355,
"rewards/gpt4o_holistic_reward": 3.625,
"rl_loss": 1.1175870895385742e-08,
"sft_loss": 2.586514711380005,
"speech_entropy": 2.0900139808654785,
"speech_kl": 0.0,
"step": 486,
"text_entropy": 1.4879392385482788,
"text_kl": 0.0,
"total_entropy": 1.9754605293273926
},
{
"combined_loss": 0.7049446105957031,
"completion_length": 516.3125,
"epoch": 0.15489821882951654,
"grad_norm": 1.6339975595474243,
"kl": 0.0,
"learning_rate": 8.820158822740297e-07,
"loss": 0.7049,
"num_samples": 1.0,
"reward": 3.25,
"reward_std": 0.2501000165939331,
"rewards/gpt4o_holistic_reward": 3.25,
"rl_loss": -1.1175870895385742e-08,
"sft_loss": 2.3498153686523438,
"speech_entropy": 2.0781824588775635,
"speech_kl": 0.0,
"step": 487,
"text_entropy": 1.1251006126403809,
"text_kl": 0.0,
"total_entropy": 1.9013476371765137
},
{
"combined_loss": 0.7730693817138672,
"completion_length": 473.25,
"epoch": 0.15521628498727735,
"grad_norm": 1.7758270502090454,
"kl": 0.0,
"learning_rate": 8.81533512044382e-07,
"loss": 0.7731,
"num_samples": 1.0,
"reward": 3.875,
"reward_std": 1.0387752056121826,
"rewards/gpt4o_holistic_reward": 3.875,
"rl_loss": 0.0,
"sft_loss": 2.5768978595733643,
"speech_entropy": 2.2842307090759277,
"speech_kl": 0.0,
"step": 488,
"text_entropy": 1.3962032794952393,
"text_kl": 0.0,
"total_entropy": 2.119879961013794
},
{
"combined_loss": 0.6924104690551758,
"completion_length": 437.25,
"epoch": 0.15553435114503816,
"grad_norm": 2.08705735206604,
"kl": 0.0,
"learning_rate": 8.810503071795131e-07,
"loss": 0.6924,
"num_samples": 1.0,
"reward": 3.4375,
"reward_std": 0.9478486180305481,
"rewards/gpt4o_holistic_reward": 3.4375,
"rl_loss": 0.0,
"sft_loss": 2.308034896850586,
"speech_entropy": 2.0420119762420654,
"speech_kl": 0.0,
"step": 489,
"text_entropy": 1.393028736114502,
"text_kl": 0.0,
"total_entropy": 1.928663730621338
},
{
"combined_loss": 0.7534229159355164,
"completion_length": 616.9375,
"epoch": 0.15585241730279897,
"grad_norm": 1.6156688928604126,
"kl": 0.0,
"learning_rate": 8.805662688958898e-07,
"loss": 0.7534,
"num_samples": 1.0,
"reward": 4.3125,
"reward_std": 0.6978486180305481,
"rewards/gpt4o_holistic_reward": 4.3125,
"rl_loss": -9.313225746154785e-09,
"sft_loss": 2.5114095211029053,
"speech_entropy": 2.1210994720458984,
"speech_kl": 0.0,
"step": 490,
"text_entropy": 1.2488669157028198,
"text_kl": 0.0,
"total_entropy": 1.964477300643921
},
{
"combined_loss": 0.648138165473938,
"completion_length": 361.8125,
"epoch": 0.15617048346055978,
"grad_norm": 1.8364536762237549,
"kl": 0.0,
"learning_rate": 8.800813984120786e-07,
"loss": 0.6481,
"num_samples": 1.0,
"reward": 4.0625,
"reward_std": 0.3751000165939331,
"rewards/gpt4o_holistic_reward": 4.0625,
"rl_loss": 3.725290298461914e-09,
"sft_loss": 2.1604604721069336,
"speech_entropy": 2.1113858222961426,
"speech_kl": 0.0,
"step": 491,
"text_entropy": 1.0654577016830444,
"text_kl": 0.0,
"total_entropy": 1.9055767059326172
},
{
"combined_loss": 0.7758172750473022,
"completion_length": 603.0625,
"epoch": 0.15648854961832062,
"grad_norm": 1.6238081455230713,
"kl": 0.0,
"learning_rate": 8.795956969487398e-07,
"loss": 0.7758,
"num_samples": 1.0,
"reward": 4.1875,
"reward_std": 0.8081127405166626,
"rewards/gpt4o_holistic_reward": 4.1875,
"rl_loss": 1.862645149230957e-08,
"sft_loss": 2.586057662963867,
"speech_entropy": 2.1472387313842773,
"speech_kl": 0.0,
"step": 492,
"text_entropy": 1.605583667755127,
"text_kl": 0.0,
"total_entropy": 2.047147035598755
},
{
"combined_loss": 0.6960165500640869,
"completion_length": 649.0625,
"epoch": 0.15680661577608143,
"grad_norm": 6.360669136047363,
"kl": 0.0,
"learning_rate": 8.791091657286267e-07,
"loss": 0.696,
"num_samples": 1.0,
"reward": 3.3125,
"reward_std": 1.0713938474655151,
"rewards/gpt4o_holistic_reward": 3.3125,
"rl_loss": 1.862645149230957e-09,
"sft_loss": 2.3200550079345703,
"speech_entropy": 3.068434715270996,
"speech_kl": 0.0,
"step": 493,
"text_entropy": 2.149477005004883,
"text_kl": 0.0,
"total_entropy": 2.953958034515381
},
{
"combined_loss": 0.5907741785049438,
"completion_length": 397.0625,
"epoch": 0.15712468193384224,
"grad_norm": 1.611266016960144,
"kl": 0.0,
"learning_rate": 8.786218059765809e-07,
"loss": 0.5908,
"num_samples": 1.0,
"reward": 3.9375,
"reward_std": 0.6250999569892883,
"rewards/gpt4o_holistic_reward": 3.9375,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 1.9692471027374268,
"speech_entropy": 1.9849560260772705,
"speech_kl": 0.0,
"step": 494,
"text_entropy": 0.6464660167694092,
"text_kl": 0.0,
"total_entropy": 1.7163063287734985
},
{
"combined_loss": 0.6750938892364502,
"completion_length": 525.0,
"epoch": 0.15744274809160305,
"grad_norm": 2.001243829727173,
"kl": 0.0,
"learning_rate": 8.781336189195296e-07,
"loss": 0.6751,
"num_samples": 1.0,
"reward": 3.75,
"reward_std": 1.0048449039459229,
"rewards/gpt4o_holistic_reward": 3.75,
"rl_loss": -1.1175870895385742e-08,
"sft_loss": 2.2503128051757812,
"speech_entropy": 2.9428701400756836,
"speech_kl": 0.0,
"step": 495,
"text_entropy": 3.2061309814453125,
"text_kl": 0.0,
"total_entropy": 3.2631325721740723
},
{
"combined_loss": 0.691491961479187,
"completion_length": 531.9375,
"epoch": 0.15776081424936386,
"grad_norm": 1.7672028541564941,
"kl": 0.0,
"learning_rate": 8.776446057864838e-07,
"loss": 0.6915,
"num_samples": 1.0,
"reward": 3.625,
"reward_std": 0.8274502754211426,
"rewards/gpt4o_holistic_reward": 3.625,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.3049731254577637,
"speech_entropy": 2.1160459518432617,
"speech_kl": 0.0,
"step": 496,
"text_entropy": 1.1704938411712646,
"text_kl": 0.0,
"total_entropy": 1.9302992820739746
},
{
"combined_loss": 0.6495949625968933,
"completion_length": 476.375,
"epoch": 0.15807888040712467,
"grad_norm": 2.018303632736206,
"kl": 0.0,
"learning_rate": 8.77154767808533e-07,
"loss": 0.6496,
"num_samples": 1.0,
"reward": 3.8125,
"reward_std": 0.6637751460075378,
"rewards/gpt4o_holistic_reward": 3.8125,
"rl_loss": -1.1175870895385742e-08,
"sft_loss": 2.165316581726074,
"speech_entropy": 2.1160874366760254,
"speech_kl": 0.0,
"step": 497,
"text_entropy": 1.245253086090088,
"text_kl": 0.0,
"total_entropy": 1.9460238218307495
},
{
"combined_loss": 0.6617956161499023,
"completion_length": 528.8125,
"epoch": 0.15839694656488548,
"grad_norm": 1.7395589351654053,
"kl": 0.0,
"learning_rate": 8.766641062188442e-07,
"loss": 0.6618,
"num_samples": 1.0,
"reward": 4.1875,
"reward_std": 0.1251000016927719,
"rewards/gpt4o_holistic_reward": 4.1875,
"rl_loss": 1.4901161193847656e-08,
"sft_loss": 2.2059853076934814,
"speech_entropy": 2.0821304321289062,
"speech_kl": 0.0,
"step": 498,
"text_entropy": 1.037213921546936,
"text_kl": 0.0,
"total_entropy": 1.8841687440872192
},
{
"combined_loss": 0.6126347184181213,
"completion_length": 466.5,
"epoch": 0.15871501272264632,
"grad_norm": 1.8486003875732422,
"kl": 0.0,
"learning_rate": 8.761726222526569e-07,
"loss": 0.6126,
"num_samples": 1.0,
"reward": 3.4375,
"reward_std": 1.2024502754211426,
"rewards/gpt4o_holistic_reward": 3.4375,
"rl_loss": 7.450580596923828e-09,
"sft_loss": 2.0421156883239746,
"speech_entropy": 2.067373752593994,
"speech_kl": 0.0,
"step": 499,
"text_entropy": 0.8997583389282227,
"text_kl": 0.0,
"total_entropy": 1.8452403545379639
},
{
"combined_loss": 0.7227458953857422,
"completion_length": 520.3125,
"epoch": 0.15903307888040713,
"grad_norm": 1.7850871086120605,
"kl": 0.0,
"learning_rate": 8.756803171472816e-07,
"loss": 0.7227,
"num_samples": 1.0,
"reward": 3.4375,
"reward_std": 0.48945680260658264,
"rewards/gpt4o_holistic_reward": 3.4375,
"rl_loss": -7.450580596923828e-09,
"sft_loss": 2.4091529846191406,
"speech_entropy": 2.1188344955444336,
"speech_kl": 0.0,
"step": 500,
"text_entropy": 1.2516241073608398,
"text_kl": 0.0,
"total_entropy": 1.9562331438064575
}
],
"logging_steps": 1,
"max_steps": 2000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}