VAPO-3B / trainer_state.json
RUIH's picture
Upload folder using huggingface_hub
4b04d5a verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.5568,
"eval_steps": 500,
"global_step": 800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 220.40625,
"epoch": 0.0032,
"grad_norm": 11.881386756896973,
"kl": 0.0,
"learning_rate": 1e-06,
"loss": 0.0,
"reward": 2.0222461223602295,
"reward_std": 1.2291262745857239,
"rewards/answer_entity_reward": 0.5891842544078827,
"rewards/answer_wer_reward": 0.36776189506053925,
"rewards/format_reward": 0.46875,
"rewards/think_ocr_reward": 0.596549928188324,
"step": 1
},
{
"completion_length": 183.75,
"epoch": 0.0064,
"grad_norm": 14.301155090332031,
"kl": 0.000579833984375,
"learning_rate": 9.9875e-07,
"loss": 0.0,
"reward": 2.1407116651535034,
"reward_std": 0.9154457449913025,
"rewards/answer_entity_reward": 0.7417342960834503,
"rewards/answer_wer_reward": 0.4293617159128189,
"rewards/format_reward": 0.59375,
"rewards/think_ocr_reward": 0.3758656233549118,
"step": 2
},
{
"completion_length": 185.09375,
"epoch": 0.0096,
"grad_norm": 7.90402889251709,
"kl": 0.0025768280029296875,
"learning_rate": 9.975e-07,
"loss": 0.0,
"reward": 2.4301702976226807,
"reward_std": 1.0761558413505554,
"rewards/answer_entity_reward": 0.7529265582561493,
"rewards/answer_wer_reward": 0.45110173523426056,
"rewards/format_reward": 0.6875,
"rewards/think_ocr_reward": 0.5386419892311096,
"step": 3
},
{
"completion_length": 201.46875,
"epoch": 0.0128,
"grad_norm": 2.4371554851531982,
"kl": 0.0039825439453125,
"learning_rate": 9.9625e-07,
"loss": 0.0,
"reward": 2.4960588216781616,
"reward_std": 1.0011246800422668,
"rewards/answer_entity_reward": 0.6945474743843079,
"rewards/answer_wer_reward": 0.626116082072258,
"rewards/format_reward": 0.65625,
"rewards/think_ocr_reward": 0.519145280122757,
"step": 4
},
{
"completion_length": 223.1875,
"epoch": 0.016,
"grad_norm": 3.092437982559204,
"kl": 0.001644134521484375,
"learning_rate": 9.95e-07,
"loss": 0.0,
"reward": 2.6151310205459595,
"reward_std": 1.0057614743709564,
"rewards/answer_entity_reward": 0.6729370057582855,
"rewards/answer_wer_reward": 0.43601465225219727,
"rewards/format_reward": 0.75,
"rewards/think_ocr_reward": 0.7561794817447662,
"step": 5
},
{
"completion_length": 211.09375,
"epoch": 0.0192,
"grad_norm": 3.8149898052215576,
"kl": 0.00344085693359375,
"learning_rate": 9.9375e-07,
"loss": 0.0,
"reward": 2.601198673248291,
"reward_std": 0.8605955541133881,
"rewards/answer_entity_reward": 0.6944940388202667,
"rewards/answer_wer_reward": 0.5194687843322754,
"rewards/format_reward": 0.71875,
"rewards/think_ocr_reward": 0.6684857904911041,
"step": 6
},
{
"completion_length": 210.8125,
"epoch": 0.0224,
"grad_norm": 2.000467300415039,
"kl": 0.0030364990234375,
"learning_rate": 9.925e-07,
"loss": 0.0,
"reward": 3.1113568544387817,
"reward_std": 0.928675651550293,
"rewards/answer_entity_reward": 0.8195368647575378,
"rewards/answer_wer_reward": 0.7422276139259338,
"rewards/format_reward": 0.75,
"rewards/think_ocr_reward": 0.7995923757553101,
"step": 7
},
{
"completion_length": 240.375,
"epoch": 0.0256,
"grad_norm": 2.2319533824920654,
"kl": 0.0052947998046875,
"learning_rate": 9.912499999999998e-07,
"loss": 0.0001,
"reward": 3.217132568359375,
"reward_std": 0.4984496384859085,
"rewards/answer_entity_reward": 0.7789974808692932,
"rewards/answer_wer_reward": 0.6678729355335236,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.8015121817588806,
"step": 8
},
{
"completion_length": 217.1875,
"epoch": 0.0288,
"grad_norm": 2.6002566814422607,
"kl": 0.06464385986328125,
"learning_rate": 9.9e-07,
"loss": 0.0006,
"reward": 3.217494249343872,
"reward_std": 0.5446330606937408,
"rewards/answer_entity_reward": 0.8213226199150085,
"rewards/answer_wer_reward": 0.7331169545650482,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.6943045258522034,
"step": 9
},
{
"completion_length": 196.40625,
"epoch": 0.032,
"grad_norm": 2.9925193786621094,
"kl": 0.008941650390625,
"learning_rate": 9.8875e-07,
"loss": 0.0001,
"reward": 3.2711292505264282,
"reward_std": 0.5466351807117462,
"rewards/answer_entity_reward": 0.7905315160751343,
"rewards/answer_wer_reward": 0.7206964790821075,
"rewards/format_reward": 0.90625,
"rewards/think_ocr_reward": 0.853651225566864,
"step": 10
},
{
"completion_length": 146.53125,
"epoch": 0.0352,
"grad_norm": 3.6174111366271973,
"kl": 0.0103912353515625,
"learning_rate": 9.875e-07,
"loss": 0.0001,
"reward": 3.083841323852539,
"reward_std": 0.6508071422576904,
"rewards/answer_entity_reward": 0.7979910671710968,
"rewards/answer_wer_reward": 0.6100275814533234,
"rewards/format_reward": 0.90625,
"rewards/think_ocr_reward": 0.7695727646350861,
"step": 11
},
{
"completion_length": 218.15625,
"epoch": 0.0384,
"grad_norm": 3.2925424575805664,
"kl": 0.00616455078125,
"learning_rate": 9.862499999999999e-07,
"loss": 0.0001,
"reward": 3.2391178607940674,
"reward_std": 0.6323770582675934,
"rewards/answer_entity_reward": 0.781956285238266,
"rewards/answer_wer_reward": 0.6958223879337311,
"rewards/format_reward": 0.90625,
"rewards/think_ocr_reward": 0.8550890386104584,
"step": 12
},
{
"completion_length": 250.53125,
"epoch": 0.0416,
"grad_norm": 2.291048288345337,
"kl": 0.0086669921875,
"learning_rate": 9.849999999999999e-07,
"loss": 0.0001,
"reward": 3.238759756088257,
"reward_std": 0.4200912415981293,
"rewards/answer_entity_reward": 0.8185493648052216,
"rewards/answer_wer_reward": 0.699150562286377,
"rewards/format_reward": 0.9375,
"rewards/think_ocr_reward": 0.7835597395896912,
"step": 13
},
{
"completion_length": 196.6875,
"epoch": 0.0448,
"grad_norm": 2.470576524734497,
"kl": 0.0181884765625,
"learning_rate": 9.8375e-07,
"loss": 0.0002,
"reward": 3.460441470146179,
"reward_std": 0.34273722767829895,
"rewards/answer_entity_reward": 0.9129322171211243,
"rewards/answer_wer_reward": 0.7192246317863464,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.8595346808433533,
"step": 14
},
{
"completion_length": 181.78125,
"epoch": 0.048,
"grad_norm": 13.122944831848145,
"kl": 0.0174560546875,
"learning_rate": 9.825e-07,
"loss": 0.0002,
"reward": 3.526148796081543,
"reward_std": 0.2207299917936325,
"rewards/answer_entity_reward": 0.8908324241638184,
"rewards/answer_wer_reward": 0.8109035789966583,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.8556627035140991,
"step": 15
},
{
"completion_length": 181.9375,
"epoch": 0.0512,
"grad_norm": 3.1282718181610107,
"kl": 0.0081329345703125,
"learning_rate": 9.8125e-07,
"loss": 0.0001,
"reward": 3.4612035751342773,
"reward_std": 0.2798766866326332,
"rewards/answer_entity_reward": 0.8926167786121368,
"rewards/answer_wer_reward": 0.6810254156589508,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9188113510608673,
"step": 16
},
{
"completion_length": 243.125,
"epoch": 0.0544,
"grad_norm": 1.907029390335083,
"kl": 0.00677490234375,
"learning_rate": 9.8e-07,
"loss": 0.0001,
"reward": 3.375656485557556,
"reward_std": 0.37908758223056793,
"rewards/answer_entity_reward": 0.8232844769954681,
"rewards/answer_wer_reward": 0.6466233134269714,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9369988143444061,
"step": 17
},
{
"completion_length": 236.34375,
"epoch": 0.0576,
"grad_norm": 2.551098108291626,
"kl": 0.0098876953125,
"learning_rate": 9.7875e-07,
"loss": 0.0001,
"reward": 3.637453317642212,
"reward_std": 0.1572738140821457,
"rewards/answer_entity_reward": 0.8815866112709045,
"rewards/answer_wer_reward": 0.8101728856563568,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9456938207149506,
"step": 18
},
{
"completion_length": 242.28125,
"epoch": 0.0608,
"grad_norm": 3.0685667991638184,
"kl": 0.010223388671875,
"learning_rate": 9.775e-07,
"loss": 0.0001,
"reward": 3.3409019708633423,
"reward_std": 0.3057943657040596,
"rewards/answer_entity_reward": 0.7610115706920624,
"rewards/answer_wer_reward": 0.6856433153152466,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8942470550537109,
"step": 19
},
{
"completion_length": 193.46875,
"epoch": 0.064,
"grad_norm": 2.6569221019744873,
"kl": 0.0095977783203125,
"learning_rate": 9.7625e-07,
"loss": 0.0001,
"reward": 3.5098860263824463,
"reward_std": 0.27671176940202713,
"rewards/answer_entity_reward": 0.8399666249752045,
"rewards/answer_wer_reward": 0.7382143139839172,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9317050278186798,
"step": 20
},
{
"completion_length": 199.28125,
"epoch": 0.0672,
"grad_norm": 3.02462100982666,
"kl": 0.0101318359375,
"learning_rate": 9.75e-07,
"loss": 0.0001,
"reward": 3.552868962287903,
"reward_std": 0.24761613458395004,
"rewards/answer_entity_reward": 0.9026052951812744,
"rewards/answer_wer_reward": 0.7746964991092682,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8755670785903931,
"step": 21
},
{
"completion_length": 239.75,
"epoch": 0.0704,
"grad_norm": 5.65736722946167,
"kl": 0.010223388671875,
"learning_rate": 9.7375e-07,
"loss": 0.0001,
"reward": 3.3219141960144043,
"reward_std": 0.32601839303970337,
"rewards/answer_entity_reward": 0.8810833096504211,
"rewards/answer_wer_reward": 0.6434947550296783,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.7973361015319824,
"step": 22
},
{
"completion_length": 216.21875,
"epoch": 0.0736,
"grad_norm": 6.68402099609375,
"kl": 0.009765625,
"learning_rate": 9.725e-07,
"loss": 0.0001,
"reward": 3.67569899559021,
"reward_std": 0.19380945712327957,
"rewards/answer_entity_reward": 0.9180394113063812,
"rewards/answer_wer_reward": 0.8205302953720093,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9371293187141418,
"step": 23
},
{
"completion_length": 200.65625,
"epoch": 0.0768,
"grad_norm": 3.398916006088257,
"kl": 0.0118408203125,
"learning_rate": 9.712499999999998e-07,
"loss": 0.0001,
"reward": 3.575831174850464,
"reward_std": 0.22907962650060654,
"rewards/answer_entity_reward": 0.9015873074531555,
"rewards/answer_wer_reward": 0.8195928931236267,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8546508848667145,
"step": 24
},
{
"completion_length": 144.9375,
"epoch": 0.08,
"grad_norm": 3.852799415588379,
"kl": 0.025146484375,
"learning_rate": 9.7e-07,
"loss": 0.0003,
"reward": 3.596950054168701,
"reward_std": 0.29281121492385864,
"rewards/answer_entity_reward": 0.9606508314609528,
"rewards/answer_wer_reward": 0.7530401945114136,
"rewards/format_reward": 0.9375,
"rewards/think_ocr_reward": 0.9457589387893677,
"step": 25
},
{
"completion_length": 201.375,
"epoch": 0.0832,
"grad_norm": 3.684136390686035,
"kl": 0.03955078125,
"learning_rate": 9.6875e-07,
"loss": 0.0004,
"reward": 3.6101993322372437,
"reward_std": 0.22506854683160782,
"rewards/answer_entity_reward": 0.8913510143756866,
"rewards/answer_wer_reward": 0.855983167886734,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8628652393817902,
"step": 26
},
{
"completion_length": 235.25,
"epoch": 0.0864,
"grad_norm": 2.9537627696990967,
"kl": 0.0134124755859375,
"learning_rate": 9.675e-07,
"loss": 0.0001,
"reward": 3.579669713973999,
"reward_std": 0.17270359210669994,
"rewards/answer_entity_reward": 0.8651459515094757,
"rewards/answer_wer_reward": 0.7930598855018616,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9214637279510498,
"step": 27
},
{
"completion_length": 199.96875,
"epoch": 0.0896,
"grad_norm": 2.0981569290161133,
"kl": 0.02239990234375,
"learning_rate": 9.6625e-07,
"loss": 0.0002,
"reward": 3.589198589324951,
"reward_std": 0.2977752536535263,
"rewards/answer_entity_reward": 0.8878033757209778,
"rewards/answer_wer_reward": 0.8114102184772491,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.889985203742981,
"step": 28
},
{
"completion_length": 229.6875,
"epoch": 0.0928,
"grad_norm": 2.406397819519043,
"kl": 0.0191650390625,
"learning_rate": 9.649999999999999e-07,
"loss": 0.0002,
"reward": 3.4348872900009155,
"reward_std": 0.37296992540359497,
"rewards/answer_entity_reward": 0.7681002914905548,
"rewards/answer_wer_reward": 0.724025309085846,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9740117192268372,
"step": 29
},
{
"completion_length": 199.59375,
"epoch": 0.096,
"grad_norm": 4.711977481842041,
"kl": 0.01727294921875,
"learning_rate": 9.637499999999999e-07,
"loss": 0.0002,
"reward": 3.7957680225372314,
"reward_std": 0.10022839158773422,
"rewards/answer_entity_reward": 0.9259244203567505,
"rewards/answer_wer_reward": 0.8810202181339264,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9888232052326202,
"step": 30
},
{
"completion_length": 227.71875,
"epoch": 0.0992,
"grad_norm": 8.605613708496094,
"kl": 0.016021728515625,
"learning_rate": 9.624999999999999e-07,
"loss": 0.0002,
"reward": 3.6433751583099365,
"reward_std": 0.19832589477300644,
"rewards/answer_entity_reward": 0.8819950520992279,
"rewards/answer_wer_reward": 0.832177460193634,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9292027056217194,
"step": 31
},
{
"completion_length": 215.65625,
"epoch": 0.1024,
"grad_norm": 3.5583388805389404,
"kl": 0.0224609375,
"learning_rate": 9.6125e-07,
"loss": 0.0002,
"reward": 3.516916036605835,
"reward_std": 0.29861560463905334,
"rewards/answer_entity_reward": 0.8093456923961639,
"rewards/answer_wer_reward": 0.7672389149665833,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9403314590454102,
"step": 32
},
{
"completion_length": 255.8125,
"epoch": 0.1056,
"grad_norm": 3.647063970565796,
"kl": 0.009185791015625,
"learning_rate": 9.6e-07,
"loss": 0.0001,
"reward": 3.5868738889694214,
"reward_std": 0.2677561491727829,
"rewards/answer_entity_reward": 0.818858414888382,
"rewards/answer_wer_reward": 0.7967112958431244,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9713042676448822,
"step": 33
},
{
"completion_length": 223.25,
"epoch": 0.1088,
"grad_norm": 4.442183017730713,
"kl": 0.02569580078125,
"learning_rate": 9.5875e-07,
"loss": 0.0003,
"reward": 3.6685177087783813,
"reward_std": 0.16033701971173286,
"rewards/answer_entity_reward": 0.8931982815265656,
"rewards/answer_wer_reward": 0.8027337491512299,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.972585529088974,
"step": 34
},
{
"completion_length": 225.09375,
"epoch": 0.112,
"grad_norm": 1.850151538848877,
"kl": 0.0135498046875,
"learning_rate": 9.575e-07,
"loss": 0.0001,
"reward": 3.622478485107422,
"reward_std": 0.15638228505849838,
"rewards/answer_entity_reward": 0.8341188132762909,
"rewards/answer_wer_reward": 0.8296426832675934,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9587167799472809,
"step": 35
},
{
"completion_length": 182.75,
"epoch": 0.1152,
"grad_norm": 3.844250202178955,
"kl": 0.100982666015625,
"learning_rate": 9.5625e-07,
"loss": 0.001,
"reward": 3.575288772583008,
"reward_std": 0.3447410613298416,
"rewards/answer_entity_reward": 0.8734935224056244,
"rewards/answer_wer_reward": 0.8351728320121765,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.8978723287582397,
"step": 36
},
{
"completion_length": 170.71875,
"epoch": 0.1184,
"grad_norm": 3.608771800994873,
"kl": 0.0318603515625,
"learning_rate": 9.55e-07,
"loss": 0.0003,
"reward": 3.757541060447693,
"reward_std": 0.16554252058267593,
"rewards/answer_entity_reward": 0.9673819839954376,
"rewards/answer_wer_reward": 0.8668203055858612,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9233386218547821,
"step": 37
},
{
"completion_length": 252.0,
"epoch": 0.1216,
"grad_norm": 2.063748836517334,
"kl": 0.01507568359375,
"learning_rate": 9.5375e-07,
"loss": 0.0001,
"reward": 3.716595768928528,
"reward_std": 0.10926416516304016,
"rewards/answer_entity_reward": 0.8833416402339935,
"rewards/answer_wer_reward": 0.8585604727268219,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9746935665607452,
"step": 38
},
{
"completion_length": 231.21875,
"epoch": 0.1248,
"grad_norm": 2.751699447631836,
"kl": 0.0213623046875,
"learning_rate": 9.525e-07,
"loss": 0.0002,
"reward": 3.539994239807129,
"reward_std": 0.1212783083319664,
"rewards/answer_entity_reward": 0.7954491972923279,
"rewards/answer_wer_reward": 0.7638055980205536,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.980739563703537,
"step": 39
},
{
"completion_length": 216.09375,
"epoch": 0.128,
"grad_norm": 2.074568033218384,
"kl": 0.0379638671875,
"learning_rate": 9.5125e-07,
"loss": 0.0004,
"reward": 3.6039533615112305,
"reward_std": 0.26473698019981384,
"rewards/answer_entity_reward": 0.8746186196804047,
"rewards/answer_wer_reward": 0.8307992517948151,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9297854900360107,
"step": 40
},
{
"completion_length": 203.5,
"epoch": 0.1312,
"grad_norm": 3.2622625827789307,
"kl": 0.05419921875,
"learning_rate": 9.499999999999999e-07,
"loss": 0.0005,
"reward": 3.4951852560043335,
"reward_std": 0.18541007116436958,
"rewards/answer_entity_reward": 0.8705199360847473,
"rewards/answer_wer_reward": 0.8321611285209656,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.7925041615962982,
"step": 41
},
{
"completion_length": 195.84375,
"epoch": 0.1344,
"grad_norm": 2.3474910259246826,
"kl": 0.0272216796875,
"learning_rate": 9.487499999999999e-07,
"loss": 0.0003,
"reward": 3.556153178215027,
"reward_std": 0.22145777754485607,
"rewards/answer_entity_reward": 0.9313356876373291,
"rewards/answer_wer_reward": 0.7051927447319031,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9508746266365051,
"step": 42
},
{
"completion_length": 213.25,
"epoch": 0.1376,
"grad_norm": 2.805851697921753,
"kl": 0.039794921875,
"learning_rate": 9.474999999999999e-07,
"loss": 0.0004,
"reward": 3.4438276290893555,
"reward_std": 0.306783527135849,
"rewards/answer_entity_reward": 0.9020311534404755,
"rewards/answer_wer_reward": 0.7658404111862183,
"rewards/format_reward": 0.9375,
"rewards/think_ocr_reward": 0.838456004858017,
"step": 43
},
{
"completion_length": 237.6875,
"epoch": 0.1408,
"grad_norm": 1.9424443244934082,
"kl": 0.04632568359375,
"learning_rate": 9.462499999999999e-07,
"loss": 0.0005,
"reward": 3.6309977769851685,
"reward_std": 0.2500930577516556,
"rewards/answer_entity_reward": 0.8781489729881287,
"rewards/answer_wer_reward": 0.8788634538650513,
"rewards/format_reward": 0.9375,
"rewards/think_ocr_reward": 0.9364852905273438,
"step": 44
},
{
"completion_length": 239.375,
"epoch": 0.144,
"grad_norm": 46.16355895996094,
"kl": 0.0579833984375,
"learning_rate": 9.45e-07,
"loss": 0.0006,
"reward": 3.5368932485580444,
"reward_std": 0.43694401532411575,
"rewards/answer_entity_reward": 0.919220894575119,
"rewards/answer_wer_reward": 0.8205748200416565,
"rewards/format_reward": 0.84375,
"rewards/think_ocr_reward": 0.9533475041389465,
"step": 45
},
{
"completion_length": 173.84375,
"epoch": 0.1472,
"grad_norm": 3.7639763355255127,
"kl": 0.0450439453125,
"learning_rate": 9.4375e-07,
"loss": 0.0004,
"reward": 3.7322875261306763,
"reward_std": 0.1945570409297943,
"rewards/answer_entity_reward": 0.9228407144546509,
"rewards/answer_wer_reward": 0.8905497789382935,
"rewards/format_reward": 0.9375,
"rewards/think_ocr_reward": 0.9813971817493439,
"step": 46
},
{
"completion_length": 147.09375,
"epoch": 0.1504,
"grad_norm": 4.257631301879883,
"kl": 0.0538330078125,
"learning_rate": 9.425e-07,
"loss": 0.0005,
"reward": 3.478027820587158,
"reward_std": 0.2542489320039749,
"rewards/answer_entity_reward": 0.8890827894210815,
"rewards/answer_wer_reward": 0.7596322894096375,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.8605626821517944,
"step": 47
},
{
"completion_length": 223.9375,
"epoch": 0.1536,
"grad_norm": 1.5165725946426392,
"kl": 0.0335693359375,
"learning_rate": 9.4125e-07,
"loss": 0.0003,
"reward": 3.695801019668579,
"reward_std": 0.21276018023490906,
"rewards/answer_entity_reward": 0.9133437275886536,
"rewards/answer_wer_reward": 0.8821894526481628,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9315177500247955,
"step": 48
},
{
"completion_length": 196.15625,
"epoch": 0.1568,
"grad_norm": 2.7737293243408203,
"kl": 0.04931640625,
"learning_rate": 9.399999999999999e-07,
"loss": 0.0005,
"reward": 3.7317415475845337,
"reward_std": 0.11913972720503807,
"rewards/answer_entity_reward": 0.9534916281700134,
"rewards/answer_wer_reward": 0.8561010956764221,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9533988535404205,
"step": 49
},
{
"completion_length": 192.21875,
"epoch": 0.16,
"grad_norm": 3.4223740100860596,
"kl": 0.04052734375,
"learning_rate": 9.387499999999999e-07,
"loss": 0.0004,
"reward": 3.65939998626709,
"reward_std": 0.1464347057044506,
"rewards/answer_entity_reward": 0.9478480219841003,
"rewards/answer_wer_reward": 0.8836140036582947,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8279379308223724,
"step": 50
},
{
"completion_length": 170.75,
"epoch": 0.1632,
"grad_norm": 3.389747381210327,
"kl": 0.0406494140625,
"learning_rate": 9.374999999999999e-07,
"loss": 0.0004,
"reward": 3.6742804050445557,
"reward_std": 0.21486516296863556,
"rewards/answer_entity_reward": 0.9492871761322021,
"rewards/answer_wer_reward": 0.8503031730651855,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8746900260448456,
"step": 51
},
{
"completion_length": 249.78125,
"epoch": 0.1664,
"grad_norm": 1.3398560285568237,
"kl": 0.0609130859375,
"learning_rate": 9.3625e-07,
"loss": 0.0006,
"reward": 3.7340474128723145,
"reward_std": 0.16536326706409454,
"rewards/answer_entity_reward": 0.9178049564361572,
"rewards/answer_wer_reward": 0.8599284589290619,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9563138782978058,
"step": 52
},
{
"completion_length": 243.53125,
"epoch": 0.1696,
"grad_norm": 2.292407512664795,
"kl": 0.035400390625,
"learning_rate": 9.35e-07,
"loss": 0.0004,
"reward": 3.6057465076446533,
"reward_std": 0.1650264859199524,
"rewards/answer_entity_reward": 0.942800760269165,
"rewards/answer_wer_reward": 0.743953675031662,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9189921319484711,
"step": 53
},
{
"completion_length": 224.4375,
"epoch": 0.1728,
"grad_norm": 25.665359497070312,
"kl": 0.03118896484375,
"learning_rate": 9.3375e-07,
"loss": 0.0003,
"reward": 3.6430656909942627,
"reward_std": 0.14360623061656952,
"rewards/answer_entity_reward": 0.907882422208786,
"rewards/answer_wer_reward": 0.7998041808605194,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9353790581226349,
"step": 54
},
{
"completion_length": 173.0625,
"epoch": 0.176,
"grad_norm": 4.687534809112549,
"kl": 0.03643798828125,
"learning_rate": 9.325e-07,
"loss": 0.0004,
"reward": 3.776802897453308,
"reward_std": 0.10255010426044464,
"rewards/answer_entity_reward": 0.9577985405921936,
"rewards/answer_wer_reward": 0.8955680429935455,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9234363734722137,
"step": 55
},
{
"completion_length": 241.84375,
"epoch": 0.1792,
"grad_norm": 2.1417253017425537,
"kl": 0.02978515625,
"learning_rate": 9.3125e-07,
"loss": 0.0003,
"reward": 3.7508766651153564,
"reward_std": 0.12244473025202751,
"rewards/answer_entity_reward": 0.9196350574493408,
"rewards/answer_wer_reward": 0.8353821933269501,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.995859295129776,
"step": 56
},
{
"completion_length": 214.15625,
"epoch": 0.1824,
"grad_norm": 2.977281332015991,
"kl": 0.03302001953125,
"learning_rate": 9.3e-07,
"loss": 0.0003,
"reward": 3.77036452293396,
"reward_std": 0.18844036478549242,
"rewards/answer_entity_reward": 0.9284944236278534,
"rewards/answer_wer_reward": 0.8541653454303741,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9877048432826996,
"step": 57
},
{
"completion_length": 245.9375,
"epoch": 0.1856,
"grad_norm": 1.5624388456344604,
"kl": 0.0296630859375,
"learning_rate": 9.287499999999999e-07,
"loss": 0.0003,
"reward": 3.7977479696273804,
"reward_std": 0.08727182075381279,
"rewards/answer_entity_reward": 0.9509085714817047,
"rewards/answer_wer_reward": 0.8592260181903839,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9876134395599365,
"step": 58
},
{
"completion_length": 232.65625,
"epoch": 0.1888,
"grad_norm": 55.87119674682617,
"kl": 0.047607421875,
"learning_rate": 9.274999999999999e-07,
"loss": 0.0005,
"reward": 3.6933377981185913,
"reward_std": 0.24168139696121216,
"rewards/answer_entity_reward": 0.9402236640453339,
"rewards/answer_wer_reward": 0.8164783418178558,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9366357624530792,
"step": 59
},
{
"completion_length": 221.65625,
"epoch": 0.192,
"grad_norm": 1.8363709449768066,
"kl": 0.04156494140625,
"learning_rate": 9.2625e-07,
"loss": 0.0004,
"reward": 3.8290294408798218,
"reward_std": 0.08228548988699913,
"rewards/answer_entity_reward": 0.9317659735679626,
"rewards/answer_wer_reward": 0.9017607867717743,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.995502769947052,
"step": 60
},
{
"completion_length": 207.5625,
"epoch": 0.1952,
"grad_norm": 5.360762119293213,
"kl": 0.03662109375,
"learning_rate": 9.25e-07,
"loss": 0.0004,
"reward": 3.4508965015411377,
"reward_std": 0.24354729056358337,
"rewards/answer_entity_reward": 0.8888726234436035,
"rewards/answer_wer_reward": 0.6527576148509979,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9092662334442139,
"step": 61
},
{
"completion_length": 175.3125,
"epoch": 0.1984,
"grad_norm": 6.900688171386719,
"kl": 0.0562744140625,
"learning_rate": 9.237499999999999e-07,
"loss": 0.0006,
"reward": 3.5809485912323,
"reward_std": 0.27670779824256897,
"rewards/answer_entity_reward": 0.875405490398407,
"rewards/answer_wer_reward": 0.846805214881897,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8587377667427063,
"step": 62
},
{
"completion_length": 167.21875,
"epoch": 0.2016,
"grad_norm": 3.296032667160034,
"kl": 0.03668212890625,
"learning_rate": 9.225e-07,
"loss": 0.0004,
"reward": 3.775553345680237,
"reward_std": 0.1621587909758091,
"rewards/answer_entity_reward": 0.9595959782600403,
"rewards/answer_wer_reward": 0.900894969701767,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9150623679161072,
"step": 63
},
{
"completion_length": 216.0625,
"epoch": 0.2048,
"grad_norm": 3.287728786468506,
"kl": 0.05419921875,
"learning_rate": 9.2125e-07,
"loss": 0.0005,
"reward": 3.580909013748169,
"reward_std": 0.37151331454515457,
"rewards/answer_entity_reward": 0.9558238685131073,
"rewards/answer_wer_reward": 0.816798210144043,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.8395369946956635,
"step": 64
},
{
"completion_length": 242.1875,
"epoch": 0.208,
"grad_norm": 4.7966766357421875,
"kl": 0.0389404296875,
"learning_rate": 9.2e-07,
"loss": 0.0004,
"reward": 3.5479079484939575,
"reward_std": 0.34015993028879166,
"rewards/answer_entity_reward": 0.9070779979228973,
"rewards/answer_wer_reward": 0.7606107890605927,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9114691615104675,
"step": 65
},
{
"completion_length": 182.9375,
"epoch": 0.2112,
"grad_norm": 4.85190486907959,
"kl": 0.0411376953125,
"learning_rate": 9.187499999999999e-07,
"loss": 0.0004,
"reward": 3.759209156036377,
"reward_std": 0.030521959997713566,
"rewards/answer_entity_reward": 0.9572916924953461,
"rewards/answer_wer_reward": 0.9216786324977875,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.880238950252533,
"step": 66
},
{
"completion_length": 197.46875,
"epoch": 0.2144,
"grad_norm": 2.888380765914917,
"kl": 0.03271484375,
"learning_rate": 9.174999999999999e-07,
"loss": 0.0003,
"reward": 3.86090886592865,
"reward_std": 0.08941158838570118,
"rewards/answer_entity_reward": 0.974116176366806,
"rewards/answer_wer_reward": 0.9031813442707062,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9836114048957825,
"step": 67
},
{
"completion_length": 212.03125,
"epoch": 0.2176,
"grad_norm": 0.9500738382339478,
"kl": 0.03057861328125,
"learning_rate": 9.1625e-07,
"loss": 0.0003,
"reward": 3.865835189819336,
"reward_std": 0.04183580353856087,
"rewards/answer_entity_reward": 0.9732177555561066,
"rewards/answer_wer_reward": 0.8951224386692047,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9974949955940247,
"step": 68
},
{
"completion_length": 168.09375,
"epoch": 0.2208,
"grad_norm": 4.705175876617432,
"kl": 0.03704833984375,
"learning_rate": 9.15e-07,
"loss": 0.0004,
"reward": 3.6963913440704346,
"reward_std": 0.16030436754226685,
"rewards/answer_entity_reward": 0.932018518447876,
"rewards/answer_wer_reward": 0.8480667769908905,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9163061678409576,
"step": 69
},
{
"completion_length": 193.21875,
"epoch": 0.224,
"grad_norm": 2.125580310821533,
"kl": 0.03515625,
"learning_rate": 9.137499999999999e-07,
"loss": 0.0004,
"reward": 3.8550466299057007,
"reward_std": 0.06468157470226288,
"rewards/answer_entity_reward": 0.9734202921390533,
"rewards/answer_wer_reward": 0.8898061215877533,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9918202459812164,
"step": 70
},
{
"completion_length": 235.78125,
"epoch": 0.2272,
"grad_norm": 6.89145040512085,
"kl": 0.042236328125,
"learning_rate": 9.124999999999999e-07,
"loss": 0.0004,
"reward": 3.725824475288391,
"reward_std": 0.05315144546329975,
"rewards/answer_entity_reward": 0.9593958258628845,
"rewards/answer_wer_reward": 0.8827618062496185,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8836667835712433,
"step": 71
},
{
"completion_length": 210.1875,
"epoch": 0.2304,
"grad_norm": 3.6971681118011475,
"kl": 0.0343017578125,
"learning_rate": 9.1125e-07,
"loss": 0.0003,
"reward": 3.719637870788574,
"reward_std": 0.10697400569915771,
"rewards/answer_entity_reward": 0.9880050718784332,
"rewards/answer_wer_reward": 0.7961998879909515,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9354328513145447,
"step": 72
},
{
"completion_length": 216.5625,
"epoch": 0.2336,
"grad_norm": 17.082843780517578,
"kl": 0.0537109375,
"learning_rate": 9.1e-07,
"loss": 0.0005,
"reward": 3.6063274145126343,
"reward_std": 0.2845265045762062,
"rewards/answer_entity_reward": 0.9374077320098877,
"rewards/answer_wer_reward": 0.7878484427928925,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.912321150302887,
"step": 73
},
{
"completion_length": 234.90625,
"epoch": 0.2368,
"grad_norm": 1.9695632457733154,
"kl": 0.031982421875,
"learning_rate": 9.087499999999999e-07,
"loss": 0.0003,
"reward": 3.762009024620056,
"reward_std": 0.06560477986931801,
"rewards/answer_entity_reward": 0.9398341476917267,
"rewards/answer_wer_reward": 0.8473882973194122,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9747865796089172,
"step": 74
},
{
"completion_length": 233.65625,
"epoch": 0.24,
"grad_norm": 1.8333961963653564,
"kl": 0.0479736328125,
"learning_rate": 9.074999999999999e-07,
"loss": 0.0005,
"reward": 3.6872040033340454,
"reward_std": 0.12730678915977478,
"rewards/answer_entity_reward": 0.9421398341655731,
"rewards/answer_wer_reward": 0.8499290347099304,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.895135223865509,
"step": 75
},
{
"completion_length": 138.8125,
"epoch": 0.2432,
"grad_norm": 2.518507719039917,
"kl": 0.0504150390625,
"learning_rate": 9.0625e-07,
"loss": 0.0005,
"reward": 3.751777410507202,
"reward_std": 0.18188580125570297,
"rewards/answer_entity_reward": 0.9270697832107544,
"rewards/answer_wer_reward": 0.9237564206123352,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9009511768817902,
"step": 76
},
{
"completion_length": 261.0,
"epoch": 0.2464,
"grad_norm": 4.395165920257568,
"kl": 0.03662109375,
"learning_rate": 9.05e-07,
"loss": 0.0004,
"reward": 3.602410674095154,
"reward_std": 0.12657387554645538,
"rewards/answer_entity_reward": 0.8546798527240753,
"rewards/answer_wer_reward": 0.794090747833252,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9536401331424713,
"step": 77
},
{
"completion_length": 221.90625,
"epoch": 0.2496,
"grad_norm": 1.2728471755981445,
"kl": 0.0294189453125,
"learning_rate": 9.0375e-07,
"loss": 0.0003,
"reward": 3.788708806037903,
"reward_std": 0.09669506549835205,
"rewards/answer_entity_reward": 0.9447909295558929,
"rewards/answer_wer_reward": 0.8481404483318329,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9957774579524994,
"step": 78
},
{
"completion_length": 254.5625,
"epoch": 0.2528,
"grad_norm": 9.725419998168945,
"kl": 0.07373046875,
"learning_rate": 9.024999999999999e-07,
"loss": 0.0007,
"reward": 3.668743133544922,
"reward_std": 0.1221558079123497,
"rewards/answer_entity_reward": 0.9435009658336639,
"rewards/answer_wer_reward": 0.8254426419734955,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8997994065284729,
"step": 79
},
{
"completion_length": 196.375,
"epoch": 0.256,
"grad_norm": 2.1853079795837402,
"kl": 0.0361328125,
"learning_rate": 9.0125e-07,
"loss": 0.0004,
"reward": 3.6546449661254883,
"reward_std": 0.1971728727221489,
"rewards/answer_entity_reward": 0.9472028017044067,
"rewards/answer_wer_reward": 0.8720800876617432,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.8666120767593384,
"step": 80
},
{
"completion_length": 248.25,
"epoch": 0.2592,
"grad_norm": 3.1572227478027344,
"kl": 0.03375244140625,
"learning_rate": 9e-07,
"loss": 0.0003,
"reward": 3.7423981428146362,
"reward_std": 0.10061750188469887,
"rewards/answer_entity_reward": 0.9398939311504364,
"rewards/answer_wer_reward": 0.8211633265018463,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9813408553600311,
"step": 81
},
{
"completion_length": 238.40625,
"epoch": 0.2624,
"grad_norm": 1.5329415798187256,
"kl": 0.03125,
"learning_rate": 8.9875e-07,
"loss": 0.0003,
"reward": 3.874926447868347,
"reward_std": 0.03685523197054863,
"rewards/answer_entity_reward": 0.9718094170093536,
"rewards/answer_wer_reward": 0.9062533378601074,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9968636631965637,
"step": 82
},
{
"completion_length": 222.5,
"epoch": 0.2656,
"grad_norm": 2.012899875640869,
"kl": 0.0419921875,
"learning_rate": 8.974999999999999e-07,
"loss": 0.0004,
"reward": 3.8047072887420654,
"reward_std": 0.046287354081869125,
"rewards/answer_entity_reward": 0.9534181356430054,
"rewards/answer_wer_reward": 0.8727244138717651,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9785646796226501,
"step": 83
},
{
"completion_length": 225.21875,
"epoch": 0.2688,
"grad_norm": 1.5400514602661133,
"kl": 0.0380859375,
"learning_rate": 8.9625e-07,
"loss": 0.0004,
"reward": 3.718083620071411,
"reward_std": 0.1703677996993065,
"rewards/answer_entity_reward": 0.9013731181621552,
"rewards/answer_wer_reward": 0.8260438740253448,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9906666278839111,
"step": 84
},
{
"completion_length": 236.125,
"epoch": 0.272,
"grad_norm": 1.6224849224090576,
"kl": 0.0550537109375,
"learning_rate": 8.95e-07,
"loss": 0.0005,
"reward": 3.8032166957855225,
"reward_std": 0.0796846654266119,
"rewards/answer_entity_reward": 0.9553452134132385,
"rewards/answer_wer_reward": 0.8544089794158936,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9934625327587128,
"step": 85
},
{
"completion_length": 214.34375,
"epoch": 0.2752,
"grad_norm": 3.1244239807128906,
"kl": 0.032470703125,
"learning_rate": 8.9375e-07,
"loss": 0.0003,
"reward": 3.803860068321228,
"reward_std": 0.06684968620538712,
"rewards/answer_entity_reward": 0.9671759307384491,
"rewards/answer_wer_reward": 0.9067878127098083,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9298965036869049,
"step": 86
},
{
"completion_length": 216.9375,
"epoch": 0.2784,
"grad_norm": 1.8527048826217651,
"kl": 0.02996826171875,
"learning_rate": 8.924999999999999e-07,
"loss": 0.0003,
"reward": 3.813448429107666,
"reward_std": 0.05041965842247009,
"rewards/answer_entity_reward": 0.9224496483802795,
"rewards/answer_wer_reward": 0.8932149708271027,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9977837204933167,
"step": 87
},
{
"completion_length": 211.8125,
"epoch": 0.2816,
"grad_norm": 2.733228921890259,
"kl": 0.05126953125,
"learning_rate": 8.912499999999999e-07,
"loss": 0.0005,
"reward": 3.8481240272521973,
"reward_std": 0.0621240958571434,
"rewards/answer_entity_reward": 0.9627074301242828,
"rewards/answer_wer_reward": 0.9041622579097748,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9812542796134949,
"step": 88
},
{
"completion_length": 202.5,
"epoch": 0.2848,
"grad_norm": 4.8413496017456055,
"kl": 0.0433349609375,
"learning_rate": 8.9e-07,
"loss": 0.0004,
"reward": 3.668493866920471,
"reward_std": 0.08999799937009811,
"rewards/answer_entity_reward": 0.96169114112854,
"rewards/answer_wer_reward": 0.7791127562522888,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.92768993973732,
"step": 89
},
{
"completion_length": 214.6875,
"epoch": 0.288,
"grad_norm": 4.111961841583252,
"kl": 0.04638671875,
"learning_rate": 8.8875e-07,
"loss": 0.0005,
"reward": 3.7720965147018433,
"reward_std": 0.18014637380838394,
"rewards/answer_entity_reward": 0.9866696000099182,
"rewards/answer_wer_reward": 0.8934727013111115,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9232043027877808,
"step": 90
},
{
"completion_length": 241.5625,
"epoch": 0.2912,
"grad_norm": 1.4061272144317627,
"kl": 0.0460205078125,
"learning_rate": 8.874999999999999e-07,
"loss": 0.0005,
"reward": 3.828965663909912,
"reward_std": 0.04340291768312454,
"rewards/answer_entity_reward": 0.9683369398117065,
"rewards/answer_wer_reward": 0.8706588447093964,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9899699091911316,
"step": 91
},
{
"completion_length": 247.9375,
"epoch": 0.2944,
"grad_norm": 1.6669530868530273,
"kl": 0.0611572265625,
"learning_rate": 8.8625e-07,
"loss": 0.0006,
"reward": 3.7649370431900024,
"reward_std": 0.1087912805378437,
"rewards/answer_entity_reward": 0.9332223832607269,
"rewards/answer_wer_reward": 0.8357318043708801,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9959828853607178,
"step": 92
},
{
"completion_length": 162.625,
"epoch": 0.2976,
"grad_norm": 5.615991115570068,
"kl": 0.058837890625,
"learning_rate": 8.85e-07,
"loss": 0.0006,
"reward": 3.8870660066604614,
"reward_std": 0.09454158693552017,
"rewards/answer_entity_reward": 0.9939196705818176,
"rewards/answer_wer_reward": 0.9443124830722809,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9488338530063629,
"step": 93
},
{
"completion_length": 256.625,
"epoch": 0.3008,
"grad_norm": 2.879868984222412,
"kl": 0.2406005859375,
"learning_rate": 8.8375e-07,
"loss": 0.0024,
"reward": 3.6465322971343994,
"reward_std": 0.23435086756944656,
"rewards/answer_entity_reward": 0.9395784735679626,
"rewards/answer_wer_reward": 0.7715516090393066,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9666522741317749,
"step": 94
},
{
"completion_length": 254.15625,
"epoch": 0.304,
"grad_norm": 6.1645121574401855,
"kl": 0.262939453125,
"learning_rate": 8.824999999999999e-07,
"loss": 0.0026,
"reward": 3.728961229324341,
"reward_std": 0.11308889091014862,
"rewards/answer_entity_reward": 0.9466511011123657,
"rewards/answer_wer_reward": 0.8248744010925293,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9574357271194458,
"step": 95
},
{
"completion_length": 202.3125,
"epoch": 0.3072,
"grad_norm": 2.2792811393737793,
"kl": 0.0501708984375,
"learning_rate": 8.812499999999999e-07,
"loss": 0.0005,
"reward": 3.856202244758606,
"reward_std": 0.05682223103940487,
"rewards/answer_entity_reward": 0.9909722208976746,
"rewards/answer_wer_reward": 0.9035914540290833,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9616385698318481,
"step": 96
},
{
"completion_length": 222.53125,
"epoch": 0.3104,
"grad_norm": 2.4435033798217773,
"kl": 0.051513671875,
"learning_rate": 8.799999999999999e-07,
"loss": 0.0005,
"reward": 3.8195481300354004,
"reward_std": 0.08100517094135284,
"rewards/answer_entity_reward": 0.979785680770874,
"rewards/answer_wer_reward": 0.8738153576850891,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.965947151184082,
"step": 97
},
{
"completion_length": 202.375,
"epoch": 0.3136,
"grad_norm": 1.7632919549942017,
"kl": 0.0357666015625,
"learning_rate": 8.7875e-07,
"loss": 0.0004,
"reward": 3.7597837448120117,
"reward_std": 0.061054665595293045,
"rewards/answer_entity_reward": 0.9468090534210205,
"rewards/answer_wer_reward": 0.8723107874393463,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9406639635562897,
"step": 98
},
{
"completion_length": 207.53125,
"epoch": 0.3168,
"grad_norm": 7.402034282684326,
"kl": 0.04736328125,
"learning_rate": 8.774999999999999e-07,
"loss": 0.0005,
"reward": 3.7576065063476562,
"reward_std": 0.04146904498338699,
"rewards/answer_entity_reward": 0.9389799237251282,
"rewards/answer_wer_reward": 0.8201505243778229,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9984759986400604,
"step": 99
},
{
"completion_length": 210.15625,
"epoch": 0.32,
"grad_norm": 1.5828880071640015,
"kl": 0.0450439453125,
"learning_rate": 8.7625e-07,
"loss": 0.0004,
"reward": 3.835609197616577,
"reward_std": 0.12980258837342262,
"rewards/answer_entity_reward": 0.9350627064704895,
"rewards/answer_wer_reward": 0.9053294062614441,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9952171444892883,
"step": 100
},
{
"completion_length": 214.34375,
"epoch": 0.3232,
"grad_norm": 5.768563270568848,
"kl": 0.055908203125,
"learning_rate": 8.75e-07,
"loss": 0.0006,
"reward": 3.611391305923462,
"reward_std": 0.2522353269159794,
"rewards/answer_entity_reward": 0.9709455966949463,
"rewards/answer_wer_reward": 0.7870493829250336,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.8846463263034821,
"step": 101
},
{
"completion_length": 223.0,
"epoch": 0.3264,
"grad_norm": 7.32905387878418,
"kl": 0.0755615234375,
"learning_rate": 8.7375e-07,
"loss": 0.0008,
"reward": 3.7200475931167603,
"reward_std": 0.18947013467550278,
"rewards/answer_entity_reward": 0.9668727219104767,
"rewards/answer_wer_reward": 0.8209056556224823,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9635193049907684,
"step": 102
},
{
"completion_length": 229.96875,
"epoch": 0.3296,
"grad_norm": 0.9038276672363281,
"kl": 0.0411376953125,
"learning_rate": 8.725e-07,
"loss": 0.0004,
"reward": 3.862263560295105,
"reward_std": 0.03179450985044241,
"rewards/answer_entity_reward": 0.9754428863525391,
"rewards/answer_wer_reward": 0.8931463062763214,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9936743974685669,
"step": 103
},
{
"completion_length": 265.3125,
"epoch": 0.3328,
"grad_norm": 1.3424818515777588,
"kl": 0.0357666015625,
"learning_rate": 8.712499999999999e-07,
"loss": 0.0004,
"reward": 3.7375279664993286,
"reward_std": 0.07805093377828598,
"rewards/answer_entity_reward": 0.9291824698448181,
"rewards/answer_wer_reward": 0.8219992816448212,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9863462746143341,
"step": 104
},
{
"completion_length": 252.78125,
"epoch": 0.336,
"grad_norm": 1.2035622596740723,
"kl": 0.033203125,
"learning_rate": 8.699999999999999e-07,
"loss": 0.0003,
"reward": 3.8339978456497192,
"reward_std": 0.05473129637539387,
"rewards/answer_entity_reward": 0.9795939922332764,
"rewards/answer_wer_reward": 0.8663320243358612,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9880718290805817,
"step": 105
},
{
"completion_length": 162.96875,
"epoch": 0.3392,
"grad_norm": 7.1932783126831055,
"kl": 0.06005859375,
"learning_rate": 8.687499999999999e-07,
"loss": 0.0006,
"reward": 3.8799617290496826,
"reward_std": 0.0983762014657259,
"rewards/answer_entity_reward": 0.9810606241226196,
"rewards/answer_wer_reward": 0.9314018487930298,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9674993753433228,
"step": 106
},
{
"completion_length": 225.625,
"epoch": 0.3424,
"grad_norm": 5.7709455490112305,
"kl": 0.05291748046875,
"learning_rate": 8.675000000000001e-07,
"loss": 0.0005,
"reward": 3.7411450147628784,
"reward_std": 0.2322532683610916,
"rewards/answer_entity_reward": 0.9225597083568573,
"rewards/answer_wer_reward": 0.8556761145591736,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9941591918468475,
"step": 107
},
{
"completion_length": 194.78125,
"epoch": 0.3456,
"grad_norm": 5.741571426391602,
"kl": 0.0556640625,
"learning_rate": 8.6625e-07,
"loss": 0.0006,
"reward": 3.867335319519043,
"reward_std": 0.03972470294684172,
"rewards/answer_entity_reward": 0.9573142230510712,
"rewards/answer_wer_reward": 0.9156463444232941,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9943746328353882,
"step": 108
},
{
"completion_length": 228.5,
"epoch": 0.3488,
"grad_norm": 6.1572089195251465,
"kl": 0.05859375,
"learning_rate": 8.65e-07,
"loss": 0.0006,
"reward": 3.673606753349304,
"reward_std": 0.08745867013931274,
"rewards/answer_entity_reward": 0.9391757845878601,
"rewards/answer_wer_reward": 0.8806695938110352,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8537613451480865,
"step": 109
},
{
"completion_length": 216.5,
"epoch": 0.352,
"grad_norm": 2.032820701599121,
"kl": 0.0518798828125,
"learning_rate": 8.6375e-07,
"loss": 0.0005,
"reward": 3.6381773948669434,
"reward_std": 0.11543435975909233,
"rewards/answer_entity_reward": 0.9743416607379913,
"rewards/answer_wer_reward": 0.7632936537265778,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9005421102046967,
"step": 110
},
{
"completion_length": 220.96875,
"epoch": 0.3552,
"grad_norm": 4.737320423126221,
"kl": 0.0751953125,
"learning_rate": 8.625e-07,
"loss": 0.0008,
"reward": 3.823172926902771,
"reward_std": 0.04683285113424063,
"rewards/answer_entity_reward": 0.9827152192592621,
"rewards/answer_wer_reward": 0.8539322018623352,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9865254759788513,
"step": 111
},
{
"completion_length": 248.75,
"epoch": 0.3584,
"grad_norm": 3.7395012378692627,
"kl": 0.0484619140625,
"learning_rate": 8.612499999999999e-07,
"loss": 0.0005,
"reward": 3.835617423057556,
"reward_std": 0.039440929889678955,
"rewards/answer_entity_reward": 0.9718195497989655,
"rewards/answer_wer_reward": 0.8654404282569885,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9983574748039246,
"step": 112
},
{
"completion_length": 218.75,
"epoch": 0.3616,
"grad_norm": 3.5470447540283203,
"kl": 0.10302734375,
"learning_rate": 8.599999999999999e-07,
"loss": 0.001,
"reward": 3.766317844390869,
"reward_std": 0.0799998790025711,
"rewards/answer_entity_reward": 0.9724812507629395,
"rewards/answer_wer_reward": 0.8530462384223938,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9407903254032135,
"step": 113
},
{
"completion_length": 231.4375,
"epoch": 0.3648,
"grad_norm": 4.614479064941406,
"kl": 0.060546875,
"learning_rate": 8.587499999999999e-07,
"loss": 0.0006,
"reward": 3.828564405441284,
"reward_std": 0.030111415311694145,
"rewards/answer_entity_reward": 0.9710638523101807,
"rewards/answer_wer_reward": 0.866461992263794,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.991038590669632,
"step": 114
},
{
"completion_length": 237.875,
"epoch": 0.368,
"grad_norm": 1.1590646505355835,
"kl": 0.046142578125,
"learning_rate": 8.575e-07,
"loss": 0.0005,
"reward": 3.870112419128418,
"reward_std": 0.051995884627103806,
"rewards/answer_entity_reward": 0.9711016416549683,
"rewards/answer_wer_reward": 0.9016274213790894,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9973834156990051,
"step": 115
},
{
"completion_length": 234.59375,
"epoch": 0.3712,
"grad_norm": 1.4525243043899536,
"kl": 0.113525390625,
"learning_rate": 8.5625e-07,
"loss": 0.0011,
"reward": 3.755509376525879,
"reward_std": 0.10925759375095367,
"rewards/answer_entity_reward": 0.9556345045566559,
"rewards/answer_wer_reward": 0.8394978046417236,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9603771269321442,
"step": 116
},
{
"completion_length": 220.4375,
"epoch": 0.3744,
"grad_norm": 1.6397019624710083,
"kl": 0.15234375,
"learning_rate": 8.55e-07,
"loss": 0.0015,
"reward": 3.829906702041626,
"reward_std": 0.03734264615923166,
"rewards/answer_entity_reward": 0.9839539229869843,
"rewards/answer_wer_reward": 0.8527026474475861,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9932501018047333,
"step": 117
},
{
"completion_length": 209.53125,
"epoch": 0.3776,
"grad_norm": 3.598604440689087,
"kl": 0.07861328125,
"learning_rate": 8.5375e-07,
"loss": 0.0008,
"reward": 3.7239131927490234,
"reward_std": 0.07304626516997814,
"rewards/answer_entity_reward": 0.9540751278400421,
"rewards/answer_wer_reward": 0.8128292262554169,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9570088684558868,
"step": 118
},
{
"completion_length": 207.71875,
"epoch": 0.3808,
"grad_norm": 1.9592057466506958,
"kl": 0.0435791015625,
"learning_rate": 8.525e-07,
"loss": 0.0004,
"reward": 3.8095905780792236,
"reward_std": 0.15753451362252235,
"rewards/answer_entity_reward": 0.9857954680919647,
"rewards/answer_wer_reward": 0.9040109515190125,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9510340690612793,
"step": 119
},
{
"completion_length": 245.90625,
"epoch": 0.384,
"grad_norm": 1.7574220895767212,
"kl": 0.0609130859375,
"learning_rate": 8.512499999999999e-07,
"loss": 0.0006,
"reward": 3.854837656021118,
"reward_std": 0.0384799987077713,
"rewards/answer_entity_reward": 0.9729723632335663,
"rewards/answer_wer_reward": 0.8832501769065857,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.998615026473999,
"step": 120
},
{
"completion_length": 187.15625,
"epoch": 0.3872,
"grad_norm": 8.7343168258667,
"kl": 0.0494384765625,
"learning_rate": 8.499999999999999e-07,
"loss": 0.0005,
"reward": 3.7950538396835327,
"reward_std": 0.09329042956233025,
"rewards/answer_entity_reward": 0.9599206745624542,
"rewards/answer_wer_reward": 0.9000534117221832,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.93507981300354,
"step": 121
},
{
"completion_length": 203.84375,
"epoch": 0.3904,
"grad_norm": 3.7000162601470947,
"kl": 0.062744140625,
"learning_rate": 8.487499999999999e-07,
"loss": 0.0006,
"reward": 3.8687225580215454,
"reward_std": 0.03621992561966181,
"rewards/answer_entity_reward": 0.9873106181621552,
"rewards/answer_wer_reward": 0.8855177164077759,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9958942830562592,
"step": 122
},
{
"completion_length": 205.84375,
"epoch": 0.3936,
"grad_norm": 9.27507209777832,
"kl": 0.0570068359375,
"learning_rate": 8.475e-07,
"loss": 0.0006,
"reward": 3.7104525566101074,
"reward_std": 0.05549425818026066,
"rewards/answer_entity_reward": 0.955735981464386,
"rewards/answer_wer_reward": 0.8933148980140686,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.86140176653862,
"step": 123
},
{
"completion_length": 246.46875,
"epoch": 0.3968,
"grad_norm": 2.3181021213531494,
"kl": 0.0404052734375,
"learning_rate": 8.462499999999999e-07,
"loss": 0.0004,
"reward": 3.821496605873108,
"reward_std": 0.09581143222749233,
"rewards/answer_entity_reward": 0.9655607342720032,
"rewards/answer_wer_reward": 0.8666167855262756,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9893191456794739,
"step": 124
},
{
"completion_length": 206.3125,
"epoch": 0.4,
"grad_norm": 1.5352882146835327,
"kl": 0.055419921875,
"learning_rate": 8.45e-07,
"loss": 0.0006,
"reward": 3.831603527069092,
"reward_std": 0.08168897591531277,
"rewards/answer_entity_reward": 0.9702457189559937,
"rewards/answer_wer_reward": 0.9070821702480316,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9542755484580994,
"step": 125
},
{
"completion_length": 240.28125,
"epoch": 0.4032,
"grad_norm": 1.380315899848938,
"kl": 0.05908203125,
"learning_rate": 8.4375e-07,
"loss": 0.0006,
"reward": 3.7971588373184204,
"reward_std": 0.10537005960941315,
"rewards/answer_entity_reward": 0.9396995604038239,
"rewards/answer_wer_reward": 0.8588653802871704,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9985939264297485,
"step": 126
},
{
"completion_length": 206.84375,
"epoch": 0.4064,
"grad_norm": 1.5937124490737915,
"kl": 0.056884765625,
"learning_rate": 8.425e-07,
"loss": 0.0006,
"reward": 3.8375606536865234,
"reward_std": 0.047878991812467575,
"rewards/answer_entity_reward": 0.9553684592247009,
"rewards/answer_wer_reward": 0.8867217302322388,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9954704642295837,
"step": 127
},
{
"completion_length": 214.1875,
"epoch": 0.4096,
"grad_norm": 1.3648440837860107,
"kl": 0.0687255859375,
"learning_rate": 8.4125e-07,
"loss": 0.0007,
"reward": 3.8555803298950195,
"reward_std": 0.05176056548953056,
"rewards/answer_entity_reward": 0.9823863804340363,
"rewards/answer_wer_reward": 0.8972643911838531,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9759295582771301,
"step": 128
},
{
"completion_length": 215.90625,
"epoch": 0.4128,
"grad_norm": 1.4308183193206787,
"kl": 0.0390625,
"learning_rate": 8.399999999999999e-07,
"loss": 0.0004,
"reward": 3.8976725339889526,
"reward_std": 0.016966319642961025,
"rewards/answer_entity_reward": 0.9958333373069763,
"rewards/answer_wer_reward": 0.9021182060241699,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997209906578064,
"step": 129
},
{
"completion_length": 189.1875,
"epoch": 0.416,
"grad_norm": 7.785026550292969,
"kl": 0.0506591796875,
"learning_rate": 8.387499999999999e-07,
"loss": 0.0005,
"reward": 3.7563494443893433,
"reward_std": 0.12806903570890427,
"rewards/answer_entity_reward": 0.9905131459236145,
"rewards/answer_wer_reward": 0.8918424248695374,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8739938735961914,
"step": 130
},
{
"completion_length": 211.21875,
"epoch": 0.4192,
"grad_norm": 6.029291152954102,
"kl": 0.0860595703125,
"learning_rate": 8.375e-07,
"loss": 0.0009,
"reward": 3.7876737117767334,
"reward_std": 0.07924951426684856,
"rewards/answer_entity_reward": 0.9788058996200562,
"rewards/answer_wer_reward": 0.903822124004364,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9050455689430237,
"step": 131
},
{
"completion_length": 197.9375,
"epoch": 0.4224,
"grad_norm": 1.5226598978042603,
"kl": 0.0865478515625,
"learning_rate": 8.3625e-07,
"loss": 0.0009,
"reward": 3.8618096113204956,
"reward_std": 0.024674754589796066,
"rewards/answer_entity_reward": 0.9936868846416473,
"rewards/answer_wer_reward": 0.9131532609462738,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9549693167209625,
"step": 132
},
{
"completion_length": 178.28125,
"epoch": 0.4256,
"grad_norm": 4.81843376159668,
"kl": 0.1806640625,
"learning_rate": 8.349999999999999e-07,
"loss": 0.0018,
"reward": 3.8692500591278076,
"reward_std": 0.0898860078305006,
"rewards/answer_entity_reward": 0.9539262652397156,
"rewards/answer_wer_reward": 0.9164533317089081,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9988704919815063,
"step": 133
},
{
"completion_length": 214.3125,
"epoch": 0.4288,
"grad_norm": 3.702409267425537,
"kl": 0.100341796875,
"learning_rate": 8.3375e-07,
"loss": 0.001,
"reward": 3.7666897773742676,
"reward_std": 0.05854834243655205,
"rewards/answer_entity_reward": 0.9739753007888794,
"rewards/answer_wer_reward": 0.8456098437309265,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9471046626567841,
"step": 134
},
{
"completion_length": 230.3125,
"epoch": 0.432,
"grad_norm": 4.869428634643555,
"kl": 0.109619140625,
"learning_rate": 8.325e-07,
"loss": 0.0011,
"reward": 3.837371587753296,
"reward_std": 0.07383839413523674,
"rewards/answer_entity_reward": 0.9623282849788666,
"rewards/answer_wer_reward": 0.8914425075054169,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9836008548736572,
"step": 135
},
{
"completion_length": 222.84375,
"epoch": 0.4352,
"grad_norm": 1.1195542812347412,
"kl": 0.0875244140625,
"learning_rate": 8.3125e-07,
"loss": 0.0009,
"reward": 3.800593137741089,
"reward_std": 0.05516563355922699,
"rewards/answer_entity_reward": 0.977182537317276,
"rewards/answer_wer_reward": 0.8409056067466736,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9825051128864288,
"step": 136
},
{
"completion_length": 156.28125,
"epoch": 0.4384,
"grad_norm": 2.307365655899048,
"kl": 0.0631103515625,
"learning_rate": 8.299999999999999e-07,
"loss": 0.0006,
"reward": 3.803721785545349,
"reward_std": 0.1857592761516571,
"rewards/answer_entity_reward": 0.9582379460334778,
"rewards/answer_wer_reward": 0.9269835352897644,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9185003936290741,
"step": 137
},
{
"completion_length": 230.3125,
"epoch": 0.4416,
"grad_norm": 1.0649584531784058,
"kl": 0.0577392578125,
"learning_rate": 8.287499999999999e-07,
"loss": 0.0006,
"reward": 3.8693535327911377,
"reward_std": 0.10830429336056113,
"rewards/answer_entity_reward": 0.9776785671710968,
"rewards/answer_wer_reward": 0.8930677771568298,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9986072480678558,
"step": 138
},
{
"completion_length": 220.5,
"epoch": 0.4448,
"grad_norm": 3.627920150756836,
"kl": 0.0648193359375,
"learning_rate": 8.275e-07,
"loss": 0.0006,
"reward": 3.779549479484558,
"reward_std": 0.04976406879723072,
"rewards/answer_entity_reward": 0.9892225861549377,
"rewards/answer_wer_reward": 0.8991544246673584,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.891172468662262,
"step": 139
},
{
"completion_length": 214.375,
"epoch": 0.448,
"grad_norm": 1.0832712650299072,
"kl": 0.0511474609375,
"learning_rate": 8.2625e-07,
"loss": 0.0005,
"reward": 3.866790771484375,
"reward_std": 0.03637353144586086,
"rewards/answer_entity_reward": 0.9854166805744171,
"rewards/answer_wer_reward": 0.8826328217983246,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987412095069885,
"step": 140
},
{
"completion_length": 215.0,
"epoch": 0.4512,
"grad_norm": 4.865916728973389,
"kl": 0.080810546875,
"learning_rate": 8.249999999999999e-07,
"loss": 0.0008,
"reward": 3.782729744911194,
"reward_std": 0.05014876648783684,
"rewards/answer_entity_reward": 0.9947552382946014,
"rewards/answer_wer_reward": 0.9396264553070068,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8483479917049408,
"step": 141
},
{
"completion_length": 235.1875,
"epoch": 0.4544,
"grad_norm": 3.832350730895996,
"kl": 0.0489501953125,
"learning_rate": 8.2375e-07,
"loss": 0.0005,
"reward": 3.8454935550689697,
"reward_std": 0.02625620225444436,
"rewards/answer_entity_reward": 0.9856643378734589,
"rewards/answer_wer_reward": 0.9073578715324402,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9524714052677155,
"step": 142
},
{
"completion_length": 193.5625,
"epoch": 0.4576,
"grad_norm": 1.5562162399291992,
"kl": 0.0836181640625,
"learning_rate": 8.225e-07,
"loss": 0.0008,
"reward": 3.8764915466308594,
"reward_std": 0.02105938969179988,
"rewards/answer_entity_reward": 0.9958333373069763,
"rewards/answer_wer_reward": 0.9281685054302216,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.952489823102951,
"step": 143
},
{
"completion_length": 192.5,
"epoch": 0.4608,
"grad_norm": 4.00892448425293,
"kl": 0.065185546875,
"learning_rate": 8.2125e-07,
"loss": 0.0007,
"reward": 3.9131712913513184,
"reward_std": 0.025579220615327358,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9231057167053223,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9945298135280609,
"step": 144
},
{
"completion_length": 222.5625,
"epoch": 0.464,
"grad_norm": 6.250589370727539,
"kl": 0.0546875,
"learning_rate": 8.199999999999999e-07,
"loss": 0.0005,
"reward": 3.8917945623397827,
"reward_std": 0.04113447107374668,
"rewards/answer_entity_reward": 0.9717775583267212,
"rewards/answer_wer_reward": 0.926241010427475,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9937759339809418,
"step": 145
},
{
"completion_length": 183.90625,
"epoch": 0.4672,
"grad_norm": 2.7752954959869385,
"kl": 0.0670166015625,
"learning_rate": 8.187499999999999e-07,
"loss": 0.0007,
"reward": 3.860864043235779,
"reward_std": 0.06173134222626686,
"rewards/answer_entity_reward": 0.9583333432674408,
"rewards/answer_wer_reward": 0.9120890200138092,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9904417395591736,
"step": 146
},
{
"completion_length": 220.78125,
"epoch": 0.4704,
"grad_norm": 3.0674679279327393,
"kl": 0.09912109375,
"learning_rate": 8.175e-07,
"loss": 0.001,
"reward": 3.84165620803833,
"reward_std": 0.03327286522835493,
"rewards/answer_entity_reward": 0.9452651739120483,
"rewards/answer_wer_reward": 0.8996314704418182,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.996759682893753,
"step": 147
},
{
"completion_length": 162.8125,
"epoch": 0.4736,
"grad_norm": 4.559942245483398,
"kl": 0.116455078125,
"learning_rate": 8.1625e-07,
"loss": 0.0012,
"reward": 3.833083748817444,
"reward_std": 0.06737112812697887,
"rewards/answer_entity_reward": 0.9923878014087677,
"rewards/answer_wer_reward": 0.902847170829773,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9378487467765808,
"step": 148
},
{
"completion_length": 221.53125,
"epoch": 0.4768,
"grad_norm": 1.3157752752304077,
"kl": 0.052978515625,
"learning_rate": 8.149999999999999e-07,
"loss": 0.0005,
"reward": 3.8545873165130615,
"reward_std": 0.019355260767042637,
"rewards/answer_entity_reward": 0.9938696324825287,
"rewards/answer_wer_reward": 0.8627510368824005,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9979668259620667,
"step": 149
},
{
"completion_length": 233.1875,
"epoch": 0.48,
"grad_norm": 4.352514743804932,
"kl": 0.053955078125,
"learning_rate": 8.137499999999999e-07,
"loss": 0.0005,
"reward": 3.8025535345077515,
"reward_std": 0.0806161779910326,
"rewards/answer_entity_reward": 0.9906516969203949,
"rewards/answer_wer_reward": 0.8636212348937988,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9482806921005249,
"step": 150
},
{
"completion_length": 210.03125,
"epoch": 0.4832,
"grad_norm": 1.3691778182983398,
"kl": 0.05615234375,
"learning_rate": 8.125e-07,
"loss": 0.0006,
"reward": 3.860105037689209,
"reward_std": 0.034908443689346313,
"rewards/answer_entity_reward": 0.9873737394809723,
"rewards/answer_wer_reward": 0.9285348653793335,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9441964328289032,
"step": 151
},
{
"completion_length": 244.625,
"epoch": 0.4864,
"grad_norm": 1.9329304695129395,
"kl": 0.058837890625,
"learning_rate": 8.1125e-07,
"loss": 0.0006,
"reward": 3.849783182144165,
"reward_std": 0.029241922311484814,
"rewards/answer_entity_reward": 0.9856617450714111,
"rewards/answer_wer_reward": 0.8688266575336456,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9952948093414307,
"step": 152
},
{
"completion_length": 174.375,
"epoch": 0.4896,
"grad_norm": 5.655167579650879,
"kl": 0.067138671875,
"learning_rate": 8.1e-07,
"loss": 0.0007,
"reward": 3.85835599899292,
"reward_std": 0.1141166789457202,
"rewards/answer_entity_reward": 0.9663461446762085,
"rewards/answer_wer_reward": 0.9284006357192993,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9636091589927673,
"step": 153
},
{
"completion_length": 185.875,
"epoch": 0.4928,
"grad_norm": 4.543191432952881,
"kl": 0.084716796875,
"learning_rate": 8.087499999999999e-07,
"loss": 0.0008,
"reward": 3.851526975631714,
"reward_std": 0.0990656241774559,
"rewards/answer_entity_reward": 0.9646950364112854,
"rewards/answer_wer_reward": 0.9213105142116547,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.965521514415741,
"step": 154
},
{
"completion_length": 209.0625,
"epoch": 0.496,
"grad_norm": 2.554072380065918,
"kl": 0.0572509765625,
"learning_rate": 8.075e-07,
"loss": 0.0006,
"reward": 3.790269613265991,
"reward_std": 0.048579949885606766,
"rewards/answer_entity_reward": 0.9870130121707916,
"rewards/answer_wer_reward": 0.8052773177623749,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9979791641235352,
"step": 155
},
{
"completion_length": 211.96875,
"epoch": 0.4992,
"grad_norm": 2.762598991394043,
"kl": 0.0498046875,
"learning_rate": 8.0625e-07,
"loss": 0.0005,
"reward": 3.9201120138168335,
"reward_std": 0.014579844661056995,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9228614568710327,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9972506165504456,
"step": 156
},
{
"completion_length": 206.59375,
"epoch": 0.5024,
"grad_norm": 1.9372365474700928,
"kl": 0.0621337890625,
"learning_rate": 8.05e-07,
"loss": 0.0006,
"reward": 3.5673259496688843,
"reward_std": 0.028257974423468113,
"rewards/answer_entity_reward": 0.9902146458625793,
"rewards/answer_wer_reward": 0.758561909198761,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.818549245595932,
"step": 157
},
{
"completion_length": 213.15625,
"epoch": 0.5056,
"grad_norm": 2.594701051712036,
"kl": 0.08203125,
"learning_rate": 8.037499999999999e-07,
"loss": 0.0008,
"reward": 3.8647842407226562,
"reward_std": 0.029484061524271965,
"rewards/answer_entity_reward": 0.9847756624221802,
"rewards/answer_wer_reward": 0.8839923739433289,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9960162043571472,
"step": 158
},
{
"completion_length": 196.34375,
"epoch": 0.5088,
"grad_norm": 3.0164191722869873,
"kl": 0.0526123046875,
"learning_rate": 8.024999999999999e-07,
"loss": 0.0005,
"reward": 3.8759838342666626,
"reward_std": 0.04202751815319061,
"rewards/answer_entity_reward": 0.9789772629737854,
"rewards/answer_wer_reward": 0.9108568131923676,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9861496686935425,
"step": 159
},
{
"completion_length": 198.0,
"epoch": 0.512,
"grad_norm": 5.223659515380859,
"kl": 0.07177734375,
"learning_rate": 8.0125e-07,
"loss": 0.0007,
"reward": 3.8265939950942993,
"reward_std": 0.04291579592972994,
"rewards/answer_entity_reward": 0.9890734255313873,
"rewards/answer_wer_reward": 0.8892558217048645,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9482647478580475,
"step": 160
},
{
"completion_length": 184.0,
"epoch": 0.5152,
"grad_norm": 2.4279987812042236,
"kl": 0.0914306640625,
"learning_rate": 8e-07,
"loss": 0.0009,
"reward": 3.8738738298416138,
"reward_std": 0.049739884212613106,
"rewards/answer_entity_reward": 0.9671474397182465,
"rewards/answer_wer_reward": 0.9240702688694,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9826560020446777,
"step": 161
},
{
"completion_length": 250.96875,
"epoch": 0.5184,
"grad_norm": 1.4533754587173462,
"kl": 0.047607421875,
"learning_rate": 7.9875e-07,
"loss": 0.0005,
"reward": 3.9009437561035156,
"reward_std": 0.03131024446338415,
"rewards/answer_entity_reward": 0.9899475276470184,
"rewards/answer_wer_reward": 0.9109963178634644,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 162
},
{
"completion_length": 206.5,
"epoch": 0.5216,
"grad_norm": 10.05416202545166,
"kl": 0.1258544921875,
"learning_rate": 7.975e-07,
"loss": 0.0013,
"reward": 3.6952139139175415,
"reward_std": 0.08068067952990532,
"rewards/answer_entity_reward": 0.9906517267227173,
"rewards/answer_wer_reward": 0.9191368222236633,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.785425454378128,
"step": 163
},
{
"completion_length": 207.71875,
"epoch": 0.5248,
"grad_norm": 5.6498823165893555,
"kl": 0.0572509765625,
"learning_rate": 7.9625e-07,
"loss": 0.0006,
"reward": 3.862972855567932,
"reward_std": 0.05051150266081095,
"rewards/answer_entity_reward": 0.9871794879436493,
"rewards/answer_wer_reward": 0.8966725766658783,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9791209697723389,
"step": 164
},
{
"completion_length": 231.8125,
"epoch": 0.528,
"grad_norm": 2.2680246829986572,
"kl": 0.0731201171875,
"learning_rate": 7.95e-07,
"loss": 0.0007,
"reward": 3.845450758934021,
"reward_std": 0.04592973738908768,
"rewards/answer_entity_reward": 0.9903846085071564,
"rewards/answer_wer_reward": 0.8566094040870667,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9984567761421204,
"step": 165
},
{
"completion_length": 218.0,
"epoch": 0.5312,
"grad_norm": 1.194057583808899,
"kl": 0.046630859375,
"learning_rate": 7.937499999999999e-07,
"loss": 0.0005,
"reward": 3.900430679321289,
"reward_std": 0.01787347625941038,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.907353401184082,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9930772483348846,
"step": 166
},
{
"completion_length": 212.25,
"epoch": 0.5344,
"grad_norm": 1.999778389930725,
"kl": 0.07568359375,
"learning_rate": 7.924999999999999e-07,
"loss": 0.0008,
"reward": 3.885169267654419,
"reward_std": 0.01909848116338253,
"rewards/answer_entity_reward": 0.9869123697280884,
"rewards/answer_wer_reward": 0.8992542028427124,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990026652812958,
"step": 167
},
{
"completion_length": 222.65625,
"epoch": 0.5376,
"grad_norm": 1.8001956939697266,
"kl": 0.03814697265625,
"learning_rate": 7.912499999999999e-07,
"loss": 0.0004,
"reward": 3.8382192850112915,
"reward_std": 0.12780769122764468,
"rewards/answer_entity_reward": 0.9684826135635376,
"rewards/answer_wer_reward": 0.8702490329742432,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994876980781555,
"step": 168
},
{
"completion_length": 181.28125,
"epoch": 0.5408,
"grad_norm": 1.3718982934951782,
"kl": 0.072509765625,
"learning_rate": 7.9e-07,
"loss": 0.0007,
"reward": 3.743025064468384,
"reward_std": 0.02209018263965845,
"rewards/answer_entity_reward": 0.9875437021255493,
"rewards/answer_wer_reward": 0.8102038502693176,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9452773928642273,
"step": 169
},
{
"completion_length": 231.3125,
"epoch": 0.544,
"grad_norm": 3.8252196311950684,
"kl": 0.087890625,
"learning_rate": 7.8875e-07,
"loss": 0.0009,
"reward": 3.855069398880005,
"reward_std": 0.12723926454782486,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.8895151615142822,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9655542373657227,
"step": 170
},
{
"completion_length": 246.6875,
"epoch": 0.5472,
"grad_norm": 1.4238818883895874,
"kl": 0.089599609375,
"learning_rate": 7.875e-07,
"loss": 0.0009,
"reward": 3.8392333984375,
"reward_std": 0.055684901773929596,
"rewards/answer_entity_reward": 0.9753443002700806,
"rewards/answer_wer_reward": 0.866324782371521,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9975642561912537,
"step": 171
},
{
"completion_length": 239.09375,
"epoch": 0.5504,
"grad_norm": 2.5418362617492676,
"kl": 0.07421875,
"learning_rate": 7.8625e-07,
"loss": 0.0007,
"reward": 3.7556768655776978,
"reward_std": 0.026184914633631706,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.8859277367591858,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8721528947353363,
"step": 172
},
{
"completion_length": 197.53125,
"epoch": 0.5536,
"grad_norm": 2.2901041507720947,
"kl": 0.0523681640625,
"learning_rate": 7.85e-07,
"loss": 0.0005,
"reward": 3.7119585275650024,
"reward_std": 0.14428242854773998,
"rewards/answer_entity_reward": 0.8789682686328888,
"rewards/answer_wer_reward": 0.8524789810180664,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9805113673210144,
"step": 173
},
{
"completion_length": 271.65625,
"epoch": 0.5568,
"grad_norm": 1.5335708856582642,
"kl": 0.048095703125,
"learning_rate": 7.837499999999999e-07,
"loss": 0.0005,
"reward": 3.8789494037628174,
"reward_std": 0.03688232973217964,
"rewards/answer_entity_reward": 0.9724817276000977,
"rewards/answer_wer_reward": 0.9107584953308105,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9957091808319092,
"step": 174
},
{
"completion_length": 197.40625,
"epoch": 0.56,
"grad_norm": 2.6528756618499756,
"kl": 0.074462890625,
"learning_rate": 7.824999999999999e-07,
"loss": 0.0007,
"reward": 3.857820510864258,
"reward_std": 0.03826703131198883,
"rewards/answer_entity_reward": 0.993686854839325,
"rewards/answer_wer_reward": 0.8975639641284943,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9665697515010834,
"step": 175
},
{
"completion_length": 200.15625,
"epoch": 0.5632,
"grad_norm": 5.963916301727295,
"kl": 0.054443359375,
"learning_rate": 7.812499999999999e-07,
"loss": 0.0005,
"reward": 3.864750027656555,
"reward_std": 0.028456556610763073,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9234411716461182,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9447809159755707,
"step": 176
},
{
"completion_length": 220.25,
"epoch": 0.5664,
"grad_norm": 1.086248517036438,
"kl": 0.07421875,
"learning_rate": 7.799999999999999e-07,
"loss": 0.0007,
"reward": 3.85122811794281,
"reward_std": 0.02548269461840391,
"rewards/answer_entity_reward": 0.9941239356994629,
"rewards/answer_wer_reward": 0.9126598238945007,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9444444477558136,
"step": 177
},
{
"completion_length": 235.6875,
"epoch": 0.5696,
"grad_norm": 3.8478362560272217,
"kl": 0.080810546875,
"learning_rate": 7.787500000000001e-07,
"loss": 0.0008,
"reward": 3.8555444478988647,
"reward_std": 0.03297184593975544,
"rewards/answer_entity_reward": 0.991346150636673,
"rewards/answer_wer_reward": 0.8777507543563843,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9864475131034851,
"step": 178
},
{
"completion_length": 210.6875,
"epoch": 0.5728,
"grad_norm": 2.200871706008911,
"kl": 0.096923828125,
"learning_rate": 7.775e-07,
"loss": 0.001,
"reward": 3.8970987796783447,
"reward_std": 0.029029657132923603,
"rewards/answer_entity_reward": 0.9676088094711304,
"rewards/answer_wer_reward": 0.9392231702804565,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9902668297290802,
"step": 179
},
{
"completion_length": 202.375,
"epoch": 0.576,
"grad_norm": 3.42965030670166,
"kl": 0.080078125,
"learning_rate": 7.7625e-07,
"loss": 0.0008,
"reward": 3.7469061613082886,
"reward_std": 0.08900729566812515,
"rewards/answer_entity_reward": 0.9832702279090881,
"rewards/answer_wer_reward": 0.8798384070396423,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8837975263595581,
"step": 180
},
{
"completion_length": 215.875,
"epoch": 0.5792,
"grad_norm": 2.5457639694213867,
"kl": 0.0595703125,
"learning_rate": 7.75e-07,
"loss": 0.0006,
"reward": 3.8780597448349,
"reward_std": 0.04192608781158924,
"rewards/answer_entity_reward": 0.9845328330993652,
"rewards/answer_wer_reward": 0.89576256275177,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9977642893791199,
"step": 181
},
{
"completion_length": 203.875,
"epoch": 0.5824,
"grad_norm": 1.3624567985534668,
"kl": 0.07177734375,
"learning_rate": 7.7375e-07,
"loss": 0.0007,
"reward": 3.8805158138275146,
"reward_std": 0.016396815422922373,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9136685729026794,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9692510068416595,
"step": 182
},
{
"completion_length": 215.90625,
"epoch": 0.5856,
"grad_norm": 1.270873785018921,
"kl": 0.0543212890625,
"learning_rate": 7.724999999999999e-07,
"loss": 0.0005,
"reward": 3.8749226331710815,
"reward_std": 0.020629468373954296,
"rewards/answer_entity_reward": 0.985921710729599,
"rewards/answer_wer_reward": 0.8920559883117676,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9969449043273926,
"step": 183
},
{
"completion_length": 230.34375,
"epoch": 0.5888,
"grad_norm": 5.295412063598633,
"kl": 0.0489501953125,
"learning_rate": 7.712499999999999e-07,
"loss": 0.0005,
"reward": 3.8914437294006348,
"reward_std": 0.053787765093147755,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9137877225875854,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9811282753944397,
"step": 184
},
{
"completion_length": 238.03125,
"epoch": 0.592,
"grad_norm": 3.6382017135620117,
"kl": 0.05126953125,
"learning_rate": 7.699999999999999e-07,
"loss": 0.0005,
"reward": 3.80574893951416,
"reward_std": 0.031003179028630257,
"rewards/answer_entity_reward": 0.9958333373069763,
"rewards/answer_wer_reward": 0.8504349291324615,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9594806730747223,
"step": 185
},
{
"completion_length": 133.46875,
"epoch": 0.5952,
"grad_norm": 5.556273937225342,
"kl": 0.06884765625,
"learning_rate": 7.6875e-07,
"loss": 0.0007,
"reward": 3.875786066055298,
"reward_std": 0.014059089124202728,
"rewards/answer_entity_reward": 0.9772727489471436,
"rewards/answer_wer_reward": 0.9379938840866089,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9605194628238678,
"step": 186
},
{
"completion_length": 233.96875,
"epoch": 0.5984,
"grad_norm": 1.1566299200057983,
"kl": 0.0654296875,
"learning_rate": 7.675e-07,
"loss": 0.0007,
"reward": 3.8272093534469604,
"reward_std": 0.056231189519166946,
"rewards/answer_entity_reward": 0.9821289777755737,
"rewards/answer_wer_reward": 0.87700355052948,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9680766761302948,
"step": 187
},
{
"completion_length": 223.21875,
"epoch": 0.6016,
"grad_norm": 1.125300407409668,
"kl": 0.0433349609375,
"learning_rate": 7.6625e-07,
"loss": 0.0004,
"reward": 3.9091583490371704,
"reward_std": 0.019687645137310028,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.917988508939743,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9946419894695282,
"step": 188
},
{
"completion_length": 213.03125,
"epoch": 0.6048,
"grad_norm": 1.806405782699585,
"kl": 0.05859375,
"learning_rate": 7.65e-07,
"loss": 0.0006,
"reward": 3.9139894247055054,
"reward_std": 0.01741368416696787,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.916355162858963,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9976341724395752,
"step": 189
},
{
"completion_length": 246.6875,
"epoch": 0.608,
"grad_norm": 2.158470630645752,
"kl": 0.05224609375,
"learning_rate": 7.6375e-07,
"loss": 0.0005,
"reward": 3.9092923402786255,
"reward_std": 0.019907254725694656,
"rewards/answer_entity_reward": 0.9944444596767426,
"rewards/answer_wer_reward": 0.9189584851264954,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9958893954753876,
"step": 190
},
{
"completion_length": 197.71875,
"epoch": 0.6112,
"grad_norm": 0.8463873863220215,
"kl": 0.0526123046875,
"learning_rate": 7.624999999999999e-07,
"loss": 0.0005,
"reward": 3.7934869527816772,
"reward_std": 0.010684152133762836,
"rewards/answer_entity_reward": 0.9927884340286255,
"rewards/answer_wer_reward": 0.8017330169677734,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9989654421806335,
"step": 191
},
{
"completion_length": 253.03125,
"epoch": 0.6144,
"grad_norm": 0.95602947473526,
"kl": 0.0577392578125,
"learning_rate": 7.612499999999999e-07,
"loss": 0.0006,
"reward": 3.8714359998703003,
"reward_std": 0.03730391897261143,
"rewards/answer_entity_reward": 0.9679293036460876,
"rewards/answer_wer_reward": 0.9067506790161133,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9967561364173889,
"step": 192
},
{
"completion_length": 260.875,
"epoch": 0.6176,
"grad_norm": 1.752991795539856,
"kl": 0.1259765625,
"learning_rate": 7.599999999999999e-07,
"loss": 0.0013,
"reward": 3.847132444381714,
"reward_std": 0.03724599629640579,
"rewards/answer_entity_reward": 0.9814560413360596,
"rewards/answer_wer_reward": 0.877534031867981,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9881424605846405,
"step": 193
},
{
"completion_length": 224.8125,
"epoch": 0.6208,
"grad_norm": 5.3836283683776855,
"kl": 0.0616455078125,
"learning_rate": 7.5875e-07,
"loss": 0.0006,
"reward": 3.838170886039734,
"reward_std": 0.043032409623265266,
"rewards/answer_entity_reward": 0.9778589308261871,
"rewards/answer_wer_reward": 0.8835411667823792,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9767708480358124,
"step": 194
},
{
"completion_length": 234.0,
"epoch": 0.624,
"grad_norm": 1.4531170129776,
"kl": 0.082763671875,
"learning_rate": 7.575e-07,
"loss": 0.0008,
"reward": 3.8195607662200928,
"reward_std": 0.06634793058037758,
"rewards/answer_entity_reward": 0.9759862422943115,
"rewards/answer_wer_reward": 0.8854676187038422,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9581069052219391,
"step": 195
},
{
"completion_length": 228.875,
"epoch": 0.6272,
"grad_norm": 1.215409278869629,
"kl": 0.0653076171875,
"learning_rate": 7.5625e-07,
"loss": 0.0006,
"reward": 3.869178295135498,
"reward_std": 0.018243765458464622,
"rewards/answer_entity_reward": 0.9914772808551788,
"rewards/answer_wer_reward": 0.9173910617828369,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9603100121021271,
"step": 196
},
{
"completion_length": 233.40625,
"epoch": 0.6304,
"grad_norm": 1.5224462747573853,
"kl": 0.0479736328125,
"learning_rate": 7.55e-07,
"loss": 0.0005,
"reward": 3.880965232849121,
"reward_std": 0.030376747716218233,
"rewards/answer_entity_reward": 0.9812500178813934,
"rewards/answer_wer_reward": 0.903846025466919,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9958691298961639,
"step": 197
},
{
"completion_length": 159.75,
"epoch": 0.6336,
"grad_norm": 2.0013957023620605,
"kl": 0.072021484375,
"learning_rate": 7.5375e-07,
"loss": 0.0007,
"reward": 3.8514485359191895,
"reward_std": 0.021021784283220768,
"rewards/answer_entity_reward": 0.9914772808551788,
"rewards/answer_wer_reward": 0.9317480027675629,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9282233119010925,
"step": 198
},
{
"completion_length": 200.125,
"epoch": 0.6368,
"grad_norm": 7.399294853210449,
"kl": 0.0662841796875,
"learning_rate": 7.524999999999999e-07,
"loss": 0.0007,
"reward": 3.9170095920562744,
"reward_std": 0.03030287381261587,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.955333948135376,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9616756439208984,
"step": 199
},
{
"completion_length": 228.75,
"epoch": 0.64,
"grad_norm": 1.6671867370605469,
"kl": 0.13623046875,
"learning_rate": 7.512499999999999e-07,
"loss": 0.0014,
"reward": 3.848036050796509,
"reward_std": 0.14389772480353713,
"rewards/answer_entity_reward": 0.9886363744735718,
"rewards/answer_wer_reward": 0.9240660667419434,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9665836989879608,
"step": 200
},
{
"completion_length": 209.90625,
"epoch": 0.6432,
"grad_norm": 1.2796622514724731,
"kl": 0.05029296875,
"learning_rate": 7.5e-07,
"loss": 0.0005,
"reward": 3.856316566467285,
"reward_std": 0.025415225885808468,
"rewards/answer_entity_reward": 0.9718458652496338,
"rewards/answer_wer_reward": 0.8857261538505554,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987446367740631,
"step": 201
},
{
"completion_length": 203.1875,
"epoch": 0.6464,
"grad_norm": 6.9469380378723145,
"kl": 0.05810546875,
"learning_rate": 7.4875e-07,
"loss": 0.0006,
"reward": 3.7580385208129883,
"reward_std": 0.0333370678126812,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.8357867002487183,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9279336631298065,
"step": 202
},
{
"completion_length": 211.5,
"epoch": 0.6496,
"grad_norm": 2.437093496322632,
"kl": 0.0400390625,
"learning_rate": 7.475e-07,
"loss": 0.0004,
"reward": 3.888434052467346,
"reward_std": 0.04942548694089055,
"rewards/answer_entity_reward": 0.9895833432674408,
"rewards/answer_wer_reward": 0.901074230670929,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9977766275405884,
"step": 203
},
{
"completion_length": 220.3125,
"epoch": 0.6528,
"grad_norm": 9.914649963378906,
"kl": 0.054443359375,
"learning_rate": 7.4625e-07,
"loss": 0.0005,
"reward": 3.9074004888534546,
"reward_std": 0.022341615986078978,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.924115002155304,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9861262738704681,
"step": 204
},
{
"completion_length": 190.28125,
"epoch": 0.656,
"grad_norm": 10.771315574645996,
"kl": 0.0731201171875,
"learning_rate": 7.45e-07,
"loss": 0.0007,
"reward": 3.8562848567962646,
"reward_std": 0.05522243678569794,
"rewards/answer_entity_reward": 0.9873949587345123,
"rewards/answer_wer_reward": 0.9283336997032166,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9405562579631805,
"step": 205
},
{
"completion_length": 254.375,
"epoch": 0.6592,
"grad_norm": 1.2101417779922485,
"kl": 0.054443359375,
"learning_rate": 7.4375e-07,
"loss": 0.0005,
"reward": 3.9058661460876465,
"reward_std": 0.015844878274947405,
"rewards/answer_entity_reward": 0.9788995683193207,
"rewards/answer_wer_reward": 0.9304846525192261,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9964818954467773,
"step": 206
},
{
"completion_length": 202.96875,
"epoch": 0.6624,
"grad_norm": 3.355869770050049,
"kl": 0.0572509765625,
"learning_rate": 7.425e-07,
"loss": 0.0006,
"reward": 3.8065719604492188,
"reward_std": 0.19051394425332546,
"rewards/answer_entity_reward": 0.9650735259056091,
"rewards/answer_wer_reward": 0.8801510035991669,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9925974309444427,
"step": 207
},
{
"completion_length": 226.1875,
"epoch": 0.6656,
"grad_norm": 1.7292360067367554,
"kl": 0.104248046875,
"learning_rate": 7.412499999999999e-07,
"loss": 0.001,
"reward": 3.8113776445388794,
"reward_std": 0.02462965715676546,
"rewards/answer_entity_reward": 0.9770916700363159,
"rewards/answer_wer_reward": 0.864607959985733,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9696778357028961,
"step": 208
},
{
"completion_length": 198.75,
"epoch": 0.6688,
"grad_norm": 4.215091705322266,
"kl": 0.06640625,
"learning_rate": 7.4e-07,
"loss": 0.0007,
"reward": 3.8144696950912476,
"reward_std": 0.025187399238348007,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9298737645149231,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8845959007740021,
"step": 209
},
{
"completion_length": 200.1875,
"epoch": 0.672,
"grad_norm": 1.537361979484558,
"kl": 0.049560546875,
"learning_rate": 7.3875e-07,
"loss": 0.0005,
"reward": 3.9332594871520996,
"reward_std": 0.011271146591752768,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.951434314250946,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9818252325057983,
"step": 210
},
{
"completion_length": 190.78125,
"epoch": 0.6752,
"grad_norm": 2.9701907634735107,
"kl": 0.0654296875,
"learning_rate": 7.375e-07,
"loss": 0.0007,
"reward": 3.8168801069259644,
"reward_std": 0.024646650068461895,
"rewards/answer_entity_reward": 0.9914772808551788,
"rewards/answer_wer_reward": 0.9553571939468384,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8700454831123352,
"step": 211
},
{
"completion_length": 157.59375,
"epoch": 0.6784,
"grad_norm": 3.1656010150909424,
"kl": 0.0611572265625,
"learning_rate": 7.362499999999999e-07,
"loss": 0.0006,
"reward": 3.8838521242141724,
"reward_std": 0.0407260712236166,
"rewards/answer_entity_reward": 0.9767543971538544,
"rewards/answer_wer_reward": 0.944227010011673,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9628707766532898,
"step": 212
},
{
"completion_length": 238.1875,
"epoch": 0.6816,
"grad_norm": 2.614816665649414,
"kl": 0.0947265625,
"learning_rate": 7.35e-07,
"loss": 0.0009,
"reward": 3.8542829751968384,
"reward_std": 0.03231436479836702,
"rewards/answer_entity_reward": 0.974577009677887,
"rewards/answer_wer_reward": 0.8831658661365509,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9965401291847229,
"step": 213
},
{
"completion_length": 255.0,
"epoch": 0.6848,
"grad_norm": 1.8072490692138672,
"kl": 0.048828125,
"learning_rate": 7.3375e-07,
"loss": 0.0005,
"reward": 3.9139556884765625,
"reward_std": 0.013969901017844677,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9155895113945007,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9983660876750946,
"step": 214
},
{
"completion_length": 163.96875,
"epoch": 0.688,
"grad_norm": 3.6364543437957764,
"kl": 0.082763671875,
"learning_rate": 7.325e-07,
"loss": 0.0008,
"reward": 3.8950713872909546,
"reward_std": 0.030674483627080917,
"rewards/answer_entity_reward": 0.9930555820465088,
"rewards/answer_wer_reward": 0.9427915513515472,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.959224134683609,
"step": 215
},
{
"completion_length": 211.90625,
"epoch": 0.6912,
"grad_norm": 1.4036628007888794,
"kl": 0.0504150390625,
"learning_rate": 7.312499999999999e-07,
"loss": 0.0005,
"reward": 3.90190052986145,
"reward_std": 0.028614184819161892,
"rewards/answer_entity_reward": 0.9636363685131073,
"rewards/answer_wer_reward": 0.9445142149925232,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9937500059604645,
"step": 216
},
{
"completion_length": 226.875,
"epoch": 0.6944,
"grad_norm": 1.5664644241333008,
"kl": 0.051025390625,
"learning_rate": 7.3e-07,
"loss": 0.0005,
"reward": 3.9051342010498047,
"reward_std": 0.023595476523041725,
"rewards/answer_entity_reward": 0.994463324546814,
"rewards/answer_wer_reward": 0.9128024578094482,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9978685975074768,
"step": 217
},
{
"completion_length": 211.96875,
"epoch": 0.6976,
"grad_norm": 3.6565327644348145,
"kl": 0.0567626953125,
"learning_rate": 7.2875e-07,
"loss": 0.0006,
"reward": 3.920815348625183,
"reward_std": 0.026728018186986446,
"rewards/answer_entity_reward": 0.9936868846416473,
"rewards/answer_wer_reward": 0.9297977983951569,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9973307251930237,
"step": 218
},
{
"completion_length": 226.90625,
"epoch": 0.7008,
"grad_norm": 5.147249221801758,
"kl": 0.142333984375,
"learning_rate": 7.275e-07,
"loss": 0.0014,
"reward": 3.887997627258301,
"reward_std": 0.017563311383128166,
"rewards/answer_entity_reward": 0.9923513829708099,
"rewards/answer_wer_reward": 0.8966234028339386,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999022901058197,
"step": 219
},
{
"completion_length": 196.75,
"epoch": 0.704,
"grad_norm": 4.334951400756836,
"kl": 0.07958984375,
"learning_rate": 7.262499999999999e-07,
"loss": 0.0008,
"reward": 3.919954776763916,
"reward_std": 0.020561310462653637,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.922648161649704,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997106492519379,
"step": 220
},
{
"completion_length": 208.15625,
"epoch": 0.7072,
"grad_norm": 4.896883964538574,
"kl": 0.072509765625,
"learning_rate": 7.249999999999999e-07,
"loss": 0.0007,
"reward": 3.8171916007995605,
"reward_std": 0.044522007927298546,
"rewards/answer_entity_reward": 0.9767857491970062,
"rewards/answer_wer_reward": 0.9031675159931183,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9372382760047913,
"step": 221
},
{
"completion_length": 197.71875,
"epoch": 0.7104,
"grad_norm": 1.9743766784667969,
"kl": 0.041259765625,
"learning_rate": 7.2375e-07,
"loss": 0.0004,
"reward": 3.9599783420562744,
"reward_std": 0.008235257118940353,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9602223634719849,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999755859375,
"step": 222
},
{
"completion_length": 181.84375,
"epoch": 0.7136,
"grad_norm": 6.57908296585083,
"kl": 0.07421875,
"learning_rate": 7.225e-07,
"loss": 0.0007,
"reward": 3.826643943786621,
"reward_std": 0.06298277154564857,
"rewards/answer_entity_reward": 0.9833333194255829,
"rewards/answer_wer_reward": 0.9450017511844635,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8983088135719299,
"step": 223
},
{
"completion_length": 177.84375,
"epoch": 0.7168,
"grad_norm": 13.744032859802246,
"kl": 0.078369140625,
"learning_rate": 7.212499999999999e-07,
"loss": 0.0008,
"reward": 3.852834939956665,
"reward_std": 0.044052885845303535,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9374657571315765,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9177731275558472,
"step": 224
},
{
"completion_length": 249.3125,
"epoch": 0.72,
"grad_norm": 1.7395777702331543,
"kl": 0.05712890625,
"learning_rate": 7.2e-07,
"loss": 0.0006,
"reward": 3.8659743070602417,
"reward_std": 0.03202287387102842,
"rewards/answer_entity_reward": 0.9767628312110901,
"rewards/answer_wer_reward": 0.8964802920818329,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9927313327789307,
"step": 225
},
{
"completion_length": 246.75,
"epoch": 0.7232,
"grad_norm": 1.1522554159164429,
"kl": 0.05419921875,
"learning_rate": 7.1875e-07,
"loss": 0.0005,
"reward": 3.868378758430481,
"reward_std": 0.02125831786543131,
"rewards/answer_entity_reward": 0.9791666567325592,
"rewards/answer_wer_reward": 0.8927575647830963,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9964545369148254,
"step": 226
},
{
"completion_length": 213.6875,
"epoch": 0.7264,
"grad_norm": 1.6328908205032349,
"kl": 0.0452880859375,
"learning_rate": 7.175e-07,
"loss": 0.0004,
"reward": 3.9461253881454468,
"reward_std": 0.017373798182234168,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9516011476516724,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9989885687828064,
"step": 227
},
{
"completion_length": 180.25,
"epoch": 0.7296,
"grad_norm": 1.6245373487472534,
"kl": 0.0810546875,
"learning_rate": 7.1625e-07,
"loss": 0.0008,
"reward": 3.92253839969635,
"reward_std": 0.009518959443084896,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9421058893203735,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9832733571529388,
"step": 228
},
{
"completion_length": 211.46875,
"epoch": 0.7328,
"grad_norm": 2.3507907390594482,
"kl": 0.080078125,
"learning_rate": 7.149999999999999e-07,
"loss": 0.0008,
"reward": 3.9085057973861694,
"reward_std": 0.011625304818153381,
"rewards/answer_entity_reward": 0.9886363744735718,
"rewards/answer_wer_reward": 0.91986945271492,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 229
},
{
"completion_length": 189.78125,
"epoch": 0.736,
"grad_norm": 2.801975965499878,
"kl": 0.068603515625,
"learning_rate": 7.137499999999999e-07,
"loss": 0.0007,
"reward": 3.849338173866272,
"reward_std": 0.04476720932871103,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9499310851097107,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.90181103348732,
"step": 230
},
{
"completion_length": 232.46875,
"epoch": 0.7392,
"grad_norm": 18.121028900146484,
"kl": 0.065673828125,
"learning_rate": 7.125e-07,
"loss": 0.0007,
"reward": 3.8422099351882935,
"reward_std": 0.05234749615192413,
"rewards/answer_entity_reward": 0.9829545617103577,
"rewards/answer_wer_reward": 0.8842452466487885,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9750101864337921,
"step": 231
},
{
"completion_length": 230.90625,
"epoch": 0.7424,
"grad_norm": 1.374346375465393,
"kl": 0.0440673828125,
"learning_rate": 7.1125e-07,
"loss": 0.0004,
"reward": 3.9123170375823975,
"reward_std": 0.025476250797510147,
"rewards/answer_entity_reward": 0.9930555820465088,
"rewards/answer_wer_reward": 0.9220384955406189,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9972230195999146,
"step": 232
},
{
"completion_length": 197.5625,
"epoch": 0.7456,
"grad_norm": 3.1081960201263428,
"kl": 0.067138671875,
"learning_rate": 7.1e-07,
"loss": 0.0007,
"reward": 3.921274781227112,
"reward_std": 0.04291347204707563,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9490483999252319,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9722263813018799,
"step": 233
},
{
"completion_length": 198.90625,
"epoch": 0.7488,
"grad_norm": 2.3603627681732178,
"kl": 0.0550537109375,
"learning_rate": 7.0875e-07,
"loss": 0.0005,
"reward": 3.9125137329101562,
"reward_std": 0.03855661302804947,
"rewards/answer_entity_reward": 0.9947552382946014,
"rewards/answer_wer_reward": 0.9429784715175629,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9747799634933472,
"step": 234
},
{
"completion_length": 220.71875,
"epoch": 0.752,
"grad_norm": 3.3247504234313965,
"kl": 0.070068359375,
"learning_rate": 7.075e-07,
"loss": 0.0007,
"reward": 3.877889394760132,
"reward_std": 0.03429079055786133,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9119226932525635,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9659668207168579,
"step": 235
},
{
"completion_length": 194.21875,
"epoch": 0.7552,
"grad_norm": 5.20084810256958,
"kl": 0.067626953125,
"learning_rate": 7.0625e-07,
"loss": 0.0007,
"reward": 3.918747305870056,
"reward_std": 0.03475894033908844,
"rewards/answer_entity_reward": 0.9929924309253693,
"rewards/answer_wer_reward": 0.9448626041412354,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.980892151594162,
"step": 236
},
{
"completion_length": 222.53125,
"epoch": 0.7584,
"grad_norm": 3.0105435848236084,
"kl": 0.07421875,
"learning_rate": 7.049999999999999e-07,
"loss": 0.0007,
"reward": 3.9244236946105957,
"reward_std": 0.010058181826025248,
"rewards/answer_entity_reward": 0.9914772808551788,
"rewards/answer_wer_reward": 0.9545913934707642,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9783551096916199,
"step": 237
},
{
"completion_length": 222.5625,
"epoch": 0.7616,
"grad_norm": 4.065408229827881,
"kl": 0.1181640625,
"learning_rate": 7.037499999999999e-07,
"loss": 0.0012,
"reward": 3.873254418373108,
"reward_std": 0.0757724829018116,
"rewards/answer_entity_reward": 0.9845328330993652,
"rewards/answer_wer_reward": 0.936627209186554,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9520943462848663,
"step": 238
},
{
"completion_length": 184.21875,
"epoch": 0.7648,
"grad_norm": 1.1628284454345703,
"kl": 0.0579833984375,
"learning_rate": 7.024999999999999e-07,
"loss": 0.0006,
"reward": 3.9432320594787598,
"reward_std": 0.010221295058727264,
"rewards/answer_entity_reward": 0.9905790388584137,
"rewards/answer_wer_reward": 0.953954666852951,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.998698353767395,
"step": 239
},
{
"completion_length": 233.90625,
"epoch": 0.768,
"grad_norm": 1.4767858982086182,
"kl": 0.079345703125,
"learning_rate": 7.0125e-07,
"loss": 0.0008,
"reward": 3.8955001831054688,
"reward_std": 0.03214742988348007,
"rewards/answer_entity_reward": 0.9854603707790375,
"rewards/answer_wer_reward": 0.9112924933433533,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987473487854004,
"step": 240
},
{
"completion_length": 176.53125,
"epoch": 0.7712,
"grad_norm": 5.655521869659424,
"kl": 0.0872802734375,
"learning_rate": 7e-07,
"loss": 0.0009,
"reward": 3.8957866430282593,
"reward_std": 0.02847579075023532,
"rewards/answer_entity_reward": 0.9903846085071564,
"rewards/answer_wer_reward": 0.9654708206653595,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9399311542510986,
"step": 241
},
{
"completion_length": 252.71875,
"epoch": 0.7744,
"grad_norm": 3.268174886703491,
"kl": 0.073486328125,
"learning_rate": 6.9875e-07,
"loss": 0.0007,
"reward": 3.8414435386657715,
"reward_std": 0.08019998762756586,
"rewards/answer_entity_reward": 0.9822989404201508,
"rewards/answer_wer_reward": 0.8909429609775543,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.968201756477356,
"step": 242
},
{
"completion_length": 234.875,
"epoch": 0.7776,
"grad_norm": 3.445681571960449,
"kl": 0.15869140625,
"learning_rate": 6.975e-07,
"loss": 0.0016,
"reward": 3.856196165084839,
"reward_std": 0.0546736940741539,
"rewards/answer_entity_reward": 0.9822468161582947,
"rewards/answer_wer_reward": 0.9020899534225464,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9718593955039978,
"step": 243
},
{
"completion_length": 186.375,
"epoch": 0.7808,
"grad_norm": 3.4756290912628174,
"kl": 0.109130859375,
"learning_rate": 6.9625e-07,
"loss": 0.0011,
"reward": 3.878678798675537,
"reward_std": 0.014406855218112469,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9121991693973541,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9693206548690796,
"step": 244
},
{
"completion_length": 224.46875,
"epoch": 0.784,
"grad_norm": 2.4778082370758057,
"kl": 0.0618896484375,
"learning_rate": 6.949999999999999e-07,
"loss": 0.0006,
"reward": 3.890427350997925,
"reward_std": 0.013088527135550976,
"rewards/answer_entity_reward": 0.9849699139595032,
"rewards/answer_wer_reward": 0.9565823972225189,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9488749206066132,
"step": 245
},
{
"completion_length": 220.0625,
"epoch": 0.7872,
"grad_norm": 1.7784525156021118,
"kl": 0.0592041015625,
"learning_rate": 6.937499999999999e-07,
"loss": 0.0006,
"reward": 3.9208799600601196,
"reward_std": 0.013537504710257053,
"rewards/answer_entity_reward": 0.9903846085071564,
"rewards/answer_wer_reward": 0.9380317628383636,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9924636483192444,
"step": 246
},
{
"completion_length": 215.03125,
"epoch": 0.7904,
"grad_norm": 1.7845004796981812,
"kl": 0.087158203125,
"learning_rate": 6.924999999999999e-07,
"loss": 0.0009,
"reward": 3.874635100364685,
"reward_std": 0.047601671889424324,
"rewards/answer_entity_reward": 0.9777146875858307,
"rewards/answer_wer_reward": 0.9114454984664917,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9854750335216522,
"step": 247
},
{
"completion_length": 237.0,
"epoch": 0.7936,
"grad_norm": 1.9031370878219604,
"kl": 0.0665283203125,
"learning_rate": 6.9125e-07,
"loss": 0.0007,
"reward": 3.8799991607666016,
"reward_std": 0.040791427716612816,
"rewards/answer_entity_reward": 0.9725233018398285,
"rewards/answer_wer_reward": 0.9113976061344147,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9960784018039703,
"step": 248
},
{
"completion_length": 247.625,
"epoch": 0.7968,
"grad_norm": 6.799812316894531,
"kl": 0.5244140625,
"learning_rate": 6.9e-07,
"loss": 0.0052,
"reward": 3.9148751497268677,
"reward_std": 0.012524784076958895,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9155747294425964,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993003606796265,
"step": 249
},
{
"completion_length": 202.78125,
"epoch": 0.8,
"grad_norm": 2.9497642517089844,
"kl": 0.108642578125,
"learning_rate": 6.8875e-07,
"loss": 0.0011,
"reward": 3.88541841506958,
"reward_std": 0.05846460163593292,
"rewards/answer_entity_reward": 0.9898538887500763,
"rewards/answer_wer_reward": 0.9265855848789215,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.968979001045227,
"step": 250
},
{
"completion_length": 230.03125,
"epoch": 0.8032,
"grad_norm": 3.021209478378296,
"kl": 0.064453125,
"learning_rate": 6.875e-07,
"loss": 0.0006,
"reward": 3.9006909132003784,
"reward_std": 0.02151984628289938,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9085462689399719,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9956169128417969,
"step": 251
},
{
"completion_length": 202.0625,
"epoch": 0.8064,
"grad_norm": 3.288858413696289,
"kl": 0.0810546875,
"learning_rate": 6.8625e-07,
"loss": 0.0008,
"reward": 3.9228957891464233,
"reward_std": 0.012390648480504751,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9330424964427948,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9898532032966614,
"step": 252
},
{
"completion_length": 202.84375,
"epoch": 0.8096,
"grad_norm": 2.384650468826294,
"kl": 0.084228515625,
"learning_rate": 6.85e-07,
"loss": 0.0009,
"reward": 3.8598722219467163,
"reward_std": 0.03435686323791742,
"rewards/answer_entity_reward": 0.9775519669055939,
"rewards/answer_wer_reward": 0.9159774780273438,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9663428068161011,
"step": 253
},
{
"completion_length": 235.15625,
"epoch": 0.8128,
"grad_norm": 3.9519598484039307,
"kl": 0.061767578125,
"learning_rate": 6.837499999999999e-07,
"loss": 0.0006,
"reward": 3.8161985874176025,
"reward_std": 0.06573762744665146,
"rewards/answer_entity_reward": 0.9905131459236145,
"rewards/answer_wer_reward": 0.8475149571895599,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9781705141067505,
"step": 254
},
{
"completion_length": 241.125,
"epoch": 0.816,
"grad_norm": 3.464174509048462,
"kl": 0.077392578125,
"learning_rate": 6.824999999999999e-07,
"loss": 0.0008,
"reward": 3.894362449645996,
"reward_std": 0.025215300731360912,
"rewards/answer_entity_reward": 0.9895833432674408,
"rewards/answer_wer_reward": 0.9064654111862183,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.998313695192337,
"step": 255
},
{
"completion_length": 177.59375,
"epoch": 0.8192,
"grad_norm": 1.5625709295272827,
"kl": 0.0986328125,
"learning_rate": 6.8125e-07,
"loss": 0.001,
"reward": 3.9517083168029785,
"reward_std": 0.01383261731825769,
"rewards/answer_entity_reward": 0.995192289352417,
"rewards/answer_wer_reward": 0.9637933671474457,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9927226006984711,
"step": 256
},
{
"completion_length": 191.625,
"epoch": 0.8224,
"grad_norm": 1.4757704734802246,
"kl": 0.0791015625,
"learning_rate": 6.800000000000001e-07,
"loss": 0.0008,
"reward": 3.8987783193588257,
"reward_std": 0.016407988965511322,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9312387406826019,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9699434041976929,
"step": 257
},
{
"completion_length": 191.65625,
"epoch": 0.8256,
"grad_norm": 3.355372428894043,
"kl": 0.09033203125,
"learning_rate": 6.7875e-07,
"loss": 0.0009,
"reward": 3.9129350185394287,
"reward_std": 0.015536424703896046,
"rewards/answer_entity_reward": 0.9944852888584137,
"rewards/answer_wer_reward": 0.9205312728881836,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9979183673858643,
"step": 258
},
{
"completion_length": 188.65625,
"epoch": 0.8288,
"grad_norm": 1.917312741279602,
"kl": 0.086669921875,
"learning_rate": 6.775e-07,
"loss": 0.0009,
"reward": 3.918121814727783,
"reward_std": 0.0268348827958107,
"rewards/answer_entity_reward": 0.9890183508396149,
"rewards/answer_wer_reward": 0.9294547438621521,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996488690376282,
"step": 259
},
{
"completion_length": 234.0,
"epoch": 0.832,
"grad_norm": 1.334208369255066,
"kl": 0.0635986328125,
"learning_rate": 6.7625e-07,
"loss": 0.0006,
"reward": 3.924370527267456,
"reward_std": 0.02556901052594185,
"rewards/answer_entity_reward": 0.980710506439209,
"rewards/answer_wer_reward": 0.9436598718166351,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 260
},
{
"completion_length": 157.09375,
"epoch": 0.8352,
"grad_norm": 3.0484063625335693,
"kl": 0.093017578125,
"learning_rate": 6.75e-07,
"loss": 0.0009,
"reward": 3.928007483482361,
"reward_std": 0.01636551646515727,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9610774517059326,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.969333827495575,
"step": 261
},
{
"completion_length": 222.6875,
"epoch": 0.8384,
"grad_norm": 1.5266326665878296,
"kl": 0.110595703125,
"learning_rate": 6.737499999999999e-07,
"loss": 0.0011,
"reward": 3.826764225959778,
"reward_std": 0.014424358261749148,
"rewards/answer_entity_reward": 0.875,
"rewards/answer_wer_reward": 0.9528080821037292,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9989560842514038,
"step": 262
},
{
"completion_length": 245.96875,
"epoch": 0.8416,
"grad_norm": 2.332728624343872,
"kl": 0.0777587890625,
"learning_rate": 6.724999999999999e-07,
"loss": 0.0008,
"reward": 3.84222412109375,
"reward_std": 0.018232629168778658,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.8854961693286896,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9591319561004639,
"step": 263
},
{
"completion_length": 155.6875,
"epoch": 0.8448,
"grad_norm": 7.505854606628418,
"kl": 0.101806640625,
"learning_rate": 6.7125e-07,
"loss": 0.001,
"reward": 3.875036120414734,
"reward_std": 0.07785245403647423,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9335145354270935,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9436048865318298,
"step": 264
},
{
"completion_length": 249.75,
"epoch": 0.848,
"grad_norm": 2.8738133907318115,
"kl": 0.0516357421875,
"learning_rate": 6.7e-07,
"loss": 0.0005,
"reward": 3.903374195098877,
"reward_std": 0.014860059600323439,
"rewards/answer_entity_reward": 0.995192289352417,
"rewards/answer_wer_reward": 0.9094418883323669,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987399280071259,
"step": 265
},
{
"completion_length": 199.34375,
"epoch": 0.8512,
"grad_norm": 8.186075210571289,
"kl": 0.074462890625,
"learning_rate": 6.6875e-07,
"loss": 0.0007,
"reward": 3.8564417362213135,
"reward_std": 0.06331180594861507,
"rewards/answer_entity_reward": 0.9917200803756714,
"rewards/answer_wer_reward": 0.9368169605731964,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9279046654701233,
"step": 266
},
{
"completion_length": 207.75,
"epoch": 0.8544,
"grad_norm": 1.7668160200119019,
"kl": 0.191650390625,
"learning_rate": 6.675e-07,
"loss": 0.0019,
"reward": 3.791893243789673,
"reward_std": 0.21384014189243317,
"rewards/answer_entity_reward": 0.9642857313156128,
"rewards/answer_wer_reward": 0.8976732790470123,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9611842036247253,
"step": 267
},
{
"completion_length": 232.78125,
"epoch": 0.8576,
"grad_norm": 3.357858180999756,
"kl": 0.0655517578125,
"learning_rate": 6.6625e-07,
"loss": 0.0006,
"reward": 3.849023461341858,
"reward_std": 0.07564813643693924,
"rewards/answer_entity_reward": 0.981249988079071,
"rewards/answer_wer_reward": 0.9172319173812866,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9505417346954346,
"step": 268
},
{
"completion_length": 159.90625,
"epoch": 0.8608,
"grad_norm": 8.665388107299805,
"kl": 0.083740234375,
"learning_rate": 6.65e-07,
"loss": 0.0008,
"reward": 3.8619388341903687,
"reward_std": 0.03842100687325001,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9294092357158661,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9369938969612122,
"step": 269
},
{
"completion_length": 199.09375,
"epoch": 0.864,
"grad_norm": 2.6412887573242188,
"kl": 0.24951171875,
"learning_rate": 6.637499999999999e-07,
"loss": 0.0025,
"reward": 3.92287015914917,
"reward_std": 0.04514491464942694,
"rewards/answer_entity_reward": 0.9867424070835114,
"rewards/answer_wer_reward": 0.948787659406662,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9873401820659637,
"step": 270
},
{
"completion_length": 138.625,
"epoch": 0.8672,
"grad_norm": 5.494461536407471,
"kl": 0.1064453125,
"learning_rate": 6.624999999999999e-07,
"loss": 0.0011,
"reward": 3.80997896194458,
"reward_std": 0.10453111864626408,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9460954964160919,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8638834655284882,
"step": 271
},
{
"completion_length": 207.0625,
"epoch": 0.8704,
"grad_norm": 6.705058574676514,
"kl": 0.0904541015625,
"learning_rate": 6.6125e-07,
"loss": 0.0009,
"reward": 3.918370246887207,
"reward_std": 0.016086122021079063,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.94427290558815,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9765011072158813,
"step": 272
},
{
"completion_length": 193.46875,
"epoch": 0.8736,
"grad_norm": 3.6274845600128174,
"kl": 0.16259765625,
"learning_rate": 6.6e-07,
"loss": 0.0016,
"reward": 3.8420186042785645,
"reward_std": 0.042743777856230736,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.88405841588974,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9579601883888245,
"step": 273
},
{
"completion_length": 238.03125,
"epoch": 0.8768,
"grad_norm": 39.40747833251953,
"kl": 0.064453125,
"learning_rate": 6.587499999999999e-07,
"loss": 0.0006,
"reward": 3.8922038078308105,
"reward_std": 0.08438011445105076,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.8997087776660919,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9924951493740082,
"step": 274
},
{
"completion_length": 215.03125,
"epoch": 0.88,
"grad_norm": 3.786466360092163,
"kl": 0.073974609375,
"learning_rate": 6.575e-07,
"loss": 0.0007,
"reward": 3.936691641807556,
"reward_std": 0.013240452855825424,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9393938779830933,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993811845779419,
"step": 275
},
{
"completion_length": 171.84375,
"epoch": 0.8832,
"grad_norm": 6.402861595153809,
"kl": 0.09619140625,
"learning_rate": 6.5625e-07,
"loss": 0.001,
"reward": 3.8171043395996094,
"reward_std": 0.07490862905979156,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9142147600650787,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.902889609336853,
"step": 276
},
{
"completion_length": 202.90625,
"epoch": 0.8864,
"grad_norm": 1.9027079343795776,
"kl": 0.07958984375,
"learning_rate": 6.55e-07,
"loss": 0.0008,
"reward": 3.910063624382019,
"reward_std": 0.014503994956612587,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9100635945796967,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 277
},
{
"completion_length": 194.90625,
"epoch": 0.8896,
"grad_norm": 3.430772304534912,
"kl": 0.10107421875,
"learning_rate": 6.5375e-07,
"loss": 0.001,
"reward": 3.9086241722106934,
"reward_std": 0.011167994700372219,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9395906329154968,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9690335094928741,
"step": 278
},
{
"completion_length": 214.21875,
"epoch": 0.8928,
"grad_norm": 1.209375262260437,
"kl": 0.07763671875,
"learning_rate": 6.524999999999999e-07,
"loss": 0.0008,
"reward": 3.934818387031555,
"reward_std": 0.013630851171910763,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9348185062408447,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 279
},
{
"completion_length": 210.375,
"epoch": 0.896,
"grad_norm": 3.4542951583862305,
"kl": 0.09619140625,
"learning_rate": 6.5125e-07,
"loss": 0.001,
"reward": 3.8483023643493652,
"reward_std": 0.022013184614479542,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9127626419067383,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.935539722442627,
"step": 280
},
{
"completion_length": 171.96875,
"epoch": 0.8992,
"grad_norm": 5.6723761558532715,
"kl": 0.138671875,
"learning_rate": 6.5e-07,
"loss": 0.0014,
"reward": 3.894706964492798,
"reward_std": 0.01279338588938117,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9379555583000183,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9567514657974243,
"step": 281
},
{
"completion_length": 111.78125,
"epoch": 0.9024,
"grad_norm": 4.6447954177856445,
"kl": 0.1376953125,
"learning_rate": 6.4875e-07,
"loss": 0.0014,
"reward": 3.901338577270508,
"reward_std": 0.019952512811869383,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.978780597448349,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9253990054130554,
"step": 282
},
{
"completion_length": 244.96875,
"epoch": 0.9056,
"grad_norm": 2.825244665145874,
"kl": 0.0611572265625,
"learning_rate": 6.474999999999999e-07,
"loss": 0.0006,
"reward": 3.9182543754577637,
"reward_std": 0.02383749559521675,
"rewards/answer_entity_reward": 0.9927884340286255,
"rewards/answer_wer_reward": 0.9259287714958191,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9995370507240295,
"step": 283
},
{
"completion_length": 218.09375,
"epoch": 0.9088,
"grad_norm": 2.9246108531951904,
"kl": 0.0736083984375,
"learning_rate": 6.4625e-07,
"loss": 0.0007,
"reward": 3.9247629642486572,
"reward_std": 0.019582282286137342,
"rewards/answer_entity_reward": 0.9866071343421936,
"rewards/answer_wer_reward": 0.9388971030712128,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9992588460445404,
"step": 284
},
{
"completion_length": 174.8125,
"epoch": 0.912,
"grad_norm": 1.4176238775253296,
"kl": 0.115478515625,
"learning_rate": 6.45e-07,
"loss": 0.0012,
"reward": 3.9359350204467773,
"reward_std": 0.01886278996244073,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9649160206317902,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9731022417545319,
"step": 285
},
{
"completion_length": 152.40625,
"epoch": 0.9152,
"grad_norm": 4.273341178894043,
"kl": 0.176025390625,
"learning_rate": 6.4375e-07,
"loss": 0.0018,
"reward": 3.850113034248352,
"reward_std": 0.07313014380633831,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9544805884361267,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8991046845912933,
"step": 286
},
{
"completion_length": 221.90625,
"epoch": 0.9184,
"grad_norm": 3.1975696086883545,
"kl": 0.083984375,
"learning_rate": 6.424999999999999e-07,
"loss": 0.0008,
"reward": 3.8276385068893433,
"reward_std": 0.019742398988455534,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.8953758776187897,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9379445612430573,
"step": 287
},
{
"completion_length": 203.1875,
"epoch": 0.9216,
"grad_norm": 4.396200180053711,
"kl": 0.1318359375,
"learning_rate": 6.4125e-07,
"loss": 0.0013,
"reward": 3.9295929670333862,
"reward_std": 0.022352089174091816,
"rewards/answer_entity_reward": 0.9927884340286255,
"rewards/answer_wer_reward": 0.9394927024841309,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9973118305206299,
"step": 288
},
{
"completion_length": 188.34375,
"epoch": 0.9248,
"grad_norm": 23.72756004333496,
"kl": 0.098388671875,
"learning_rate": 6.4e-07,
"loss": 0.001,
"reward": 3.7452211380004883,
"reward_std": 0.12425664439797401,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.8480645418167114,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8971565663814545,
"step": 289
},
{
"completion_length": 233.5625,
"epoch": 0.928,
"grad_norm": 1.2391304969787598,
"kl": 0.068603515625,
"learning_rate": 6.3875e-07,
"loss": 0.0007,
"reward": 3.8707345724105835,
"reward_std": 0.03127638017758727,
"rewards/answer_entity_reward": 0.989980161190033,
"rewards/answer_wer_reward": 0.8822586238384247,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9984957575798035,
"step": 290
},
{
"completion_length": 176.8125,
"epoch": 0.9312,
"grad_norm": 3.8803555965423584,
"kl": 0.14697265625,
"learning_rate": 6.374999999999999e-07,
"loss": 0.0015,
"reward": 3.890373468399048,
"reward_std": 0.01580220554023981,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9229053854942322,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9674679934978485,
"step": 291
},
{
"completion_length": 249.9375,
"epoch": 0.9344,
"grad_norm": 1.001364827156067,
"kl": 0.08447265625,
"learning_rate": 6.362499999999999e-07,
"loss": 0.0008,
"reward": 3.8967798948287964,
"reward_std": 0.015075822360813618,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.8994384407997131,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994248449802399,
"step": 292
},
{
"completion_length": 192.125,
"epoch": 0.9376,
"grad_norm": 7.706722736358643,
"kl": 0.12255859375,
"learning_rate": 6.35e-07,
"loss": 0.0012,
"reward": 3.92827308177948,
"reward_std": 0.02050976036116481,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9391875863075256,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9890855252742767,
"step": 293
},
{
"completion_length": 235.125,
"epoch": 0.9408,
"grad_norm": 1.723900556564331,
"kl": 0.0587158203125,
"learning_rate": 6.3375e-07,
"loss": 0.0006,
"reward": 3.9498140811920166,
"reward_std": 0.012220169650390744,
"rewards/answer_entity_reward": 0.9981617629528046,
"rewards/answer_wer_reward": 0.9532225430011749,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.99842968583107,
"step": 294
},
{
"completion_length": 205.65625,
"epoch": 0.944,
"grad_norm": 5.019091606140137,
"kl": 0.092041015625,
"learning_rate": 6.324999999999999e-07,
"loss": 0.0009,
"reward": 3.72371768951416,
"reward_std": 0.03362658293917775,
"rewards/answer_entity_reward": 0.988194465637207,
"rewards/answer_wer_reward": 0.8201212882995605,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9154019355773926,
"step": 295
},
{
"completion_length": 229.78125,
"epoch": 0.9472,
"grad_norm": 2.4262614250183105,
"kl": 0.07763671875,
"learning_rate": 6.3125e-07,
"loss": 0.0008,
"reward": 3.9112552404403687,
"reward_std": 0.02215595170855522,
"rewards/answer_entity_reward": 0.9932383000850677,
"rewards/answer_wer_reward": 0.9202675223350525,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9977494478225708,
"step": 296
},
{
"completion_length": 201.4375,
"epoch": 0.9504,
"grad_norm": 15.131966590881348,
"kl": 1.363037109375,
"learning_rate": 6.3e-07,
"loss": 0.0136,
"reward": 3.8845863342285156,
"reward_std": 0.025053692050278187,
"rewards/answer_entity_reward": 0.995192289352417,
"rewards/answer_wer_reward": 0.9146546125411987,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9747393429279327,
"step": 297
},
{
"completion_length": 172.9375,
"epoch": 0.9536,
"grad_norm": 0.7034117579460144,
"kl": 0.114501953125,
"learning_rate": 6.2875e-07,
"loss": 0.0011,
"reward": 3.9505850076675415,
"reward_std": 0.004406077787280083,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9516552090644836,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9989297986030579,
"step": 298
},
{
"completion_length": 226.03125,
"epoch": 0.9568,
"grad_norm": 10.005863189697266,
"kl": 0.099853515625,
"learning_rate": 6.274999999999999e-07,
"loss": 0.001,
"reward": 3.78713595867157,
"reward_std": 0.118343286216259,
"rewards/answer_entity_reward": 0.9955128133296967,
"rewards/answer_wer_reward": 0.8108388781547546,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9807842969894409,
"step": 299
},
{
"completion_length": 183.0,
"epoch": 0.96,
"grad_norm": 12.267927169799805,
"kl": 0.142578125,
"learning_rate": 6.262499999999999e-07,
"loss": 0.0014,
"reward": 3.7959177494049072,
"reward_std": 0.09426255617290735,
"rewards/answer_entity_reward": 0.9763257503509521,
"rewards/answer_wer_reward": 0.963774561882019,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8558174073696136,
"step": 300
},
{
"completion_length": 255.28125,
"epoch": 0.9632,
"grad_norm": 1.5198532342910767,
"kl": 0.0638427734375,
"learning_rate": 6.249999999999999e-07,
"loss": 0.0006,
"reward": 3.8590621948242188,
"reward_std": 0.05621089227497578,
"rewards/answer_entity_reward": 0.9652777910232544,
"rewards/answer_wer_reward": 0.8950084447860718,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987758696079254,
"step": 301
},
{
"completion_length": 231.09375,
"epoch": 0.9664,
"grad_norm": 2.063969135284424,
"kl": 0.0770263671875,
"learning_rate": 6.2375e-07,
"loss": 0.0008,
"reward": 3.8598477840423584,
"reward_std": 0.04335158132016659,
"rewards/answer_entity_reward": 0.9843385815620422,
"rewards/answer_wer_reward": 0.8991816341876984,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.976327657699585,
"step": 302
},
{
"completion_length": 214.1875,
"epoch": 0.9696,
"grad_norm": 4.762388706207275,
"kl": 0.09765625,
"learning_rate": 6.225000000000001e-07,
"loss": 0.001,
"reward": 3.86174213886261,
"reward_std": 0.03313549840822816,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9338361918926239,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9313782155513763,
"step": 303
},
{
"completion_length": 232.5625,
"epoch": 0.9728,
"grad_norm": 2.811995506286621,
"kl": 0.10595703125,
"learning_rate": 6.2125e-07,
"loss": 0.0011,
"reward": 3.732570767402649,
"reward_std": 0.14181919861584902,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9169972240924835,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.846823513507843,
"step": 304
},
{
"completion_length": 221.96875,
"epoch": 0.976,
"grad_norm": 2.424633741378784,
"kl": 0.0677490234375,
"learning_rate": 6.2e-07,
"loss": 0.0007,
"reward": 3.9095277786254883,
"reward_std": 0.047814636724069715,
"rewards/answer_entity_reward": 0.9927884638309479,
"rewards/answer_wer_reward": 0.93398517370224,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9827540516853333,
"step": 305
},
{
"completion_length": 273.8125,
"epoch": 0.9792,
"grad_norm": 1.3363338708877563,
"kl": 0.0654296875,
"learning_rate": 6.1875e-07,
"loss": 0.0007,
"reward": 3.8615630865097046,
"reward_std": 0.029406235553324223,
"rewards/answer_entity_reward": 0.9869123697280884,
"rewards/answer_wer_reward": 0.8774734139442444,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9971773624420166,
"step": 306
},
{
"completion_length": 243.34375,
"epoch": 0.9824,
"grad_norm": 3.1950275897979736,
"kl": 0.05810546875,
"learning_rate": 6.175e-07,
"loss": 0.0006,
"reward": 3.898465633392334,
"reward_std": 0.022021150682121515,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.903068333864212,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.998238205909729,
"step": 307
},
{
"completion_length": 230.375,
"epoch": 0.9856,
"grad_norm": 1.1819887161254883,
"kl": 0.075927734375,
"learning_rate": 6.162499999999999e-07,
"loss": 0.0008,
"reward": 3.9233819246292114,
"reward_std": 0.01652457471936941,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9294087886810303,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9968140125274658,
"step": 308
},
{
"completion_length": 193.78125,
"epoch": 0.9888,
"grad_norm": 3.613255739212036,
"kl": 0.089111328125,
"learning_rate": 6.149999999999999e-07,
"loss": 0.0009,
"reward": 3.9530293941497803,
"reward_std": 0.013143055606633425,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9536189138889313,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994103908538818,
"step": 309
},
{
"completion_length": 223.1875,
"epoch": 0.992,
"grad_norm": 2.9832558631896973,
"kl": 0.076904296875,
"learning_rate": 6.1375e-07,
"loss": 0.0008,
"reward": 3.9074047803878784,
"reward_std": 0.03526896797120571,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9134717583656311,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9939330518245697,
"step": 310
},
{
"completion_length": 202.78125,
"epoch": 0.9952,
"grad_norm": 1.6509346961975098,
"kl": 0.100830078125,
"learning_rate": 6.125000000000001e-07,
"loss": 0.001,
"reward": 3.897627115249634,
"reward_std": 0.025366032496094704,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9004680216312408,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 311
},
{
"completion_length": 227.75,
"epoch": 0.9984,
"grad_norm": 2.9892170429229736,
"kl": 0.091064453125,
"learning_rate": 6.1125e-07,
"loss": 0.0009,
"reward": 3.879219174385071,
"reward_std": 0.04558245837688446,
"rewards/answer_entity_reward": 0.9914772808551788,
"rewards/answer_wer_reward": 0.9074902236461639,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9802517294883728,
"step": 312
},
{
"completion_length": 165.125,
"epoch": 1.0,
"grad_norm": 1.1831876039505005,
"kl": 0.09814453125,
"learning_rate": 6.1e-07,
"loss": 0.0005,
"reward": 3.956197738647461,
"reward_std": 0.047231610864400864,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.985044002532959,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9711538553237915,
"step": 313
},
{
"completion_length": 194.875,
"epoch": 1.0032,
"grad_norm": 1.1336063146591187,
"kl": 0.10302734375,
"learning_rate": 6.0875e-07,
"loss": 0.001,
"reward": 3.955459713935852,
"reward_std": 0.010184567421674728,
"rewards/answer_entity_reward": 0.995192289352417,
"rewards/answer_wer_reward": 0.9638065993785858,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9964607954025269,
"step": 314
},
{
"completion_length": 172.1875,
"epoch": 1.0064,
"grad_norm": 7.745497226715088,
"kl": 0.099609375,
"learning_rate": 6.075e-07,
"loss": 0.001,
"reward": 3.9203338623046875,
"reward_std": 0.005493420176208019,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9503339529037476,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9699999988079071,
"step": 315
},
{
"completion_length": 216.90625,
"epoch": 1.0096,
"grad_norm": 5.326587200164795,
"kl": 0.076904296875,
"learning_rate": 6.062499999999999e-07,
"loss": 0.0008,
"reward": 3.8242450952529907,
"reward_std": 0.04496973566710949,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9261577427387238,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8980873227119446,
"step": 316
},
{
"completion_length": 179.59375,
"epoch": 1.0128,
"grad_norm": 1.887527346611023,
"kl": 0.0675048828125,
"learning_rate": 6.049999999999999e-07,
"loss": 0.0007,
"reward": 3.9317299127578735,
"reward_std": 0.023447751067578793,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9398273527622223,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9963668584823608,
"step": 317
},
{
"completion_length": 215.1875,
"epoch": 1.016,
"grad_norm": 2.478510618209839,
"kl": 0.060791015625,
"learning_rate": 6.037499999999999e-07,
"loss": 0.0006,
"reward": 3.8788411617279053,
"reward_std": 0.020661167800426483,
"rewards/answer_entity_reward": 0.9930555820465088,
"rewards/answer_wer_reward": 0.8995265662670135,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9862589836120605,
"step": 318
},
{
"completion_length": 205.5,
"epoch": 1.0192,
"grad_norm": 1.7058178186416626,
"kl": 0.0830078125,
"learning_rate": 6.025000000000001e-07,
"loss": 0.0008,
"reward": 3.807918906211853,
"reward_std": 0.04822289012372494,
"rewards/answer_entity_reward": 0.9788461625576019,
"rewards/answer_wer_reward": 0.8715765476226807,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9574962258338928,
"step": 319
},
{
"completion_length": 242.46875,
"epoch": 1.0224,
"grad_norm": 1.7695921659469604,
"kl": 0.0859375,
"learning_rate": 6.0125e-07,
"loss": 0.0009,
"reward": 3.9255610704421997,
"reward_std": 0.019923360086977482,
"rewards/answer_entity_reward": 0.9906516969203949,
"rewards/answer_wer_reward": 0.9401695132255554,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9947398900985718,
"step": 320
},
{
"completion_length": 175.40625,
"epoch": 1.0256,
"grad_norm": 2.60329270362854,
"kl": 0.085693359375,
"learning_rate": 6e-07,
"loss": 0.0009,
"reward": 3.9218677282333374,
"reward_std": 0.008750536944717169,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9223886132240295,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994791746139526,
"step": 321
},
{
"completion_length": 157.875,
"epoch": 1.0288,
"grad_norm": 5.270680904388428,
"kl": 0.120361328125,
"learning_rate": 5.9875e-07,
"loss": 0.0012,
"reward": 3.8664562702178955,
"reward_std": 0.03370736539363861,
"rewards/answer_entity_reward": 0.9868055582046509,
"rewards/answer_wer_reward": 0.9486467838287354,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9310039281845093,
"step": 322
},
{
"completion_length": 202.15625,
"epoch": 1.032,
"grad_norm": 0.9677954316139221,
"kl": 0.072998046875,
"learning_rate": 5.975e-07,
"loss": 0.0007,
"reward": 3.9512887001037598,
"reward_std": 0.008498450508341193,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9516439437866211,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996448755264282,
"step": 323
},
{
"completion_length": 183.21875,
"epoch": 1.0352,
"grad_norm": 8.04370403289795,
"kl": 0.0908203125,
"learning_rate": 5.962499999999999e-07,
"loss": 0.0009,
"reward": 3.810960531234741,
"reward_std": 0.017052859999239445,
"rewards/answer_entity_reward": 0.9861111044883728,
"rewards/answer_wer_reward": 0.9431954920291901,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8816540241241455,
"step": 324
},
{
"completion_length": 209.125,
"epoch": 1.0384,
"grad_norm": 1.1835105419158936,
"kl": 0.09326171875,
"learning_rate": 5.949999999999999e-07,
"loss": 0.0009,
"reward": 3.9159555435180664,
"reward_std": 0.02768123522400856,
"rewards/answer_entity_reward": 0.9866695702075958,
"rewards/answer_wer_reward": 0.930209755897522,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990763068199158,
"step": 325
},
{
"completion_length": 202.15625,
"epoch": 1.0416,
"grad_norm": 1.198609471321106,
"kl": 0.0748291015625,
"learning_rate": 5.937499999999999e-07,
"loss": 0.0007,
"reward": 3.85296094417572,
"reward_std": 0.19228698359802365,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9154608845710754,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.96875,
"step": 326
},
{
"completion_length": 187.9375,
"epoch": 1.0448,
"grad_norm": 3.9246749877929688,
"kl": 0.08740234375,
"learning_rate": 5.925e-07,
"loss": 0.0009,
"reward": 3.8706984519958496,
"reward_std": 0.046023860573768616,
"rewards/answer_entity_reward": 0.9947552382946014,
"rewards/answer_wer_reward": 0.9316051602363586,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9443379342556,
"step": 327
},
{
"completion_length": 206.5625,
"epoch": 1.048,
"grad_norm": 2.1665873527526855,
"kl": 0.111083984375,
"learning_rate": 5.912500000000001e-07,
"loss": 0.0011,
"reward": 3.8563778400421143,
"reward_std": 0.02296618465334177,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9032285511493683,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9531491994857788,
"step": 328
},
{
"completion_length": 202.71875,
"epoch": 1.0512,
"grad_norm": 2.493177890777588,
"kl": 0.087646484375,
"learning_rate": 5.9e-07,
"loss": 0.0009,
"reward": 3.8221092224121094,
"reward_std": 0.13764610793441534,
"rewards/answer_entity_reward": 0.9418402910232544,
"rewards/answer_wer_reward": 0.8825558722019196,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9977130889892578,
"step": 329
},
{
"completion_length": 200.0,
"epoch": 1.0544,
"grad_norm": 1.2568529844284058,
"kl": 0.114013671875,
"learning_rate": 5.8875e-07,
"loss": 0.0011,
"reward": 3.934491515159607,
"reward_std": 0.012761063873767853,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9390542805194855,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9954372346401215,
"step": 330
},
{
"completion_length": 212.71875,
"epoch": 1.0576,
"grad_norm": 1.3623089790344238,
"kl": 0.086669921875,
"learning_rate": 5.875e-07,
"loss": 0.0009,
"reward": 3.8928335905075073,
"reward_std": 0.03161040600389242,
"rewards/answer_entity_reward": 0.9936868846416473,
"rewards/answer_wer_reward": 0.8996903300285339,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994563460350037,
"step": 331
},
{
"completion_length": 240.3125,
"epoch": 1.0608,
"grad_norm": 1.2754676342010498,
"kl": 0.0615234375,
"learning_rate": 5.8625e-07,
"loss": 0.0006,
"reward": 3.925002932548523,
"reward_std": 0.0067287166602909565,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9267281293869019,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9982748329639435,
"step": 332
},
{
"completion_length": 218.6875,
"epoch": 1.064,
"grad_norm": 1.989392638206482,
"kl": 0.073486328125,
"learning_rate": 5.849999999999999e-07,
"loss": 0.0007,
"reward": 3.9305100440979004,
"reward_std": 0.014313624240458012,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9314764738082886,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999033510684967,
"step": 333
},
{
"completion_length": 187.125,
"epoch": 1.0672,
"grad_norm": 4.332698822021484,
"kl": 0.11474609375,
"learning_rate": 5.837499999999999e-07,
"loss": 0.0011,
"reward": 3.9111961126327515,
"reward_std": 0.017924371175467968,
"rewards/answer_entity_reward": 0.9967105388641357,
"rewards/answer_wer_reward": 0.9153991043567657,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990865290164948,
"step": 334
},
{
"completion_length": 240.8125,
"epoch": 1.0704,
"grad_norm": 0.991020143032074,
"kl": 0.0609130859375,
"learning_rate": 5.825e-07,
"loss": 0.0006,
"reward": 3.9502662420272827,
"reward_std": 0.006167408544570208,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9529542922973633,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9973118305206299,
"step": 335
},
{
"completion_length": 250.625,
"epoch": 1.0735999999999999,
"grad_norm": 2.3996546268463135,
"kl": 0.06396484375,
"learning_rate": 5.8125e-07,
"loss": 0.0006,
"reward": 3.899760365486145,
"reward_std": 0.02179525839164853,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9130350351333618,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9867254495620728,
"step": 336
},
{
"completion_length": 193.9375,
"epoch": 1.0768,
"grad_norm": 3.6998724937438965,
"kl": 0.090576171875,
"learning_rate": 5.8e-07,
"loss": 0.0009,
"reward": 3.8309794664382935,
"reward_std": 0.01553899934515357,
"rewards/answer_entity_reward": 0.9886363744735718,
"rewards/answer_wer_reward": 0.9471099972724915,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.895233154296875,
"step": 337
},
{
"completion_length": 221.90625,
"epoch": 1.08,
"grad_norm": 1.1334843635559082,
"kl": 0.0587158203125,
"learning_rate": 5.7875e-07,
"loss": 0.0006,
"reward": 3.936136484146118,
"reward_std": 0.012863298412412405,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9376117587089539,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9985246956348419,
"step": 338
},
{
"completion_length": 242.125,
"epoch": 1.0832,
"grad_norm": 1.0358681678771973,
"kl": 0.0643310546875,
"learning_rate": 5.775e-07,
"loss": 0.0007,
"reward": 3.887587547302246,
"reward_std": 0.0230812830850482,
"rewards/answer_entity_reward": 0.9798610806465149,
"rewards/answer_wer_reward": 0.9077264070510864,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 339
},
{
"completion_length": 213.9375,
"epoch": 1.0864,
"grad_norm": 24.39422035217285,
"kl": 0.080078125,
"learning_rate": 5.7625e-07,
"loss": 0.0008,
"reward": 3.887939691543579,
"reward_std": 0.014108296483755112,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.8971990048885345,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9907407462596893,
"step": 340
},
{
"completion_length": 205.375,
"epoch": 1.0896,
"grad_norm": 1.204923152923584,
"kl": 0.1015625,
"learning_rate": 5.749999999999999e-07,
"loss": 0.001,
"reward": 3.819010019302368,
"reward_std": 0.24664557841606438,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9135412275791168,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.936718761920929,
"step": 341
},
{
"completion_length": 231.09375,
"epoch": 1.0928,
"grad_norm": 0.831721842288971,
"kl": 0.06884765625,
"learning_rate": 5.737499999999999e-07,
"loss": 0.0007,
"reward": 3.9083417654037476,
"reward_std": 0.023847888689488173,
"rewards/answer_entity_reward": 0.9902146458625793,
"rewards/answer_wer_reward": 0.9184364974498749,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999690592288971,
"step": 342
},
{
"completion_length": 225.28125,
"epoch": 1.096,
"grad_norm": 1.239318609237671,
"kl": 0.070068359375,
"learning_rate": 5.725e-07,
"loss": 0.0007,
"reward": 3.8802337646484375,
"reward_std": 0.019388118293136358,
"rewards/answer_entity_reward": 0.9895833730697632,
"rewards/answer_wer_reward": 0.8906503319740295,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 343
},
{
"completion_length": 182.25,
"epoch": 1.0992,
"grad_norm": 2.810415267944336,
"kl": 0.08349609375,
"learning_rate": 5.7125e-07,
"loss": 0.0008,
"reward": 3.8992663621902466,
"reward_std": 0.017442656215280294,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9195939302444458,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9825133979320526,
"step": 344
},
{
"completion_length": 228.9375,
"epoch": 1.1024,
"grad_norm": 2.4584133625030518,
"kl": 0.11376953125,
"learning_rate": 5.699999999999999e-07,
"loss": 0.0011,
"reward": 3.893067240715027,
"reward_std": 0.024248626083135605,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9001834988594055,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9963560402393341,
"step": 345
},
{
"completion_length": 154.46875,
"epoch": 1.1056,
"grad_norm": 2.5888006687164307,
"kl": 0.1025390625,
"learning_rate": 5.6875e-07,
"loss": 0.001,
"reward": 3.8254867792129517,
"reward_std": 0.031096864491701126,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9297608137130737,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8957259356975555,
"step": 346
},
{
"completion_length": 174.0625,
"epoch": 1.1088,
"grad_norm": 2.087509870529175,
"kl": 0.12158203125,
"learning_rate": 5.675e-07,
"loss": 0.0012,
"reward": 3.920476198196411,
"reward_std": 0.017223183065652847,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9334003627300262,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9870758056640625,
"step": 347
},
{
"completion_length": 209.3125,
"epoch": 1.112,
"grad_norm": 1.5391756296157837,
"kl": 0.105712890625,
"learning_rate": 5.6625e-07,
"loss": 0.0011,
"reward": 3.9325058460235596,
"reward_std": 0.011998760513961315,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9345271587371826,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9979787170886993,
"step": 348
},
{
"completion_length": 211.9375,
"epoch": 1.1152,
"grad_norm": 2.1449012756347656,
"kl": 0.072021484375,
"learning_rate": 5.649999999999999e-07,
"loss": 0.0007,
"reward": 3.887805461883545,
"reward_std": 0.01465547364205122,
"rewards/answer_entity_reward": 0.9981617629528046,
"rewards/answer_wer_reward": 0.8914407789707184,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.998202919960022,
"step": 349
},
{
"completion_length": 219.875,
"epoch": 1.1184,
"grad_norm": 2.7394628524780273,
"kl": 0.065185546875,
"learning_rate": 5.637499999999999e-07,
"loss": 0.0007,
"reward": 3.905122399330139,
"reward_std": 0.014080648310482502,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9162788391113281,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9888435006141663,
"step": 350
},
{
"completion_length": 191.28125,
"epoch": 1.1216,
"grad_norm": 2.381448745727539,
"kl": 0.0721435546875,
"learning_rate": 5.625e-07,
"loss": 0.0007,
"reward": 3.8880510330200195,
"reward_std": 0.04133735504001379,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9348196983337402,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9576955437660217,
"step": 351
},
{
"completion_length": 263.5625,
"epoch": 1.1248,
"grad_norm": 1.0376274585723877,
"kl": 0.0584716796875,
"learning_rate": 5.6125e-07,
"loss": 0.0006,
"reward": 3.8982614278793335,
"reward_std": 0.012545288074761629,
"rewards/answer_entity_reward": 0.9981617629528046,
"rewards/answer_wer_reward": 0.9007040560245514,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999395489692688,
"step": 352
},
{
"completion_length": 218.59375,
"epoch": 1.1280000000000001,
"grad_norm": 1.5081944465637207,
"kl": 0.10009765625,
"learning_rate": 5.6e-07,
"loss": 0.001,
"reward": 3.9146311283111572,
"reward_std": 0.021717723459005356,
"rewards/answer_entity_reward": 0.9917200803756714,
"rewards/answer_wer_reward": 0.9235903024673462,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993206560611725,
"step": 353
},
{
"completion_length": 226.84375,
"epoch": 1.1312,
"grad_norm": 1.0990034341812134,
"kl": 0.063720703125,
"learning_rate": 5.587499999999999e-07,
"loss": 0.0006,
"reward": 3.9005931615829468,
"reward_std": 0.018239760771393776,
"rewards/answer_entity_reward": 0.9927884340286255,
"rewards/answer_wer_reward": 0.9203313589096069,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9874734580516815,
"step": 354
},
{
"completion_length": 238.59375,
"epoch": 1.1344,
"grad_norm": 10.765813827514648,
"kl": 0.056884765625,
"learning_rate": 5.575e-07,
"loss": 0.0006,
"reward": 3.9274662733078003,
"reward_std": 0.016329116653651,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9380079507827759,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9894582629203796,
"step": 355
},
{
"completion_length": 216.1875,
"epoch": 1.1376,
"grad_norm": 6.097777843475342,
"kl": 0.43701171875,
"learning_rate": 5.5625e-07,
"loss": 0.0044,
"reward": 3.6753621101379395,
"reward_std": 0.09127287194132805,
"rewards/answer_entity_reward": 0.9843385517597198,
"rewards/answer_wer_reward": 0.9279595017433167,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.7630640268325806,
"step": 356
},
{
"completion_length": 233.25,
"epoch": 1.1408,
"grad_norm": 1.9484727382659912,
"kl": 0.07470703125,
"learning_rate": 5.55e-07,
"loss": 0.0007,
"reward": 3.8734058141708374,
"reward_std": 0.026476514525711536,
"rewards/answer_entity_reward": 0.9829545617103577,
"rewards/answer_wer_reward": 0.906408816576004,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9840425550937653,
"step": 357
},
{
"completion_length": 224.125,
"epoch": 1.144,
"grad_norm": 1.650207757949829,
"kl": 0.071533203125,
"learning_rate": 5.5375e-07,
"loss": 0.0007,
"reward": 3.9309768676757812,
"reward_std": 0.016152822878211737,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9357885122299194,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9986605942249298,
"step": 358
},
{
"completion_length": 202.8125,
"epoch": 1.1472,
"grad_norm": 2.33708119392395,
"kl": 0.102294921875,
"learning_rate": 5.525e-07,
"loss": 0.001,
"reward": 3.901100993156433,
"reward_std": 0.06198639050126076,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9566735327243805,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9444274306297302,
"step": 359
},
{
"completion_length": 231.03125,
"epoch": 1.1504,
"grad_norm": 2.603564977645874,
"kl": 0.0662841796875,
"learning_rate": 5.5125e-07,
"loss": 0.0007,
"reward": 3.8539780378341675,
"reward_std": 0.04134450480341911,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.8810023069381714,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.975816547870636,
"step": 360
},
{
"completion_length": 176.8125,
"epoch": 1.1536,
"grad_norm": 1.9730738401412964,
"kl": 0.0673828125,
"learning_rate": 5.5e-07,
"loss": 0.0007,
"reward": 3.946772813796997,
"reward_std": 0.007931779837235808,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9499374032020569,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9968354403972626,
"step": 361
},
{
"completion_length": 205.28125,
"epoch": 1.1568,
"grad_norm": 2.6627304553985596,
"kl": 0.0997314453125,
"learning_rate": 5.487499999999999e-07,
"loss": 0.001,
"reward": 3.914576292037964,
"reward_std": 0.015826540999114513,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9475591778755188,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9670170843601227,
"step": 362
},
{
"completion_length": 199.9375,
"epoch": 1.16,
"grad_norm": 2.073272466659546,
"kl": 0.091064453125,
"learning_rate": 5.474999999999999e-07,
"loss": 0.0009,
"reward": 3.89456570148468,
"reward_std": 0.008259527385234833,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9333997070789337,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9611659348011017,
"step": 363
},
{
"completion_length": 222.0625,
"epoch": 1.1632,
"grad_norm": 1.7804555892944336,
"kl": 0.1220703125,
"learning_rate": 5.4625e-07,
"loss": 0.0012,
"reward": 3.847594380378723,
"reward_std": 0.09885499440133572,
"rewards/answer_entity_reward": 0.9692708849906921,
"rewards/answer_wer_reward": 0.8783235251903534,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 364
},
{
"completion_length": 206.71875,
"epoch": 1.1663999999999999,
"grad_norm": 1.6756658554077148,
"kl": 0.097900390625,
"learning_rate": 5.45e-07,
"loss": 0.001,
"reward": 3.866326928138733,
"reward_std": 0.027653913479298353,
"rewards/answer_entity_reward": 0.990705132484436,
"rewards/answer_wer_reward": 0.9324296712875366,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9431920945644379,
"step": 365
},
{
"completion_length": 187.125,
"epoch": 1.1696,
"grad_norm": 1.6528626680374146,
"kl": 0.075439453125,
"learning_rate": 5.4375e-07,
"loss": 0.0008,
"reward": 3.821729063987732,
"reward_std": 0.14681637566536665,
"rewards/answer_entity_reward": 0.995192289352417,
"rewards/answer_wer_reward": 0.896637350320816,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9611493647098541,
"step": 366
},
{
"completion_length": 180.5625,
"epoch": 1.1728,
"grad_norm": 2.211965560913086,
"kl": 0.10302734375,
"learning_rate": 5.425e-07,
"loss": 0.001,
"reward": 3.857783317565918,
"reward_std": 0.13934296648949385,
"rewards/answer_entity_reward": 0.9847756326198578,
"rewards/answer_wer_reward": 0.9358752965927124,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9683823585510254,
"step": 367
},
{
"completion_length": 208.21875,
"epoch": 1.176,
"grad_norm": 2.522264242172241,
"kl": 0.060546875,
"learning_rate": 5.4125e-07,
"loss": 0.0006,
"reward": 3.8018884658813477,
"reward_std": 0.07955996971577406,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.8091042637825012,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9962564706802368,
"step": 368
},
{
"completion_length": 192.84375,
"epoch": 1.1792,
"grad_norm": 1.4488089084625244,
"kl": 0.0791015625,
"learning_rate": 5.4e-07,
"loss": 0.0008,
"reward": 3.940070152282715,
"reward_std": 0.008247917518019676,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9410351514816284,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999035120010376,
"step": 369
},
{
"completion_length": 244.96875,
"epoch": 1.1824,
"grad_norm": 5.085299968719482,
"kl": 0.109130859375,
"learning_rate": 5.387499999999999e-07,
"loss": 0.0011,
"reward": 3.834069848060608,
"reward_std": 0.027521015144884586,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.907810240983963,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9319414496421814,
"step": 370
},
{
"completion_length": 223.25,
"epoch": 1.1856,
"grad_norm": 2.248169183731079,
"kl": 0.1083984375,
"learning_rate": 5.374999999999999e-07,
"loss": 0.0011,
"reward": 3.9311490058898926,
"reward_std": 0.011384843150153756,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.931148886680603,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 371
},
{
"completion_length": 237.3125,
"epoch": 1.1888,
"grad_norm": 1.0549304485321045,
"kl": 0.05419921875,
"learning_rate": 5.3625e-07,
"loss": 0.0005,
"reward": 3.890028476715088,
"reward_std": 0.012344780378043652,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.8915461599826813,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9984822571277618,
"step": 372
},
{
"completion_length": 216.0625,
"epoch": 1.192,
"grad_norm": 1.3054077625274658,
"kl": 0.0694580078125,
"learning_rate": 5.35e-07,
"loss": 0.0007,
"reward": 3.8679678440093994,
"reward_std": 0.016808426938951015,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.8875625133514404,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.980405330657959,
"step": 373
},
{
"completion_length": 222.34375,
"epoch": 1.1952,
"grad_norm": 10.381876945495605,
"kl": 0.067626953125,
"learning_rate": 5.3375e-07,
"loss": 0.0007,
"reward": 3.946020483970642,
"reward_std": 0.016021378338336945,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9507038593292236,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9977203905582428,
"step": 374
},
{
"completion_length": 208.875,
"epoch": 1.1984,
"grad_norm": 2.7493553161621094,
"kl": 0.13525390625,
"learning_rate": 5.325e-07,
"loss": 0.0014,
"reward": 3.942535161972046,
"reward_std": 0.01458098879083991,
"rewards/answer_entity_reward": 0.993686854839325,
"rewards/answer_wer_reward": 0.9490944147109985,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997539520263672,
"step": 375
},
{
"completion_length": 252.09375,
"epoch": 1.2016,
"grad_norm": 1.9127050638198853,
"kl": 0.079345703125,
"learning_rate": 5.3125e-07,
"loss": 0.0008,
"reward": 3.8897405862808228,
"reward_std": 0.015877339988946915,
"rewards/answer_entity_reward": 0.9888257682323456,
"rewards/answer_wer_reward": 0.9078421294689178,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9930725991725922,
"step": 376
},
{
"completion_length": 186.375,
"epoch": 1.2048,
"grad_norm": 1.832676887512207,
"kl": 0.096435546875,
"learning_rate": 5.3e-07,
"loss": 0.001,
"reward": 3.9009323120117188,
"reward_std": 0.013205710332840681,
"rewards/answer_entity_reward": 0.995192289352417,
"rewards/answer_wer_reward": 0.940411388874054,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9653284549713135,
"step": 377
},
{
"completion_length": 224.46875,
"epoch": 1.208,
"grad_norm": 1.1020106077194214,
"kl": 0.0638427734375,
"learning_rate": 5.2875e-07,
"loss": 0.0006,
"reward": 3.9539231061935425,
"reward_std": 0.005315458634868264,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9545543491840363,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993686676025391,
"step": 378
},
{
"completion_length": 158.25,
"epoch": 1.2112,
"grad_norm": 2.493016481399536,
"kl": 0.123779296875,
"learning_rate": 5.274999999999999e-07,
"loss": 0.0012,
"reward": 3.921034097671509,
"reward_std": 0.009559540543705225,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9532065689563751,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9678275287151337,
"step": 379
},
{
"completion_length": 253.4375,
"epoch": 1.2144,
"grad_norm": 1.1055541038513184,
"kl": 0.067626953125,
"learning_rate": 5.262499999999999e-07,
"loss": 0.0007,
"reward": 3.8998262882232666,
"reward_std": 0.021630683913826942,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9026672542095184,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 380
},
{
"completion_length": 211.28125,
"epoch": 1.2176,
"grad_norm": 2.4898200035095215,
"kl": 0.072998046875,
"learning_rate": 5.25e-07,
"loss": 0.0007,
"reward": 3.8961129188537598,
"reward_std": 0.02530479012057185,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9441809356212616,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9547730088233948,
"step": 381
},
{
"completion_length": 241.34375,
"epoch": 1.2208,
"grad_norm": 1.5863702297210693,
"kl": 0.09033203125,
"learning_rate": 5.237500000000001e-07,
"loss": 0.0009,
"reward": 3.9048832654953003,
"reward_std": 0.02675863727927208,
"rewards/answer_entity_reward": 0.9836346209049225,
"rewards/answer_wer_reward": 0.9218496978282928,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993990361690521,
"step": 382
},
{
"completion_length": 244.90625,
"epoch": 1.224,
"grad_norm": 1.3265018463134766,
"kl": 0.08984375,
"learning_rate": 5.225e-07,
"loss": 0.0009,
"reward": 3.9047261476516724,
"reward_std": 0.013275579549372196,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9047262072563171,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 383
},
{
"completion_length": 192.3125,
"epoch": 1.2272,
"grad_norm": 2.3593811988830566,
"kl": 0.09521484375,
"learning_rate": 5.2125e-07,
"loss": 0.0009,
"reward": 3.893195629119873,
"reward_std": 0.03080725111067295,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9534947872161865,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9441652297973633,
"step": 384
},
{
"completion_length": 218.65625,
"epoch": 1.2304,
"grad_norm": 2.7099356651306152,
"kl": 0.06982421875,
"learning_rate": 5.2e-07,
"loss": 0.0007,
"reward": 3.8559422492980957,
"reward_std": 0.0489511676132679,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9215229749679565,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9344193935394287,
"step": 385
},
{
"completion_length": 168.375,
"epoch": 1.2336,
"grad_norm": 3.930095672607422,
"kl": 0.109130859375,
"learning_rate": 5.1875e-07,
"loss": 0.0011,
"reward": 3.848017930984497,
"reward_std": 0.043564099818468094,
"rewards/answer_entity_reward": 0.96875,
"rewards/answer_wer_reward": 0.9368032217025757,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9424647688865662,
"step": 386
},
{
"completion_length": 183.1875,
"epoch": 1.2368000000000001,
"grad_norm": 7.302414894104004,
"kl": 0.1279296875,
"learning_rate": 5.174999999999999e-07,
"loss": 0.0013,
"reward": 3.7856842279434204,
"reward_std": 0.026621405966579914,
"rewards/answer_entity_reward": 0.9930555820465088,
"rewards/answer_wer_reward": 0.9465668201446533,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8460619151592255,
"step": 387
},
{
"completion_length": 246.0,
"epoch": 1.24,
"grad_norm": 1.0175095796585083,
"kl": 0.06591796875,
"learning_rate": 5.162499999999999e-07,
"loss": 0.0007,
"reward": 3.923374652862549,
"reward_std": 0.011706824880093336,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9246262907981873,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987484514713287,
"step": 388
},
{
"completion_length": 214.34375,
"epoch": 1.2432,
"grad_norm": 0.9391213655471802,
"kl": 0.072021484375,
"learning_rate": 5.149999999999999e-07,
"loss": 0.0007,
"reward": 3.9639917612075806,
"reward_std": 0.009625846752896905,
"rewards/answer_entity_reward": 0.995192289352417,
"rewards/answer_wer_reward": 0.9695361256599426,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9992632865905762,
"step": 389
},
{
"completion_length": 237.8125,
"epoch": 1.2464,
"grad_norm": 1.1664483547210693,
"kl": 0.07568359375,
"learning_rate": 5.137500000000001e-07,
"loss": 0.0008,
"reward": 3.935038685798645,
"reward_std": 0.018754366785287857,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9395028948783875,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 390
},
{
"completion_length": 221.09375,
"epoch": 1.2496,
"grad_norm": 1.0274744033813477,
"kl": 0.06591796875,
"learning_rate": 5.125e-07,
"loss": 0.0007,
"reward": 3.9391175508499146,
"reward_std": 0.008871730417013168,
"rewards/answer_entity_reward": 0.9886363744735718,
"rewards/answer_wer_reward": 0.9511756002902985,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993055462837219,
"step": 391
},
{
"completion_length": 216.53125,
"epoch": 1.2528000000000001,
"grad_norm": 1.4062410593032837,
"kl": 0.0712890625,
"learning_rate": 5.1125e-07,
"loss": 0.0007,
"reward": 3.8631064891815186,
"reward_std": 0.02681769710034132,
"rewards/answer_entity_reward": 0.9895833432674408,
"rewards/answer_wer_reward": 0.9219101965427399,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9516128897666931,
"step": 392
},
{
"completion_length": 141.9375,
"epoch": 1.256,
"grad_norm": 9.963582038879395,
"kl": 0.12841796875,
"learning_rate": 5.1e-07,
"loss": 0.0013,
"reward": 3.886857271194458,
"reward_std": 0.011839461978524923,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9376890957355499,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9520089328289032,
"step": 393
},
{
"completion_length": 223.75,
"epoch": 1.2591999999999999,
"grad_norm": 3.129469156265259,
"kl": 0.06201171875,
"learning_rate": 5.0875e-07,
"loss": 0.0006,
"reward": 3.8934308290481567,
"reward_std": 0.04124835692346096,
"rewards/answer_entity_reward": 0.9847756326198578,
"rewards/answer_wer_reward": 0.9095006585121155,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9991544783115387,
"step": 394
},
{
"completion_length": 194.21875,
"epoch": 1.2624,
"grad_norm": 8.187355995178223,
"kl": 0.0849609375,
"learning_rate": 5.074999999999999e-07,
"loss": 0.0008,
"reward": 3.8118830919265747,
"reward_std": 0.03198861540295184,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.8248356580734253,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9870474934577942,
"step": 395
},
{
"completion_length": 216.0625,
"epoch": 1.2656,
"grad_norm": 1.9981720447540283,
"kl": 0.08349609375,
"learning_rate": 5.062499999999999e-07,
"loss": 0.0008,
"reward": 3.876628041267395,
"reward_std": 0.030061259865760803,
"rewards/answer_entity_reward": 0.9899572730064392,
"rewards/answer_wer_reward": 0.9325210154056549,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9541498124599457,
"step": 396
},
{
"completion_length": 244.9375,
"epoch": 1.2688,
"grad_norm": 1.46060311794281,
"kl": 0.08740234375,
"learning_rate": 5.049999999999999e-07,
"loss": 0.0009,
"reward": 3.9221689701080322,
"reward_std": 0.016801749356091022,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9224453568458557,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997234344482422,
"step": 397
},
{
"completion_length": 171.78125,
"epoch": 1.272,
"grad_norm": 2.054922342300415,
"kl": 0.116455078125,
"learning_rate": 5.0375e-07,
"loss": 0.0012,
"reward": 3.922398328781128,
"reward_std": 0.015158042311668396,
"rewards/answer_entity_reward": 0.9818181991577148,
"rewards/answer_wer_reward": 0.9412411153316498,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993391633033752,
"step": 398
},
{
"completion_length": 223.6875,
"epoch": 1.2752,
"grad_norm": 4.638472557067871,
"kl": 0.0859375,
"learning_rate": 5.025e-07,
"loss": 0.0009,
"reward": 3.928803563117981,
"reward_std": 0.015867930836975574,
"rewards/answer_entity_reward": 0.9790209829807281,
"rewards/answer_wer_reward": 0.9508891105651855,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9988935589790344,
"step": 399
},
{
"completion_length": 180.40625,
"epoch": 1.2784,
"grad_norm": 17.943954467773438,
"kl": 0.09228515625,
"learning_rate": 5.0125e-07,
"loss": 0.0009,
"reward": 3.918807029724121,
"reward_std": 0.010303683578968048,
"rewards/answer_entity_reward": 0.9916666746139526,
"rewards/answer_wer_reward": 0.9271402955055237,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 400
},
{
"completion_length": 208.59375,
"epoch": 1.2816,
"grad_norm": 2.634068489074707,
"kl": 0.10400390625,
"learning_rate": 5e-07,
"loss": 0.001,
"reward": 3.825340986251831,
"reward_std": 0.0303196981549263,
"rewards/answer_entity_reward": 0.9871430397033691,
"rewards/answer_wer_reward": 0.9115504324436188,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9266475439071655,
"step": 401
},
{
"completion_length": 203.25,
"epoch": 1.2848,
"grad_norm": 1.149072289466858,
"kl": 0.066162109375,
"learning_rate": 4.9875e-07,
"loss": 0.0007,
"reward": 3.9346535205841064,
"reward_std": 0.009479325264692307,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9361503720283508,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9985032677650452,
"step": 402
},
{
"completion_length": 214.875,
"epoch": 1.288,
"grad_norm": 1.2013689279556274,
"kl": 0.08447265625,
"learning_rate": 4.975e-07,
"loss": 0.0008,
"reward": 3.8625338077545166,
"reward_std": 0.012592533603310585,
"rewards/answer_entity_reward": 0.9916666746139526,
"rewards/answer_wer_reward": 0.9030886590480804,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9677785038948059,
"step": 403
},
{
"completion_length": 252.78125,
"epoch": 1.2912,
"grad_norm": 1.6769248247146606,
"kl": 0.066162109375,
"learning_rate": 4.9625e-07,
"loss": 0.0007,
"reward": 3.8860517740249634,
"reward_std": 0.034614769741892815,
"rewards/answer_entity_reward": 0.9836356937885284,
"rewards/answer_wer_reward": 0.9036450088024139,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987711310386658,
"step": 404
},
{
"completion_length": 213.96875,
"epoch": 1.2944,
"grad_norm": 1.5894328355789185,
"kl": 0.069580078125,
"learning_rate": 4.95e-07,
"loss": 0.0007,
"reward": 3.921362280845642,
"reward_std": 0.014703459106385708,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.92447629570961,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9992897808551788,
"step": 405
},
{
"completion_length": 242.53125,
"epoch": 1.2976,
"grad_norm": 3.458373785018921,
"kl": 0.1416015625,
"learning_rate": 4.9375e-07,
"loss": 0.0014,
"reward": 3.7037495374679565,
"reward_std": 0.1908966824412346,
"rewards/answer_entity_reward": 0.9941239356994629,
"rewards/answer_wer_reward": 0.8811471164226532,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.85972860455513,
"step": 406
},
{
"completion_length": 187.53125,
"epoch": 1.3008,
"grad_norm": 7.737911224365234,
"kl": 0.107666015625,
"learning_rate": 4.924999999999999e-07,
"loss": 0.0011,
"reward": 3.9244754314422607,
"reward_std": 0.021069620735943317,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9345695376396179,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9933781623840332,
"step": 407
},
{
"completion_length": 208.4375,
"epoch": 1.304,
"grad_norm": 2.0846338272094727,
"kl": 0.165771484375,
"learning_rate": 4.9125e-07,
"loss": 0.0017,
"reward": 3.9408286809921265,
"reward_std": 0.011463565286248922,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9510546028614044,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9897740483283997,
"step": 408
},
{
"completion_length": 198.28125,
"epoch": 1.3072,
"grad_norm": 2.9788646697998047,
"kl": 0.089111328125,
"learning_rate": 4.9e-07,
"loss": 0.0009,
"reward": 3.900764584541321,
"reward_std": 0.03450075723230839,
"rewards/answer_entity_reward": 0.9874475002288818,
"rewards/answer_wer_reward": 0.9153684377670288,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9979486465454102,
"step": 409
},
{
"completion_length": 176.28125,
"epoch": 1.3104,
"grad_norm": 2.856952667236328,
"kl": 0.09228515625,
"learning_rate": 4.8875e-07,
"loss": 0.0009,
"reward": 3.9486716985702515,
"reward_std": 0.021902556531131268,
"rewards/answer_entity_reward": 0.9912830293178558,
"rewards/answer_wer_reward": 0.9610228836536407,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9963657557964325,
"step": 410
},
{
"completion_length": 209.28125,
"epoch": 1.3136,
"grad_norm": 2.0441436767578125,
"kl": 0.08642578125,
"learning_rate": 4.875e-07,
"loss": 0.0009,
"reward": 3.916486144065857,
"reward_std": 0.018760663457214832,
"rewards/answer_entity_reward": 0.9963235259056091,
"rewards/answer_wer_reward": 0.9207533895969391,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994091391563416,
"step": 411
},
{
"completion_length": 232.21875,
"epoch": 1.3168,
"grad_norm": 1.071075201034546,
"kl": 0.0633544921875,
"learning_rate": 4.8625e-07,
"loss": 0.0006,
"reward": 3.9308619499206543,
"reward_std": 0.018531675916165113,
"rewards/answer_entity_reward": 0.9893162250518799,
"rewards/answer_wer_reward": 0.943993479013443,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9975521564483643,
"step": 412
},
{
"completion_length": 210.0625,
"epoch": 1.32,
"grad_norm": 3.82405686378479,
"kl": 0.09326171875,
"learning_rate": 4.85e-07,
"loss": 0.0009,
"reward": 3.889458179473877,
"reward_std": 0.02208129083737731,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9473488032817841,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9449501633644104,
"step": 413
},
{
"completion_length": 197.15625,
"epoch": 1.3232,
"grad_norm": 1.4103983640670776,
"kl": 0.0849609375,
"learning_rate": 4.8375e-07,
"loss": 0.0009,
"reward": 3.9459102153778076,
"reward_std": 0.014464881271123886,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9481469988822937,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9977630972862244,
"step": 414
},
{
"completion_length": 239.9375,
"epoch": 1.3264,
"grad_norm": 1.4598060846328735,
"kl": 0.06982421875,
"learning_rate": 4.824999999999999e-07,
"loss": 0.0007,
"reward": 3.862109899520874,
"reward_std": 0.07382148411124945,
"rewards/answer_entity_reward": 0.9833333194255829,
"rewards/answer_wer_reward": 0.9100264310836792,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.96875,
"step": 415
},
{
"completion_length": 184.3125,
"epoch": 1.3296000000000001,
"grad_norm": 0.6735196709632874,
"kl": 0.063720703125,
"learning_rate": 4.812499999999999e-07,
"loss": 0.0006,
"reward": 3.8697965145111084,
"reward_std": 0.18503482337109745,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.932296484708786,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.96875,
"step": 416
},
{
"completion_length": 172.1875,
"epoch": 1.3328,
"grad_norm": 1.7613649368286133,
"kl": 0.11962890625,
"learning_rate": 4.8e-07,
"loss": 0.0012,
"reward": 3.938371181488037,
"reward_std": 0.020422414876520634,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9546558260917664,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.988179475069046,
"step": 417
},
{
"completion_length": 223.09375,
"epoch": 1.336,
"grad_norm": 3.332552671432495,
"kl": 0.12841796875,
"learning_rate": 4.7875e-07,
"loss": 0.0013,
"reward": 3.9398679733276367,
"reward_std": 0.018179779406636953,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9447188973426819,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9979900419712067,
"step": 418
},
{
"completion_length": 226.5,
"epoch": 1.3392,
"grad_norm": 1.4374769926071167,
"kl": 0.083984375,
"learning_rate": 4.775e-07,
"loss": 0.0008,
"reward": 3.891066312789917,
"reward_std": 0.02610717061907053,
"rewards/answer_entity_reward": 0.9841079115867615,
"rewards/answer_wer_reward": 0.9078442752361298,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9991140365600586,
"step": 419
},
{
"completion_length": 196.84375,
"epoch": 1.3424,
"grad_norm": 1.7055010795593262,
"kl": 0.100830078125,
"learning_rate": 4.7625e-07,
"loss": 0.001,
"reward": 3.899353504180908,
"reward_std": 0.022911718115210533,
"rewards/answer_entity_reward": 0.9914772808551788,
"rewards/answer_wer_reward": 0.9394311308860779,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9684451520442963,
"step": 420
},
{
"completion_length": 223.34375,
"epoch": 1.3456000000000001,
"grad_norm": 2.624370574951172,
"kl": 0.13427734375,
"learning_rate": 4.7499999999999995e-07,
"loss": 0.0013,
"reward": 3.8897502422332764,
"reward_std": 0.06373783992603421,
"rewards/answer_entity_reward": 0.9921875,
"rewards/answer_wer_reward": 0.9388905465602875,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9586721360683441,
"step": 421
},
{
"completion_length": 209.0,
"epoch": 1.3488,
"grad_norm": 2.2683520317077637,
"kl": 0.102294921875,
"learning_rate": 4.7374999999999996e-07,
"loss": 0.001,
"reward": 3.960143804550171,
"reward_std": 0.006363062420859933,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9613305628299713,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9988133013248444,
"step": 422
},
{
"completion_length": 187.6875,
"epoch": 1.3519999999999999,
"grad_norm": 1.426279067993164,
"kl": 0.130859375,
"learning_rate": 4.725e-07,
"loss": 0.0013,
"reward": 3.904189109802246,
"reward_std": 0.017666546627879143,
"rewards/answer_entity_reward": 0.9875437021255493,
"rewards/answer_wer_reward": 0.9480733275413513,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9685720801353455,
"step": 423
},
{
"completion_length": 227.96875,
"epoch": 1.3552,
"grad_norm": 2.3656458854675293,
"kl": 0.202880859375,
"learning_rate": 4.7125e-07,
"loss": 0.002,
"reward": 3.8170067071914673,
"reward_std": 0.15287955617532134,
"rewards/answer_entity_reward": 0.993697464466095,
"rewards/answer_wer_reward": 0.9000802934169769,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9544788599014282,
"step": 424
},
{
"completion_length": 260.375,
"epoch": 1.3584,
"grad_norm": 19.11045265197754,
"kl": 0.0771484375,
"learning_rate": 4.6999999999999995e-07,
"loss": 0.0008,
"reward": 3.9173004627227783,
"reward_std": 0.02492327243089676,
"rewards/answer_entity_reward": 0.9955128133296967,
"rewards/answer_wer_reward": 0.9262253046035767,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9955623149871826,
"step": 425
},
{
"completion_length": 244.0625,
"epoch": 1.3616,
"grad_norm": 3.7110118865966797,
"kl": 0.0650634765625,
"learning_rate": 4.6874999999999996e-07,
"loss": 0.0007,
"reward": 3.912764072418213,
"reward_std": 0.022814412601292133,
"rewards/answer_entity_reward": 0.9910714328289032,
"rewards/answer_wer_reward": 0.9220215976238251,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996710419654846,
"step": 426
},
{
"completion_length": 206.0625,
"epoch": 1.3648,
"grad_norm": 7.218249797821045,
"kl": 0.0869140625,
"learning_rate": 4.675e-07,
"loss": 0.0009,
"reward": 3.8915610313415527,
"reward_std": 0.020747858565300703,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.913354367017746,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9816789925098419,
"step": 427
},
{
"completion_length": 229.21875,
"epoch": 1.3679999999999999,
"grad_norm": 6.419763088226318,
"kl": 0.078857421875,
"learning_rate": 4.6625e-07,
"loss": 0.0008,
"reward": 3.7964917421340942,
"reward_std": 0.03975658491253853,
"rewards/answer_entity_reward": 0.9886363744735718,
"rewards/answer_wer_reward": 0.9177364408969879,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8901189565658569,
"step": 428
},
{
"completion_length": 252.96875,
"epoch": 1.3712,
"grad_norm": 6.5345025062561035,
"kl": 0.0782470703125,
"learning_rate": 4.65e-07,
"loss": 0.0008,
"reward": 3.903268814086914,
"reward_std": 0.016737705329433084,
"rewards/answer_entity_reward": 0.9764957129955292,
"rewards/answer_wer_reward": 0.92976513504982,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9970079958438873,
"step": 429
},
{
"completion_length": 240.15625,
"epoch": 1.3744,
"grad_norm": 2.109302043914795,
"kl": 0.078857421875,
"learning_rate": 4.6374999999999995e-07,
"loss": 0.0008,
"reward": 3.935005784034729,
"reward_std": 0.035214878618717194,
"rewards/answer_entity_reward": 0.9908459782600403,
"rewards/answer_wer_reward": 0.9483801424503326,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9957797825336456,
"step": 430
},
{
"completion_length": 204.21875,
"epoch": 1.3776,
"grad_norm": 2.1557323932647705,
"kl": 0.0986328125,
"learning_rate": 4.625e-07,
"loss": 0.001,
"reward": 3.895322561264038,
"reward_std": 0.00989355193451047,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9249120354652405,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9704104661941528,
"step": 431
},
{
"completion_length": 222.375,
"epoch": 1.3808,
"grad_norm": 1.1159002780914307,
"kl": 0.139892578125,
"learning_rate": 4.6125e-07,
"loss": 0.0014,
"reward": 3.909332513809204,
"reward_std": 0.02693999744951725,
"rewards/answer_entity_reward": 0.995192289352417,
"rewards/answer_wer_reward": 0.9155605435371399,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9985795617103577,
"step": 432
},
{
"completion_length": 203.8125,
"epoch": 1.384,
"grad_norm": 1.4166613817214966,
"kl": 0.1220703125,
"learning_rate": 4.6e-07,
"loss": 0.0012,
"reward": 3.8192185163497925,
"reward_std": 0.20739353261888027,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.8859462738037109,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9673629999160767,
"step": 433
},
{
"completion_length": 253.25,
"epoch": 1.3872,
"grad_norm": 2.674269437789917,
"kl": 0.0657958984375,
"learning_rate": 4.5874999999999995e-07,
"loss": 0.0007,
"reward": 3.88591992855072,
"reward_std": 0.02829979732632637,
"rewards/answer_entity_reward": 0.9763771891593933,
"rewards/answer_wer_reward": 0.9098401963710785,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997023940086365,
"step": 434
},
{
"completion_length": 216.15625,
"epoch": 1.3904,
"grad_norm": 2.3317995071411133,
"kl": 0.1495361328125,
"learning_rate": 4.575e-07,
"loss": 0.0015,
"reward": 3.8120020627975464,
"reward_std": 0.0887885820120573,
"rewards/answer_entity_reward": 0.9914772808551788,
"rewards/answer_wer_reward": 0.9127777814865112,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9077470898628235,
"step": 435
},
{
"completion_length": 210.5625,
"epoch": 1.3936,
"grad_norm": 8.527549743652344,
"kl": 0.13818359375,
"learning_rate": 4.5624999999999997e-07,
"loss": 0.0014,
"reward": 3.802919387817383,
"reward_std": 0.015426071360707283,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9406470954418182,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.862272173166275,
"step": 436
},
{
"completion_length": 194.21875,
"epoch": 1.3968,
"grad_norm": 3.6950721740722656,
"kl": 0.098388671875,
"learning_rate": 4.55e-07,
"loss": 0.001,
"reward": 3.9119696617126465,
"reward_std": 0.025569402612745762,
"rewards/answer_entity_reward": 0.9852430522441864,
"rewards/answer_wer_reward": 0.9274449944496155,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9992816150188446,
"step": 437
},
{
"completion_length": 218.0,
"epoch": 1.4,
"grad_norm": 2.0461039543151855,
"kl": 0.09130859375,
"learning_rate": 4.5374999999999994e-07,
"loss": 0.0009,
"reward": 3.9378126859664917,
"reward_std": 0.023795679211616516,
"rewards/answer_entity_reward": 0.9909090995788574,
"rewards/answer_wer_reward": 0.9559187889099121,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9909848570823669,
"step": 438
},
{
"completion_length": 166.21875,
"epoch": 1.4032,
"grad_norm": 6.606758117675781,
"kl": 0.099853515625,
"learning_rate": 4.525e-07,
"loss": 0.001,
"reward": 3.7634676694869995,
"reward_std": 0.12013816519174725,
"rewards/answer_entity_reward": 0.9916666746139526,
"rewards/answer_wer_reward": 0.9638259708881378,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8079750537872314,
"step": 439
},
{
"completion_length": 199.0625,
"epoch": 1.4064,
"grad_norm": 2.7103731632232666,
"kl": 0.107666015625,
"learning_rate": 4.5124999999999997e-07,
"loss": 0.0011,
"reward": 3.88293993473053,
"reward_std": 0.030841628089547157,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9417436718940735,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9411962330341339,
"step": 440
},
{
"completion_length": 228.5,
"epoch": 1.4096,
"grad_norm": 16.007980346679688,
"kl": 0.090576171875,
"learning_rate": 4.5e-07,
"loss": 0.0009,
"reward": 3.8373541831970215,
"reward_std": 0.07324423175305128,
"rewards/answer_entity_reward": 0.9903846085071564,
"rewards/answer_wer_reward": 0.8474734723567963,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994959831237793,
"step": 441
},
{
"completion_length": 198.0,
"epoch": 1.4128,
"grad_norm": 1.5419743061065674,
"kl": 0.090576171875,
"learning_rate": 4.4874999999999994e-07,
"loss": 0.0009,
"reward": 3.9394867420196533,
"reward_std": 0.020834744907915592,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9435902535915375,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993686676025391,
"step": 442
},
{
"completion_length": 212.9375,
"epoch": 1.416,
"grad_norm": 2.2846686840057373,
"kl": 0.09375,
"learning_rate": 4.475e-07,
"loss": 0.0009,
"reward": 3.9014971256256104,
"reward_std": 0.05675862170755863,
"rewards/answer_entity_reward": 0.9917200803756714,
"rewards/answer_wer_reward": 0.9370100498199463,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9727669358253479,
"step": 443
},
{
"completion_length": 228.78125,
"epoch": 1.4192,
"grad_norm": 1.8499493598937988,
"kl": 0.0723876953125,
"learning_rate": 4.4624999999999996e-07,
"loss": 0.0007,
"reward": 3.967541456222534,
"reward_std": 0.005963538307696581,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9685240089893341,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990174472332001,
"step": 444
},
{
"completion_length": 224.09375,
"epoch": 1.4224,
"grad_norm": 5.091113567352295,
"kl": 0.0704345703125,
"learning_rate": 4.45e-07,
"loss": 0.0007,
"reward": 3.886088252067566,
"reward_std": 0.04133851733058691,
"rewards/answer_entity_reward": 0.9886092245578766,
"rewards/answer_wer_reward": 0.9384825825691223,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9589964747428894,
"step": 445
},
{
"completion_length": 228.34375,
"epoch": 1.4256,
"grad_norm": 57.5860710144043,
"kl": 0.142822265625,
"learning_rate": 4.4374999999999993e-07,
"loss": 0.0014,
"reward": 3.9096713066101074,
"reward_std": 0.01603887975215912,
"rewards/answer_entity_reward": 0.9981617629528046,
"rewards/answer_wer_reward": 0.9126511812210083,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9988585412502289,
"step": 446
},
{
"completion_length": 202.34375,
"epoch": 1.4288,
"grad_norm": 2.211174964904785,
"kl": 0.056640625,
"learning_rate": 4.425e-07,
"loss": 0.0006,
"reward": 3.893475890159607,
"reward_std": 0.046710459515452385,
"rewards/answer_entity_reward": 0.9659091234207153,
"rewards/answer_wer_reward": 0.9309596717357635,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9966072142124176,
"step": 447
},
{
"completion_length": 186.375,
"epoch": 1.432,
"grad_norm": 3.053344249725342,
"kl": 0.07666015625,
"learning_rate": 4.4124999999999996e-07,
"loss": 0.0008,
"reward": 3.941947340965271,
"reward_std": 0.009567510336637497,
"rewards/answer_entity_reward": 0.9883012771606445,
"rewards/answer_wer_reward": 0.964864045381546,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9887820482254028,
"step": 448
},
{
"completion_length": 170.65625,
"epoch": 1.4352,
"grad_norm": 2.5118942260742188,
"kl": 0.086181640625,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0009,
"reward": 3.8258646726608276,
"reward_std": 0.011685115285217762,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.8330351114273071,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9928297400474548,
"step": 449
},
{
"completion_length": 199.8125,
"epoch": 1.4384000000000001,
"grad_norm": 3.3471686840057373,
"kl": 0.12109375,
"learning_rate": 4.3874999999999993e-07,
"loss": 0.0012,
"reward": 3.768259644508362,
"reward_std": 0.061878617852926254,
"rewards/answer_entity_reward": 0.9866071343421936,
"rewards/answer_wer_reward": 0.8001611828804016,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9814914762973785,
"step": 450
},
{
"completion_length": 179.625,
"epoch": 1.4416,
"grad_norm": 6.58098840713501,
"kl": 0.13671875,
"learning_rate": 4.375e-07,
"loss": 0.0014,
"reward": 3.9213969707489014,
"reward_std": 0.007897446397691965,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.954677164554596,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.966719776391983,
"step": 451
},
{
"completion_length": 201.53125,
"epoch": 1.4447999999999999,
"grad_norm": 2.6606149673461914,
"kl": 0.07275390625,
"learning_rate": 4.3625e-07,
"loss": 0.0007,
"reward": 3.930065631866455,
"reward_std": 0.016306706704199314,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9411978721618652,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9888678193092346,
"step": 452
},
{
"completion_length": 218.71875,
"epoch": 1.448,
"grad_norm": 2.720804452896118,
"kl": 0.068115234375,
"learning_rate": 4.3499999999999996e-07,
"loss": 0.0007,
"reward": 3.9184677600860596,
"reward_std": 0.018319842871278524,
"rewards/answer_entity_reward": 0.9955128133296967,
"rewards/answer_wer_reward": 0.9235129654407501,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994419813156128,
"step": 453
},
{
"completion_length": 207.65625,
"epoch": 1.4512,
"grad_norm": 3.4664785861968994,
"kl": 0.153564453125,
"learning_rate": 4.3375000000000003e-07,
"loss": 0.0015,
"reward": 3.9119069576263428,
"reward_std": 0.017484096810221672,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9505945444107056,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9613124430179596,
"step": 454
},
{
"completion_length": 212.125,
"epoch": 1.4544000000000001,
"grad_norm": 1.4592719078063965,
"kl": 0.081298828125,
"learning_rate": 4.325e-07,
"loss": 0.0008,
"reward": 3.9381325244903564,
"reward_std": 0.015622157603502274,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.94671231508255,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9938240349292755,
"step": 455
},
{
"completion_length": 237.8125,
"epoch": 1.4576,
"grad_norm": 1.2292534112930298,
"kl": 0.089111328125,
"learning_rate": 4.3125e-07,
"loss": 0.0009,
"reward": 3.940074920654297,
"reward_std": 0.013516389299184084,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.943013072013855,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9970619082450867,
"step": 456
},
{
"completion_length": 206.90625,
"epoch": 1.4607999999999999,
"grad_norm": 2.4139420986175537,
"kl": 0.08837890625,
"learning_rate": 4.2999999999999996e-07,
"loss": 0.0009,
"reward": 3.9423701763153076,
"reward_std": 0.017034863587468863,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.942692369222641,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999677836894989,
"step": 457
},
{
"completion_length": 200.21875,
"epoch": 1.464,
"grad_norm": 1.0297181606292725,
"kl": 0.11083984375,
"learning_rate": 4.2875e-07,
"loss": 0.0011,
"reward": 3.9459547996520996,
"reward_std": 0.014651869423687458,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9467397332191467,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999215304851532,
"step": 458
},
{
"completion_length": 219.75,
"epoch": 1.4672,
"grad_norm": 1.3148033618927002,
"kl": 0.10546875,
"learning_rate": 4.275e-07,
"loss": 0.0011,
"reward": 3.9567900896072388,
"reward_std": 0.011340227210894227,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9586590230464935,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.99813112616539,
"step": 459
},
{
"completion_length": 181.0625,
"epoch": 1.4704,
"grad_norm": 2.4274115562438965,
"kl": 0.09814453125,
"learning_rate": 4.2625e-07,
"loss": 0.001,
"reward": 3.9310171604156494,
"reward_std": 0.017811311408877373,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9543968439102173,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9766204059123993,
"step": 460
},
{
"completion_length": 205.625,
"epoch": 1.4736,
"grad_norm": 2.885746717453003,
"kl": 0.12548828125,
"learning_rate": 4.2499999999999995e-07,
"loss": 0.0013,
"reward": 3.878596305847168,
"reward_std": 0.02481621317565441,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.9378580451011658,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9464203119277954,
"step": 461
},
{
"completion_length": 205.4375,
"epoch": 1.4768,
"grad_norm": 2.366044521331787,
"kl": 0.10595703125,
"learning_rate": 4.2375e-07,
"loss": 0.0011,
"reward": 3.940233826637268,
"reward_std": 0.013500516302883625,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.9461761116981506,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997395873069763,
"step": 462
},
{
"completion_length": 229.46875,
"epoch": 1.48,
"grad_norm": 2.4469070434570312,
"kl": 0.078369140625,
"learning_rate": 4.225e-07,
"loss": 0.0008,
"reward": 3.92271089553833,
"reward_std": 0.022854273673146963,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9247944056987762,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9979166686534882,
"step": 463
},
{
"completion_length": 225.96875,
"epoch": 1.4832,
"grad_norm": 11.768393516540527,
"kl": 0.1123046875,
"learning_rate": 4.2125e-07,
"loss": 0.0011,
"reward": 3.9518144130706787,
"reward_std": 0.010446197353303432,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9525662660598755,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9992480874061584,
"step": 464
},
{
"completion_length": 149.0,
"epoch": 1.4864,
"grad_norm": 6.672958850860596,
"kl": 0.185791015625,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.0019,
"reward": 3.944068431854248,
"reward_std": 0.02685389667749405,
"rewards/answer_entity_reward": 0.9774305522441864,
"rewards/answer_wer_reward": 0.9714455008506775,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.995192289352417,
"step": 465
},
{
"completion_length": 249.625,
"epoch": 1.4896,
"grad_norm": 1.7048887014389038,
"kl": 0.10986328125,
"learning_rate": 4.1875e-07,
"loss": 0.0011,
"reward": 3.902083158493042,
"reward_std": 0.011234605684876442,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9024596214294434,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996234774589539,
"step": 466
},
{
"completion_length": 181.8125,
"epoch": 1.4928,
"grad_norm": 2.429704189300537,
"kl": 0.112548828125,
"learning_rate": 4.1749999999999997e-07,
"loss": 0.0011,
"reward": 3.9163752794265747,
"reward_std": 0.014369658660143614,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9364789724349976,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9798963963985443,
"step": 467
},
{
"completion_length": 201.5625,
"epoch": 1.496,
"grad_norm": 3.5214920043945312,
"kl": 0.09912109375,
"learning_rate": 4.1625e-07,
"loss": 0.001,
"reward": 3.9279046058654785,
"reward_std": 0.016232089139521122,
"rewards/answer_entity_reward": 0.9983552694320679,
"rewards/answer_wer_reward": 0.9468095898628235,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9827397167682648,
"step": 468
},
{
"completion_length": 180.09375,
"epoch": 1.4992,
"grad_norm": 2.471404790878296,
"kl": 0.10693359375,
"learning_rate": 4.1499999999999994e-07,
"loss": 0.0011,
"reward": 3.860212564468384,
"reward_std": 0.02879812940955162,
"rewards/answer_entity_reward": 0.9914772808551788,
"rewards/answer_wer_reward": 0.9390608966350555,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9296742975711823,
"step": 469
},
{
"completion_length": 208.1875,
"epoch": 1.5024,
"grad_norm": 0.9673317074775696,
"kl": 0.104248046875,
"learning_rate": 4.1375e-07,
"loss": 0.001,
"reward": 3.944485664367676,
"reward_std": 0.01182422018609941,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9444854557514191,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 470
},
{
"completion_length": 194.0625,
"epoch": 1.5056,
"grad_norm": 1.0823942422866821,
"kl": 0.096435546875,
"learning_rate": 4.1249999999999997e-07,
"loss": 0.001,
"reward": 3.9105581045150757,
"reward_std": 0.015555873978883028,
"rewards/answer_entity_reward": 0.9869916439056396,
"rewards/answer_wer_reward": 0.9253619015216827,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9982046484947205,
"step": 471
},
{
"completion_length": 221.5625,
"epoch": 1.5088,
"grad_norm": 4.074758052825928,
"kl": 0.077880859375,
"learning_rate": 4.1125e-07,
"loss": 0.0008,
"reward": 3.933529496192932,
"reward_std": 0.019466498168185353,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9344994425773621,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990300834178925,
"step": 472
},
{
"completion_length": 167.3125,
"epoch": 1.512,
"grad_norm": 2.003244400024414,
"kl": 0.10888671875,
"learning_rate": 4.0999999999999994e-07,
"loss": 0.0011,
"reward": 3.927189588546753,
"reward_std": 0.00937123317271471,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9625618755817413,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9646276533603668,
"step": 473
},
{
"completion_length": 253.59375,
"epoch": 1.5152,
"grad_norm": 1.7125921249389648,
"kl": 0.1005859375,
"learning_rate": 4.0875e-07,
"loss": 0.001,
"reward": 3.9120967388153076,
"reward_std": 0.020000137854367495,
"rewards/answer_entity_reward": 0.9910256266593933,
"rewards/answer_wer_reward": 0.9225968718528748,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9984740912914276,
"step": 474
},
{
"completion_length": 168.5625,
"epoch": 1.5184,
"grad_norm": 2.8377087116241455,
"kl": 0.176513671875,
"learning_rate": 4.0749999999999996e-07,
"loss": 0.0018,
"reward": 3.8111839294433594,
"reward_std": 0.015397761948406696,
"rewards/answer_entity_reward": 0.9882352948188782,
"rewards/answer_wer_reward": 0.9508877992630005,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8720609545707703,
"step": 475
},
{
"completion_length": 180.0625,
"epoch": 1.5215999999999998,
"grad_norm": 1.8417869806289673,
"kl": 0.09814453125,
"learning_rate": 4.0625e-07,
"loss": 0.001,
"reward": 3.9401214122772217,
"reward_std": 0.017564786598086357,
"rewards/answer_entity_reward": 0.9958333373069763,
"rewards/answer_wer_reward": 0.95186448097229,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9924235343933105,
"step": 476
},
{
"completion_length": 259.5625,
"epoch": 1.5248,
"grad_norm": 3.1482410430908203,
"kl": 0.065673828125,
"learning_rate": 4.05e-07,
"loss": 0.0007,
"reward": 3.8720295429229736,
"reward_std": 0.05017535015940666,
"rewards/answer_entity_reward": 0.9819904267787933,
"rewards/answer_wer_reward": 0.9132304787635803,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9768086373806,
"step": 477
},
{
"completion_length": 224.25,
"epoch": 1.528,
"grad_norm": 1.309258222579956,
"kl": 0.151123046875,
"learning_rate": 4.0375e-07,
"loss": 0.0015,
"reward": 3.9491400718688965,
"reward_std": 0.015128562692552805,
"rewards/answer_entity_reward": 0.9905637204647064,
"rewards/answer_wer_reward": 0.9590685665607452,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999507874250412,
"step": 478
},
{
"completion_length": 195.03125,
"epoch": 1.5312000000000001,
"grad_norm": 2.627673864364624,
"kl": 0.10205078125,
"learning_rate": 4.025e-07,
"loss": 0.001,
"reward": 3.8748838901519775,
"reward_std": 0.03435908444225788,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.931220144033432,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9471359848976135,
"step": 479
},
{
"completion_length": 214.09375,
"epoch": 1.5344,
"grad_norm": 1.328961730003357,
"kl": 0.09375,
"learning_rate": 4.0124999999999997e-07,
"loss": 0.0009,
"reward": 3.895302414894104,
"reward_std": 0.05907848384231329,
"rewards/answer_entity_reward": 0.9810912609100342,
"rewards/answer_wer_reward": 0.9161643385887146,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.998046875,
"step": 480
},
{
"completion_length": 238.34375,
"epoch": 1.5375999999999999,
"grad_norm": 1.2219247817993164,
"kl": 0.073486328125,
"learning_rate": 4e-07,
"loss": 0.0007,
"reward": 3.9036080837249756,
"reward_std": 0.039926802739501,
"rewards/answer_entity_reward": 0.9823717474937439,
"rewards/answer_wer_reward": 0.9258527159690857,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9953835308551788,
"step": 481
},
{
"completion_length": 197.90625,
"epoch": 1.5408,
"grad_norm": 1.6363537311553955,
"kl": 0.14306640625,
"learning_rate": 3.9875e-07,
"loss": 0.0014,
"reward": 3.948188543319702,
"reward_std": 0.010867676697671413,
"rewards/answer_entity_reward": 0.9958333373069763,
"rewards/answer_wer_reward": 0.9528435170650482,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.99951171875,
"step": 482
},
{
"completion_length": 248.84375,
"epoch": 1.544,
"grad_norm": 1.3250434398651123,
"kl": 0.07421875,
"learning_rate": 3.975e-07,
"loss": 0.0007,
"reward": 3.9251530170440674,
"reward_std": 0.011389322113245726,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9254424273967743,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997106492519379,
"step": 483
},
{
"completion_length": 183.03125,
"epoch": 1.5472000000000001,
"grad_norm": 1.3042057752609253,
"kl": 0.10107421875,
"learning_rate": 3.9624999999999996e-07,
"loss": 0.001,
"reward": 3.8853487968444824,
"reward_std": 0.06827400880865753,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9366248250007629,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9487238228321075,
"step": 484
},
{
"completion_length": 192.0625,
"epoch": 1.5504,
"grad_norm": 2.232529640197754,
"kl": 0.11279296875,
"learning_rate": 3.95e-07,
"loss": 0.0011,
"reward": 3.9339704513549805,
"reward_std": 0.011435477063059807,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.935157060623169,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9988133013248444,
"step": 485
},
{
"completion_length": 237.28125,
"epoch": 1.5535999999999999,
"grad_norm": 1.1462312936782837,
"kl": 0.098876953125,
"learning_rate": 3.9375e-07,
"loss": 0.001,
"reward": 3.953871250152588,
"reward_std": 0.007947361096739769,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9546429216861725,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9992283880710602,
"step": 486
},
{
"completion_length": 167.8125,
"epoch": 1.5568,
"grad_norm": 2.324936628341675,
"kl": 0.1357421875,
"learning_rate": 3.925e-07,
"loss": 0.0014,
"reward": 3.858751654624939,
"reward_std": 0.14167471043765545,
"rewards/answer_entity_reward": 0.9930555820465088,
"rewards/answer_wer_reward": 0.9401543736457825,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9567916989326477,
"step": 487
},
{
"completion_length": 245.09375,
"epoch": 1.56,
"grad_norm": 4.512195587158203,
"kl": 0.076171875,
"learning_rate": 3.9124999999999996e-07,
"loss": 0.0008,
"reward": 3.920499563217163,
"reward_std": 0.03615456819534302,
"rewards/answer_entity_reward": 0.9908459782600403,
"rewards/answer_wer_reward": 0.9309035241603851,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987500607967377,
"step": 488
},
{
"completion_length": 227.84375,
"epoch": 1.5632000000000001,
"grad_norm": 10.537569046020508,
"kl": 0.0859375,
"learning_rate": 3.8999999999999997e-07,
"loss": 0.0009,
"reward": 3.9345154762268066,
"reward_std": 0.0299052600748837,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9527814090251923,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9817341864109039,
"step": 489
},
{
"completion_length": 228.78125,
"epoch": 1.5664,
"grad_norm": 1.635452151298523,
"kl": 0.125,
"learning_rate": 3.8875e-07,
"loss": 0.0013,
"reward": 3.944974184036255,
"reward_std": 0.019456470385193825,
"rewards/answer_entity_reward": 0.9919143319129944,
"rewards/answer_wer_reward": 0.9538533091545105,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.99920654296875,
"step": 490
},
{
"completion_length": 146.78125,
"epoch": 1.5695999999999999,
"grad_norm": 3.557502031326294,
"kl": 0.256103515625,
"learning_rate": 3.875e-07,
"loss": 0.0026,
"reward": 3.865471601486206,
"reward_std": 0.05972531996667385,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.972651481628418,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8928201496601105,
"step": 491
},
{
"completion_length": 207.28125,
"epoch": 1.5728,
"grad_norm": 1.0813632011413574,
"kl": 0.11767578125,
"learning_rate": 3.8624999999999995e-07,
"loss": 0.0012,
"reward": 3.91045343875885,
"reward_std": 0.01970634702593088,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9153560400009155,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9979382753372192,
"step": 492
},
{
"completion_length": 219.4375,
"epoch": 1.576,
"grad_norm": 9.319220542907715,
"kl": 0.11181640625,
"learning_rate": 3.8499999999999997e-07,
"loss": 0.0011,
"reward": 3.876826047897339,
"reward_std": 0.02829575538635254,
"rewards/answer_entity_reward": 0.9930555820465088,
"rewards/answer_wer_reward": 0.9400706589221954,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9436996281147003,
"step": 493
},
{
"completion_length": 205.34375,
"epoch": 1.5792000000000002,
"grad_norm": 1.0891739130020142,
"kl": 0.091796875,
"learning_rate": 3.8375e-07,
"loss": 0.0009,
"reward": 3.9375252723693848,
"reward_std": 0.01566324196755886,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9394660592079163,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9980591833591461,
"step": 494
},
{
"completion_length": 247.4375,
"epoch": 1.5824,
"grad_norm": 1.313225507736206,
"kl": 0.120361328125,
"learning_rate": 3.825e-07,
"loss": 0.0012,
"reward": 3.9278939962387085,
"reward_std": 0.01758108288049698,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9288604855537415,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999033510684967,
"step": 495
},
{
"completion_length": 190.28125,
"epoch": 1.5856,
"grad_norm": 9.440873146057129,
"kl": 0.19677734375,
"learning_rate": 3.8124999999999995e-07,
"loss": 0.002,
"reward": 3.7597657442092896,
"reward_std": 0.05097449291497469,
"rewards/answer_entity_reward": 0.9879385828971863,
"rewards/answer_wer_reward": 0.9360098242759705,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8358173370361328,
"step": 496
},
{
"completion_length": 165.09375,
"epoch": 1.5888,
"grad_norm": 3.4180030822753906,
"kl": 0.111572265625,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.0011,
"reward": 3.87969434261322,
"reward_std": 0.058905988931655884,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9649176299571991,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9147767424583435,
"step": 497
},
{
"completion_length": 188.96875,
"epoch": 1.592,
"grad_norm": 5.278741359710693,
"kl": 0.072998046875,
"learning_rate": 3.7875e-07,
"loss": 0.0007,
"reward": 3.9229685068130493,
"reward_std": 0.03557159844785929,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9312180578708649,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9938337206840515,
"step": 498
},
{
"completion_length": 223.125,
"epoch": 1.5952,
"grad_norm": 1.8821698427200317,
"kl": 0.100341796875,
"learning_rate": 3.775e-07,
"loss": 0.001,
"reward": 3.85340416431427,
"reward_std": 0.1366682257503271,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9250198900699615,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9596343636512756,
"step": 499
},
{
"completion_length": 206.125,
"epoch": 1.5984,
"grad_norm": 2.528049945831299,
"kl": 0.089111328125,
"learning_rate": 3.7624999999999994e-07,
"loss": 0.0009,
"reward": 3.9384653568267822,
"reward_std": 0.011245439760386944,
"rewards/answer_entity_reward": 0.9983552694320679,
"rewards/answer_wer_reward": 0.9415569603443146,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9985532462596893,
"step": 500
},
{
"completion_length": 229.46875,
"epoch": 1.6016,
"grad_norm": 1.4024198055267334,
"kl": 0.082763671875,
"learning_rate": 3.75e-07,
"loss": 0.0008,
"reward": 3.9224425554275513,
"reward_std": 0.012164951767772436,
"rewards/answer_entity_reward": 0.9886363744735718,
"rewards/answer_wer_reward": 0.9341387450695038,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996675550937653,
"step": 501
},
{
"completion_length": 183.09375,
"epoch": 1.6048,
"grad_norm": 2.642270088195801,
"kl": 0.0888671875,
"learning_rate": 3.7375e-07,
"loss": 0.0009,
"reward": 3.872815251350403,
"reward_std": 0.04407367669045925,
"rewards/answer_entity_reward": 0.995192289352417,
"rewards/answer_wer_reward": 0.953230619430542,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9243922829627991,
"step": 502
},
{
"completion_length": 242.8125,
"epoch": 1.608,
"grad_norm": 3.0733675956726074,
"kl": 0.1044921875,
"learning_rate": 3.725e-07,
"loss": 0.001,
"reward": 3.9133812189102173,
"reward_std": 0.017343452665954828,
"rewards/answer_entity_reward": 0.9914772808551788,
"rewards/answer_wer_reward": 0.9236075580120087,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9982964098453522,
"step": 503
},
{
"completion_length": 234.34375,
"epoch": 1.6112,
"grad_norm": 1.4146682024002075,
"kl": 0.1064453125,
"learning_rate": 3.7125e-07,
"loss": 0.0011,
"reward": 3.941379427909851,
"reward_std": 0.011062228586524725,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9433701932430267,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9980092346668243,
"step": 504
},
{
"completion_length": 252.84375,
"epoch": 1.6143999999999998,
"grad_norm": 1.9019030332565308,
"kl": 0.101318359375,
"learning_rate": 3.7e-07,
"loss": 0.001,
"reward": 3.87961208820343,
"reward_std": 0.02180068287998438,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.8812887370586395,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9983234107494354,
"step": 505
},
{
"completion_length": 184.34375,
"epoch": 1.6176,
"grad_norm": 3.1965742111206055,
"kl": 0.114501953125,
"learning_rate": 3.6875e-07,
"loss": 0.0011,
"reward": 3.650223731994629,
"reward_std": 0.16780234314501286,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9241631031036377,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.7573105096817017,
"step": 506
},
{
"completion_length": 224.1875,
"epoch": 1.6208,
"grad_norm": 1.8885560035705566,
"kl": 0.115966796875,
"learning_rate": 3.675e-07,
"loss": 0.0012,
"reward": 3.9122270345687866,
"reward_std": 0.04261860717087984,
"rewards/answer_entity_reward": 0.9947552382946014,
"rewards/answer_wer_reward": 0.9476769864559174,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9697948098182678,
"step": 507
},
{
"completion_length": 220.25,
"epoch": 1.624,
"grad_norm": 3.2499520778656006,
"kl": 0.1044921875,
"learning_rate": 3.6625e-07,
"loss": 0.0011,
"reward": 3.927606225013733,
"reward_std": 0.023842450696974993,
"rewards/answer_entity_reward": 0.9879376590251923,
"rewards/answer_wer_reward": 0.9439153373241425,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9957531988620758,
"step": 508
},
{
"completion_length": 227.96875,
"epoch": 1.6272,
"grad_norm": 2.6528868675231934,
"kl": 0.087890625,
"learning_rate": 3.65e-07,
"loss": 0.0009,
"reward": 3.928350806236267,
"reward_std": 0.01731124660000205,
"rewards/answer_entity_reward": 0.9886363744735718,
"rewards/answer_wer_reward": 0.9420913755893707,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9976231157779694,
"step": 509
},
{
"completion_length": 243.5,
"epoch": 1.6303999999999998,
"grad_norm": 1.618895411491394,
"kl": 0.09814453125,
"learning_rate": 3.6375e-07,
"loss": 0.001,
"reward": 3.9476526975631714,
"reward_std": 0.011007866356521845,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9514043629169464,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9962483048439026,
"step": 510
},
{
"completion_length": 263.4375,
"epoch": 1.6336,
"grad_norm": 2.8576741218566895,
"kl": 0.104736328125,
"learning_rate": 3.6249999999999997e-07,
"loss": 0.001,
"reward": 3.9101955890655518,
"reward_std": 0.01921992190182209,
"rewards/answer_entity_reward": 0.9935776889324188,
"rewards/answer_wer_reward": 0.9185610413551331,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9980569183826447,
"step": 511
},
{
"completion_length": 184.5625,
"epoch": 1.6368,
"grad_norm": 6.8555908203125,
"kl": 0.113525390625,
"learning_rate": 3.6125e-07,
"loss": 0.0011,
"reward": 3.8663313388824463,
"reward_std": 0.10157291498035192,
"rewards/answer_entity_reward": 0.9823717772960663,
"rewards/answer_wer_reward": 0.9564132988452911,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9275462925434113,
"step": 512
},
{
"completion_length": 218.25,
"epoch": 1.6400000000000001,
"grad_norm": 1.9482468366622925,
"kl": 0.091064453125,
"learning_rate": 3.6e-07,
"loss": 0.0009,
"reward": 3.8723970651626587,
"reward_std": 0.07238492835313082,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.9418750703334808,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9362037181854248,
"step": 513
},
{
"completion_length": 240.90625,
"epoch": 1.6432,
"grad_norm": 1.2296831607818604,
"kl": 0.079345703125,
"learning_rate": 3.5875e-07,
"loss": 0.0008,
"reward": 3.9039018154144287,
"reward_std": 0.09914317354559898,
"rewards/answer_entity_reward": 0.984375,
"rewards/answer_wer_reward": 0.9198593199253082,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996675550937653,
"step": 514
},
{
"completion_length": 234.4375,
"epoch": 1.6463999999999999,
"grad_norm": 1.4495328664779663,
"kl": 0.1455078125,
"learning_rate": 3.5749999999999997e-07,
"loss": 0.0015,
"reward": 3.9163358211517334,
"reward_std": 0.013342800550162792,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9182494282722473,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9980863332748413,
"step": 515
},
{
"completion_length": 251.5,
"epoch": 1.6496,
"grad_norm": 1.5324357748031616,
"kl": 0.085693359375,
"learning_rate": 3.5625e-07,
"loss": 0.0009,
"reward": 3.8444111347198486,
"reward_std": 0.19724943954497576,
"rewards/answer_entity_reward": 0.9548611044883728,
"rewards/answer_wer_reward": 0.9235136210918427,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9972864091396332,
"step": 516
},
{
"completion_length": 242.40625,
"epoch": 1.6528,
"grad_norm": 33.44215774536133,
"kl": 0.107177734375,
"learning_rate": 3.55e-07,
"loss": 0.0011,
"reward": 3.8674838542938232,
"reward_std": 0.024256199598312378,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.931198239326477,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9362856149673462,
"step": 517
},
{
"completion_length": 222.375,
"epoch": 1.6560000000000001,
"grad_norm": 2.1098077297210693,
"kl": 0.119140625,
"learning_rate": 3.5375e-07,
"loss": 0.0012,
"reward": 3.916640877723694,
"reward_std": 0.012934736907482147,
"rewards/answer_entity_reward": 0.9841346144676208,
"rewards/answer_wer_reward": 0.9368312060832977,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.99567511677742,
"step": 518
},
{
"completion_length": 187.09375,
"epoch": 1.6592,
"grad_norm": 5.296720504760742,
"kl": 0.1220703125,
"learning_rate": 3.5249999999999996e-07,
"loss": 0.0012,
"reward": 3.9440935850143433,
"reward_std": 0.02182569820433855,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9536486864089966,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9939171075820923,
"step": 519
},
{
"completion_length": 199.1875,
"epoch": 1.6623999999999999,
"grad_norm": 2.8992345333099365,
"kl": 0.1083984375,
"learning_rate": 3.5124999999999997e-07,
"loss": 0.0011,
"reward": 3.868250846862793,
"reward_std": 0.01035462855361402,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.952102780342102,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9196203351020813,
"step": 520
},
{
"completion_length": 201.03125,
"epoch": 1.6656,
"grad_norm": 2.3841094970703125,
"kl": 0.176025390625,
"learning_rate": 3.5e-07,
"loss": 0.0018,
"reward": 3.8405520915985107,
"reward_std": 0.020799917168915272,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9267003536224365,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.913851797580719,
"step": 521
},
{
"completion_length": 212.84375,
"epoch": 1.6688,
"grad_norm": 2.3912744522094727,
"kl": 0.126953125,
"learning_rate": 3.4875e-07,
"loss": 0.0013,
"reward": 3.894093632698059,
"reward_std": 0.027726877480745316,
"rewards/answer_entity_reward": 0.9886363744735718,
"rewards/answer_wer_reward": 0.935745120048523,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9697121679782867,
"step": 522
},
{
"completion_length": 235.875,
"epoch": 1.6720000000000002,
"grad_norm": 3.050795078277588,
"kl": 0.109130859375,
"learning_rate": 3.4749999999999996e-07,
"loss": 0.0011,
"reward": 3.8923540115356445,
"reward_std": 0.01905027125030756,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9099950790405273,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9851997494697571,
"step": 523
},
{
"completion_length": 228.25,
"epoch": 1.6752,
"grad_norm": 1.20732843875885,
"kl": 0.09375,
"learning_rate": 3.4624999999999997e-07,
"loss": 0.0009,
"reward": 3.936145067214966,
"reward_std": 0.009886496467515826,
"rewards/answer_entity_reward": 0.9944852888584137,
"rewards/answer_wer_reward": 0.9416597485542297,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 524
},
{
"completion_length": 225.78125,
"epoch": 1.6784,
"grad_norm": 8.249052047729492,
"kl": 0.09326171875,
"learning_rate": 3.45e-07,
"loss": 0.0009,
"reward": 3.922656536102295,
"reward_std": 0.030036092270165682,
"rewards/answer_entity_reward": 0.9934523701667786,
"rewards/answer_wer_reward": 0.9322790205478668,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9969250857830048,
"step": 525
},
{
"completion_length": 181.78125,
"epoch": 1.6816,
"grad_norm": 3.0338377952575684,
"kl": 0.42138671875,
"learning_rate": 3.4375e-07,
"loss": 0.0042,
"reward": 3.9170188903808594,
"reward_std": 0.02494343649595976,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9231892824172974,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9962334036827087,
"step": 526
},
{
"completion_length": 197.0625,
"epoch": 1.6848,
"grad_norm": 1.7836970090866089,
"kl": 0.2099609375,
"learning_rate": 3.425e-07,
"loss": 0.0021,
"reward": 3.9194570779800415,
"reward_std": 0.03800513781607151,
"rewards/answer_entity_reward": 0.9902680516242981,
"rewards/answer_wer_reward": 0.9317581951618195,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9974307715892792,
"step": 527
},
{
"completion_length": 210.59375,
"epoch": 1.688,
"grad_norm": 2.595771074295044,
"kl": 0.1103515625,
"learning_rate": 3.4124999999999996e-07,
"loss": 0.0011,
"reward": 3.8902900218963623,
"reward_std": 0.03382246592082083,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.959803968667984,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9328898191452026,
"step": 528
},
{
"completion_length": 220.0,
"epoch": 1.6912,
"grad_norm": 1.7138639688491821,
"kl": 0.1044921875,
"learning_rate": 3.4000000000000003e-07,
"loss": 0.001,
"reward": 3.9327096939086914,
"reward_std": 0.02261860202997923,
"rewards/answer_entity_reward": 0.9938696324825287,
"rewards/answer_wer_reward": 0.9420961737632751,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9967438876628876,
"step": 529
},
{
"completion_length": 196.75,
"epoch": 1.6944,
"grad_norm": 11.008087158203125,
"kl": 0.25732421875,
"learning_rate": 3.3875e-07,
"loss": 0.0026,
"reward": 3.9551256895065308,
"reward_std": 0.013849829090759158,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9695225656032562,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9884441494941711,
"step": 530
},
{
"completion_length": 206.25,
"epoch": 1.6976,
"grad_norm": 3.295365810394287,
"kl": 0.17822265625,
"learning_rate": 3.375e-07,
"loss": 0.0018,
"reward": 3.8593257665634155,
"reward_std": 0.03199449460953474,
"rewards/answer_entity_reward": 0.9895833134651184,
"rewards/answer_wer_reward": 0.9447747468948364,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9249675273895264,
"step": 531
},
{
"completion_length": 202.9375,
"epoch": 1.7008,
"grad_norm": 1.3525906801223755,
"kl": 0.1484375,
"learning_rate": 3.3624999999999996e-07,
"loss": 0.0015,
"reward": 3.9375537633895874,
"reward_std": 0.017243665643036366,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9407197833061218,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9992378056049347,
"step": 532
},
{
"completion_length": 243.09375,
"epoch": 1.704,
"grad_norm": 3.5387661457061768,
"kl": 0.074951171875,
"learning_rate": 3.35e-07,
"loss": 0.0007,
"reward": 3.907800793647766,
"reward_std": 0.019072275608778,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9087632894515991,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999037504196167,
"step": 533
},
{
"completion_length": 235.15625,
"epoch": 1.7072,
"grad_norm": 2.016521453857422,
"kl": 0.09326171875,
"learning_rate": 3.3375e-07,
"loss": 0.0009,
"reward": 3.8281819820404053,
"reward_std": 0.021804995834827423,
"rewards/answer_entity_reward": 0.9937500059604645,
"rewards/answer_wer_reward": 0.9325411021709442,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9018907845020294,
"step": 534
},
{
"completion_length": 228.25,
"epoch": 1.7104,
"grad_norm": 2.274576187133789,
"kl": 0.090087890625,
"learning_rate": 3.325e-07,
"loss": 0.0009,
"reward": 3.9243232011795044,
"reward_std": 0.02412506751716137,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9507229626178741,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9764412045478821,
"step": 535
},
{
"completion_length": 224.53125,
"epoch": 1.7136,
"grad_norm": 2.6043360233306885,
"kl": 0.10400390625,
"learning_rate": 3.3124999999999995e-07,
"loss": 0.001,
"reward": 3.876230835914612,
"reward_std": 0.07055234862491488,
"rewards/answer_entity_reward": 0.9927884340286255,
"rewards/answer_wer_reward": 0.9505043029785156,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9329380691051483,
"step": 536
},
{
"completion_length": 225.75,
"epoch": 1.7168,
"grad_norm": 2.8599207401275635,
"kl": 0.09814453125,
"learning_rate": 3.3e-07,
"loss": 0.001,
"reward": 3.8414641618728638,
"reward_std": 0.05350587982684374,
"rewards/answer_entity_reward": 0.9983552694320679,
"rewards/answer_wer_reward": 0.9369199872016907,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9061886966228485,
"step": 537
},
{
"completion_length": 148.6875,
"epoch": 1.72,
"grad_norm": 1.6326717138290405,
"kl": 0.10009765625,
"learning_rate": 3.2875e-07,
"loss": 0.001,
"reward": 3.9361575841903687,
"reward_std": 0.004058501799590886,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9676616787910461,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9684959352016449,
"step": 538
},
{
"completion_length": 195.0625,
"epoch": 1.7231999999999998,
"grad_norm": 1.9592961072921753,
"kl": 0.12841796875,
"learning_rate": 3.275e-07,
"loss": 0.0013,
"reward": 3.772740364074707,
"reward_std": 0.1297362227924168,
"rewards/answer_entity_reward": 0.8774839639663696,
"rewards/answer_wer_reward": 0.953325480222702,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9419309496879578,
"step": 539
},
{
"completion_length": 234.75,
"epoch": 1.7264,
"grad_norm": 2.8339364528656006,
"kl": 0.09130859375,
"learning_rate": 3.2624999999999995e-07,
"loss": 0.0009,
"reward": 3.9273258447647095,
"reward_std": 0.019230290316045284,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.929772675037384,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996366202831268,
"step": 540
},
{
"completion_length": 202.21875,
"epoch": 1.7296,
"grad_norm": 1.428126335144043,
"kl": 0.11083984375,
"learning_rate": 3.25e-07,
"loss": 0.0011,
"reward": 3.8019243478775024,
"reward_std": 0.012322985101491213,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9284610748291016,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8734632432460785,
"step": 541
},
{
"completion_length": 204.875,
"epoch": 1.7328000000000001,
"grad_norm": 2.1539251804351807,
"kl": 0.11376953125,
"learning_rate": 3.2374999999999997e-07,
"loss": 0.0011,
"reward": 3.9308128356933594,
"reward_std": 0.03895580768585205,
"rewards/answer_entity_reward": 0.9930555820465088,
"rewards/answer_wer_reward": 0.9500284790992737,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9877286553382874,
"step": 542
},
{
"completion_length": 237.40625,
"epoch": 1.736,
"grad_norm": 2.9644949436187744,
"kl": 0.091796875,
"learning_rate": 3.225e-07,
"loss": 0.0009,
"reward": 3.8919214010238647,
"reward_std": 0.025371606461703777,
"rewards/answer_entity_reward": 0.9927884340286255,
"rewards/answer_wer_reward": 0.9108140766620636,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9883188605308533,
"step": 543
},
{
"completion_length": 173.5,
"epoch": 1.7391999999999999,
"grad_norm": 1.8892700672149658,
"kl": 0.11376953125,
"learning_rate": 3.2124999999999994e-07,
"loss": 0.0011,
"reward": 3.816041350364685,
"reward_std": 0.021231804974377155,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.8221178352832794,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9973958432674408,
"step": 544
},
{
"completion_length": 196.90625,
"epoch": 1.7424,
"grad_norm": 1.6765927076339722,
"kl": 0.103271484375,
"learning_rate": 3.2e-07,
"loss": 0.001,
"reward": 3.825459599494934,
"reward_std": 0.1512175016105175,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9306570887565613,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.928135871887207,
"step": 545
},
{
"completion_length": 245.15625,
"epoch": 1.7456,
"grad_norm": 2.408535957336426,
"kl": 0.100830078125,
"learning_rate": 3.1874999999999997e-07,
"loss": 0.001,
"reward": 3.904157519340515,
"reward_std": 0.024684349074959755,
"rewards/answer_entity_reward": 0.9914772808551788,
"rewards/answer_wer_reward": 0.9129303097724915,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999750018119812,
"step": 546
},
{
"completion_length": 192.15625,
"epoch": 1.7488000000000001,
"grad_norm": 1.3466379642486572,
"kl": 0.1162109375,
"learning_rate": 3.175e-07,
"loss": 0.0012,
"reward": 3.8801496028900146,
"reward_std": 0.028854741947725415,
"rewards/answer_entity_reward": 0.987500011920929,
"rewards/answer_wer_reward": 0.9211397469043732,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9715098142623901,
"step": 547
},
{
"completion_length": 199.5,
"epoch": 1.752,
"grad_norm": 1.6798815727233887,
"kl": 0.14404296875,
"learning_rate": 3.1624999999999994e-07,
"loss": 0.0014,
"reward": 3.9099488258361816,
"reward_std": 0.01651060301810503,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9137388169765472,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9986139535903931,
"step": 548
},
{
"completion_length": 244.9375,
"epoch": 1.7551999999999999,
"grad_norm": 1.4050216674804688,
"kl": 0.12451171875,
"learning_rate": 3.15e-07,
"loss": 0.0012,
"reward": 3.9373788833618164,
"reward_std": 0.015202231705188751,
"rewards/answer_entity_reward": 0.9914772808551788,
"rewards/answer_wer_reward": 0.9471401572227478,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987614154815674,
"step": 549
},
{
"completion_length": 248.25,
"epoch": 1.7584,
"grad_norm": 1.4261935949325562,
"kl": 0.06884765625,
"learning_rate": 3.1374999999999996e-07,
"loss": 0.0007,
"reward": 3.8940224647521973,
"reward_std": 0.02191777713596821,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.8968429565429688,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9995833337306976,
"step": 550
},
{
"completion_length": 201.96875,
"epoch": 1.7616,
"grad_norm": 3.3936564922332764,
"kl": 0.10986328125,
"learning_rate": 3.1249999999999997e-07,
"loss": 0.0011,
"reward": 3.8392233848571777,
"reward_std": 0.054989127907902,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9420890212059021,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8999753296375275,
"step": 551
},
{
"completion_length": 224.28125,
"epoch": 1.7648000000000001,
"grad_norm": 3.447808027267456,
"kl": 0.1162109375,
"learning_rate": 3.1125000000000004e-07,
"loss": 0.0012,
"reward": 3.928855776786804,
"reward_std": 0.03860421013087034,
"rewards/answer_entity_reward": 0.995192289352417,
"rewards/answer_wer_reward": 0.9465460479259491,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9871174991130829,
"step": 552
},
{
"completion_length": 239.0625,
"epoch": 1.768,
"grad_norm": 0.9099166989326477,
"kl": 0.09228515625,
"learning_rate": 3.1e-07,
"loss": 0.0008,
"reward": 3.946284055709839,
"reward_std": 0.0096789482049644,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9500063955783844,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9962775409221649,
"step": 553
},
{
"completion_length": 225.0625,
"epoch": 1.7711999999999999,
"grad_norm": 5.470230579376221,
"kl": 0.0791015625,
"learning_rate": 3.0875e-07,
"loss": 0.0008,
"reward": 3.919348955154419,
"reward_std": 0.03945630043745041,
"rewards/answer_entity_reward": 0.9685782790184021,
"rewards/answer_wer_reward": 0.9539141952991486,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9968563914299011,
"step": 554
},
{
"completion_length": 221.46875,
"epoch": 1.7744,
"grad_norm": 2.4623939990997314,
"kl": 0.091064453125,
"learning_rate": 3.0749999999999997e-07,
"loss": 0.0009,
"reward": 3.9319478273391724,
"reward_std": 0.020772571209818125,
"rewards/answer_entity_reward": 0.9983552694320679,
"rewards/answer_wer_reward": 0.9341712892055511,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994212985038757,
"step": 555
},
{
"completion_length": 195.28125,
"epoch": 1.7776,
"grad_norm": 3.2428677082061768,
"kl": 0.1201171875,
"learning_rate": 3.0625000000000003e-07,
"loss": 0.0012,
"reward": 3.8943945169448853,
"reward_std": 0.03664180589839816,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.9439602494239807,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9561159908771515,
"step": 556
},
{
"completion_length": 181.4375,
"epoch": 1.7808000000000002,
"grad_norm": 3.0905327796936035,
"kl": 0.1103515625,
"learning_rate": 3.05e-07,
"loss": 0.0011,
"reward": 3.761397957801819,
"reward_std": 0.21460139192640781,
"rewards/answer_entity_reward": 0.9930555522441864,
"rewards/answer_wer_reward": 0.928047776222229,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.8715444803237915,
"step": 557
},
{
"completion_length": 221.625,
"epoch": 1.784,
"grad_norm": 1.951019525527954,
"kl": 0.075927734375,
"learning_rate": 3.0375e-07,
"loss": 0.0008,
"reward": 3.77008855342865,
"reward_std": 0.32161275763064623,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.8999943733215332,
"rewards/format_reward": 0.9375,
"rewards/think_ocr_reward": 0.9325942993164062,
"step": 558
},
{
"completion_length": 205.28125,
"epoch": 1.7872,
"grad_norm": 3.277336359024048,
"kl": 0.219482421875,
"learning_rate": 3.0249999999999996e-07,
"loss": 0.0022,
"reward": 3.934972047805786,
"reward_std": 0.0279585188254714,
"rewards/answer_entity_reward": 0.9919143319129944,
"rewards/answer_wer_reward": 0.944387674331665,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.998670220375061,
"step": 559
},
{
"completion_length": 228.5625,
"epoch": 1.7904,
"grad_norm": 1.3801170587539673,
"kl": 0.090576171875,
"learning_rate": 3.0125000000000003e-07,
"loss": 0.0009,
"reward": 3.93076229095459,
"reward_std": 0.018667958676815033,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9448211789131165,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9887820482254028,
"step": 560
},
{
"completion_length": 210.71875,
"epoch": 1.7936,
"grad_norm": 1.791351556777954,
"kl": 0.109130859375,
"learning_rate": 3e-07,
"loss": 0.0011,
"reward": 3.883724331855774,
"reward_std": 0.061979083344340324,
"rewards/answer_entity_reward": 0.9836647808551788,
"rewards/answer_wer_reward": 0.9013588726520538,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987007081508636,
"step": 561
},
{
"completion_length": 205.125,
"epoch": 1.7968,
"grad_norm": 2.168004274368286,
"kl": 0.103271484375,
"learning_rate": 2.9875e-07,
"loss": 0.001,
"reward": 3.8556606769561768,
"reward_std": 0.08509537391364574,
"rewards/answer_entity_reward": 0.995192289352417,
"rewards/answer_wer_reward": 0.9325578808784485,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9279104173183441,
"step": 562
},
{
"completion_length": 206.78125,
"epoch": 1.8,
"grad_norm": 1.8020058870315552,
"kl": 0.112548828125,
"learning_rate": 2.9749999999999996e-07,
"loss": 0.0011,
"reward": 3.9098552465438843,
"reward_std": 0.027897534891963005,
"rewards/answer_entity_reward": 0.995192289352417,
"rewards/answer_wer_reward": 0.9327702820301056,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9818926453590393,
"step": 563
},
{
"completion_length": 199.84375,
"epoch": 1.8032,
"grad_norm": 2.1101276874542236,
"kl": 0.08056640625,
"learning_rate": 2.9625e-07,
"loss": 0.0008,
"reward": 3.928394079208374,
"reward_std": 0.013759741093963385,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9306167364120483,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9977773427963257,
"step": 564
},
{
"completion_length": 198.34375,
"epoch": 1.8064,
"grad_norm": 1.7468022108078003,
"kl": 0.110595703125,
"learning_rate": 2.95e-07,
"loss": 0.0011,
"reward": 3.8688454627990723,
"reward_std": 0.01723374053835869,
"rewards/answer_entity_reward": 0.9888257682323456,
"rewards/answer_wer_reward": 0.9232835471630096,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9567362368106842,
"step": 565
},
{
"completion_length": 166.21875,
"epoch": 1.8096,
"grad_norm": 3.4565577507019043,
"kl": 0.12451171875,
"learning_rate": 2.9375e-07,
"loss": 0.0012,
"reward": 3.8460001945495605,
"reward_std": 0.12010016990825534,
"rewards/answer_entity_reward": 0.9685245454311371,
"rewards/answer_wer_reward": 0.905397355556488,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9720781445503235,
"step": 566
},
{
"completion_length": 204.375,
"epoch": 1.8128,
"grad_norm": 2.109642267227173,
"kl": 0.1474609375,
"learning_rate": 2.9249999999999995e-07,
"loss": 0.0015,
"reward": 3.9155898094177246,
"reward_std": 0.021943609230220318,
"rewards/answer_entity_reward": 0.995192289352417,
"rewards/answer_wer_reward": 0.9553306102752686,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9650669693946838,
"step": 567
},
{
"completion_length": 231.5625,
"epoch": 1.8159999999999998,
"grad_norm": 1.4336498975753784,
"kl": 0.101318359375,
"learning_rate": 2.9125e-07,
"loss": 0.001,
"reward": 3.9311413764953613,
"reward_std": 0.012714509852230549,
"rewards/answer_entity_reward": 0.9944852888584137,
"rewards/answer_wer_reward": 0.9397754371166229,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9968807101249695,
"step": 568
},
{
"completion_length": 204.8125,
"epoch": 1.8192,
"grad_norm": 2.3991148471832275,
"kl": 0.0830078125,
"learning_rate": 2.9e-07,
"loss": 0.0008,
"reward": 3.843847155570984,
"reward_std": 0.1981589295901358,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9109402298927307,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9641570150852203,
"step": 569
},
{
"completion_length": 232.46875,
"epoch": 1.8224,
"grad_norm": 1.6885050535202026,
"kl": 0.085693359375,
"learning_rate": 2.8875e-07,
"loss": 0.0009,
"reward": 3.8632709980010986,
"reward_std": 0.08977647870779037,
"rewards/answer_entity_reward": 0.9838541746139526,
"rewards/answer_wer_reward": 0.9250127673149109,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9544041156768799,
"step": 570
},
{
"completion_length": 277.78125,
"epoch": 1.8256000000000001,
"grad_norm": 1.6569448709487915,
"kl": 0.09619140625,
"learning_rate": 2.8749999999999995e-07,
"loss": 0.001,
"reward": 3.70079243183136,
"reward_std": 0.15860513970255852,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.901878148317337,
"rewards/format_reward": 0.9375,
"rewards/think_ocr_reward": 0.8614143133163452,
"step": 571
},
{
"completion_length": 248.0625,
"epoch": 1.8288,
"grad_norm": 1.72274649143219,
"kl": 0.08251953125,
"learning_rate": 2.8625e-07,
"loss": 0.0008,
"reward": 3.8939234018325806,
"reward_std": 0.016017161309719086,
"rewards/answer_entity_reward": 0.9803321659564972,
"rewards/answer_wer_reward": 0.9138159155845642,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997751712799072,
"step": 572
},
{
"completion_length": 222.1875,
"epoch": 1.8319999999999999,
"grad_norm": 5.086897373199463,
"kl": 0.3359375,
"learning_rate": 2.8499999999999997e-07,
"loss": 0.0034,
"reward": 3.931598663330078,
"reward_std": 0.02384120598435402,
"rewards/answer_entity_reward": 0.9947552382946014,
"rewards/answer_wer_reward": 0.9370801150798798,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997632503509521,
"step": 573
},
{
"completion_length": 216.3125,
"epoch": 1.8352,
"grad_norm": 1.5536502599716187,
"kl": 0.089599609375,
"learning_rate": 2.8375e-07,
"loss": 0.0009,
"reward": 3.941322922706604,
"reward_std": 0.018783860839903355,
"rewards/answer_entity_reward": 0.9833333194255829,
"rewards/answer_wer_reward": 0.9589866697788239,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990028738975525,
"step": 574
},
{
"completion_length": 232.9375,
"epoch": 1.8384,
"grad_norm": 1.4012224674224854,
"kl": 0.09033203125,
"learning_rate": 2.8249999999999994e-07,
"loss": 0.0009,
"reward": 3.9058892726898193,
"reward_std": 0.05868656514212489,
"rewards/answer_entity_reward": 0.9895833432674408,
"rewards/answer_wer_reward": 0.9166894555091858,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996165633201599,
"step": 575
},
{
"completion_length": 202.125,
"epoch": 1.8416000000000001,
"grad_norm": 3.967221260070801,
"kl": 0.098388671875,
"learning_rate": 2.8125e-07,
"loss": 0.001,
"reward": 3.936561346054077,
"reward_std": 0.028190571581944823,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9553852677345276,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9811758995056152,
"step": 576
},
{
"completion_length": 248.25,
"epoch": 1.8448,
"grad_norm": 3.581430673599243,
"kl": 0.19970703125,
"learning_rate": 2.8e-07,
"loss": 0.002,
"reward": 3.854965329170227,
"reward_std": 0.08418525848537683,
"rewards/answer_entity_reward": 0.9715560376644135,
"rewards/answer_wer_reward": 0.9045931100845337,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9788161218166351,
"step": 577
},
{
"completion_length": 199.25,
"epoch": 1.8479999999999999,
"grad_norm": 3.7948851585388184,
"kl": 0.147705078125,
"learning_rate": 2.7875e-07,
"loss": 0.0015,
"reward": 3.923743486404419,
"reward_std": 0.042667020577937365,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9548681676387787,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9688753485679626,
"step": 578
},
{
"completion_length": 193.34375,
"epoch": 1.8512,
"grad_norm": 2.4876842498779297,
"kl": 0.102294921875,
"learning_rate": 2.775e-07,
"loss": 0.001,
"reward": 3.97040331363678,
"reward_std": 0.01486315974034369,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.973244309425354,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 579
},
{
"completion_length": 223.3125,
"epoch": 1.8544,
"grad_norm": 4.710970878601074,
"kl": 0.14501953125,
"learning_rate": 2.7625e-07,
"loss": 0.0015,
"reward": 3.9073562622070312,
"reward_std": 0.040397388860583305,
"rewards/answer_entity_reward": 0.9927884638309479,
"rewards/answer_wer_reward": 0.9182255864143372,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9963421821594238,
"step": 580
},
{
"completion_length": 240.75,
"epoch": 1.8576000000000001,
"grad_norm": 1.0144391059875488,
"kl": 0.089599609375,
"learning_rate": 2.75e-07,
"loss": 0.0009,
"reward": 3.8770586252212524,
"reward_std": 0.030949956737458706,
"rewards/answer_entity_reward": 0.9654052257537842,
"rewards/answer_wer_reward": 0.9127066433429718,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9989466071128845,
"step": 581
},
{
"completion_length": 240.46875,
"epoch": 1.8608,
"grad_norm": 33.290077209472656,
"kl": 0.14111328125,
"learning_rate": 2.7374999999999997e-07,
"loss": 0.0014,
"reward": 3.931227445602417,
"reward_std": 0.017369844019412994,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9346133172512054,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990180134773254,
"step": 582
},
{
"completion_length": 247.1875,
"epoch": 1.8639999999999999,
"grad_norm": 1.7812319993972778,
"kl": 0.083984375,
"learning_rate": 2.725e-07,
"loss": 0.0008,
"reward": 3.885915160179138,
"reward_std": 0.0845849048346281,
"rewards/answer_entity_reward": 0.9893162548542023,
"rewards/answer_wer_reward": 0.9280518591403961,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9685470759868622,
"step": 583
},
{
"completion_length": 208.0625,
"epoch": 1.8672,
"grad_norm": 3.882129192352295,
"kl": 0.133544921875,
"learning_rate": 2.7125e-07,
"loss": 0.0013,
"reward": 3.8892873525619507,
"reward_std": 0.04396933689713478,
"rewards/answer_entity_reward": 0.9919143319129944,
"rewards/answer_wer_reward": 0.9468680024147034,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9505050778388977,
"step": 584
},
{
"completion_length": 254.6875,
"epoch": 1.8704,
"grad_norm": 1.1797689199447632,
"kl": 0.09619140625,
"learning_rate": 2.7e-07,
"loss": 0.001,
"reward": 3.9246060848236084,
"reward_std": 0.021186589263379574,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9278402030467987,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9991695880889893,
"step": 585
},
{
"completion_length": 225.9375,
"epoch": 1.8736000000000002,
"grad_norm": 1.4846960306167603,
"kl": 0.1142578125,
"learning_rate": 2.6874999999999997e-07,
"loss": 0.0011,
"reward": 3.9609856605529785,
"reward_std": 0.024455342907458544,
"rewards/answer_entity_reward": 0.9909090995788574,
"rewards/answer_wer_reward": 0.9731672704219818,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9969093799591064,
"step": 586
},
{
"completion_length": 214.03125,
"epoch": 1.8768,
"grad_norm": 1.4108463525772095,
"kl": 0.2236328125,
"learning_rate": 2.675e-07,
"loss": 0.0022,
"reward": 3.9008651971817017,
"reward_std": 0.023720702156424522,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9427990317344666,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9615384638309479,
"step": 587
},
{
"completion_length": 257.34375,
"epoch": 1.88,
"grad_norm": 2.2120485305786133,
"kl": 0.085693359375,
"learning_rate": 2.6625e-07,
"loss": 0.0009,
"reward": 3.905014157295227,
"reward_std": 0.02011673618108034,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9055063128471375,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999507874250412,
"step": 588
},
{
"completion_length": 217.21875,
"epoch": 1.8832,
"grad_norm": 2.6982715129852295,
"kl": 0.08935546875,
"learning_rate": 2.65e-07,
"loss": 0.0009,
"reward": 3.9260659217834473,
"reward_std": 0.02971976064145565,
"rewards/answer_entity_reward": 0.9871068000793457,
"rewards/answer_wer_reward": 0.9389589130878448,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 589
},
{
"completion_length": 253.65625,
"epoch": 1.8864,
"grad_norm": 1.0963667631149292,
"kl": 0.081298828125,
"learning_rate": 2.6374999999999996e-07,
"loss": 0.0008,
"reward": 3.9269603490829468,
"reward_std": 0.02615117933601141,
"rewards/answer_entity_reward": 0.9908459782600403,
"rewards/answer_wer_reward": 0.9372480809688568,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9988662898540497,
"step": 590
},
{
"completion_length": 204.09375,
"epoch": 1.8896,
"grad_norm": 2.6849443912506104,
"kl": 0.128173828125,
"learning_rate": 2.625e-07,
"loss": 0.0013,
"reward": 3.912359118461609,
"reward_std": 0.025913351215422153,
"rewards/answer_entity_reward": 0.9798610806465149,
"rewards/answer_wer_reward": 0.9567141532897949,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9757837951183319,
"step": 591
},
{
"completion_length": 229.75,
"epoch": 1.8928,
"grad_norm": 12.276920318603516,
"kl": 0.524169921875,
"learning_rate": 2.6125e-07,
"loss": 0.0052,
"reward": 3.8932021856307983,
"reward_std": 0.014225118793547153,
"rewards/answer_entity_reward": 0.9981617629528046,
"rewards/answer_wer_reward": 0.9339624643325806,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9610779881477356,
"step": 592
},
{
"completion_length": 172.0,
"epoch": 1.896,
"grad_norm": 3.136312961578369,
"kl": 0.197509765625,
"learning_rate": 2.6e-07,
"loss": 0.002,
"reward": 3.927412748336792,
"reward_std": 0.02914919052273035,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9748775362968445,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9525350630283356,
"step": 593
},
{
"completion_length": 231.625,
"epoch": 1.8992,
"grad_norm": 1.4952311515808105,
"kl": 0.0966796875,
"learning_rate": 2.5874999999999996e-07,
"loss": 0.001,
"reward": 3.920572519302368,
"reward_std": 0.023940533865243196,
"rewards/answer_entity_reward": 0.9852676391601562,
"rewards/answer_wer_reward": 0.9361679553985596,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9991368651390076,
"step": 594
},
{
"completion_length": 222.71875,
"epoch": 1.9024,
"grad_norm": 1.1621572971343994,
"kl": 0.10302734375,
"learning_rate": 2.5749999999999997e-07,
"loss": 0.001,
"reward": 3.8978298902511597,
"reward_std": 0.07608090154826641,
"rewards/answer_entity_reward": 0.9649057686328888,
"rewards/answer_wer_reward": 0.9345213770866394,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9984027147293091,
"step": 595
},
{
"completion_length": 254.21875,
"epoch": 1.9056,
"grad_norm": 0.9472298622131348,
"kl": 0.092041015625,
"learning_rate": 2.5625e-07,
"loss": 0.0009,
"reward": 3.916618824005127,
"reward_std": 0.015597880817949772,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9208222925662994,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9978796541690826,
"step": 596
},
{
"completion_length": 247.84375,
"epoch": 1.9088,
"grad_norm": 1.7148473262786865,
"kl": 0.106201171875,
"learning_rate": 2.55e-07,
"loss": 0.0011,
"reward": 3.9292455911636353,
"reward_std": 0.007026449544355273,
"rewards/answer_entity_reward": 0.9861111044883728,
"rewards/answer_wer_reward": 0.9444275796413422,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987069070339203,
"step": 597
},
{
"completion_length": 183.21875,
"epoch": 1.912,
"grad_norm": 1.6502317190170288,
"kl": 0.119140625,
"learning_rate": 2.5374999999999995e-07,
"loss": 0.0012,
"reward": 3.9383983612060547,
"reward_std": 0.03170687519013882,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9614830911159515,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.981379508972168,
"step": 598
},
{
"completion_length": 167.625,
"epoch": 1.9152,
"grad_norm": 1.1803314685821533,
"kl": 0.137939453125,
"learning_rate": 2.5249999999999996e-07,
"loss": 0.0014,
"reward": 3.9067423343658447,
"reward_std": 0.013731301296502352,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.949960470199585,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9567819237709045,
"step": 599
},
{
"completion_length": 199.75,
"epoch": 1.9184,
"grad_norm": 1.3902597427368164,
"kl": 0.080322265625,
"learning_rate": 2.5125e-07,
"loss": 0.0008,
"reward": 3.927718758583069,
"reward_std": 0.02047483716160059,
"rewards/answer_entity_reward": 0.9840544760227203,
"rewards/answer_wer_reward": 0.9543785154819489,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9892857074737549,
"step": 600
},
{
"completion_length": 210.375,
"epoch": 1.9216,
"grad_norm": 1.122063159942627,
"kl": 0.120361328125,
"learning_rate": 2.5e-07,
"loss": 0.0012,
"reward": 3.942714214324951,
"reward_std": 0.027352871373295784,
"rewards/answer_entity_reward": 0.9909090995788574,
"rewards/answer_wer_reward": 0.9525187313556671,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9992863833904266,
"step": 601
},
{
"completion_length": 191.28125,
"epoch": 1.9247999999999998,
"grad_norm": 1.9480561017990112,
"kl": 0.092529296875,
"learning_rate": 2.4875e-07,
"loss": 0.0009,
"reward": 3.8946096897125244,
"reward_std": 0.07258242554962635,
"rewards/answer_entity_reward": 0.9886363744735718,
"rewards/answer_wer_reward": 0.9430054724216461,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9629679620265961,
"step": 602
},
{
"completion_length": 210.0,
"epoch": 1.928,
"grad_norm": 1.522335171699524,
"kl": 0.08203125,
"learning_rate": 2.475e-07,
"loss": 0.0008,
"reward": 3.9419585466384888,
"reward_std": 0.02094284538179636,
"rewards/answer_entity_reward": 0.9960784316062927,
"rewards/answer_wer_reward": 0.9462659358978271,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996141791343689,
"step": 603
},
{
"completion_length": 194.375,
"epoch": 1.9312,
"grad_norm": 1.9648785591125488,
"kl": 0.2880859375,
"learning_rate": 2.4624999999999997e-07,
"loss": 0.0029,
"reward": 3.9480878114700317,
"reward_std": 0.015357580035924911,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9483262896537781,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997614622116089,
"step": 604
},
{
"completion_length": 220.75,
"epoch": 1.9344000000000001,
"grad_norm": 2.2600207328796387,
"kl": 0.09814453125,
"learning_rate": 2.45e-07,
"loss": 0.001,
"reward": 3.92673122882843,
"reward_std": 0.025613561272621155,
"rewards/answer_entity_reward": 0.9899475276470184,
"rewards/answer_wer_reward": 0.9408612251281738,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9959224164485931,
"step": 605
},
{
"completion_length": 159.125,
"epoch": 1.9376,
"grad_norm": 3.3259623050689697,
"kl": 0.15869140625,
"learning_rate": 2.4375e-07,
"loss": 0.0016,
"reward": 3.9267284870147705,
"reward_std": 0.024998134351335466,
"rewards/answer_entity_reward": 0.987500011920929,
"rewards/answer_wer_reward": 0.9395100474357605,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997184872627258,
"step": 606
},
{
"completion_length": 250.0625,
"epoch": 1.9407999999999999,
"grad_norm": 1.4518193006515503,
"kl": 0.1357421875,
"learning_rate": 2.425e-07,
"loss": 0.0014,
"reward": 3.825323224067688,
"reward_std": 0.11214365810155869,
"rewards/answer_entity_reward": 0.9936868846416473,
"rewards/answer_wer_reward": 0.9456245005130768,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9172618985176086,
"step": 607
},
{
"completion_length": 219.53125,
"epoch": 1.944,
"grad_norm": 1.2852040529251099,
"kl": 0.081298828125,
"learning_rate": 2.4124999999999997e-07,
"loss": 0.0008,
"reward": 3.9620739221572876,
"reward_std": 0.00826547248288989,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9646617472171783,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9974121451377869,
"step": 608
},
{
"completion_length": 232.875,
"epoch": 1.9472,
"grad_norm": 4.870666027069092,
"kl": 0.111328125,
"learning_rate": 2.4e-07,
"loss": 0.0011,
"reward": 3.9366871118545532,
"reward_std": 0.022448008647188544,
"rewards/answer_entity_reward": 0.9916141629219055,
"rewards/answer_wer_reward": 0.9457239210605621,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993489384651184,
"step": 609
},
{
"completion_length": 241.625,
"epoch": 1.9504000000000001,
"grad_norm": 1.8101410865783691,
"kl": 0.096435546875,
"learning_rate": 2.3875e-07,
"loss": 0.001,
"reward": 3.9368724822998047,
"reward_std": 0.022243991494178772,
"rewards/answer_entity_reward": 0.9929924309253693,
"rewards/answer_wer_reward": 0.943880021572113,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 610
},
{
"completion_length": 223.0,
"epoch": 1.9536,
"grad_norm": 0.8068660497665405,
"kl": 0.099609375,
"learning_rate": 2.3749999999999998e-07,
"loss": 0.001,
"reward": 3.9270153045654297,
"reward_std": 0.01834964146837592,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.931628555059433,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9988590180873871,
"step": 611
},
{
"completion_length": 259.59375,
"epoch": 1.9567999999999999,
"grad_norm": 1.522141695022583,
"kl": 0.08203125,
"learning_rate": 2.3625e-07,
"loss": 0.0008,
"reward": 3.9466216564178467,
"reward_std": 0.015272928401827812,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9516074061393738,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9984864592552185,
"step": 612
},
{
"completion_length": 211.6875,
"epoch": 1.96,
"grad_norm": 5.929853916168213,
"kl": 0.10205078125,
"learning_rate": 2.3499999999999997e-07,
"loss": 0.001,
"reward": 3.912535309791565,
"reward_std": 0.026867160573601723,
"rewards/answer_entity_reward": 0.9908565580844879,
"rewards/answer_wer_reward": 0.9229700565338135,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987086653709412,
"step": 613
},
{
"completion_length": 222.40625,
"epoch": 1.9632,
"grad_norm": 2.7727534770965576,
"kl": 0.106689453125,
"learning_rate": 2.3375e-07,
"loss": 0.0011,
"reward": 3.9305132627487183,
"reward_std": 0.07138971472159028,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9646386206150055,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9658747315406799,
"step": 614
},
{
"completion_length": 222.34375,
"epoch": 1.9664000000000001,
"grad_norm": 1.5660823583602905,
"kl": 0.09912109375,
"learning_rate": 2.325e-07,
"loss": 0.001,
"reward": 3.935341477394104,
"reward_std": 0.01785436598584056,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9401703774929047,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.997254341840744,
"step": 615
},
{
"completion_length": 236.34375,
"epoch": 1.9696,
"grad_norm": 1.94826340675354,
"kl": 0.078857421875,
"learning_rate": 2.3125e-07,
"loss": 0.0008,
"reward": 3.958570718765259,
"reward_std": 0.008256069151684642,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9587988257408142,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997718930244446,
"step": 616
},
{
"completion_length": 143.5,
"epoch": 1.9727999999999999,
"grad_norm": 2.0813863277435303,
"kl": 0.121337890625,
"learning_rate": 2.3e-07,
"loss": 0.0012,
"reward": 3.886753797531128,
"reward_std": 0.027786132879555225,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9573519229888916,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9294019639492035,
"step": 617
},
{
"completion_length": 227.75,
"epoch": 1.976,
"grad_norm": 8.525589942932129,
"kl": 0.097900390625,
"learning_rate": 2.2875e-07,
"loss": 0.001,
"reward": 3.857698917388916,
"reward_std": 0.07474052533507347,
"rewards/answer_entity_reward": 0.9985119104385376,
"rewards/answer_wer_reward": 0.9551934599876404,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9039934873580933,
"step": 618
},
{
"completion_length": 233.46875,
"epoch": 1.9792,
"grad_norm": 1.966539978981018,
"kl": 0.082763671875,
"learning_rate": 2.275e-07,
"loss": 0.0008,
"reward": 3.9442771673202515,
"reward_std": 0.018204713938757777,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.9551240801811218,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9948348999023438,
"step": 619
},
{
"completion_length": 254.59375,
"epoch": 1.9824000000000002,
"grad_norm": 2.300699234008789,
"kl": 0.338134765625,
"learning_rate": 2.2625e-07,
"loss": 0.0034,
"reward": 3.9195804595947266,
"reward_std": 0.014696986880153418,
"rewards/answer_entity_reward": 0.974116176366806,
"rewards/answer_wer_reward": 0.9479033648967743,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9975608885288239,
"step": 620
},
{
"completion_length": 239.71875,
"epoch": 1.9856,
"grad_norm": 20.98819923400879,
"kl": 0.16015625,
"learning_rate": 2.25e-07,
"loss": 0.0016,
"reward": 3.858734607696533,
"reward_std": 0.14412511140108109,
"rewards/answer_entity_reward": 0.9937500059604645,
"rewards/answer_wer_reward": 0.9278987050056458,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9683361053466797,
"step": 621
},
{
"completion_length": 225.1875,
"epoch": 1.9888,
"grad_norm": 2.4856221675872803,
"kl": 0.1181640625,
"learning_rate": 2.2375e-07,
"loss": 0.0012,
"reward": 3.92032527923584,
"reward_std": 0.030348293483257294,
"rewards/answer_entity_reward": 0.9947916567325592,
"rewards/answer_wer_reward": 0.9266910254955292,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9988425970077515,
"step": 622
},
{
"completion_length": 209.4375,
"epoch": 1.992,
"grad_norm": 2.2857937812805176,
"kl": 0.109619140625,
"learning_rate": 2.225e-07,
"loss": 0.0011,
"reward": 3.790956974029541,
"reward_std": 0.07298576645553112,
"rewards/answer_entity_reward": 0.993697464466095,
"rewards/answer_wer_reward": 0.93813356757164,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8591260015964508,
"step": 623
},
{
"completion_length": 182.90625,
"epoch": 1.9952,
"grad_norm": 4.463064670562744,
"kl": 0.126953125,
"learning_rate": 2.2125e-07,
"loss": 0.0013,
"reward": 3.906226873397827,
"reward_std": 0.0698380870744586,
"rewards/answer_entity_reward": 0.992799699306488,
"rewards/answer_wer_reward": 0.9589782953262329,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9544488191604614,
"step": 624
},
{
"completion_length": 232.875,
"epoch": 1.9984,
"grad_norm": 1.20980966091156,
"kl": 0.1044921875,
"learning_rate": 2.1999999999999998e-07,
"loss": 0.001,
"reward": 3.9167827367782593,
"reward_std": 0.01762760616838932,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9183346629142761,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9984481334686279,
"step": 625
},
{
"completion_length": 177.75,
"epoch": 2.0,
"grad_norm": 2.3776934146881104,
"kl": 0.11865234375,
"learning_rate": 2.1875e-07,
"loss": 0.0006,
"reward": 3.8532142639160156,
"reward_std": 0.018374208360910416,
"rewards/answer_entity_reward": 0.9963235259056091,
"rewards/answer_wer_reward": 0.9639798402786255,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8929109573364258,
"step": 626
},
{
"completion_length": 234.6875,
"epoch": 2.0032,
"grad_norm": 1.611534833908081,
"kl": 0.13134765625,
"learning_rate": 2.1749999999999998e-07,
"loss": 0.0013,
"reward": 3.9422988891601562,
"reward_std": 0.020460932981222868,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9433672726154327,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9989316165447235,
"step": 627
},
{
"completion_length": 174.4375,
"epoch": 2.0064,
"grad_norm": 3.054837942123413,
"kl": 0.1240234375,
"learning_rate": 2.1625e-07,
"loss": 0.0012,
"reward": 3.958639144897461,
"reward_std": 0.009486648719757795,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9633738994598389,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9952651560306549,
"step": 628
},
{
"completion_length": 222.15625,
"epoch": 2.0096,
"grad_norm": 4.340118885040283,
"kl": 0.093017578125,
"learning_rate": 2.1499999999999998e-07,
"loss": 0.0009,
"reward": 3.8726435899734497,
"reward_std": 0.033597009256482124,
"rewards/answer_entity_reward": 0.9919143319129944,
"rewards/answer_wer_reward": 0.9492302238941193,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9314990937709808,
"step": 629
},
{
"completion_length": 207.8125,
"epoch": 2.0128,
"grad_norm": 2.162853717803955,
"kl": 0.109130859375,
"learning_rate": 2.1375e-07,
"loss": 0.0011,
"reward": 3.9318206310272217,
"reward_std": 0.01979170460253954,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9489758312702179,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9863169491291046,
"step": 630
},
{
"completion_length": 239.90625,
"epoch": 2.016,
"grad_norm": 1.676960825920105,
"kl": 0.182861328125,
"learning_rate": 2.1249999999999998e-07,
"loss": 0.0018,
"reward": 3.937206506729126,
"reward_std": 0.014235546346753836,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9372064471244812,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 631
},
{
"completion_length": 194.28125,
"epoch": 2.0192,
"grad_norm": 5.123164176940918,
"kl": 0.110107421875,
"learning_rate": 2.1125e-07,
"loss": 0.0011,
"reward": 3.7548152208328247,
"reward_std": 0.10348369181156158,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.8692809343338013,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8912160098552704,
"step": 632
},
{
"completion_length": 214.5625,
"epoch": 2.0224,
"grad_norm": 1.3529505729675293,
"kl": 0.106689453125,
"learning_rate": 2.0999999999999997e-07,
"loss": 0.0011,
"reward": 3.8920629024505615,
"reward_std": 0.01197694381698966,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9270462095737457,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9650167524814606,
"step": 633
},
{
"completion_length": 194.25,
"epoch": 2.0256,
"grad_norm": 1.6181930303573608,
"kl": 0.109130859375,
"learning_rate": 2.0874999999999999e-07,
"loss": 0.0011,
"reward": 3.9565629959106445,
"reward_std": 0.021904858760535717,
"rewards/answer_entity_reward": 0.995192289352417,
"rewards/answer_wer_reward": 0.9613706469535828,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 634
},
{
"completion_length": 206.59375,
"epoch": 2.0288,
"grad_norm": 2.353773832321167,
"kl": 0.0986328125,
"learning_rate": 2.0749999999999997e-07,
"loss": 0.001,
"reward": 3.919626474380493,
"reward_std": 0.02727056946605444,
"rewards/answer_entity_reward": 0.987500011920929,
"rewards/answer_wer_reward": 0.9333742260932922,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987522065639496,
"step": 635
},
{
"completion_length": 189.71875,
"epoch": 2.032,
"grad_norm": 1.6075130701065063,
"kl": 0.13720703125,
"learning_rate": 2.0624999999999998e-07,
"loss": 0.0014,
"reward": 3.9046106338500977,
"reward_std": 0.025621079374104738,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.9407951831817627,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9694972634315491,
"step": 636
},
{
"completion_length": 230.46875,
"epoch": 2.0352,
"grad_norm": 5.240235805511475,
"kl": 0.10302734375,
"learning_rate": 2.0499999999999997e-07,
"loss": 0.001,
"reward": 3.9211699962615967,
"reward_std": 0.017814213410019875,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9250318109989166,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9982215464115143,
"step": 637
},
{
"completion_length": 209.59375,
"epoch": 2.0384,
"grad_norm": 2.4782729148864746,
"kl": 0.083984375,
"learning_rate": 2.0374999999999998e-07,
"loss": 0.0008,
"reward": 3.894644021987915,
"reward_std": 0.020965205505490303,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9609961807727814,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9336478412151337,
"step": 638
},
{
"completion_length": 233.5,
"epoch": 2.0416,
"grad_norm": 1.102921485900879,
"kl": 0.089599609375,
"learning_rate": 2.025e-07,
"loss": 0.0009,
"reward": 3.9374464750289917,
"reward_std": 0.015141086652874947,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9409857094287872,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993016719818115,
"step": 639
},
{
"completion_length": 227.84375,
"epoch": 2.0448,
"grad_norm": 1.3384666442871094,
"kl": 0.0908203125,
"learning_rate": 2.0125e-07,
"loss": 0.0009,
"reward": 3.9045239686965942,
"reward_std": 0.12723269453272223,
"rewards/answer_entity_reward": 0.96875,
"rewards/answer_wer_reward": 0.9367768168449402,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.998997151851654,
"step": 640
},
{
"completion_length": 175.625,
"epoch": 2.048,
"grad_norm": 0.6850874423980713,
"kl": 0.124267578125,
"learning_rate": 2e-07,
"loss": 0.0012,
"reward": 3.929681897163391,
"reward_std": 0.008345533395186067,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9302853643894196,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993966221809387,
"step": 641
},
{
"completion_length": 213.59375,
"epoch": 2.0512,
"grad_norm": 2.9222097396850586,
"kl": 0.101318359375,
"learning_rate": 1.9875e-07,
"loss": 0.001,
"reward": 3.8092339038848877,
"reward_std": 0.11687304638326168,
"rewards/answer_entity_reward": 0.9529532790184021,
"rewards/answer_wer_reward": 0.8997257351875305,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9565548896789551,
"step": 642
},
{
"completion_length": 205.5,
"epoch": 2.0544,
"grad_norm": 1.1586568355560303,
"kl": 0.092041015625,
"learning_rate": 1.975e-07,
"loss": 0.0009,
"reward": 3.9247117042541504,
"reward_std": 0.011728376615792513,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.935352236032486,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9893594086170197,
"step": 643
},
{
"completion_length": 203.09375,
"epoch": 2.0576,
"grad_norm": 1.5699268579483032,
"kl": 0.09326171875,
"learning_rate": 1.9625e-07,
"loss": 0.0009,
"reward": 3.9444518089294434,
"reward_std": 0.020181890577077866,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9571816027164459,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9907424449920654,
"step": 644
},
{
"completion_length": 203.4375,
"epoch": 2.0608,
"grad_norm": 1.7927268743515015,
"kl": 0.15478515625,
"learning_rate": 1.9499999999999999e-07,
"loss": 0.0015,
"reward": 3.9478741884231567,
"reward_std": 0.01690173940733075,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9509572982788086,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997577667236328,
"step": 645
},
{
"completion_length": 249.40625,
"epoch": 2.064,
"grad_norm": 1.3610011339187622,
"kl": 0.09033203125,
"learning_rate": 1.9375e-07,
"loss": 0.0009,
"reward": 3.817861795425415,
"reward_std": 0.1958598094061017,
"rewards/answer_entity_reward": 0.990950733423233,
"rewards/answer_wer_reward": 0.8910082578659058,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9671527743339539,
"step": 646
},
{
"completion_length": 211.84375,
"epoch": 2.0672,
"grad_norm": 1.7078856229782104,
"kl": 0.104248046875,
"learning_rate": 1.9249999999999998e-07,
"loss": 0.001,
"reward": 3.9115630388259888,
"reward_std": 0.024524363689124584,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9344038963317871,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9800000190734863,
"step": 647
},
{
"completion_length": 250.3125,
"epoch": 2.0704,
"grad_norm": 1.6208539009094238,
"kl": 0.10205078125,
"learning_rate": 1.9125e-07,
"loss": 0.001,
"reward": 3.8414340019226074,
"reward_std": 0.15159638598561287,
"rewards/answer_entity_reward": 0.9867424070835114,
"rewards/answer_wer_reward": 0.9219352900981903,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.964006245136261,
"step": 648
},
{
"completion_length": 191.21875,
"epoch": 2.0736,
"grad_norm": 2.747303009033203,
"kl": 0.123046875,
"learning_rate": 1.8999999999999998e-07,
"loss": 0.0012,
"reward": 3.929761052131653,
"reward_std": 0.029091503005474806,
"rewards/answer_entity_reward": 0.9930555522441864,
"rewards/answer_wer_reward": 0.9552291929721832,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9814762473106384,
"step": 649
},
{
"completion_length": 242.21875,
"epoch": 2.0768,
"grad_norm": 1.213749647140503,
"kl": 0.08203125,
"learning_rate": 1.8875e-07,
"loss": 0.0008,
"reward": 3.9264228343963623,
"reward_std": 0.022060640156269073,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9312105178833008,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9972956776618958,
"step": 650
},
{
"completion_length": 220.65625,
"epoch": 2.08,
"grad_norm": 6.092029571533203,
"kl": 0.100830078125,
"learning_rate": 1.875e-07,
"loss": 0.001,
"reward": 3.9253735542297363,
"reward_std": 0.07221902348101139,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9613818228244781,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9668327569961548,
"step": 651
},
{
"completion_length": 182.5625,
"epoch": 2.0832,
"grad_norm": 1.7553961277008057,
"kl": 0.11279296875,
"learning_rate": 1.8625e-07,
"loss": 0.0011,
"reward": 3.8646005392074585,
"reward_std": 0.14707163721323013,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.931645005941391,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9676778018474579,
"step": 652
},
{
"completion_length": 232.3125,
"epoch": 2.0864,
"grad_norm": 1.1559618711471558,
"kl": 0.091796875,
"learning_rate": 1.85e-07,
"loss": 0.0009,
"reward": 3.957954168319702,
"reward_std": 0.013995198532938957,
"rewards/answer_entity_reward": 0.9958333373069763,
"rewards/answer_wer_reward": 0.963512659072876,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9986082315444946,
"step": 653
},
{
"completion_length": 223.15625,
"epoch": 2.0896,
"grad_norm": 2.3205788135528564,
"kl": 0.10302734375,
"learning_rate": 1.8375e-07,
"loss": 0.001,
"reward": 3.9284400939941406,
"reward_std": 0.02101885131560266,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9305233955383301,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 654
},
{
"completion_length": 174.03125,
"epoch": 2.0928,
"grad_norm": 3.2282309532165527,
"kl": 0.089599609375,
"learning_rate": 1.825e-07,
"loss": 0.0009,
"reward": 3.913803219795227,
"reward_std": 0.06426881160587072,
"rewards/answer_entity_reward": 0.9895833134651184,
"rewards/answer_wer_reward": 0.9792838096618652,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.944936066865921,
"step": 655
},
{
"completion_length": 209.40625,
"epoch": 2.096,
"grad_norm": 3.0301027297973633,
"kl": 0.15478515625,
"learning_rate": 1.8124999999999999e-07,
"loss": 0.0015,
"reward": 3.7523492574691772,
"reward_std": 0.15331693179905415,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9194472134113312,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.8641521334648132,
"step": 656
},
{
"completion_length": 183.46875,
"epoch": 2.0992,
"grad_norm": 3.859424591064453,
"kl": 0.10498046875,
"learning_rate": 1.8e-07,
"loss": 0.001,
"reward": 3.9188934564590454,
"reward_std": 0.016068585216999054,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9425098896026611,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9784668385982513,
"step": 657
},
{
"completion_length": 249.28125,
"epoch": 2.1024,
"grad_norm": 1.1957335472106934,
"kl": 0.0732421875,
"learning_rate": 1.7874999999999998e-07,
"loss": 0.0007,
"reward": 3.920902967453003,
"reward_std": 0.009091381449252367,
"rewards/answer_entity_reward": 0.982051283121109,
"rewards/answer_wer_reward": 0.9388516247272491,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 658
},
{
"completion_length": 215.1875,
"epoch": 2.1056,
"grad_norm": 0.9195266962051392,
"kl": 0.08349609375,
"learning_rate": 1.775e-07,
"loss": 0.0008,
"reward": 3.9518920183181763,
"reward_std": 0.00956010865047574,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9591186344623566,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9927734434604645,
"step": 659
},
{
"completion_length": 247.21875,
"epoch": 2.1088,
"grad_norm": 1.4949768781661987,
"kl": 0.109130859375,
"learning_rate": 1.7624999999999998e-07,
"loss": 0.0011,
"reward": 3.9060639142990112,
"reward_std": 0.029787511564791203,
"rewards/answer_entity_reward": 0.9838598966598511,
"rewards/answer_wer_reward": 0.9244924187660217,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9977116584777832,
"step": 660
},
{
"completion_length": 213.15625,
"epoch": 2.112,
"grad_norm": 2.8325705528259277,
"kl": 0.08740234375,
"learning_rate": 1.75e-07,
"loss": 0.0009,
"reward": 3.951107144355774,
"reward_std": 0.019360109698027372,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9607862234115601,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9931618571281433,
"step": 661
},
{
"completion_length": 210.1875,
"epoch": 2.1152,
"grad_norm": 4.155531883239746,
"kl": 0.12890625,
"learning_rate": 1.7374999999999998e-07,
"loss": 0.0013,
"reward": 3.8224732875823975,
"reward_std": 0.200032701715827,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.909185916185379,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9480096995830536,
"step": 662
},
{
"completion_length": 220.59375,
"epoch": 2.1184,
"grad_norm": 1.299959421157837,
"kl": 0.091796875,
"learning_rate": 1.725e-07,
"loss": 0.0009,
"reward": 3.959660768508911,
"reward_std": 0.009787917137145996,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9610175788402557,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9986431002616882,
"step": 663
},
{
"completion_length": 203.875,
"epoch": 2.1216,
"grad_norm": 1.2713404893875122,
"kl": 0.0849609375,
"learning_rate": 1.7125e-07,
"loss": 0.0008,
"reward": 3.913174271583557,
"reward_std": 0.03005001787096262,
"rewards/answer_entity_reward": 0.9858973920345306,
"rewards/answer_wer_reward": 0.9283359348773956,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9989408254623413,
"step": 664
},
{
"completion_length": 235.6875,
"epoch": 2.1248,
"grad_norm": 13.369653701782227,
"kl": 0.167236328125,
"learning_rate": 1.7000000000000001e-07,
"loss": 0.0017,
"reward": 3.862402558326721,
"reward_std": 0.1546822851523757,
"rewards/answer_entity_reward": 0.9921875,
"rewards/answer_wer_reward": 0.9333168268203735,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9681483209133148,
"step": 665
},
{
"completion_length": 154.8125,
"epoch": 2.128,
"grad_norm": 35.12384033203125,
"kl": 0.115966796875,
"learning_rate": 1.6875e-07,
"loss": 0.0012,
"reward": 3.9367175102233887,
"reward_std": 0.02194784674793482,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9534772336483002,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9832403361797333,
"step": 666
},
{
"completion_length": 195.28125,
"epoch": 2.1312,
"grad_norm": 1.2815937995910645,
"kl": 0.107666015625,
"learning_rate": 1.675e-07,
"loss": 0.0011,
"reward": 3.937751293182373,
"reward_std": 0.014415924437344074,
"rewards/answer_entity_reward": 0.9930555522441864,
"rewards/answer_wer_reward": 0.9462690353393555,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9984267055988312,
"step": 667
},
{
"completion_length": 225.90625,
"epoch": 2.1344,
"grad_norm": 0.840438723564148,
"kl": 0.12841796875,
"learning_rate": 1.6625e-07,
"loss": 0.0013,
"reward": 3.9389572143554688,
"reward_std": 0.01061929203569889,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.939858615398407,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990985691547394,
"step": 668
},
{
"completion_length": 186.6875,
"epoch": 2.1376,
"grad_norm": 1.6506493091583252,
"kl": 0.081787109375,
"learning_rate": 1.65e-07,
"loss": 0.0008,
"reward": 3.956157922744751,
"reward_std": 0.008805734105408192,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9561578929424286,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 669
},
{
"completion_length": 256.84375,
"epoch": 2.1408,
"grad_norm": 1.2955864667892456,
"kl": 0.134765625,
"learning_rate": 1.6375e-07,
"loss": 0.0013,
"reward": 3.924846053123474,
"reward_std": 0.016075235791504383,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9267153441905975,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.998130738735199,
"step": 670
},
{
"completion_length": 205.8125,
"epoch": 2.144,
"grad_norm": 2.9848484992980957,
"kl": 0.09375,
"learning_rate": 1.625e-07,
"loss": 0.0009,
"reward": 3.920342206954956,
"reward_std": 0.017433147877454758,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9359186589717865,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9844235181808472,
"step": 671
},
{
"completion_length": 205.25,
"epoch": 2.1471999999999998,
"grad_norm": 3.0758063793182373,
"kl": 0.0888671875,
"learning_rate": 1.6125e-07,
"loss": 0.0009,
"reward": 3.94283390045166,
"reward_std": 0.03408639598637819,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9549511075019836,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9878826439380646,
"step": 672
},
{
"completion_length": 185.3125,
"epoch": 2.1504,
"grad_norm": 4.493408203125,
"kl": 0.1298828125,
"learning_rate": 1.6e-07,
"loss": 0.0013,
"reward": 3.7623226642608643,
"reward_std": 0.053579739294946194,
"rewards/answer_entity_reward": 0.9799679517745972,
"rewards/answer_wer_reward": 0.930513322353363,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8518414497375488,
"step": 673
},
{
"completion_length": 238.5,
"epoch": 2.1536,
"grad_norm": 1.0133973360061646,
"kl": 0.074462890625,
"learning_rate": 1.5875e-07,
"loss": 0.0008,
"reward": 3.949939250946045,
"reward_std": 0.01046135206706822,
"rewards/answer_entity_reward": 0.9926470518112183,
"rewards/answer_wer_reward": 0.9593237638473511,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.997968465089798,
"step": 674
},
{
"completion_length": 204.75,
"epoch": 2.1568,
"grad_norm": 2.416959762573242,
"kl": 0.24658203125,
"learning_rate": 1.575e-07,
"loss": 0.0025,
"reward": 3.8200684785842896,
"reward_std": 0.014629668090492487,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9454044103622437,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8746639788150787,
"step": 675
},
{
"completion_length": 201.0625,
"epoch": 2.16,
"grad_norm": 1.1082431077957153,
"kl": 0.103515625,
"learning_rate": 1.5624999999999999e-07,
"loss": 0.001,
"reward": 3.9603604078292847,
"reward_std": 0.013338471297174692,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9646645486354828,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9985367655754089,
"step": 676
},
{
"completion_length": 200.5,
"epoch": 2.1632,
"grad_norm": 1.243102788925171,
"kl": 0.088623046875,
"learning_rate": 1.55e-07,
"loss": 0.0009,
"reward": 3.9476585388183594,
"reward_std": 0.014779110439121723,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9507860839366913,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997133016586304,
"step": 677
},
{
"completion_length": 235.0,
"epoch": 2.1664,
"grad_norm": 1.9828643798828125,
"kl": 0.076171875,
"learning_rate": 1.5374999999999998e-07,
"loss": 0.0008,
"reward": 3.840447187423706,
"reward_std": 0.12814121507108212,
"rewards/answer_entity_reward": 0.9507211446762085,
"rewards/answer_wer_reward": 0.8900850713253021,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996408224105835,
"step": 678
},
{
"completion_length": 209.0625,
"epoch": 2.1696,
"grad_norm": 1.306552529335022,
"kl": 0.09130859375,
"learning_rate": 1.525e-07,
"loss": 0.0009,
"reward": 3.9448314905166626,
"reward_std": 0.016167795285582542,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9459536671638489,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9988778233528137,
"step": 679
},
{
"completion_length": 206.84375,
"epoch": 2.1728,
"grad_norm": 1.7964757680892944,
"kl": 0.1064453125,
"learning_rate": 1.5124999999999998e-07,
"loss": 0.0011,
"reward": 3.9424251317977905,
"reward_std": 0.0136543451808393,
"rewards/answer_entity_reward": 0.9895104765892029,
"rewards/answer_wer_reward": 0.9541302621364594,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987844526767731,
"step": 680
},
{
"completion_length": 244.21875,
"epoch": 2.176,
"grad_norm": 1.3341420888900757,
"kl": 0.08642578125,
"learning_rate": 1.5e-07,
"loss": 0.0009,
"reward": 3.9383710622787476,
"reward_std": 0.01660554250702262,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9446630477905273,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9981722235679626,
"step": 681
},
{
"completion_length": 248.84375,
"epoch": 2.1792,
"grad_norm": 0.9630815386772156,
"kl": 0.0888671875,
"learning_rate": 1.4874999999999998e-07,
"loss": 0.0009,
"reward": 3.949557065963745,
"reward_std": 0.01444097189232707,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9514444172382355,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9981126189231873,
"step": 682
},
{
"completion_length": 222.5625,
"epoch": 2.1824,
"grad_norm": 1.4436620473861694,
"kl": 0.091796875,
"learning_rate": 1.475e-07,
"loss": 0.0009,
"reward": 3.9340105056762695,
"reward_std": 0.011837240774184465,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9374523460865021,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993990361690521,
"step": 683
},
{
"completion_length": 220.3125,
"epoch": 2.1856,
"grad_norm": 1.7951076030731201,
"kl": 0.13623046875,
"learning_rate": 1.4624999999999998e-07,
"loss": 0.0014,
"reward": 3.9159233570098877,
"reward_std": 0.022063229698687792,
"rewards/answer_entity_reward": 0.9825946092605591,
"rewards/answer_wer_reward": 0.935338944196701,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9979897737503052,
"step": 684
},
{
"completion_length": 222.71875,
"epoch": 2.1888,
"grad_norm": 2.693173885345459,
"kl": 0.090576171875,
"learning_rate": 1.45e-07,
"loss": 0.0009,
"reward": 3.8880432844161987,
"reward_std": 0.03081614337861538,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9077447652816772,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9827025234699249,
"step": 685
},
{
"completion_length": 263.5,
"epoch": 2.192,
"grad_norm": 5.544942855834961,
"kl": 0.122802734375,
"learning_rate": 1.4374999999999997e-07,
"loss": 0.0012,
"reward": 3.906672716140747,
"reward_std": 0.017923741601407528,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9078539311885834,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9988189041614532,
"step": 686
},
{
"completion_length": 219.0625,
"epoch": 2.1952,
"grad_norm": 1.3066198825836182,
"kl": 0.13720703125,
"learning_rate": 1.4249999999999999e-07,
"loss": 0.0014,
"reward": 3.8579492568969727,
"reward_std": 0.11750033870339394,
"rewards/answer_entity_reward": 0.9445319771766663,
"rewards/answer_wer_reward": 0.9204041659832001,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9930130541324615,
"step": 687
},
{
"completion_length": 203.09375,
"epoch": 2.1984,
"grad_norm": 2.9115042686462402,
"kl": 0.130859375,
"learning_rate": 1.4124999999999997e-07,
"loss": 0.0013,
"reward": 3.929637312889099,
"reward_std": 0.07596011366695166,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9678620994091034,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9652473330497742,
"step": 688
},
{
"completion_length": 205.375,
"epoch": 2.2016,
"grad_norm": 2.322467803955078,
"kl": 0.087158203125,
"learning_rate": 1.4e-07,
"loss": 0.0009,
"reward": 3.8996429443359375,
"reward_std": 0.06278708390891552,
"rewards/answer_entity_reward": 0.9936868846416473,
"rewards/answer_wer_reward": 0.9460262954235077,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.959929883480072,
"step": 689
},
{
"completion_length": 225.0625,
"epoch": 2.2048,
"grad_norm": 2.730459213256836,
"kl": 0.0791015625,
"learning_rate": 1.3875e-07,
"loss": 0.0008,
"reward": 3.916098475456238,
"reward_std": 0.02135017653927207,
"rewards/answer_entity_reward": 0.9937500059604645,
"rewards/answer_wer_reward": 0.9546346664428711,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9677136838436127,
"step": 690
},
{
"completion_length": 154.09375,
"epoch": 2.208,
"grad_norm": 2.1384575366973877,
"kl": 0.08740234375,
"learning_rate": 1.375e-07,
"loss": 0.0009,
"reward": 3.8095338344573975,
"reward_std": 0.02021293295547366,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9482340812683105,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8612997531890869,
"step": 691
},
{
"completion_length": 160.6875,
"epoch": 2.2112,
"grad_norm": 1.9878817796707153,
"kl": 0.1083984375,
"learning_rate": 1.3625e-07,
"loss": 0.0011,
"reward": 3.8489197492599487,
"reward_std": 0.06898931134492159,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.929458349943161,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9194613993167877,
"step": 692
},
{
"completion_length": 209.0625,
"epoch": 2.2144,
"grad_norm": 1.895799994468689,
"kl": 0.11669921875,
"learning_rate": 1.35e-07,
"loss": 0.0012,
"reward": 3.897484064102173,
"reward_std": 0.02146145049482584,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.926066517829895,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9714176058769226,
"step": 693
},
{
"completion_length": 248.3125,
"epoch": 2.2176,
"grad_norm": 2.0095603466033936,
"kl": 0.1015625,
"learning_rate": 1.3375e-07,
"loss": 0.001,
"reward": 3.9220433235168457,
"reward_std": 0.014254164882004261,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9220432937145233,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 694
},
{
"completion_length": 223.0,
"epoch": 2.2208,
"grad_norm": 1.6252143383026123,
"kl": 0.12158203125,
"learning_rate": 1.325e-07,
"loss": 0.0012,
"reward": 3.9079580307006836,
"reward_std": 0.02597262989729643,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9434219896793365,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9645361006259918,
"step": 695
},
{
"completion_length": 187.5,
"epoch": 2.224,
"grad_norm": 1.387984275817871,
"kl": 0.0947265625,
"learning_rate": 1.3125e-07,
"loss": 0.0009,
"reward": 3.9440150260925293,
"reward_std": 0.015697208931669593,
"rewards/answer_entity_reward": 0.9816919267177582,
"rewards/answer_wer_reward": 0.9631733596324921,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9991496503353119,
"step": 696
},
{
"completion_length": 203.34375,
"epoch": 2.2272,
"grad_norm": 2.2257113456726074,
"kl": 0.110595703125,
"learning_rate": 1.3e-07,
"loss": 0.0011,
"reward": 3.885230541229248,
"reward_std": 0.023641248233616352,
"rewards/answer_entity_reward": 0.9805992841720581,
"rewards/answer_wer_reward": 0.9495046138763428,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9551265835762024,
"step": 697
},
{
"completion_length": 201.8125,
"epoch": 2.2304,
"grad_norm": 1.595376968383789,
"kl": 0.076171875,
"learning_rate": 1.2874999999999998e-07,
"loss": 0.0008,
"reward": 3.9703818559646606,
"reward_std": 0.01011386327445507,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9731970131397247,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999588817358017,
"step": 698
},
{
"completion_length": 230.6875,
"epoch": 2.2336,
"grad_norm": 1.6279692649841309,
"kl": 0.12744140625,
"learning_rate": 1.275e-07,
"loss": 0.0013,
"reward": 3.9279537200927734,
"reward_std": 0.017515965271741152,
"rewards/answer_entity_reward": 0.9880050718784332,
"rewards/answer_wer_reward": 0.9430651664733887,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9968834519386292,
"step": 699
},
{
"completion_length": 182.125,
"epoch": 2.2368,
"grad_norm": 2.8828890323638916,
"kl": 0.18505859375,
"learning_rate": 1.2624999999999998e-07,
"loss": 0.0019,
"reward": 3.8859431743621826,
"reward_std": 0.142324005253613,
"rewards/answer_entity_reward": 0.993686854839325,
"rewards/answer_wer_reward": 0.9579981565475464,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9655081927776337,
"step": 700
},
{
"completion_length": 209.03125,
"epoch": 2.24,
"grad_norm": 2.9951937198638916,
"kl": 0.12109375,
"learning_rate": 1.25e-07,
"loss": 0.0012,
"reward": 3.7733466625213623,
"reward_std": 0.025467259343713522,
"rewards/answer_entity_reward": 0.9886675775051117,
"rewards/answer_wer_reward": 0.9475079476833344,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8371710479259491,
"step": 701
},
{
"completion_length": 211.40625,
"epoch": 2.2432,
"grad_norm": 2.5615384578704834,
"kl": 0.14208984375,
"learning_rate": 1.2375e-07,
"loss": 0.0014,
"reward": 3.9001591205596924,
"reward_std": 0.03031878173351288,
"rewards/answer_entity_reward": 0.9910256266593933,
"rewards/answer_wer_reward": 0.957984060049057,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9511493742465973,
"step": 702
},
{
"completion_length": 240.6875,
"epoch": 2.2464,
"grad_norm": 1.6149277687072754,
"kl": 0.10888671875,
"learning_rate": 1.225e-07,
"loss": 0.0011,
"reward": 3.917873740196228,
"reward_std": 0.01580545213073492,
"rewards/answer_entity_reward": 0.9787845611572266,
"rewards/answer_wer_reward": 0.9405834674835205,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9985057711601257,
"step": 703
},
{
"completion_length": 190.0625,
"epoch": 2.2496,
"grad_norm": 1.620892882347107,
"kl": 0.087646484375,
"learning_rate": 1.2125e-07,
"loss": 0.0009,
"reward": 3.954784393310547,
"reward_std": 0.03893708251416683,
"rewards/answer_entity_reward": 0.9937500059604645,
"rewards/answer_wer_reward": 0.962031751871109,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990026652812958,
"step": 704
},
{
"completion_length": 170.0,
"epoch": 2.2528,
"grad_norm": 1.5188636779785156,
"kl": 0.111572265625,
"learning_rate": 1.2e-07,
"loss": 0.0011,
"reward": 3.9031065702438354,
"reward_std": 0.011392949614673853,
"rewards/answer_entity_reward": 0.9861111044883728,
"rewards/answer_wer_reward": 0.9356338381767273,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9813616275787354,
"step": 705
},
{
"completion_length": 211.46875,
"epoch": 2.2560000000000002,
"grad_norm": 2.7718734741210938,
"kl": 0.102294921875,
"learning_rate": 1.1874999999999999e-07,
"loss": 0.001,
"reward": 3.936674475669861,
"reward_std": 0.021578084211796522,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9491873383522034,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9874871671199799,
"step": 706
},
{
"completion_length": 255.0,
"epoch": 2.2592,
"grad_norm": 1.6890819072723389,
"kl": 0.099853515625,
"learning_rate": 1.1749999999999999e-07,
"loss": 0.001,
"reward": 3.9247710704803467,
"reward_std": 0.013670595828443766,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9279445707798004,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9968266189098358,
"step": 707
},
{
"completion_length": 189.3125,
"epoch": 2.2624,
"grad_norm": 2.3591725826263428,
"kl": 0.111328125,
"learning_rate": 1.1625e-07,
"loss": 0.0011,
"reward": 3.929440498352051,
"reward_std": 0.018895008601248264,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9316463768482208,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.99779412150383,
"step": 708
},
{
"completion_length": 202.125,
"epoch": 2.2656,
"grad_norm": 5.1716766357421875,
"kl": 0.142333984375,
"learning_rate": 1.15e-07,
"loss": 0.0014,
"reward": 3.9494107961654663,
"reward_std": 0.023188273422420025,
"rewards/answer_entity_reward": 0.9902146458625793,
"rewards/answer_wer_reward": 0.9693313241004944,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.989864856004715,
"step": 709
},
{
"completion_length": 241.90625,
"epoch": 2.2688,
"grad_norm": 2.9082345962524414,
"kl": 0.15087890625,
"learning_rate": 1.1375e-07,
"loss": 0.0015,
"reward": 3.877661347389221,
"reward_std": 0.08314304798841476,
"rewards/answer_entity_reward": 0.9895833134651184,
"rewards/answer_wer_reward": 0.9211136996746063,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9669643044471741,
"step": 710
},
{
"completion_length": 222.40625,
"epoch": 2.2720000000000002,
"grad_norm": 2.9711413383483887,
"kl": 0.123779296875,
"learning_rate": 1.125e-07,
"loss": 0.0012,
"reward": 3.9322550296783447,
"reward_std": 0.06140775140374899,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9581426084041595,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9775847494602203,
"step": 711
},
{
"completion_length": 242.6875,
"epoch": 2.2752,
"grad_norm": 6.453571796417236,
"kl": 0.116943359375,
"learning_rate": 1.1125e-07,
"loss": 0.0012,
"reward": 3.874239444732666,
"reward_std": 0.017658520489931107,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.8984209299087524,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9782223105430603,
"step": 712
},
{
"completion_length": 206.0625,
"epoch": 2.2784,
"grad_norm": 2.0138731002807617,
"kl": 0.10205078125,
"learning_rate": 1.0999999999999999e-07,
"loss": 0.001,
"reward": 3.9465200901031494,
"reward_std": 0.01707920106127858,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.950833261013031,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9985276162624359,
"step": 713
},
{
"completion_length": 205.90625,
"epoch": 2.2816,
"grad_norm": 1.6215705871582031,
"kl": 0.22216796875,
"learning_rate": 1.0874999999999999e-07,
"loss": 0.0022,
"reward": 3.921483874320984,
"reward_std": 0.017741497606039047,
"rewards/answer_entity_reward": 0.9818618893623352,
"rewards/answer_wer_reward": 0.9403572380542755,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9992647171020508,
"step": 714
},
{
"completion_length": 165.53125,
"epoch": 2.2848,
"grad_norm": 2.939443349838257,
"kl": 0.10302734375,
"learning_rate": 1.0749999999999999e-07,
"loss": 0.001,
"reward": 3.8573367595672607,
"reward_std": 0.05941922590136528,
"rewards/answer_entity_reward": 0.9981617629528046,
"rewards/answer_wer_reward": 0.9561320841312408,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9030430614948273,
"step": 715
},
{
"completion_length": 209.34375,
"epoch": 2.288,
"grad_norm": 3.167865753173828,
"kl": 0.098876953125,
"learning_rate": 1.0624999999999999e-07,
"loss": 0.001,
"reward": 3.9200966358184814,
"reward_std": 0.011050965171307325,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9683842360973358,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9517123103141785,
"step": 716
},
{
"completion_length": 211.78125,
"epoch": 2.2912,
"grad_norm": 2.83433198928833,
"kl": 0.157470703125,
"learning_rate": 1.0499999999999999e-07,
"loss": 0.0016,
"reward": 3.888568639755249,
"reward_std": 0.0255763940513134,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9216626286506653,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9669062495231628,
"step": 717
},
{
"completion_length": 233.21875,
"epoch": 2.2944,
"grad_norm": 1.1522959470748901,
"kl": 0.123046875,
"learning_rate": 1.0374999999999999e-07,
"loss": 0.0012,
"reward": 3.9315165281295776,
"reward_std": 0.015323773492127657,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9349887073040009,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 718
},
{
"completion_length": 219.6875,
"epoch": 2.2976,
"grad_norm": 2.8032352924346924,
"kl": 0.097900390625,
"learning_rate": 1.0249999999999998e-07,
"loss": 0.001,
"reward": 3.941191077232361,
"reward_std": 0.014842316508293152,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9449678063392639,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9962232708930969,
"step": 719
},
{
"completion_length": 247.75,
"epoch": 2.3008,
"grad_norm": 2.120060682296753,
"kl": 0.10791015625,
"learning_rate": 1.0125e-07,
"loss": 0.0011,
"reward": 3.7576488256454468,
"reward_std": 0.034239969216287136,
"rewards/answer_entity_reward": 0.9883012771606445,
"rewards/answer_wer_reward": 0.9174286723136902,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8519188165664673,
"step": 720
},
{
"completion_length": 148.46875,
"epoch": 2.304,
"grad_norm": 3.453160047531128,
"kl": 0.1357421875,
"learning_rate": 1e-07,
"loss": 0.0014,
"reward": 3.9483038187026978,
"reward_std": 0.010362145490944386,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9624904096126556,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9858134686946869,
"step": 721
},
{
"completion_length": 242.625,
"epoch": 2.3072,
"grad_norm": 1.0787523984909058,
"kl": 0.08740234375,
"learning_rate": 9.875e-08,
"loss": 0.0009,
"reward": 3.8604942560195923,
"reward_std": 0.14663540851324797,
"rewards/answer_entity_reward": 0.9937500059604645,
"rewards/answer_wer_reward": 0.9304596483707428,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9675346612930298,
"step": 722
},
{
"completion_length": 184.15625,
"epoch": 2.3104,
"grad_norm": 2.8213894367218018,
"kl": 0.078857421875,
"learning_rate": 9.749999999999999e-08,
"loss": 0.0008,
"reward": 3.9743396043777466,
"reward_std": 0.007034428184852004,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.975724995136261,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9986145198345184,
"step": 723
},
{
"completion_length": 259.78125,
"epoch": 2.3136,
"grad_norm": 1.6101382970809937,
"kl": 0.090087890625,
"learning_rate": 9.624999999999999e-08,
"loss": 0.0009,
"reward": 3.9425781965255737,
"reward_std": 0.012072732672095299,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9443398118019104,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9982384443283081,
"step": 724
},
{
"completion_length": 244.0625,
"epoch": 2.3168,
"grad_norm": 6.2361578941345215,
"kl": 0.1103515625,
"learning_rate": 9.499999999999999e-08,
"loss": 0.0011,
"reward": 3.930173873901367,
"reward_std": 0.027357542887330055,
"rewards/answer_entity_reward": 0.9856617748737335,
"rewards/answer_wer_reward": 0.9461617767810822,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9983505010604858,
"step": 725
},
{
"completion_length": 201.59375,
"epoch": 2.32,
"grad_norm": 1.4726715087890625,
"kl": 0.09375,
"learning_rate": 9.375e-08,
"loss": 0.0009,
"reward": 3.932676076889038,
"reward_std": 0.018328175880014896,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.9490721523761749,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9892857074737549,
"step": 726
},
{
"completion_length": 224.46875,
"epoch": 2.3232,
"grad_norm": 1.8913533687591553,
"kl": 0.10693359375,
"learning_rate": 9.25e-08,
"loss": 0.0011,
"reward": 3.9074403047561646,
"reward_std": 0.03500279039144516,
"rewards/answer_entity_reward": 0.9926948249340057,
"rewards/answer_wer_reward": 0.9156512916088104,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990941882133484,
"step": 727
},
{
"completion_length": 193.96875,
"epoch": 2.3264,
"grad_norm": 3.589576244354248,
"kl": 0.095458984375,
"learning_rate": 9.125e-08,
"loss": 0.001,
"reward": 3.9219532012939453,
"reward_std": 0.018014353699982166,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9440249502658844,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9807692468166351,
"step": 728
},
{
"completion_length": 178.4375,
"epoch": 2.3296,
"grad_norm": 1.4839043617248535,
"kl": 0.125244140625,
"learning_rate": 9e-08,
"loss": 0.0013,
"reward": 3.8304929733276367,
"reward_std": 0.009818047750741243,
"rewards/answer_entity_reward": 0.9844697117805481,
"rewards/answer_wer_reward": 0.9768873453140259,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.869135856628418,
"step": 729
},
{
"completion_length": 199.28125,
"epoch": 2.3327999999999998,
"grad_norm": 1.497478723526001,
"kl": 0.08642578125,
"learning_rate": 8.875e-08,
"loss": 0.0009,
"reward": 3.9690757989883423,
"reward_std": 0.009865536354482174,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9690757989883423,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 730
},
{
"completion_length": 220.40625,
"epoch": 2.336,
"grad_norm": 5.609241485595703,
"kl": 0.0966796875,
"learning_rate": 8.75e-08,
"loss": 0.001,
"reward": 3.935381293296814,
"reward_std": 0.037938917987048626,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9465770721435547,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9888042211532593,
"step": 731
},
{
"completion_length": 215.75,
"epoch": 2.3392,
"grad_norm": 3.496508836746216,
"kl": 0.13134765625,
"learning_rate": 8.625e-08,
"loss": 0.0013,
"reward": 3.8224092721939087,
"reward_std": 0.02808304876089096,
"rewards/answer_entity_reward": 0.9923513829708099,
"rewards/answer_wer_reward": 0.9497494399547577,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8803083300590515,
"step": 732
},
{
"completion_length": 230.78125,
"epoch": 2.3424,
"grad_norm": 27.852195739746094,
"kl": 0.087890625,
"learning_rate": 8.500000000000001e-08,
"loss": 0.0009,
"reward": 3.8288865089416504,
"reward_std": 0.01470271497964859,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9576314091682434,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8736589848995209,
"step": 733
},
{
"completion_length": 241.9375,
"epoch": 2.3456,
"grad_norm": 3.033336639404297,
"kl": 0.1015625,
"learning_rate": 8.375e-08,
"loss": 0.001,
"reward": 3.8371338844299316,
"reward_std": 0.060717299580574036,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.8998311161994934,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9373026490211487,
"step": 734
},
{
"completion_length": 235.53125,
"epoch": 2.3487999999999998,
"grad_norm": 1.6953455209732056,
"kl": 0.094970703125,
"learning_rate": 8.25e-08,
"loss": 0.001,
"reward": 3.9114056825637817,
"reward_std": 0.03203952219337225,
"rewards/answer_entity_reward": 0.9862325191497803,
"rewards/answer_wer_reward": 0.9290111660957336,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9961620271205902,
"step": 735
},
{
"completion_length": 170.59375,
"epoch": 2.352,
"grad_norm": 3.9929087162017822,
"kl": 0.096923828125,
"learning_rate": 8.125e-08,
"loss": 0.001,
"reward": 3.7566089630126953,
"reward_std": 0.029996749013662338,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.8621053397655487,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8945035934448242,
"step": 736
},
{
"completion_length": 233.125,
"epoch": 2.3552,
"grad_norm": 4.515742301940918,
"kl": 0.129638671875,
"learning_rate": 8e-08,
"loss": 0.0013,
"reward": 3.9159114360809326,
"reward_std": 0.03965392196550965,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9617535173892975,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9576301276683807,
"step": 737
},
{
"completion_length": 202.5,
"epoch": 2.3584,
"grad_norm": 3.593953847885132,
"kl": 0.107177734375,
"learning_rate": 7.875e-08,
"loss": 0.0011,
"reward": 3.9383411407470703,
"reward_std": 0.030629536136984825,
"rewards/answer_entity_reward": 0.9930555820465088,
"rewards/answer_wer_reward": 0.9511449038982391,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.994140625,
"step": 738
},
{
"completion_length": 204.5,
"epoch": 2.3616,
"grad_norm": 1.8713083267211914,
"kl": 0.099365234375,
"learning_rate": 7.75e-08,
"loss": 0.001,
"reward": 3.9490264654159546,
"reward_std": 0.017966313287615776,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.95371875166893,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997718930244446,
"step": 739
},
{
"completion_length": 240.84375,
"epoch": 2.3648,
"grad_norm": 1.2076594829559326,
"kl": 0.087646484375,
"learning_rate": 7.625e-08,
"loss": 0.0009,
"reward": 3.9529651403427124,
"reward_std": 0.00970834819599986,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.95296511054039,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 740
},
{
"completion_length": 245.0,
"epoch": 2.368,
"grad_norm": 0.9895936846733093,
"kl": 0.10205078125,
"learning_rate": 7.5e-08,
"loss": 0.001,
"reward": 3.9112091064453125,
"reward_std": 0.01916833221912384,
"rewards/answer_entity_reward": 0.9895833134651184,
"rewards/answer_wer_reward": 0.9262779057025909,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9953478574752808,
"step": 741
},
{
"completion_length": 240.09375,
"epoch": 2.3712,
"grad_norm": 2.48711895942688,
"kl": 0.076171875,
"learning_rate": 7.375e-08,
"loss": 0.0008,
"reward": 3.942033529281616,
"reward_std": 0.015272341668605804,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9465188384056091,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9983557760715485,
"step": 742
},
{
"completion_length": 201.78125,
"epoch": 2.3744,
"grad_norm": 2.5322351455688477,
"kl": 0.103271484375,
"learning_rate": 7.25e-08,
"loss": 0.001,
"reward": 3.903318166732788,
"reward_std": 0.024559098295867443,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9573764503002167,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9459417462348938,
"step": 743
},
{
"completion_length": 176.8125,
"epoch": 2.3776,
"grad_norm": 10.369518280029297,
"kl": 0.10986328125,
"learning_rate": 7.124999999999999e-08,
"loss": 0.0011,
"reward": 3.9451266527175903,
"reward_std": 0.008970791008323431,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.953162282705307,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9919642806053162,
"step": 744
},
{
"completion_length": 230.625,
"epoch": 2.3808,
"grad_norm": 1.5272488594055176,
"kl": 0.09130859375,
"learning_rate": 7e-08,
"loss": 0.0009,
"reward": 3.9410911798477173,
"reward_std": 0.017650599591434002,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9447437524795532,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9991883039474487,
"step": 745
},
{
"completion_length": 193.3125,
"epoch": 2.384,
"grad_norm": 2.9624199867248535,
"kl": 0.165771484375,
"learning_rate": 6.875e-08,
"loss": 0.0017,
"reward": 3.8940484523773193,
"reward_std": 0.060107991099357605,
"rewards/answer_entity_reward": 0.9955128133296967,
"rewards/answer_wer_reward": 0.9106853604316711,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9878502786159515,
"step": 746
},
{
"completion_length": 216.34375,
"epoch": 2.3872,
"grad_norm": 1.623085379600525,
"kl": 0.08544921875,
"learning_rate": 6.75e-08,
"loss": 0.0009,
"reward": 3.9699491262435913,
"reward_std": 0.016816058196127415,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9752996861934662,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9946492910385132,
"step": 747
},
{
"completion_length": 202.125,
"epoch": 2.3904,
"grad_norm": 1.5331361293792725,
"kl": 0.12841796875,
"learning_rate": 6.625e-08,
"loss": 0.0013,
"reward": 3.922391891479492,
"reward_std": 0.008891359670087695,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9298486709594727,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9925432503223419,
"step": 748
},
{
"completion_length": 242.6875,
"epoch": 2.3936,
"grad_norm": 1.3326294422149658,
"kl": 0.095947265625,
"learning_rate": 6.5e-08,
"loss": 0.001,
"reward": 3.95425808429718,
"reward_std": 0.009861439000815153,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9546802639961243,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9995777010917664,
"step": 749
},
{
"completion_length": 175.90625,
"epoch": 2.3968,
"grad_norm": 0.9046992063522339,
"kl": 0.112060546875,
"learning_rate": 6.375e-08,
"loss": 0.0011,
"reward": 3.977583885192871,
"reward_std": 0.006871582940220833,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9775838255882263,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 750
},
{
"completion_length": 215.59375,
"epoch": 2.4,
"grad_norm": 3.0961620807647705,
"kl": 0.089111328125,
"learning_rate": 6.25e-08,
"loss": 0.0009,
"reward": 3.9490163326263428,
"reward_std": 0.012763194739818573,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9750434756278992,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9768137633800507,
"step": 751
},
{
"completion_length": 239.84375,
"epoch": 2.4032,
"grad_norm": 0.9473263621330261,
"kl": 0.102783203125,
"learning_rate": 6.125e-08,
"loss": 0.001,
"reward": 3.953715443611145,
"reward_std": 0.018719897605478764,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9566626846790314,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999456524848938,
"step": 752
},
{
"completion_length": 177.09375,
"epoch": 2.4064,
"grad_norm": 0.7227364182472229,
"kl": 0.115234375,
"learning_rate": 6e-08,
"loss": 0.0012,
"reward": 3.9009724855422974,
"reward_std": 0.1057232718449086,
"rewards/answer_entity_reward": 0.96875,
"rewards/answer_wer_reward": 0.9504120945930481,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.981810450553894,
"step": 753
},
{
"completion_length": 215.96875,
"epoch": 2.4096,
"grad_norm": 3.616448163986206,
"kl": 0.103759765625,
"learning_rate": 5.8749999999999993e-08,
"loss": 0.001,
"reward": 3.936669707298279,
"reward_std": 0.01544360350817442,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9422976672649384,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9978442788124084,
"step": 754
},
{
"completion_length": 222.03125,
"epoch": 2.4128,
"grad_norm": 5.449378967285156,
"kl": 0.08203125,
"learning_rate": 5.75e-08,
"loss": 0.0008,
"reward": 3.9294843673706055,
"reward_std": 0.05806633085012436,
"rewards/answer_entity_reward": 0.9763257801532745,
"rewards/answer_wer_reward": 0.9556067883968353,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9975519478321075,
"step": 755
},
{
"completion_length": 200.3125,
"epoch": 2.416,
"grad_norm": 2.46901798248291,
"kl": 0.119873046875,
"learning_rate": 5.625e-08,
"loss": 0.0012,
"reward": 3.888831377029419,
"reward_std": 0.015442279167473316,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9264732301235199,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9651989638805389,
"step": 756
},
{
"completion_length": 237.21875,
"epoch": 2.4192,
"grad_norm": 1.3007749319076538,
"kl": 0.095947265625,
"learning_rate": 5.4999999999999996e-08,
"loss": 0.001,
"reward": 3.933607816696167,
"reward_std": 0.023877738043665886,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9380720853805542,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 757
},
{
"completion_length": 151.78125,
"epoch": 2.4224,
"grad_norm": 0.7467179894447327,
"kl": 0.099853515625,
"learning_rate": 5.3749999999999995e-08,
"loss": 0.001,
"reward": 3.9564812183380127,
"reward_std": 0.004365669563412666,
"rewards/answer_entity_reward": 0.9916666746139526,
"rewards/answer_wer_reward": 0.9648145437240601,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 758
},
{
"completion_length": 232.125,
"epoch": 2.4256,
"grad_norm": 1.5784400701522827,
"kl": 0.1041259765625,
"learning_rate": 5.2499999999999994e-08,
"loss": 0.001,
"reward": 3.9412447214126587,
"reward_std": 0.02170270448550582,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9489176869392395,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9967913925647736,
"step": 759
},
{
"completion_length": 215.4375,
"epoch": 2.4288,
"grad_norm": 3.9008543491363525,
"kl": 0.1298828125,
"learning_rate": 5.124999999999999e-08,
"loss": 0.0013,
"reward": 3.8311843872070312,
"reward_std": 0.05369440279901028,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9393357634544373,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8918486833572388,
"step": 760
},
{
"completion_length": 219.65625,
"epoch": 2.432,
"grad_norm": 4.4970526695251465,
"kl": 0.09765625,
"learning_rate": 5e-08,
"loss": 0.001,
"reward": 3.9511146545410156,
"reward_std": 0.01855921559035778,
"rewards/answer_entity_reward": 0.9981617629528046,
"rewards/answer_wer_reward": 0.9531445503234863,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.99980828166008,
"step": 761
},
{
"completion_length": 210.21875,
"epoch": 2.4352,
"grad_norm": 0.9267875552177429,
"kl": 0.096923828125,
"learning_rate": 4.8749999999999996e-08,
"loss": 0.001,
"reward": 3.930490016937256,
"reward_std": 0.013515972066670656,
"rewards/answer_entity_reward": 0.9930555820465088,
"rewards/answer_wer_reward": 0.9386539459228516,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987804591655731,
"step": 762
},
{
"completion_length": 196.625,
"epoch": 2.4384,
"grad_norm": 2.2344725131988525,
"kl": 0.1025390625,
"learning_rate": 4.7499999999999995e-08,
"loss": 0.001,
"reward": 3.9080734252929688,
"reward_std": 0.035708663053810596,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9325708150863647,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9779064655303955,
"step": 763
},
{
"completion_length": 225.09375,
"epoch": 2.4416,
"grad_norm": 1.588053822517395,
"kl": 0.095947265625,
"learning_rate": 4.625e-08,
"loss": 0.001,
"reward": 3.9343831539154053,
"reward_std": 0.016630763188004494,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9485695362091064,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9892857074737549,
"step": 764
},
{
"completion_length": 247.09375,
"epoch": 2.4448,
"grad_norm": 1.1707122325897217,
"kl": 0.09228515625,
"learning_rate": 4.5e-08,
"loss": 0.0009,
"reward": 3.8900914192199707,
"reward_std": 0.06134997680783272,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.908464640378952,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9837101101875305,
"step": 765
},
{
"completion_length": 241.65625,
"epoch": 2.448,
"grad_norm": 2.8273398876190186,
"kl": 0.110595703125,
"learning_rate": 4.375e-08,
"loss": 0.0011,
"reward": 3.890642285346985,
"reward_std": 0.021557598374783993,
"rewards/answer_entity_reward": 0.9983552694320679,
"rewards/answer_wer_reward": 0.8973233997821808,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9949637055397034,
"step": 766
},
{
"completion_length": 216.96875,
"epoch": 2.4512,
"grad_norm": 1.1206011772155762,
"kl": 0.095947265625,
"learning_rate": 4.2500000000000003e-08,
"loss": 0.001,
"reward": 3.9437450170516968,
"reward_std": 0.008607666241005063,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9612680077552795,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9824769496917725,
"step": 767
},
{
"completion_length": 230.3125,
"epoch": 2.4544,
"grad_norm": 15.688488960266113,
"kl": 0.085693359375,
"learning_rate": 4.125e-08,
"loss": 0.0009,
"reward": 3.9394757747650146,
"reward_std": 0.030962621793150902,
"rewards/answer_entity_reward": 0.9806547462940216,
"rewards/answer_wer_reward": 0.9617869853973389,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9970340430736542,
"step": 768
},
{
"completion_length": 248.46875,
"epoch": 2.4576000000000002,
"grad_norm": 1.5618577003479004,
"kl": 0.16162109375,
"learning_rate": 4e-08,
"loss": 0.0016,
"reward": 3.9416744709014893,
"reward_std": 0.016101540066301823,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9428056180477142,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9988687336444855,
"step": 769
},
{
"completion_length": 234.0625,
"epoch": 2.4608,
"grad_norm": 3.257962226867676,
"kl": 0.16259765625,
"learning_rate": 3.875e-08,
"loss": 0.0016,
"reward": 3.9221439361572266,
"reward_std": 0.02909655123949051,
"rewards/answer_entity_reward": 0.9871794581413269,
"rewards/answer_wer_reward": 0.9352968335151672,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996675550937653,
"step": 770
},
{
"completion_length": 245.78125,
"epoch": 2.464,
"grad_norm": 2.2879505157470703,
"kl": 0.083251953125,
"learning_rate": 3.75e-08,
"loss": 0.0008,
"reward": 3.9263609647750854,
"reward_std": 0.022196561098098755,
"rewards/answer_entity_reward": 0.9944852888584137,
"rewards/answer_wer_reward": 0.9454439282417297,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9864316880702972,
"step": 771
},
{
"completion_length": 158.78125,
"epoch": 2.4672,
"grad_norm": 2.214250087738037,
"kl": 0.1328125,
"learning_rate": 3.625e-08,
"loss": 0.0013,
"reward": 3.883350372314453,
"reward_std": 0.04219530359841883,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9803332090377808,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9030172228813171,
"step": 772
},
{
"completion_length": 229.25,
"epoch": 2.4704,
"grad_norm": 1.8548256158828735,
"kl": 0.10205078125,
"learning_rate": 3.5e-08,
"loss": 0.001,
"reward": 3.9539661407470703,
"reward_std": 0.011240935884416103,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9542403221130371,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999725878238678,
"step": 773
},
{
"completion_length": 227.9375,
"epoch": 2.4736000000000002,
"grad_norm": 2.2110090255737305,
"kl": 0.0927734375,
"learning_rate": 3.375e-08,
"loss": 0.0009,
"reward": 3.9344996213912964,
"reward_std": 0.011312551097944379,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9557085335254669,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9787910580635071,
"step": 774
},
{
"completion_length": 250.9375,
"epoch": 2.4768,
"grad_norm": 25.519304275512695,
"kl": 0.1328125,
"learning_rate": 3.25e-08,
"loss": 0.0013,
"reward": 3.915758967399597,
"reward_std": 0.015426212921738625,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9174197912216187,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9983391761779785,
"step": 775
},
{
"completion_length": 223.65625,
"epoch": 2.48,
"grad_norm": 3.6137807369232178,
"kl": 0.115966796875,
"learning_rate": 3.125e-08,
"loss": 0.0012,
"reward": 3.939508318901062,
"reward_std": 0.00902418838813901,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9407406747341156,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987677037715912,
"step": 776
},
{
"completion_length": 248.0,
"epoch": 2.4832,
"grad_norm": 1.4470294713974,
"kl": 0.16015625,
"learning_rate": 3e-08,
"loss": 0.0016,
"reward": 3.8835391998291016,
"reward_std": 0.029840022325515747,
"rewards/answer_entity_reward": 0.9926948249340057,
"rewards/answer_wer_reward": 0.9006942212581635,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9901500642299652,
"step": 777
},
{
"completion_length": 174.3125,
"epoch": 2.4864,
"grad_norm": 2.8671512603759766,
"kl": 0.12353515625,
"learning_rate": 2.875e-08,
"loss": 0.0012,
"reward": 3.9390430450439453,
"reward_std": 0.029761829413473606,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9475694894790649,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9938772618770599,
"step": 778
},
{
"completion_length": 217.0,
"epoch": 2.4896,
"grad_norm": 1.7183799743652344,
"kl": 0.095458984375,
"learning_rate": 2.7499999999999998e-08,
"loss": 0.001,
"reward": 3.922086715698242,
"reward_std": 0.01339792925864458,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9282321929931641,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9938544631004333,
"step": 779
},
{
"completion_length": 205.09375,
"epoch": 2.4928,
"grad_norm": 2.424999475479126,
"kl": 0.102294921875,
"learning_rate": 2.6249999999999997e-08,
"loss": 0.001,
"reward": 3.944626212120056,
"reward_std": 0.038148084189742804,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9640980660915375,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9805281758308411,
"step": 780
},
{
"completion_length": 220.6875,
"epoch": 2.496,
"grad_norm": 1.739138126373291,
"kl": 0.09765625,
"learning_rate": 2.5e-08,
"loss": 0.001,
"reward": 3.943056583404541,
"reward_std": 0.025130684953182936,
"rewards/answer_entity_reward": 0.9871794581413269,
"rewards/answer_wer_reward": 0.9561585485935211,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997184872627258,
"step": 781
},
{
"completion_length": 199.375,
"epoch": 2.4992,
"grad_norm": 1.3409775495529175,
"kl": 0.083984375,
"learning_rate": 2.3749999999999998e-08,
"loss": 0.0008,
"reward": 3.9247756004333496,
"reward_std": 0.021664155647158623,
"rewards/answer_entity_reward": 0.9902146756649017,
"rewards/answer_wer_reward": 0.9345609843730927,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 782
},
{
"completion_length": 221.28125,
"epoch": 2.5023999999999997,
"grad_norm": 1.9740352630615234,
"kl": 0.099853515625,
"learning_rate": 2.25e-08,
"loss": 0.001,
"reward": 3.955259919166565,
"reward_std": 0.010415108175948262,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9565965533256531,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9986633956432343,
"step": 783
},
{
"completion_length": 235.40625,
"epoch": 2.5056000000000003,
"grad_norm": 7.616406440734863,
"kl": 0.144287109375,
"learning_rate": 2.1250000000000002e-08,
"loss": 0.0014,
"reward": 3.9511306285858154,
"reward_std": 0.011523132212460041,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9639480412006378,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9871825873851776,
"step": 784
},
{
"completion_length": 205.84375,
"epoch": 2.5088,
"grad_norm": 3.1992883682250977,
"kl": 0.107421875,
"learning_rate": 2e-08,
"loss": 0.0011,
"reward": 3.92184841632843,
"reward_std": 0.016722742468118668,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9461718797683716,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9756765961647034,
"step": 785
},
{
"completion_length": 225.6875,
"epoch": 2.512,
"grad_norm": 1.2884989976882935,
"kl": 0.139404296875,
"learning_rate": 1.875e-08,
"loss": 0.0014,
"reward": 3.946265697479248,
"reward_std": 0.017564056208357215,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9507300853729248,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 786
},
{
"completion_length": 176.96875,
"epoch": 2.5152,
"grad_norm": 3.3580868244171143,
"kl": 0.1982421875,
"learning_rate": 1.75e-08,
"loss": 0.002,
"reward": 3.8963418006896973,
"reward_std": 0.04480761382728815,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9633896946907043,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9329521358013153,
"step": 787
},
{
"completion_length": 260.28125,
"epoch": 2.5183999999999997,
"grad_norm": 1.0715585947036743,
"kl": 0.105712890625,
"learning_rate": 1.625e-08,
"loss": 0.0011,
"reward": 3.904552698135376,
"reward_std": 0.034514338709414005,
"rewards/answer_entity_reward": 0.9831239283084869,
"rewards/answer_wer_reward": 0.9217879772186279,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996408224105835,
"step": 788
},
{
"completion_length": 251.625,
"epoch": 2.5216,
"grad_norm": 3.5006961822509766,
"kl": 0.080322265625,
"learning_rate": 1.5e-08,
"loss": 0.0008,
"reward": 3.9146039485931396,
"reward_std": 0.028964843600988388,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9146038293838501,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 789
},
{
"completion_length": 201.0625,
"epoch": 2.5248,
"grad_norm": 5.0292534828186035,
"kl": 0.1396484375,
"learning_rate": 1.3749999999999999e-08,
"loss": 0.0014,
"reward": 3.897321939468384,
"reward_std": 0.01521459873765707,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9714652001857758,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9258567690849304,
"step": 790
},
{
"completion_length": 185.03125,
"epoch": 2.528,
"grad_norm": 2.234839916229248,
"kl": 0.1005859375,
"learning_rate": 1.25e-08,
"loss": 0.001,
"reward": 3.930277109146118,
"reward_std": 0.026013732887804508,
"rewards/answer_entity_reward": 0.9908459782600403,
"rewards/answer_wer_reward": 0.9424907863140106,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9969403147697449,
"step": 791
},
{
"completion_length": 185.84375,
"epoch": 2.5312,
"grad_norm": 0.5959092974662781,
"kl": 0.0947265625,
"learning_rate": 1.125e-08,
"loss": 0.0009,
"reward": 3.954566478729248,
"reward_std": 0.011145764729008079,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9578571021556854,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9995503425598145,
"step": 792
},
{
"completion_length": 197.84375,
"epoch": 2.5343999999999998,
"grad_norm": 2.0784664154052734,
"kl": 0.114501953125,
"learning_rate": 1e-08,
"loss": 0.0011,
"reward": 3.885765790939331,
"reward_std": 0.012588209472596645,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9541250765323639,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.931640625,
"step": 793
},
{
"completion_length": 190.03125,
"epoch": 2.5376,
"grad_norm": 1.7104955911636353,
"kl": 0.224609375,
"learning_rate": 8.75e-09,
"loss": 0.0022,
"reward": 3.824442148208618,
"reward_std": 0.039704530499875546,
"rewards/answer_entity_reward": 0.9877451062202454,
"rewards/answer_wer_reward": 0.919477641582489,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9172193706035614,
"step": 794
},
{
"completion_length": 221.4375,
"epoch": 2.5408,
"grad_norm": 2.524031162261963,
"kl": 0.09521484375,
"learning_rate": 7.5e-09,
"loss": 0.001,
"reward": 3.9210238456726074,
"reward_std": 0.03186593018472195,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9480306208133698,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9729932844638824,
"step": 795
},
{
"completion_length": 149.9375,
"epoch": 2.544,
"grad_norm": 2.592532157897949,
"kl": 0.116943359375,
"learning_rate": 6.25e-09,
"loss": 0.0012,
"reward": 3.835923910140991,
"reward_std": 0.016047589480876923,
"rewards/answer_entity_reward": 0.9942555129528046,
"rewards/answer_wer_reward": 0.8419776558876038,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999690592288971,
"step": 796
},
{
"completion_length": 196.3125,
"epoch": 2.5472,
"grad_norm": 1.1898647546768188,
"kl": 0.0810546875,
"learning_rate": 5e-09,
"loss": 0.0008,
"reward": 3.9655500650405884,
"reward_std": 0.012615942629054189,
"rewards/answer_entity_reward": 0.9937500059604645,
"rewards/answer_wer_reward": 0.9718000292778015,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 797
},
{
"completion_length": 220.3125,
"epoch": 2.5504,
"grad_norm": 1.6702154874801636,
"kl": 0.093505859375,
"learning_rate": 3.75e-09,
"loss": 0.0009,
"reward": 3.945963501930237,
"reward_std": 0.007131826248951256,
"rewards/answer_entity_reward": 0.9926470518112183,
"rewards/answer_wer_reward": 0.9533165395259857,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 798
},
{
"completion_length": 223.65625,
"epoch": 2.5536,
"grad_norm": 46.13692855834961,
"kl": 0.11083984375,
"learning_rate": 2.5e-09,
"loss": 0.0011,
"reward": 3.8597129583358765,
"reward_std": 0.09108205512166023,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9162732660770416,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9770934879779816,
"step": 799
},
{
"completion_length": 233.84375,
"epoch": 2.5568,
"grad_norm": 1.1842632293701172,
"kl": 0.108642578125,
"learning_rate": 1.25e-09,
"loss": 0.0011,
"reward": 3.9367305040359497,
"reward_std": 0.01876719295978546,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9404171705245972,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9983966648578644,
"step": 800
}
],
"logging_steps": 1,
"max_steps": 800,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}