{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.5568, "eval_steps": 500, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 220.40625, "epoch": 0.0032, "grad_norm": 11.881386756896973, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 2.0222461223602295, "reward_std": 1.2291262745857239, "rewards/answer_entity_reward": 0.5891842544078827, "rewards/answer_wer_reward": 0.36776189506053925, "rewards/format_reward": 0.46875, "rewards/think_ocr_reward": 0.596549928188324, "step": 1 }, { "completion_length": 183.75, "epoch": 0.0064, "grad_norm": 14.301155090332031, "kl": 0.000579833984375, "learning_rate": 9.9875e-07, "loss": 0.0, "reward": 2.1407116651535034, "reward_std": 0.9154457449913025, "rewards/answer_entity_reward": 0.7417342960834503, "rewards/answer_wer_reward": 0.4293617159128189, "rewards/format_reward": 0.59375, "rewards/think_ocr_reward": 0.3758656233549118, "step": 2 }, { "completion_length": 185.09375, "epoch": 0.0096, "grad_norm": 7.90402889251709, "kl": 0.0025768280029296875, "learning_rate": 9.975e-07, "loss": 0.0, "reward": 2.4301702976226807, "reward_std": 1.0761558413505554, "rewards/answer_entity_reward": 0.7529265582561493, "rewards/answer_wer_reward": 0.45110173523426056, "rewards/format_reward": 0.6875, "rewards/think_ocr_reward": 0.5386419892311096, "step": 3 }, { "completion_length": 201.46875, "epoch": 0.0128, "grad_norm": 2.4371554851531982, "kl": 0.0039825439453125, "learning_rate": 9.9625e-07, "loss": 0.0, "reward": 2.4960588216781616, "reward_std": 1.0011246800422668, "rewards/answer_entity_reward": 0.6945474743843079, "rewards/answer_wer_reward": 0.626116082072258, "rewards/format_reward": 0.65625, "rewards/think_ocr_reward": 0.519145280122757, "step": 4 }, { "completion_length": 223.1875, "epoch": 0.016, "grad_norm": 3.092437982559204, "kl": 0.001644134521484375, "learning_rate": 9.95e-07, "loss": 0.0, "reward": 2.6151310205459595, "reward_std": 1.0057614743709564, "rewards/answer_entity_reward": 0.6729370057582855, "rewards/answer_wer_reward": 0.43601465225219727, "rewards/format_reward": 0.75, "rewards/think_ocr_reward": 0.7561794817447662, "step": 5 }, { "completion_length": 211.09375, "epoch": 0.0192, "grad_norm": 3.8149898052215576, "kl": 0.00344085693359375, "learning_rate": 9.9375e-07, "loss": 0.0, "reward": 2.601198673248291, "reward_std": 0.8605955541133881, "rewards/answer_entity_reward": 0.6944940388202667, "rewards/answer_wer_reward": 0.5194687843322754, "rewards/format_reward": 0.71875, "rewards/think_ocr_reward": 0.6684857904911041, "step": 6 }, { "completion_length": 210.8125, "epoch": 0.0224, "grad_norm": 2.000467300415039, "kl": 0.0030364990234375, "learning_rate": 9.925e-07, "loss": 0.0, "reward": 3.1113568544387817, "reward_std": 0.928675651550293, "rewards/answer_entity_reward": 0.8195368647575378, "rewards/answer_wer_reward": 0.7422276139259338, "rewards/format_reward": 0.75, "rewards/think_ocr_reward": 0.7995923757553101, "step": 7 }, { "completion_length": 240.375, "epoch": 0.0256, "grad_norm": 2.2319533824920654, "kl": 0.0052947998046875, "learning_rate": 9.912499999999998e-07, "loss": 0.0001, "reward": 3.217132568359375, "reward_std": 0.4984496384859085, "rewards/answer_entity_reward": 0.7789974808692932, "rewards/answer_wer_reward": 0.6678729355335236, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.8015121817588806, "step": 8 }, { "completion_length": 217.1875, "epoch": 0.0288, "grad_norm": 2.6002566814422607, "kl": 0.06464385986328125, "learning_rate": 9.9e-07, "loss": 0.0006, "reward": 3.217494249343872, "reward_std": 0.5446330606937408, "rewards/answer_entity_reward": 0.8213226199150085, "rewards/answer_wer_reward": 0.7331169545650482, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.6943045258522034, "step": 9 }, { "completion_length": 196.40625, "epoch": 0.032, "grad_norm": 2.9925193786621094, "kl": 0.008941650390625, "learning_rate": 9.8875e-07, "loss": 0.0001, "reward": 3.2711292505264282, "reward_std": 0.5466351807117462, "rewards/answer_entity_reward": 0.7905315160751343, "rewards/answer_wer_reward": 0.7206964790821075, "rewards/format_reward": 0.90625, "rewards/think_ocr_reward": 0.853651225566864, "step": 10 }, { "completion_length": 146.53125, "epoch": 0.0352, "grad_norm": 3.6174111366271973, "kl": 0.0103912353515625, "learning_rate": 9.875e-07, "loss": 0.0001, "reward": 3.083841323852539, "reward_std": 0.6508071422576904, "rewards/answer_entity_reward": 0.7979910671710968, "rewards/answer_wer_reward": 0.6100275814533234, "rewards/format_reward": 0.90625, "rewards/think_ocr_reward": 0.7695727646350861, "step": 11 }, { "completion_length": 218.15625, "epoch": 0.0384, "grad_norm": 3.2925424575805664, "kl": 0.00616455078125, "learning_rate": 9.862499999999999e-07, "loss": 0.0001, "reward": 3.2391178607940674, "reward_std": 0.6323770582675934, "rewards/answer_entity_reward": 0.781956285238266, "rewards/answer_wer_reward": 0.6958223879337311, "rewards/format_reward": 0.90625, "rewards/think_ocr_reward": 0.8550890386104584, "step": 12 }, { "completion_length": 250.53125, "epoch": 0.0416, "grad_norm": 2.291048288345337, "kl": 0.0086669921875, "learning_rate": 9.849999999999999e-07, "loss": 0.0001, "reward": 3.238759756088257, "reward_std": 0.4200912415981293, "rewards/answer_entity_reward": 0.8185493648052216, "rewards/answer_wer_reward": 0.699150562286377, "rewards/format_reward": 0.9375, "rewards/think_ocr_reward": 0.7835597395896912, "step": 13 }, { "completion_length": 196.6875, "epoch": 0.0448, "grad_norm": 2.470576524734497, "kl": 0.0181884765625, "learning_rate": 9.8375e-07, "loss": 0.0002, "reward": 3.460441470146179, "reward_std": 0.34273722767829895, "rewards/answer_entity_reward": 0.9129322171211243, "rewards/answer_wer_reward": 0.7192246317863464, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.8595346808433533, "step": 14 }, { "completion_length": 181.78125, "epoch": 0.048, "grad_norm": 13.122944831848145, "kl": 0.0174560546875, "learning_rate": 9.825e-07, "loss": 0.0002, "reward": 3.526148796081543, "reward_std": 0.2207299917936325, "rewards/answer_entity_reward": 0.8908324241638184, "rewards/answer_wer_reward": 0.8109035789966583, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.8556627035140991, "step": 15 }, { "completion_length": 181.9375, "epoch": 0.0512, "grad_norm": 3.1282718181610107, "kl": 0.0081329345703125, "learning_rate": 9.8125e-07, "loss": 0.0001, "reward": 3.4612035751342773, "reward_std": 0.2798766866326332, "rewards/answer_entity_reward": 0.8926167786121368, "rewards/answer_wer_reward": 0.6810254156589508, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9188113510608673, "step": 16 }, { "completion_length": 243.125, "epoch": 0.0544, "grad_norm": 1.907029390335083, "kl": 0.00677490234375, "learning_rate": 9.8e-07, "loss": 0.0001, "reward": 3.375656485557556, "reward_std": 0.37908758223056793, "rewards/answer_entity_reward": 0.8232844769954681, "rewards/answer_wer_reward": 0.6466233134269714, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9369988143444061, "step": 17 }, { "completion_length": 236.34375, "epoch": 0.0576, "grad_norm": 2.551098108291626, "kl": 0.0098876953125, "learning_rate": 9.7875e-07, "loss": 0.0001, "reward": 3.637453317642212, "reward_std": 0.1572738140821457, "rewards/answer_entity_reward": 0.8815866112709045, "rewards/answer_wer_reward": 0.8101728856563568, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9456938207149506, "step": 18 }, { "completion_length": 242.28125, "epoch": 0.0608, "grad_norm": 3.0685667991638184, "kl": 0.010223388671875, "learning_rate": 9.775e-07, "loss": 0.0001, "reward": 3.3409019708633423, "reward_std": 0.3057943657040596, "rewards/answer_entity_reward": 0.7610115706920624, "rewards/answer_wer_reward": 0.6856433153152466, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8942470550537109, "step": 19 }, { "completion_length": 193.46875, "epoch": 0.064, "grad_norm": 2.6569221019744873, "kl": 0.0095977783203125, "learning_rate": 9.7625e-07, "loss": 0.0001, "reward": 3.5098860263824463, "reward_std": 0.27671176940202713, "rewards/answer_entity_reward": 0.8399666249752045, "rewards/answer_wer_reward": 0.7382143139839172, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9317050278186798, "step": 20 }, { "completion_length": 199.28125, "epoch": 0.0672, "grad_norm": 3.02462100982666, "kl": 0.0101318359375, "learning_rate": 9.75e-07, "loss": 0.0001, "reward": 3.552868962287903, "reward_std": 0.24761613458395004, "rewards/answer_entity_reward": 0.9026052951812744, "rewards/answer_wer_reward": 0.7746964991092682, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8755670785903931, "step": 21 }, { "completion_length": 239.75, "epoch": 0.0704, "grad_norm": 5.65736722946167, "kl": 0.010223388671875, "learning_rate": 9.7375e-07, "loss": 0.0001, "reward": 3.3219141960144043, "reward_std": 0.32601839303970337, "rewards/answer_entity_reward": 0.8810833096504211, "rewards/answer_wer_reward": 0.6434947550296783, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.7973361015319824, "step": 22 }, { "completion_length": 216.21875, "epoch": 0.0736, "grad_norm": 6.68402099609375, "kl": 0.009765625, "learning_rate": 9.725e-07, "loss": 0.0001, "reward": 3.67569899559021, "reward_std": 0.19380945712327957, "rewards/answer_entity_reward": 0.9180394113063812, "rewards/answer_wer_reward": 0.8205302953720093, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9371293187141418, "step": 23 }, { "completion_length": 200.65625, "epoch": 0.0768, "grad_norm": 3.398916006088257, "kl": 0.0118408203125, "learning_rate": 9.712499999999998e-07, "loss": 0.0001, "reward": 3.575831174850464, "reward_std": 0.22907962650060654, "rewards/answer_entity_reward": 0.9015873074531555, "rewards/answer_wer_reward": 0.8195928931236267, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8546508848667145, "step": 24 }, { "completion_length": 144.9375, "epoch": 0.08, "grad_norm": 3.852799415588379, "kl": 0.025146484375, "learning_rate": 9.7e-07, "loss": 0.0003, "reward": 3.596950054168701, "reward_std": 0.29281121492385864, "rewards/answer_entity_reward": 0.9606508314609528, "rewards/answer_wer_reward": 0.7530401945114136, "rewards/format_reward": 0.9375, "rewards/think_ocr_reward": 0.9457589387893677, "step": 25 }, { "completion_length": 201.375, "epoch": 0.0832, "grad_norm": 3.684136390686035, "kl": 0.03955078125, "learning_rate": 9.6875e-07, "loss": 0.0004, "reward": 3.6101993322372437, "reward_std": 0.22506854683160782, "rewards/answer_entity_reward": 0.8913510143756866, "rewards/answer_wer_reward": 0.855983167886734, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8628652393817902, "step": 26 }, { "completion_length": 235.25, "epoch": 0.0864, "grad_norm": 2.9537627696990967, "kl": 0.0134124755859375, "learning_rate": 9.675e-07, "loss": 0.0001, "reward": 3.579669713973999, "reward_std": 0.17270359210669994, "rewards/answer_entity_reward": 0.8651459515094757, "rewards/answer_wer_reward": 0.7930598855018616, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9214637279510498, "step": 27 }, { "completion_length": 199.96875, "epoch": 0.0896, "grad_norm": 2.0981569290161133, "kl": 0.02239990234375, "learning_rate": 9.6625e-07, "loss": 0.0002, "reward": 3.589198589324951, "reward_std": 0.2977752536535263, "rewards/answer_entity_reward": 0.8878033757209778, "rewards/answer_wer_reward": 0.8114102184772491, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.889985203742981, "step": 28 }, { "completion_length": 229.6875, "epoch": 0.0928, "grad_norm": 2.406397819519043, "kl": 0.0191650390625, "learning_rate": 9.649999999999999e-07, "loss": 0.0002, "reward": 3.4348872900009155, "reward_std": 0.37296992540359497, "rewards/answer_entity_reward": 0.7681002914905548, "rewards/answer_wer_reward": 0.724025309085846, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9740117192268372, "step": 29 }, { "completion_length": 199.59375, "epoch": 0.096, "grad_norm": 4.711977481842041, "kl": 0.01727294921875, "learning_rate": 9.637499999999999e-07, "loss": 0.0002, "reward": 3.7957680225372314, "reward_std": 0.10022839158773422, "rewards/answer_entity_reward": 0.9259244203567505, "rewards/answer_wer_reward": 0.8810202181339264, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9888232052326202, "step": 30 }, { "completion_length": 227.71875, "epoch": 0.0992, "grad_norm": 8.605613708496094, "kl": 0.016021728515625, "learning_rate": 9.624999999999999e-07, "loss": 0.0002, "reward": 3.6433751583099365, "reward_std": 0.19832589477300644, "rewards/answer_entity_reward": 0.8819950520992279, "rewards/answer_wer_reward": 0.832177460193634, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9292027056217194, "step": 31 }, { "completion_length": 215.65625, "epoch": 0.1024, "grad_norm": 3.5583388805389404, "kl": 0.0224609375, "learning_rate": 9.6125e-07, "loss": 0.0002, "reward": 3.516916036605835, "reward_std": 0.29861560463905334, "rewards/answer_entity_reward": 0.8093456923961639, "rewards/answer_wer_reward": 0.7672389149665833, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9403314590454102, "step": 32 }, { "completion_length": 255.8125, "epoch": 0.1056, "grad_norm": 3.647063970565796, "kl": 0.009185791015625, "learning_rate": 9.6e-07, "loss": 0.0001, "reward": 3.5868738889694214, "reward_std": 0.2677561491727829, "rewards/answer_entity_reward": 0.818858414888382, "rewards/answer_wer_reward": 0.7967112958431244, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9713042676448822, "step": 33 }, { "completion_length": 223.25, "epoch": 0.1088, "grad_norm": 4.442183017730713, "kl": 0.02569580078125, "learning_rate": 9.5875e-07, "loss": 0.0003, "reward": 3.6685177087783813, "reward_std": 0.16033701971173286, "rewards/answer_entity_reward": 0.8931982815265656, "rewards/answer_wer_reward": 0.8027337491512299, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.972585529088974, "step": 34 }, { "completion_length": 225.09375, "epoch": 0.112, "grad_norm": 1.850151538848877, "kl": 0.0135498046875, "learning_rate": 9.575e-07, "loss": 0.0001, "reward": 3.622478485107422, "reward_std": 0.15638228505849838, "rewards/answer_entity_reward": 0.8341188132762909, "rewards/answer_wer_reward": 0.8296426832675934, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9587167799472809, "step": 35 }, { "completion_length": 182.75, "epoch": 0.1152, "grad_norm": 3.844250202178955, "kl": 0.100982666015625, "learning_rate": 9.5625e-07, "loss": 0.001, "reward": 3.575288772583008, "reward_std": 0.3447410613298416, "rewards/answer_entity_reward": 0.8734935224056244, "rewards/answer_wer_reward": 0.8351728320121765, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.8978723287582397, "step": 36 }, { "completion_length": 170.71875, "epoch": 0.1184, "grad_norm": 3.608771800994873, "kl": 0.0318603515625, "learning_rate": 9.55e-07, "loss": 0.0003, "reward": 3.757541060447693, "reward_std": 0.16554252058267593, "rewards/answer_entity_reward": 0.9673819839954376, "rewards/answer_wer_reward": 0.8668203055858612, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9233386218547821, "step": 37 }, { "completion_length": 252.0, "epoch": 0.1216, "grad_norm": 2.063748836517334, "kl": 0.01507568359375, "learning_rate": 9.5375e-07, "loss": 0.0001, "reward": 3.716595768928528, "reward_std": 0.10926416516304016, "rewards/answer_entity_reward": 0.8833416402339935, "rewards/answer_wer_reward": 0.8585604727268219, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9746935665607452, "step": 38 }, { "completion_length": 231.21875, "epoch": 0.1248, "grad_norm": 2.751699447631836, "kl": 0.0213623046875, "learning_rate": 9.525e-07, "loss": 0.0002, "reward": 3.539994239807129, "reward_std": 0.1212783083319664, "rewards/answer_entity_reward": 0.7954491972923279, "rewards/answer_wer_reward": 0.7638055980205536, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.980739563703537, "step": 39 }, { "completion_length": 216.09375, "epoch": 0.128, "grad_norm": 2.074568033218384, "kl": 0.0379638671875, "learning_rate": 9.5125e-07, "loss": 0.0004, "reward": 3.6039533615112305, "reward_std": 0.26473698019981384, "rewards/answer_entity_reward": 0.8746186196804047, "rewards/answer_wer_reward": 0.8307992517948151, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9297854900360107, "step": 40 }, { "completion_length": 203.5, "epoch": 0.1312, "grad_norm": 3.2622625827789307, "kl": 0.05419921875, "learning_rate": 9.499999999999999e-07, "loss": 0.0005, "reward": 3.4951852560043335, "reward_std": 0.18541007116436958, "rewards/answer_entity_reward": 0.8705199360847473, "rewards/answer_wer_reward": 0.8321611285209656, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.7925041615962982, "step": 41 }, { "completion_length": 195.84375, "epoch": 0.1344, "grad_norm": 2.3474910259246826, "kl": 0.0272216796875, "learning_rate": 9.487499999999999e-07, "loss": 0.0003, "reward": 3.556153178215027, "reward_std": 0.22145777754485607, "rewards/answer_entity_reward": 0.9313356876373291, "rewards/answer_wer_reward": 0.7051927447319031, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9508746266365051, "step": 42 }, { "completion_length": 213.25, "epoch": 0.1376, "grad_norm": 2.805851697921753, "kl": 0.039794921875, "learning_rate": 9.474999999999999e-07, "loss": 0.0004, "reward": 3.4438276290893555, "reward_std": 0.306783527135849, "rewards/answer_entity_reward": 0.9020311534404755, "rewards/answer_wer_reward": 0.7658404111862183, "rewards/format_reward": 0.9375, "rewards/think_ocr_reward": 0.838456004858017, "step": 43 }, { "completion_length": 237.6875, "epoch": 0.1408, "grad_norm": 1.9424443244934082, "kl": 0.04632568359375, "learning_rate": 9.462499999999999e-07, "loss": 0.0005, "reward": 3.6309977769851685, "reward_std": 0.2500930577516556, "rewards/answer_entity_reward": 0.8781489729881287, "rewards/answer_wer_reward": 0.8788634538650513, "rewards/format_reward": 0.9375, "rewards/think_ocr_reward": 0.9364852905273438, "step": 44 }, { "completion_length": 239.375, "epoch": 0.144, "grad_norm": 46.16355895996094, "kl": 0.0579833984375, "learning_rate": 9.45e-07, "loss": 0.0006, "reward": 3.5368932485580444, "reward_std": 0.43694401532411575, "rewards/answer_entity_reward": 0.919220894575119, "rewards/answer_wer_reward": 0.8205748200416565, "rewards/format_reward": 0.84375, "rewards/think_ocr_reward": 0.9533475041389465, "step": 45 }, { "completion_length": 173.84375, "epoch": 0.1472, "grad_norm": 3.7639763355255127, "kl": 0.0450439453125, "learning_rate": 9.4375e-07, "loss": 0.0004, "reward": 3.7322875261306763, "reward_std": 0.1945570409297943, "rewards/answer_entity_reward": 0.9228407144546509, "rewards/answer_wer_reward": 0.8905497789382935, "rewards/format_reward": 0.9375, "rewards/think_ocr_reward": 0.9813971817493439, "step": 46 }, { "completion_length": 147.09375, "epoch": 0.1504, "grad_norm": 4.257631301879883, "kl": 0.0538330078125, "learning_rate": 9.425e-07, "loss": 0.0005, "reward": 3.478027820587158, "reward_std": 0.2542489320039749, "rewards/answer_entity_reward": 0.8890827894210815, "rewards/answer_wer_reward": 0.7596322894096375, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.8605626821517944, "step": 47 }, { "completion_length": 223.9375, "epoch": 0.1536, "grad_norm": 1.5165725946426392, "kl": 0.0335693359375, "learning_rate": 9.4125e-07, "loss": 0.0003, "reward": 3.695801019668579, "reward_std": 0.21276018023490906, "rewards/answer_entity_reward": 0.9133437275886536, "rewards/answer_wer_reward": 0.8821894526481628, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9315177500247955, "step": 48 }, { "completion_length": 196.15625, "epoch": 0.1568, "grad_norm": 2.7737293243408203, "kl": 0.04931640625, "learning_rate": 9.399999999999999e-07, "loss": 0.0005, "reward": 3.7317415475845337, "reward_std": 0.11913972720503807, "rewards/answer_entity_reward": 0.9534916281700134, "rewards/answer_wer_reward": 0.8561010956764221, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9533988535404205, "step": 49 }, { "completion_length": 192.21875, "epoch": 0.16, "grad_norm": 3.4223740100860596, "kl": 0.04052734375, "learning_rate": 9.387499999999999e-07, "loss": 0.0004, "reward": 3.65939998626709, "reward_std": 0.1464347057044506, "rewards/answer_entity_reward": 0.9478480219841003, "rewards/answer_wer_reward": 0.8836140036582947, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8279379308223724, "step": 50 }, { "completion_length": 170.75, "epoch": 0.1632, "grad_norm": 3.389747381210327, "kl": 0.0406494140625, "learning_rate": 9.374999999999999e-07, "loss": 0.0004, "reward": 3.6742804050445557, "reward_std": 0.21486516296863556, "rewards/answer_entity_reward": 0.9492871761322021, "rewards/answer_wer_reward": 0.8503031730651855, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8746900260448456, "step": 51 }, { "completion_length": 249.78125, "epoch": 0.1664, "grad_norm": 1.3398560285568237, "kl": 0.0609130859375, "learning_rate": 9.3625e-07, "loss": 0.0006, "reward": 3.7340474128723145, "reward_std": 0.16536326706409454, "rewards/answer_entity_reward": 0.9178049564361572, "rewards/answer_wer_reward": 0.8599284589290619, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9563138782978058, "step": 52 }, { "completion_length": 243.53125, "epoch": 0.1696, "grad_norm": 2.292407512664795, "kl": 0.035400390625, "learning_rate": 9.35e-07, "loss": 0.0004, "reward": 3.6057465076446533, "reward_std": 0.1650264859199524, "rewards/answer_entity_reward": 0.942800760269165, "rewards/answer_wer_reward": 0.743953675031662, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9189921319484711, "step": 53 }, { "completion_length": 224.4375, "epoch": 0.1728, "grad_norm": 25.665359497070312, "kl": 0.03118896484375, "learning_rate": 9.3375e-07, "loss": 0.0003, "reward": 3.6430656909942627, "reward_std": 0.14360623061656952, "rewards/answer_entity_reward": 0.907882422208786, "rewards/answer_wer_reward": 0.7998041808605194, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9353790581226349, "step": 54 }, { "completion_length": 173.0625, "epoch": 0.176, "grad_norm": 4.687534809112549, "kl": 0.03643798828125, "learning_rate": 9.325e-07, "loss": 0.0004, "reward": 3.776802897453308, "reward_std": 0.10255010426044464, "rewards/answer_entity_reward": 0.9577985405921936, "rewards/answer_wer_reward": 0.8955680429935455, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9234363734722137, "step": 55 }, { "completion_length": 241.84375, "epoch": 0.1792, "grad_norm": 2.1417253017425537, "kl": 0.02978515625, "learning_rate": 9.3125e-07, "loss": 0.0003, "reward": 3.7508766651153564, "reward_std": 0.12244473025202751, "rewards/answer_entity_reward": 0.9196350574493408, "rewards/answer_wer_reward": 0.8353821933269501, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.995859295129776, "step": 56 }, { "completion_length": 214.15625, "epoch": 0.1824, "grad_norm": 2.977281332015991, "kl": 0.03302001953125, "learning_rate": 9.3e-07, "loss": 0.0003, "reward": 3.77036452293396, "reward_std": 0.18844036478549242, "rewards/answer_entity_reward": 0.9284944236278534, "rewards/answer_wer_reward": 0.8541653454303741, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9877048432826996, "step": 57 }, { "completion_length": 245.9375, "epoch": 0.1856, "grad_norm": 1.5624388456344604, "kl": 0.0296630859375, "learning_rate": 9.287499999999999e-07, "loss": 0.0003, "reward": 3.7977479696273804, "reward_std": 0.08727182075381279, "rewards/answer_entity_reward": 0.9509085714817047, "rewards/answer_wer_reward": 0.8592260181903839, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9876134395599365, "step": 58 }, { "completion_length": 232.65625, "epoch": 0.1888, "grad_norm": 55.87119674682617, "kl": 0.047607421875, "learning_rate": 9.274999999999999e-07, "loss": 0.0005, "reward": 3.6933377981185913, "reward_std": 0.24168139696121216, "rewards/answer_entity_reward": 0.9402236640453339, "rewards/answer_wer_reward": 0.8164783418178558, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9366357624530792, "step": 59 }, { "completion_length": 221.65625, "epoch": 0.192, "grad_norm": 1.8363709449768066, "kl": 0.04156494140625, "learning_rate": 9.2625e-07, "loss": 0.0004, "reward": 3.8290294408798218, "reward_std": 0.08228548988699913, "rewards/answer_entity_reward": 0.9317659735679626, "rewards/answer_wer_reward": 0.9017607867717743, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.995502769947052, "step": 60 }, { "completion_length": 207.5625, "epoch": 0.1952, "grad_norm": 5.360762119293213, "kl": 0.03662109375, "learning_rate": 9.25e-07, "loss": 0.0004, "reward": 3.4508965015411377, "reward_std": 0.24354729056358337, "rewards/answer_entity_reward": 0.8888726234436035, "rewards/answer_wer_reward": 0.6527576148509979, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9092662334442139, "step": 61 }, { "completion_length": 175.3125, "epoch": 0.1984, "grad_norm": 6.900688171386719, "kl": 0.0562744140625, "learning_rate": 9.237499999999999e-07, "loss": 0.0006, "reward": 3.5809485912323, "reward_std": 0.27670779824256897, "rewards/answer_entity_reward": 0.875405490398407, "rewards/answer_wer_reward": 0.846805214881897, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8587377667427063, "step": 62 }, { "completion_length": 167.21875, "epoch": 0.2016, "grad_norm": 3.296032667160034, "kl": 0.03668212890625, "learning_rate": 9.225e-07, "loss": 0.0004, "reward": 3.775553345680237, "reward_std": 0.1621587909758091, "rewards/answer_entity_reward": 0.9595959782600403, "rewards/answer_wer_reward": 0.900894969701767, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9150623679161072, "step": 63 }, { "completion_length": 216.0625, "epoch": 0.2048, "grad_norm": 3.287728786468506, "kl": 0.05419921875, "learning_rate": 9.2125e-07, "loss": 0.0005, "reward": 3.580909013748169, "reward_std": 0.37151331454515457, "rewards/answer_entity_reward": 0.9558238685131073, "rewards/answer_wer_reward": 0.816798210144043, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.8395369946956635, "step": 64 }, { "completion_length": 242.1875, "epoch": 0.208, "grad_norm": 4.7966766357421875, "kl": 0.0389404296875, "learning_rate": 9.2e-07, "loss": 0.0004, "reward": 3.5479079484939575, "reward_std": 0.34015993028879166, "rewards/answer_entity_reward": 0.9070779979228973, "rewards/answer_wer_reward": 0.7606107890605927, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9114691615104675, "step": 65 }, { "completion_length": 182.9375, "epoch": 0.2112, "grad_norm": 4.85190486907959, "kl": 0.0411376953125, "learning_rate": 9.187499999999999e-07, "loss": 0.0004, "reward": 3.759209156036377, "reward_std": 0.030521959997713566, "rewards/answer_entity_reward": 0.9572916924953461, "rewards/answer_wer_reward": 0.9216786324977875, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.880238950252533, "step": 66 }, { "completion_length": 197.46875, "epoch": 0.2144, "grad_norm": 2.888380765914917, "kl": 0.03271484375, "learning_rate": 9.174999999999999e-07, "loss": 0.0003, "reward": 3.86090886592865, "reward_std": 0.08941158838570118, "rewards/answer_entity_reward": 0.974116176366806, "rewards/answer_wer_reward": 0.9031813442707062, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9836114048957825, "step": 67 }, { "completion_length": 212.03125, "epoch": 0.2176, "grad_norm": 0.9500738382339478, "kl": 0.03057861328125, "learning_rate": 9.1625e-07, "loss": 0.0003, "reward": 3.865835189819336, "reward_std": 0.04183580353856087, "rewards/answer_entity_reward": 0.9732177555561066, "rewards/answer_wer_reward": 0.8951224386692047, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9974949955940247, "step": 68 }, { "completion_length": 168.09375, "epoch": 0.2208, "grad_norm": 4.705175876617432, "kl": 0.03704833984375, "learning_rate": 9.15e-07, "loss": 0.0004, "reward": 3.6963913440704346, "reward_std": 0.16030436754226685, "rewards/answer_entity_reward": 0.932018518447876, "rewards/answer_wer_reward": 0.8480667769908905, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9163061678409576, "step": 69 }, { "completion_length": 193.21875, "epoch": 0.224, "grad_norm": 2.125580310821533, "kl": 0.03515625, "learning_rate": 9.137499999999999e-07, "loss": 0.0004, "reward": 3.8550466299057007, "reward_std": 0.06468157470226288, "rewards/answer_entity_reward": 0.9734202921390533, "rewards/answer_wer_reward": 0.8898061215877533, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9918202459812164, "step": 70 }, { "completion_length": 235.78125, "epoch": 0.2272, "grad_norm": 6.89145040512085, "kl": 0.042236328125, "learning_rate": 9.124999999999999e-07, "loss": 0.0004, "reward": 3.725824475288391, "reward_std": 0.05315144546329975, "rewards/answer_entity_reward": 0.9593958258628845, "rewards/answer_wer_reward": 0.8827618062496185, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8836667835712433, "step": 71 }, { "completion_length": 210.1875, "epoch": 0.2304, "grad_norm": 3.6971681118011475, "kl": 0.0343017578125, "learning_rate": 9.1125e-07, "loss": 0.0003, "reward": 3.719637870788574, "reward_std": 0.10697400569915771, "rewards/answer_entity_reward": 0.9880050718784332, "rewards/answer_wer_reward": 0.7961998879909515, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9354328513145447, "step": 72 }, { "completion_length": 216.5625, "epoch": 0.2336, "grad_norm": 17.082843780517578, "kl": 0.0537109375, "learning_rate": 9.1e-07, "loss": 0.0005, "reward": 3.6063274145126343, "reward_std": 0.2845265045762062, "rewards/answer_entity_reward": 0.9374077320098877, "rewards/answer_wer_reward": 0.7878484427928925, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.912321150302887, "step": 73 }, { "completion_length": 234.90625, "epoch": 0.2368, "grad_norm": 1.9695632457733154, "kl": 0.031982421875, "learning_rate": 9.087499999999999e-07, "loss": 0.0003, "reward": 3.762009024620056, "reward_std": 0.06560477986931801, "rewards/answer_entity_reward": 0.9398341476917267, "rewards/answer_wer_reward": 0.8473882973194122, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9747865796089172, "step": 74 }, { "completion_length": 233.65625, "epoch": 0.24, "grad_norm": 1.8333961963653564, "kl": 0.0479736328125, "learning_rate": 9.074999999999999e-07, "loss": 0.0005, "reward": 3.6872040033340454, "reward_std": 0.12730678915977478, "rewards/answer_entity_reward": 0.9421398341655731, "rewards/answer_wer_reward": 0.8499290347099304, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.895135223865509, "step": 75 }, { "completion_length": 138.8125, "epoch": 0.2432, "grad_norm": 2.518507719039917, "kl": 0.0504150390625, "learning_rate": 9.0625e-07, "loss": 0.0005, "reward": 3.751777410507202, "reward_std": 0.18188580125570297, "rewards/answer_entity_reward": 0.9270697832107544, "rewards/answer_wer_reward": 0.9237564206123352, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9009511768817902, "step": 76 }, { "completion_length": 261.0, "epoch": 0.2464, "grad_norm": 4.395165920257568, "kl": 0.03662109375, "learning_rate": 9.05e-07, "loss": 0.0004, "reward": 3.602410674095154, "reward_std": 0.12657387554645538, "rewards/answer_entity_reward": 0.8546798527240753, "rewards/answer_wer_reward": 0.794090747833252, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9536401331424713, "step": 77 }, { "completion_length": 221.90625, "epoch": 0.2496, "grad_norm": 1.2728471755981445, "kl": 0.0294189453125, "learning_rate": 9.0375e-07, "loss": 0.0003, "reward": 3.788708806037903, "reward_std": 0.09669506549835205, "rewards/answer_entity_reward": 0.9447909295558929, "rewards/answer_wer_reward": 0.8481404483318329, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9957774579524994, "step": 78 }, { "completion_length": 254.5625, "epoch": 0.2528, "grad_norm": 9.725419998168945, "kl": 0.07373046875, "learning_rate": 9.024999999999999e-07, "loss": 0.0007, "reward": 3.668743133544922, "reward_std": 0.1221558079123497, "rewards/answer_entity_reward": 0.9435009658336639, "rewards/answer_wer_reward": 0.8254426419734955, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8997994065284729, "step": 79 }, { "completion_length": 196.375, "epoch": 0.256, "grad_norm": 2.1853079795837402, "kl": 0.0361328125, "learning_rate": 9.0125e-07, "loss": 0.0004, "reward": 3.6546449661254883, "reward_std": 0.1971728727221489, "rewards/answer_entity_reward": 0.9472028017044067, "rewards/answer_wer_reward": 0.8720800876617432, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.8666120767593384, "step": 80 }, { "completion_length": 248.25, "epoch": 0.2592, "grad_norm": 3.1572227478027344, "kl": 0.03375244140625, "learning_rate": 9e-07, "loss": 0.0003, "reward": 3.7423981428146362, "reward_std": 0.10061750188469887, "rewards/answer_entity_reward": 0.9398939311504364, "rewards/answer_wer_reward": 0.8211633265018463, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9813408553600311, "step": 81 }, { "completion_length": 238.40625, "epoch": 0.2624, "grad_norm": 1.5329415798187256, "kl": 0.03125, "learning_rate": 8.9875e-07, "loss": 0.0003, "reward": 3.874926447868347, "reward_std": 0.03685523197054863, "rewards/answer_entity_reward": 0.9718094170093536, "rewards/answer_wer_reward": 0.9062533378601074, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9968636631965637, "step": 82 }, { "completion_length": 222.5, "epoch": 0.2656, "grad_norm": 2.012899875640869, "kl": 0.0419921875, "learning_rate": 8.974999999999999e-07, "loss": 0.0004, "reward": 3.8047072887420654, "reward_std": 0.046287354081869125, "rewards/answer_entity_reward": 0.9534181356430054, "rewards/answer_wer_reward": 0.8727244138717651, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9785646796226501, "step": 83 }, { "completion_length": 225.21875, "epoch": 0.2688, "grad_norm": 1.5400514602661133, "kl": 0.0380859375, "learning_rate": 8.9625e-07, "loss": 0.0004, "reward": 3.718083620071411, "reward_std": 0.1703677996993065, "rewards/answer_entity_reward": 0.9013731181621552, "rewards/answer_wer_reward": 0.8260438740253448, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9906666278839111, "step": 84 }, { "completion_length": 236.125, "epoch": 0.272, "grad_norm": 1.6224849224090576, "kl": 0.0550537109375, "learning_rate": 8.95e-07, "loss": 0.0005, "reward": 3.8032166957855225, "reward_std": 0.0796846654266119, "rewards/answer_entity_reward": 0.9553452134132385, "rewards/answer_wer_reward": 0.8544089794158936, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9934625327587128, "step": 85 }, { "completion_length": 214.34375, "epoch": 0.2752, "grad_norm": 3.1244239807128906, "kl": 0.032470703125, "learning_rate": 8.9375e-07, "loss": 0.0003, "reward": 3.803860068321228, "reward_std": 0.06684968620538712, "rewards/answer_entity_reward": 0.9671759307384491, "rewards/answer_wer_reward": 0.9067878127098083, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9298965036869049, "step": 86 }, { "completion_length": 216.9375, "epoch": 0.2784, "grad_norm": 1.8527048826217651, "kl": 0.02996826171875, "learning_rate": 8.924999999999999e-07, "loss": 0.0003, "reward": 3.813448429107666, "reward_std": 0.05041965842247009, "rewards/answer_entity_reward": 0.9224496483802795, "rewards/answer_wer_reward": 0.8932149708271027, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9977837204933167, "step": 87 }, { "completion_length": 211.8125, "epoch": 0.2816, "grad_norm": 2.733228921890259, "kl": 0.05126953125, "learning_rate": 8.912499999999999e-07, "loss": 0.0005, "reward": 3.8481240272521973, "reward_std": 0.0621240958571434, "rewards/answer_entity_reward": 0.9627074301242828, "rewards/answer_wer_reward": 0.9041622579097748, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9812542796134949, "step": 88 }, { "completion_length": 202.5, "epoch": 0.2848, "grad_norm": 4.8413496017456055, "kl": 0.0433349609375, "learning_rate": 8.9e-07, "loss": 0.0004, "reward": 3.668493866920471, "reward_std": 0.08999799937009811, "rewards/answer_entity_reward": 0.96169114112854, "rewards/answer_wer_reward": 0.7791127562522888, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.92768993973732, "step": 89 }, { "completion_length": 214.6875, "epoch": 0.288, "grad_norm": 4.111961841583252, "kl": 0.04638671875, "learning_rate": 8.8875e-07, "loss": 0.0005, "reward": 3.7720965147018433, "reward_std": 0.18014637380838394, "rewards/answer_entity_reward": 0.9866696000099182, "rewards/answer_wer_reward": 0.8934727013111115, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9232043027877808, "step": 90 }, { "completion_length": 241.5625, "epoch": 0.2912, "grad_norm": 1.4061272144317627, "kl": 0.0460205078125, "learning_rate": 8.874999999999999e-07, "loss": 0.0005, "reward": 3.828965663909912, "reward_std": 0.04340291768312454, "rewards/answer_entity_reward": 0.9683369398117065, "rewards/answer_wer_reward": 0.8706588447093964, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9899699091911316, "step": 91 }, { "completion_length": 247.9375, "epoch": 0.2944, "grad_norm": 1.6669530868530273, "kl": 0.0611572265625, "learning_rate": 8.8625e-07, "loss": 0.0006, "reward": 3.7649370431900024, "reward_std": 0.1087912805378437, "rewards/answer_entity_reward": 0.9332223832607269, "rewards/answer_wer_reward": 0.8357318043708801, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9959828853607178, "step": 92 }, { "completion_length": 162.625, "epoch": 0.2976, "grad_norm": 5.615991115570068, "kl": 0.058837890625, "learning_rate": 8.85e-07, "loss": 0.0006, "reward": 3.8870660066604614, "reward_std": 0.09454158693552017, "rewards/answer_entity_reward": 0.9939196705818176, "rewards/answer_wer_reward": 0.9443124830722809, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9488338530063629, "step": 93 }, { "completion_length": 256.625, "epoch": 0.3008, "grad_norm": 2.879868984222412, "kl": 0.2406005859375, "learning_rate": 8.8375e-07, "loss": 0.0024, "reward": 3.6465322971343994, "reward_std": 0.23435086756944656, "rewards/answer_entity_reward": 0.9395784735679626, "rewards/answer_wer_reward": 0.7715516090393066, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9666522741317749, "step": 94 }, { "completion_length": 254.15625, "epoch": 0.304, "grad_norm": 6.1645121574401855, "kl": 0.262939453125, "learning_rate": 8.824999999999999e-07, "loss": 0.0026, "reward": 3.728961229324341, "reward_std": 0.11308889091014862, "rewards/answer_entity_reward": 0.9466511011123657, "rewards/answer_wer_reward": 0.8248744010925293, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9574357271194458, "step": 95 }, { "completion_length": 202.3125, "epoch": 0.3072, "grad_norm": 2.2792811393737793, "kl": 0.0501708984375, "learning_rate": 8.812499999999999e-07, "loss": 0.0005, "reward": 3.856202244758606, "reward_std": 0.05682223103940487, "rewards/answer_entity_reward": 0.9909722208976746, "rewards/answer_wer_reward": 0.9035914540290833, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9616385698318481, "step": 96 }, { "completion_length": 222.53125, "epoch": 0.3104, "grad_norm": 2.4435033798217773, "kl": 0.051513671875, "learning_rate": 8.799999999999999e-07, "loss": 0.0005, "reward": 3.8195481300354004, "reward_std": 0.08100517094135284, "rewards/answer_entity_reward": 0.979785680770874, "rewards/answer_wer_reward": 0.8738153576850891, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.965947151184082, "step": 97 }, { "completion_length": 202.375, "epoch": 0.3136, "grad_norm": 1.7632919549942017, "kl": 0.0357666015625, "learning_rate": 8.7875e-07, "loss": 0.0004, "reward": 3.7597837448120117, "reward_std": 0.061054665595293045, "rewards/answer_entity_reward": 0.9468090534210205, "rewards/answer_wer_reward": 0.8723107874393463, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9406639635562897, "step": 98 }, { "completion_length": 207.53125, "epoch": 0.3168, "grad_norm": 7.402034282684326, "kl": 0.04736328125, "learning_rate": 8.774999999999999e-07, "loss": 0.0005, "reward": 3.7576065063476562, "reward_std": 0.04146904498338699, "rewards/answer_entity_reward": 0.9389799237251282, "rewards/answer_wer_reward": 0.8201505243778229, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9984759986400604, "step": 99 }, { "completion_length": 210.15625, "epoch": 0.32, "grad_norm": 1.5828880071640015, "kl": 0.0450439453125, "learning_rate": 8.7625e-07, "loss": 0.0004, "reward": 3.835609197616577, "reward_std": 0.12980258837342262, "rewards/answer_entity_reward": 0.9350627064704895, "rewards/answer_wer_reward": 0.9053294062614441, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9952171444892883, "step": 100 }, { "completion_length": 214.34375, "epoch": 0.3232, "grad_norm": 5.768563270568848, "kl": 0.055908203125, "learning_rate": 8.75e-07, "loss": 0.0006, "reward": 3.611391305923462, "reward_std": 0.2522353269159794, "rewards/answer_entity_reward": 0.9709455966949463, "rewards/answer_wer_reward": 0.7870493829250336, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.8846463263034821, "step": 101 }, { "completion_length": 223.0, "epoch": 0.3264, "grad_norm": 7.32905387878418, "kl": 0.0755615234375, "learning_rate": 8.7375e-07, "loss": 0.0008, "reward": 3.7200475931167603, "reward_std": 0.18947013467550278, "rewards/answer_entity_reward": 0.9668727219104767, "rewards/answer_wer_reward": 0.8209056556224823, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9635193049907684, "step": 102 }, { "completion_length": 229.96875, "epoch": 0.3296, "grad_norm": 0.9038276672363281, "kl": 0.0411376953125, "learning_rate": 8.725e-07, "loss": 0.0004, "reward": 3.862263560295105, "reward_std": 0.03179450985044241, "rewards/answer_entity_reward": 0.9754428863525391, "rewards/answer_wer_reward": 0.8931463062763214, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9936743974685669, "step": 103 }, { "completion_length": 265.3125, "epoch": 0.3328, "grad_norm": 1.3424818515777588, "kl": 0.0357666015625, "learning_rate": 8.712499999999999e-07, "loss": 0.0004, "reward": 3.7375279664993286, "reward_std": 0.07805093377828598, "rewards/answer_entity_reward": 0.9291824698448181, "rewards/answer_wer_reward": 0.8219992816448212, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9863462746143341, "step": 104 }, { "completion_length": 252.78125, "epoch": 0.336, "grad_norm": 1.2035622596740723, "kl": 0.033203125, "learning_rate": 8.699999999999999e-07, "loss": 0.0003, "reward": 3.8339978456497192, "reward_std": 0.05473129637539387, "rewards/answer_entity_reward": 0.9795939922332764, "rewards/answer_wer_reward": 0.8663320243358612, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9880718290805817, "step": 105 }, { "completion_length": 162.96875, "epoch": 0.3392, "grad_norm": 7.1932783126831055, "kl": 0.06005859375, "learning_rate": 8.687499999999999e-07, "loss": 0.0006, "reward": 3.8799617290496826, "reward_std": 0.0983762014657259, "rewards/answer_entity_reward": 0.9810606241226196, "rewards/answer_wer_reward": 0.9314018487930298, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9674993753433228, "step": 106 }, { "completion_length": 225.625, "epoch": 0.3424, "grad_norm": 5.7709455490112305, "kl": 0.05291748046875, "learning_rate": 8.675000000000001e-07, "loss": 0.0005, "reward": 3.7411450147628784, "reward_std": 0.2322532683610916, "rewards/answer_entity_reward": 0.9225597083568573, "rewards/answer_wer_reward": 0.8556761145591736, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9941591918468475, "step": 107 }, { "completion_length": 194.78125, "epoch": 0.3456, "grad_norm": 5.741571426391602, "kl": 0.0556640625, "learning_rate": 8.6625e-07, "loss": 0.0006, "reward": 3.867335319519043, "reward_std": 0.03972470294684172, "rewards/answer_entity_reward": 0.9573142230510712, "rewards/answer_wer_reward": 0.9156463444232941, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9943746328353882, "step": 108 }, { "completion_length": 228.5, "epoch": 0.3488, "grad_norm": 6.1572089195251465, "kl": 0.05859375, "learning_rate": 8.65e-07, "loss": 0.0006, "reward": 3.673606753349304, "reward_std": 0.08745867013931274, "rewards/answer_entity_reward": 0.9391757845878601, "rewards/answer_wer_reward": 0.8806695938110352, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8537613451480865, "step": 109 }, { "completion_length": 216.5, "epoch": 0.352, "grad_norm": 2.032820701599121, "kl": 0.0518798828125, "learning_rate": 8.6375e-07, "loss": 0.0005, "reward": 3.6381773948669434, "reward_std": 0.11543435975909233, "rewards/answer_entity_reward": 0.9743416607379913, "rewards/answer_wer_reward": 0.7632936537265778, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9005421102046967, "step": 110 }, { "completion_length": 220.96875, "epoch": 0.3552, "grad_norm": 4.737320423126221, "kl": 0.0751953125, "learning_rate": 8.625e-07, "loss": 0.0008, "reward": 3.823172926902771, "reward_std": 0.04683285113424063, "rewards/answer_entity_reward": 0.9827152192592621, "rewards/answer_wer_reward": 0.8539322018623352, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9865254759788513, "step": 111 }, { "completion_length": 248.75, "epoch": 0.3584, "grad_norm": 3.7395012378692627, "kl": 0.0484619140625, "learning_rate": 8.612499999999999e-07, "loss": 0.0005, "reward": 3.835617423057556, "reward_std": 0.039440929889678955, "rewards/answer_entity_reward": 0.9718195497989655, "rewards/answer_wer_reward": 0.8654404282569885, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9983574748039246, "step": 112 }, { "completion_length": 218.75, "epoch": 0.3616, "grad_norm": 3.5470447540283203, "kl": 0.10302734375, "learning_rate": 8.599999999999999e-07, "loss": 0.001, "reward": 3.766317844390869, "reward_std": 0.0799998790025711, "rewards/answer_entity_reward": 0.9724812507629395, "rewards/answer_wer_reward": 0.8530462384223938, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9407903254032135, "step": 113 }, { "completion_length": 231.4375, "epoch": 0.3648, "grad_norm": 4.614479064941406, "kl": 0.060546875, "learning_rate": 8.587499999999999e-07, "loss": 0.0006, "reward": 3.828564405441284, "reward_std": 0.030111415311694145, "rewards/answer_entity_reward": 0.9710638523101807, "rewards/answer_wer_reward": 0.866461992263794, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.991038590669632, "step": 114 }, { "completion_length": 237.875, "epoch": 0.368, "grad_norm": 1.1590646505355835, "kl": 0.046142578125, "learning_rate": 8.575e-07, "loss": 0.0005, "reward": 3.870112419128418, "reward_std": 0.051995884627103806, "rewards/answer_entity_reward": 0.9711016416549683, "rewards/answer_wer_reward": 0.9016274213790894, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9973834156990051, "step": 115 }, { "completion_length": 234.59375, "epoch": 0.3712, "grad_norm": 1.4525243043899536, "kl": 0.113525390625, "learning_rate": 8.5625e-07, "loss": 0.0011, "reward": 3.755509376525879, "reward_std": 0.10925759375095367, "rewards/answer_entity_reward": 0.9556345045566559, "rewards/answer_wer_reward": 0.8394978046417236, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9603771269321442, "step": 116 }, { "completion_length": 220.4375, "epoch": 0.3744, "grad_norm": 1.6397019624710083, "kl": 0.15234375, "learning_rate": 8.55e-07, "loss": 0.0015, "reward": 3.829906702041626, "reward_std": 0.03734264615923166, "rewards/answer_entity_reward": 0.9839539229869843, "rewards/answer_wer_reward": 0.8527026474475861, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9932501018047333, "step": 117 }, { "completion_length": 209.53125, "epoch": 0.3776, "grad_norm": 3.598604440689087, "kl": 0.07861328125, "learning_rate": 8.5375e-07, "loss": 0.0008, "reward": 3.7239131927490234, "reward_std": 0.07304626516997814, "rewards/answer_entity_reward": 0.9540751278400421, "rewards/answer_wer_reward": 0.8128292262554169, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9570088684558868, "step": 118 }, { "completion_length": 207.71875, "epoch": 0.3808, "grad_norm": 1.9592057466506958, "kl": 0.0435791015625, "learning_rate": 8.525e-07, "loss": 0.0004, "reward": 3.8095905780792236, "reward_std": 0.15753451362252235, "rewards/answer_entity_reward": 0.9857954680919647, "rewards/answer_wer_reward": 0.9040109515190125, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9510340690612793, "step": 119 }, { "completion_length": 245.90625, "epoch": 0.384, "grad_norm": 1.7574220895767212, "kl": 0.0609130859375, "learning_rate": 8.512499999999999e-07, "loss": 0.0006, "reward": 3.854837656021118, "reward_std": 0.0384799987077713, "rewards/answer_entity_reward": 0.9729723632335663, "rewards/answer_wer_reward": 0.8832501769065857, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.998615026473999, "step": 120 }, { "completion_length": 187.15625, "epoch": 0.3872, "grad_norm": 8.7343168258667, "kl": 0.0494384765625, "learning_rate": 8.499999999999999e-07, "loss": 0.0005, "reward": 3.7950538396835327, "reward_std": 0.09329042956233025, "rewards/answer_entity_reward": 0.9599206745624542, "rewards/answer_wer_reward": 0.9000534117221832, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.93507981300354, "step": 121 }, { "completion_length": 203.84375, "epoch": 0.3904, "grad_norm": 3.7000162601470947, "kl": 0.062744140625, "learning_rate": 8.487499999999999e-07, "loss": 0.0006, "reward": 3.8687225580215454, "reward_std": 0.03621992561966181, "rewards/answer_entity_reward": 0.9873106181621552, "rewards/answer_wer_reward": 0.8855177164077759, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9958942830562592, "step": 122 }, { "completion_length": 205.84375, "epoch": 0.3936, "grad_norm": 9.27507209777832, "kl": 0.0570068359375, "learning_rate": 8.475e-07, "loss": 0.0006, "reward": 3.7104525566101074, "reward_std": 0.05549425818026066, "rewards/answer_entity_reward": 0.955735981464386, "rewards/answer_wer_reward": 0.8933148980140686, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.86140176653862, "step": 123 }, { "completion_length": 246.46875, "epoch": 0.3968, "grad_norm": 2.3181021213531494, "kl": 0.0404052734375, "learning_rate": 8.462499999999999e-07, "loss": 0.0004, "reward": 3.821496605873108, "reward_std": 0.09581143222749233, "rewards/answer_entity_reward": 0.9655607342720032, "rewards/answer_wer_reward": 0.8666167855262756, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9893191456794739, "step": 124 }, { "completion_length": 206.3125, "epoch": 0.4, "grad_norm": 1.5352882146835327, "kl": 0.055419921875, "learning_rate": 8.45e-07, "loss": 0.0006, "reward": 3.831603527069092, "reward_std": 0.08168897591531277, "rewards/answer_entity_reward": 0.9702457189559937, "rewards/answer_wer_reward": 0.9070821702480316, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9542755484580994, "step": 125 }, { "completion_length": 240.28125, "epoch": 0.4032, "grad_norm": 1.380315899848938, "kl": 0.05908203125, "learning_rate": 8.4375e-07, "loss": 0.0006, "reward": 3.7971588373184204, "reward_std": 0.10537005960941315, "rewards/answer_entity_reward": 0.9396995604038239, "rewards/answer_wer_reward": 0.8588653802871704, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9985939264297485, "step": 126 }, { "completion_length": 206.84375, "epoch": 0.4064, "grad_norm": 1.5937124490737915, "kl": 0.056884765625, "learning_rate": 8.425e-07, "loss": 0.0006, "reward": 3.8375606536865234, "reward_std": 0.047878991812467575, "rewards/answer_entity_reward": 0.9553684592247009, "rewards/answer_wer_reward": 0.8867217302322388, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9954704642295837, "step": 127 }, { "completion_length": 214.1875, "epoch": 0.4096, "grad_norm": 1.3648440837860107, "kl": 0.0687255859375, "learning_rate": 8.4125e-07, "loss": 0.0007, "reward": 3.8555803298950195, "reward_std": 0.05176056548953056, "rewards/answer_entity_reward": 0.9823863804340363, "rewards/answer_wer_reward": 0.8972643911838531, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9759295582771301, "step": 128 }, { "completion_length": 215.90625, "epoch": 0.4128, "grad_norm": 1.4308183193206787, "kl": 0.0390625, "learning_rate": 8.399999999999999e-07, "loss": 0.0004, "reward": 3.8976725339889526, "reward_std": 0.016966319642961025, "rewards/answer_entity_reward": 0.9958333373069763, "rewards/answer_wer_reward": 0.9021182060241699, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997209906578064, "step": 129 }, { "completion_length": 189.1875, "epoch": 0.416, "grad_norm": 7.785026550292969, "kl": 0.0506591796875, "learning_rate": 8.387499999999999e-07, "loss": 0.0005, "reward": 3.7563494443893433, "reward_std": 0.12806903570890427, "rewards/answer_entity_reward": 0.9905131459236145, "rewards/answer_wer_reward": 0.8918424248695374, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8739938735961914, "step": 130 }, { "completion_length": 211.21875, "epoch": 0.4192, "grad_norm": 6.029291152954102, "kl": 0.0860595703125, "learning_rate": 8.375e-07, "loss": 0.0009, "reward": 3.7876737117767334, "reward_std": 0.07924951426684856, "rewards/answer_entity_reward": 0.9788058996200562, "rewards/answer_wer_reward": 0.903822124004364, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9050455689430237, "step": 131 }, { "completion_length": 197.9375, "epoch": 0.4224, "grad_norm": 1.5226598978042603, "kl": 0.0865478515625, "learning_rate": 8.3625e-07, "loss": 0.0009, "reward": 3.8618096113204956, "reward_std": 0.024674754589796066, "rewards/answer_entity_reward": 0.9936868846416473, "rewards/answer_wer_reward": 0.9131532609462738, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9549693167209625, "step": 132 }, { "completion_length": 178.28125, "epoch": 0.4256, "grad_norm": 4.81843376159668, "kl": 0.1806640625, "learning_rate": 8.349999999999999e-07, "loss": 0.0018, "reward": 3.8692500591278076, "reward_std": 0.0898860078305006, "rewards/answer_entity_reward": 0.9539262652397156, "rewards/answer_wer_reward": 0.9164533317089081, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9988704919815063, "step": 133 }, { "completion_length": 214.3125, "epoch": 0.4288, "grad_norm": 3.702409267425537, "kl": 0.100341796875, "learning_rate": 8.3375e-07, "loss": 0.001, "reward": 3.7666897773742676, "reward_std": 0.05854834243655205, "rewards/answer_entity_reward": 0.9739753007888794, "rewards/answer_wer_reward": 0.8456098437309265, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9471046626567841, "step": 134 }, { "completion_length": 230.3125, "epoch": 0.432, "grad_norm": 4.869428634643555, "kl": 0.109619140625, "learning_rate": 8.325e-07, "loss": 0.0011, "reward": 3.837371587753296, "reward_std": 0.07383839413523674, "rewards/answer_entity_reward": 0.9623282849788666, "rewards/answer_wer_reward": 0.8914425075054169, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9836008548736572, "step": 135 }, { "completion_length": 222.84375, "epoch": 0.4352, "grad_norm": 1.1195542812347412, "kl": 0.0875244140625, "learning_rate": 8.3125e-07, "loss": 0.0009, "reward": 3.800593137741089, "reward_std": 0.05516563355922699, "rewards/answer_entity_reward": 0.977182537317276, "rewards/answer_wer_reward": 0.8409056067466736, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9825051128864288, "step": 136 }, { "completion_length": 156.28125, "epoch": 0.4384, "grad_norm": 2.307365655899048, "kl": 0.0631103515625, "learning_rate": 8.299999999999999e-07, "loss": 0.0006, "reward": 3.803721785545349, "reward_std": 0.1857592761516571, "rewards/answer_entity_reward": 0.9582379460334778, "rewards/answer_wer_reward": 0.9269835352897644, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9185003936290741, "step": 137 }, { "completion_length": 230.3125, "epoch": 0.4416, "grad_norm": 1.0649584531784058, "kl": 0.0577392578125, "learning_rate": 8.287499999999999e-07, "loss": 0.0006, "reward": 3.8693535327911377, "reward_std": 0.10830429336056113, "rewards/answer_entity_reward": 0.9776785671710968, "rewards/answer_wer_reward": 0.8930677771568298, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9986072480678558, "step": 138 }, { "completion_length": 220.5, "epoch": 0.4448, "grad_norm": 3.627920150756836, "kl": 0.0648193359375, "learning_rate": 8.275e-07, "loss": 0.0006, "reward": 3.779549479484558, "reward_std": 0.04976406879723072, "rewards/answer_entity_reward": 0.9892225861549377, "rewards/answer_wer_reward": 0.8991544246673584, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.891172468662262, "step": 139 }, { "completion_length": 214.375, "epoch": 0.448, "grad_norm": 1.0832712650299072, "kl": 0.0511474609375, "learning_rate": 8.2625e-07, "loss": 0.0005, "reward": 3.866790771484375, "reward_std": 0.03637353144586086, "rewards/answer_entity_reward": 0.9854166805744171, "rewards/answer_wer_reward": 0.8826328217983246, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987412095069885, "step": 140 }, { "completion_length": 215.0, "epoch": 0.4512, "grad_norm": 4.865916728973389, "kl": 0.080810546875, "learning_rate": 8.249999999999999e-07, "loss": 0.0008, "reward": 3.782729744911194, "reward_std": 0.05014876648783684, "rewards/answer_entity_reward": 0.9947552382946014, "rewards/answer_wer_reward": 0.9396264553070068, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8483479917049408, "step": 141 }, { "completion_length": 235.1875, "epoch": 0.4544, "grad_norm": 3.832350730895996, "kl": 0.0489501953125, "learning_rate": 8.2375e-07, "loss": 0.0005, "reward": 3.8454935550689697, "reward_std": 0.02625620225444436, "rewards/answer_entity_reward": 0.9856643378734589, "rewards/answer_wer_reward": 0.9073578715324402, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9524714052677155, "step": 142 }, { "completion_length": 193.5625, "epoch": 0.4576, "grad_norm": 1.5562162399291992, "kl": 0.0836181640625, "learning_rate": 8.225e-07, "loss": 0.0008, "reward": 3.8764915466308594, "reward_std": 0.02105938969179988, "rewards/answer_entity_reward": 0.9958333373069763, "rewards/answer_wer_reward": 0.9281685054302216, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.952489823102951, "step": 143 }, { "completion_length": 192.5, "epoch": 0.4608, "grad_norm": 4.00892448425293, "kl": 0.065185546875, "learning_rate": 8.2125e-07, "loss": 0.0007, "reward": 3.9131712913513184, "reward_std": 0.025579220615327358, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9231057167053223, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9945298135280609, "step": 144 }, { "completion_length": 222.5625, "epoch": 0.464, "grad_norm": 6.250589370727539, "kl": 0.0546875, "learning_rate": 8.199999999999999e-07, "loss": 0.0005, "reward": 3.8917945623397827, "reward_std": 0.04113447107374668, "rewards/answer_entity_reward": 0.9717775583267212, "rewards/answer_wer_reward": 0.926241010427475, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9937759339809418, "step": 145 }, { "completion_length": 183.90625, "epoch": 0.4672, "grad_norm": 2.7752954959869385, "kl": 0.0670166015625, "learning_rate": 8.187499999999999e-07, "loss": 0.0007, "reward": 3.860864043235779, "reward_std": 0.06173134222626686, "rewards/answer_entity_reward": 0.9583333432674408, "rewards/answer_wer_reward": 0.9120890200138092, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9904417395591736, "step": 146 }, { "completion_length": 220.78125, "epoch": 0.4704, "grad_norm": 3.0674679279327393, "kl": 0.09912109375, "learning_rate": 8.175e-07, "loss": 0.001, "reward": 3.84165620803833, "reward_std": 0.03327286522835493, "rewards/answer_entity_reward": 0.9452651739120483, "rewards/answer_wer_reward": 0.8996314704418182, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.996759682893753, "step": 147 }, { "completion_length": 162.8125, "epoch": 0.4736, "grad_norm": 4.559942245483398, "kl": 0.116455078125, "learning_rate": 8.1625e-07, "loss": 0.0012, "reward": 3.833083748817444, "reward_std": 0.06737112812697887, "rewards/answer_entity_reward": 0.9923878014087677, "rewards/answer_wer_reward": 0.902847170829773, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9378487467765808, "step": 148 }, { "completion_length": 221.53125, "epoch": 0.4768, "grad_norm": 1.3157752752304077, "kl": 0.052978515625, "learning_rate": 8.149999999999999e-07, "loss": 0.0005, "reward": 3.8545873165130615, "reward_std": 0.019355260767042637, "rewards/answer_entity_reward": 0.9938696324825287, "rewards/answer_wer_reward": 0.8627510368824005, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9979668259620667, "step": 149 }, { "completion_length": 233.1875, "epoch": 0.48, "grad_norm": 4.352514743804932, "kl": 0.053955078125, "learning_rate": 8.137499999999999e-07, "loss": 0.0005, "reward": 3.8025535345077515, "reward_std": 0.0806161779910326, "rewards/answer_entity_reward": 0.9906516969203949, "rewards/answer_wer_reward": 0.8636212348937988, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9482806921005249, "step": 150 }, { "completion_length": 210.03125, "epoch": 0.4832, "grad_norm": 1.3691778182983398, "kl": 0.05615234375, "learning_rate": 8.125e-07, "loss": 0.0006, "reward": 3.860105037689209, "reward_std": 0.034908443689346313, "rewards/answer_entity_reward": 0.9873737394809723, "rewards/answer_wer_reward": 0.9285348653793335, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9441964328289032, "step": 151 }, { "completion_length": 244.625, "epoch": 0.4864, "grad_norm": 1.9329304695129395, "kl": 0.058837890625, "learning_rate": 8.1125e-07, "loss": 0.0006, "reward": 3.849783182144165, "reward_std": 0.029241922311484814, "rewards/answer_entity_reward": 0.9856617450714111, "rewards/answer_wer_reward": 0.8688266575336456, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9952948093414307, "step": 152 }, { "completion_length": 174.375, "epoch": 0.4896, "grad_norm": 5.655167579650879, "kl": 0.067138671875, "learning_rate": 8.1e-07, "loss": 0.0007, "reward": 3.85835599899292, "reward_std": 0.1141166789457202, "rewards/answer_entity_reward": 0.9663461446762085, "rewards/answer_wer_reward": 0.9284006357192993, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9636091589927673, "step": 153 }, { "completion_length": 185.875, "epoch": 0.4928, "grad_norm": 4.543191432952881, "kl": 0.084716796875, "learning_rate": 8.087499999999999e-07, "loss": 0.0008, "reward": 3.851526975631714, "reward_std": 0.0990656241774559, "rewards/answer_entity_reward": 0.9646950364112854, "rewards/answer_wer_reward": 0.9213105142116547, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.965521514415741, "step": 154 }, { "completion_length": 209.0625, "epoch": 0.496, "grad_norm": 2.554072380065918, "kl": 0.0572509765625, "learning_rate": 8.075e-07, "loss": 0.0006, "reward": 3.790269613265991, "reward_std": 0.048579949885606766, "rewards/answer_entity_reward": 0.9870130121707916, "rewards/answer_wer_reward": 0.8052773177623749, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9979791641235352, "step": 155 }, { "completion_length": 211.96875, "epoch": 0.4992, "grad_norm": 2.762598991394043, "kl": 0.0498046875, "learning_rate": 8.0625e-07, "loss": 0.0005, "reward": 3.9201120138168335, "reward_std": 0.014579844661056995, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9228614568710327, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9972506165504456, "step": 156 }, { "completion_length": 206.59375, "epoch": 0.5024, "grad_norm": 1.9372365474700928, "kl": 0.0621337890625, "learning_rate": 8.05e-07, "loss": 0.0006, "reward": 3.5673259496688843, "reward_std": 0.028257974423468113, "rewards/answer_entity_reward": 0.9902146458625793, "rewards/answer_wer_reward": 0.758561909198761, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.818549245595932, "step": 157 }, { "completion_length": 213.15625, "epoch": 0.5056, "grad_norm": 2.594701051712036, "kl": 0.08203125, "learning_rate": 8.037499999999999e-07, "loss": 0.0008, "reward": 3.8647842407226562, "reward_std": 0.029484061524271965, "rewards/answer_entity_reward": 0.9847756624221802, "rewards/answer_wer_reward": 0.8839923739433289, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9960162043571472, "step": 158 }, { "completion_length": 196.34375, "epoch": 0.5088, "grad_norm": 3.0164191722869873, "kl": 0.0526123046875, "learning_rate": 8.024999999999999e-07, "loss": 0.0005, "reward": 3.8759838342666626, "reward_std": 0.04202751815319061, "rewards/answer_entity_reward": 0.9789772629737854, "rewards/answer_wer_reward": 0.9108568131923676, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9861496686935425, "step": 159 }, { "completion_length": 198.0, "epoch": 0.512, "grad_norm": 5.223659515380859, "kl": 0.07177734375, "learning_rate": 8.0125e-07, "loss": 0.0007, "reward": 3.8265939950942993, "reward_std": 0.04291579592972994, "rewards/answer_entity_reward": 0.9890734255313873, "rewards/answer_wer_reward": 0.8892558217048645, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9482647478580475, "step": 160 }, { "completion_length": 184.0, "epoch": 0.5152, "grad_norm": 2.4279987812042236, "kl": 0.0914306640625, "learning_rate": 8e-07, "loss": 0.0009, "reward": 3.8738738298416138, "reward_std": 0.049739884212613106, "rewards/answer_entity_reward": 0.9671474397182465, "rewards/answer_wer_reward": 0.9240702688694, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9826560020446777, "step": 161 }, { "completion_length": 250.96875, "epoch": 0.5184, "grad_norm": 1.4533754587173462, "kl": 0.047607421875, "learning_rate": 7.9875e-07, "loss": 0.0005, "reward": 3.9009437561035156, "reward_std": 0.03131024446338415, "rewards/answer_entity_reward": 0.9899475276470184, "rewards/answer_wer_reward": 0.9109963178634644, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 162 }, { "completion_length": 206.5, "epoch": 0.5216, "grad_norm": 10.05416202545166, "kl": 0.1258544921875, "learning_rate": 7.975e-07, "loss": 0.0013, "reward": 3.6952139139175415, "reward_std": 0.08068067952990532, "rewards/answer_entity_reward": 0.9906517267227173, "rewards/answer_wer_reward": 0.9191368222236633, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.785425454378128, "step": 163 }, { "completion_length": 207.71875, "epoch": 0.5248, "grad_norm": 5.6498823165893555, "kl": 0.0572509765625, "learning_rate": 7.9625e-07, "loss": 0.0006, "reward": 3.862972855567932, "reward_std": 0.05051150266081095, "rewards/answer_entity_reward": 0.9871794879436493, "rewards/answer_wer_reward": 0.8966725766658783, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9791209697723389, "step": 164 }, { "completion_length": 231.8125, "epoch": 0.528, "grad_norm": 2.2680246829986572, "kl": 0.0731201171875, "learning_rate": 7.95e-07, "loss": 0.0007, "reward": 3.845450758934021, "reward_std": 0.04592973738908768, "rewards/answer_entity_reward": 0.9903846085071564, "rewards/answer_wer_reward": 0.8566094040870667, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9984567761421204, "step": 165 }, { "completion_length": 218.0, "epoch": 0.5312, "grad_norm": 1.194057583808899, "kl": 0.046630859375, "learning_rate": 7.937499999999999e-07, "loss": 0.0005, "reward": 3.900430679321289, "reward_std": 0.01787347625941038, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.907353401184082, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9930772483348846, "step": 166 }, { "completion_length": 212.25, "epoch": 0.5344, "grad_norm": 1.999778389930725, "kl": 0.07568359375, "learning_rate": 7.924999999999999e-07, "loss": 0.0008, "reward": 3.885169267654419, "reward_std": 0.01909848116338253, "rewards/answer_entity_reward": 0.9869123697280884, "rewards/answer_wer_reward": 0.8992542028427124, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990026652812958, "step": 167 }, { "completion_length": 222.65625, "epoch": 0.5376, "grad_norm": 1.8001956939697266, "kl": 0.03814697265625, "learning_rate": 7.912499999999999e-07, "loss": 0.0004, "reward": 3.8382192850112915, "reward_std": 0.12780769122764468, "rewards/answer_entity_reward": 0.9684826135635376, "rewards/answer_wer_reward": 0.8702490329742432, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994876980781555, "step": 168 }, { "completion_length": 181.28125, "epoch": 0.5408, "grad_norm": 1.3718982934951782, "kl": 0.072509765625, "learning_rate": 7.9e-07, "loss": 0.0007, "reward": 3.743025064468384, "reward_std": 0.02209018263965845, "rewards/answer_entity_reward": 0.9875437021255493, "rewards/answer_wer_reward": 0.8102038502693176, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9452773928642273, "step": 169 }, { "completion_length": 231.3125, "epoch": 0.544, "grad_norm": 3.8252196311950684, "kl": 0.087890625, "learning_rate": 7.8875e-07, "loss": 0.0009, "reward": 3.855069398880005, "reward_std": 0.12723926454782486, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.8895151615142822, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9655542373657227, "step": 170 }, { "completion_length": 246.6875, "epoch": 0.5472, "grad_norm": 1.4238818883895874, "kl": 0.089599609375, "learning_rate": 7.875e-07, "loss": 0.0009, "reward": 3.8392333984375, "reward_std": 0.055684901773929596, "rewards/answer_entity_reward": 0.9753443002700806, "rewards/answer_wer_reward": 0.866324782371521, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9975642561912537, "step": 171 }, { "completion_length": 239.09375, "epoch": 0.5504, "grad_norm": 2.5418362617492676, "kl": 0.07421875, "learning_rate": 7.8625e-07, "loss": 0.0007, "reward": 3.7556768655776978, "reward_std": 0.026184914633631706, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.8859277367591858, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8721528947353363, "step": 172 }, { "completion_length": 197.53125, "epoch": 0.5536, "grad_norm": 2.2901041507720947, "kl": 0.0523681640625, "learning_rate": 7.85e-07, "loss": 0.0005, "reward": 3.7119585275650024, "reward_std": 0.14428242854773998, "rewards/answer_entity_reward": 0.8789682686328888, "rewards/answer_wer_reward": 0.8524789810180664, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9805113673210144, "step": 173 }, { "completion_length": 271.65625, "epoch": 0.5568, "grad_norm": 1.5335708856582642, "kl": 0.048095703125, "learning_rate": 7.837499999999999e-07, "loss": 0.0005, "reward": 3.8789494037628174, "reward_std": 0.03688232973217964, "rewards/answer_entity_reward": 0.9724817276000977, "rewards/answer_wer_reward": 0.9107584953308105, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9957091808319092, "step": 174 }, { "completion_length": 197.40625, "epoch": 0.56, "grad_norm": 2.6528756618499756, "kl": 0.074462890625, "learning_rate": 7.824999999999999e-07, "loss": 0.0007, "reward": 3.857820510864258, "reward_std": 0.03826703131198883, "rewards/answer_entity_reward": 0.993686854839325, "rewards/answer_wer_reward": 0.8975639641284943, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9665697515010834, "step": 175 }, { "completion_length": 200.15625, "epoch": 0.5632, "grad_norm": 5.963916301727295, "kl": 0.054443359375, "learning_rate": 7.812499999999999e-07, "loss": 0.0005, "reward": 3.864750027656555, "reward_std": 0.028456556610763073, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9234411716461182, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9447809159755707, "step": 176 }, { "completion_length": 220.25, "epoch": 0.5664, "grad_norm": 1.086248517036438, "kl": 0.07421875, "learning_rate": 7.799999999999999e-07, "loss": 0.0007, "reward": 3.85122811794281, "reward_std": 0.02548269461840391, "rewards/answer_entity_reward": 0.9941239356994629, "rewards/answer_wer_reward": 0.9126598238945007, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9444444477558136, "step": 177 }, { "completion_length": 235.6875, "epoch": 0.5696, "grad_norm": 3.8478362560272217, "kl": 0.080810546875, "learning_rate": 7.787500000000001e-07, "loss": 0.0008, "reward": 3.8555444478988647, "reward_std": 0.03297184593975544, "rewards/answer_entity_reward": 0.991346150636673, "rewards/answer_wer_reward": 0.8777507543563843, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9864475131034851, "step": 178 }, { "completion_length": 210.6875, "epoch": 0.5728, "grad_norm": 2.200871706008911, "kl": 0.096923828125, "learning_rate": 7.775e-07, "loss": 0.001, "reward": 3.8970987796783447, "reward_std": 0.029029657132923603, "rewards/answer_entity_reward": 0.9676088094711304, "rewards/answer_wer_reward": 0.9392231702804565, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9902668297290802, "step": 179 }, { "completion_length": 202.375, "epoch": 0.576, "grad_norm": 3.42965030670166, "kl": 0.080078125, "learning_rate": 7.7625e-07, "loss": 0.0008, "reward": 3.7469061613082886, "reward_std": 0.08900729566812515, "rewards/answer_entity_reward": 0.9832702279090881, "rewards/answer_wer_reward": 0.8798384070396423, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8837975263595581, "step": 180 }, { "completion_length": 215.875, "epoch": 0.5792, "grad_norm": 2.5457639694213867, "kl": 0.0595703125, "learning_rate": 7.75e-07, "loss": 0.0006, "reward": 3.8780597448349, "reward_std": 0.04192608781158924, "rewards/answer_entity_reward": 0.9845328330993652, "rewards/answer_wer_reward": 0.89576256275177, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9977642893791199, "step": 181 }, { "completion_length": 203.875, "epoch": 0.5824, "grad_norm": 1.3624567985534668, "kl": 0.07177734375, "learning_rate": 7.7375e-07, "loss": 0.0007, "reward": 3.8805158138275146, "reward_std": 0.016396815422922373, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9136685729026794, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9692510068416595, "step": 182 }, { "completion_length": 215.90625, "epoch": 0.5856, "grad_norm": 1.270873785018921, "kl": 0.0543212890625, "learning_rate": 7.724999999999999e-07, "loss": 0.0005, "reward": 3.8749226331710815, "reward_std": 0.020629468373954296, "rewards/answer_entity_reward": 0.985921710729599, "rewards/answer_wer_reward": 0.8920559883117676, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9969449043273926, "step": 183 }, { "completion_length": 230.34375, "epoch": 0.5888, "grad_norm": 5.295412063598633, "kl": 0.0489501953125, "learning_rate": 7.712499999999999e-07, "loss": 0.0005, "reward": 3.8914437294006348, "reward_std": 0.053787765093147755, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9137877225875854, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9811282753944397, "step": 184 }, { "completion_length": 238.03125, "epoch": 0.592, "grad_norm": 3.6382017135620117, "kl": 0.05126953125, "learning_rate": 7.699999999999999e-07, "loss": 0.0005, "reward": 3.80574893951416, "reward_std": 0.031003179028630257, "rewards/answer_entity_reward": 0.9958333373069763, "rewards/answer_wer_reward": 0.8504349291324615, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9594806730747223, "step": 185 }, { "completion_length": 133.46875, "epoch": 0.5952, "grad_norm": 5.556273937225342, "kl": 0.06884765625, "learning_rate": 7.6875e-07, "loss": 0.0007, "reward": 3.875786066055298, "reward_std": 0.014059089124202728, "rewards/answer_entity_reward": 0.9772727489471436, "rewards/answer_wer_reward": 0.9379938840866089, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9605194628238678, "step": 186 }, { "completion_length": 233.96875, "epoch": 0.5984, "grad_norm": 1.1566299200057983, "kl": 0.0654296875, "learning_rate": 7.675e-07, "loss": 0.0007, "reward": 3.8272093534469604, "reward_std": 0.056231189519166946, "rewards/answer_entity_reward": 0.9821289777755737, "rewards/answer_wer_reward": 0.87700355052948, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9680766761302948, "step": 187 }, { "completion_length": 223.21875, "epoch": 0.6016, "grad_norm": 1.125300407409668, "kl": 0.0433349609375, "learning_rate": 7.6625e-07, "loss": 0.0004, "reward": 3.9091583490371704, "reward_std": 0.019687645137310028, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.917988508939743, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9946419894695282, "step": 188 }, { "completion_length": 213.03125, "epoch": 0.6048, "grad_norm": 1.806405782699585, "kl": 0.05859375, "learning_rate": 7.65e-07, "loss": 0.0006, "reward": 3.9139894247055054, "reward_std": 0.01741368416696787, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.916355162858963, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9976341724395752, "step": 189 }, { "completion_length": 246.6875, "epoch": 0.608, "grad_norm": 2.158470630645752, "kl": 0.05224609375, "learning_rate": 7.6375e-07, "loss": 0.0005, "reward": 3.9092923402786255, "reward_std": 0.019907254725694656, "rewards/answer_entity_reward": 0.9944444596767426, "rewards/answer_wer_reward": 0.9189584851264954, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9958893954753876, "step": 190 }, { "completion_length": 197.71875, "epoch": 0.6112, "grad_norm": 0.8463873863220215, "kl": 0.0526123046875, "learning_rate": 7.624999999999999e-07, "loss": 0.0005, "reward": 3.7934869527816772, "reward_std": 0.010684152133762836, "rewards/answer_entity_reward": 0.9927884340286255, "rewards/answer_wer_reward": 0.8017330169677734, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9989654421806335, "step": 191 }, { "completion_length": 253.03125, "epoch": 0.6144, "grad_norm": 0.95602947473526, "kl": 0.0577392578125, "learning_rate": 7.612499999999999e-07, "loss": 0.0006, "reward": 3.8714359998703003, "reward_std": 0.03730391897261143, "rewards/answer_entity_reward": 0.9679293036460876, "rewards/answer_wer_reward": 0.9067506790161133, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9967561364173889, "step": 192 }, { "completion_length": 260.875, "epoch": 0.6176, "grad_norm": 1.752991795539856, "kl": 0.1259765625, "learning_rate": 7.599999999999999e-07, "loss": 0.0013, "reward": 3.847132444381714, "reward_std": 0.03724599629640579, "rewards/answer_entity_reward": 0.9814560413360596, "rewards/answer_wer_reward": 0.877534031867981, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9881424605846405, "step": 193 }, { "completion_length": 224.8125, "epoch": 0.6208, "grad_norm": 5.3836283683776855, "kl": 0.0616455078125, "learning_rate": 7.5875e-07, "loss": 0.0006, "reward": 3.838170886039734, "reward_std": 0.043032409623265266, "rewards/answer_entity_reward": 0.9778589308261871, "rewards/answer_wer_reward": 0.8835411667823792, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9767708480358124, "step": 194 }, { "completion_length": 234.0, "epoch": 0.624, "grad_norm": 1.4531170129776, "kl": 0.082763671875, "learning_rate": 7.575e-07, "loss": 0.0008, "reward": 3.8195607662200928, "reward_std": 0.06634793058037758, "rewards/answer_entity_reward": 0.9759862422943115, "rewards/answer_wer_reward": 0.8854676187038422, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9581069052219391, "step": 195 }, { "completion_length": 228.875, "epoch": 0.6272, "grad_norm": 1.215409278869629, "kl": 0.0653076171875, "learning_rate": 7.5625e-07, "loss": 0.0006, "reward": 3.869178295135498, "reward_std": 0.018243765458464622, "rewards/answer_entity_reward": 0.9914772808551788, "rewards/answer_wer_reward": 0.9173910617828369, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9603100121021271, "step": 196 }, { "completion_length": 233.40625, "epoch": 0.6304, "grad_norm": 1.5224462747573853, "kl": 0.0479736328125, "learning_rate": 7.55e-07, "loss": 0.0005, "reward": 3.880965232849121, "reward_std": 0.030376747716218233, "rewards/answer_entity_reward": 0.9812500178813934, "rewards/answer_wer_reward": 0.903846025466919, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9958691298961639, "step": 197 }, { "completion_length": 159.75, "epoch": 0.6336, "grad_norm": 2.0013957023620605, "kl": 0.072021484375, "learning_rate": 7.5375e-07, "loss": 0.0007, "reward": 3.8514485359191895, "reward_std": 0.021021784283220768, "rewards/answer_entity_reward": 0.9914772808551788, "rewards/answer_wer_reward": 0.9317480027675629, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9282233119010925, "step": 198 }, { "completion_length": 200.125, "epoch": 0.6368, "grad_norm": 7.399294853210449, "kl": 0.0662841796875, "learning_rate": 7.524999999999999e-07, "loss": 0.0007, "reward": 3.9170095920562744, "reward_std": 0.03030287381261587, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.955333948135376, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9616756439208984, "step": 199 }, { "completion_length": 228.75, "epoch": 0.64, "grad_norm": 1.6671867370605469, "kl": 0.13623046875, "learning_rate": 7.512499999999999e-07, "loss": 0.0014, "reward": 3.848036050796509, "reward_std": 0.14389772480353713, "rewards/answer_entity_reward": 0.9886363744735718, "rewards/answer_wer_reward": 0.9240660667419434, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9665836989879608, "step": 200 }, { "completion_length": 209.90625, "epoch": 0.6432, "grad_norm": 1.2796622514724731, "kl": 0.05029296875, "learning_rate": 7.5e-07, "loss": 0.0005, "reward": 3.856316566467285, "reward_std": 0.025415225885808468, "rewards/answer_entity_reward": 0.9718458652496338, "rewards/answer_wer_reward": 0.8857261538505554, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987446367740631, "step": 201 }, { "completion_length": 203.1875, "epoch": 0.6464, "grad_norm": 6.9469380378723145, "kl": 0.05810546875, "learning_rate": 7.4875e-07, "loss": 0.0006, "reward": 3.7580385208129883, "reward_std": 0.0333370678126812, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.8357867002487183, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9279336631298065, "step": 202 }, { "completion_length": 211.5, "epoch": 0.6496, "grad_norm": 2.437093496322632, "kl": 0.0400390625, "learning_rate": 7.475e-07, "loss": 0.0004, "reward": 3.888434052467346, "reward_std": 0.04942548694089055, "rewards/answer_entity_reward": 0.9895833432674408, "rewards/answer_wer_reward": 0.901074230670929, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9977766275405884, "step": 203 }, { "completion_length": 220.3125, "epoch": 0.6528, "grad_norm": 9.914649963378906, "kl": 0.054443359375, "learning_rate": 7.4625e-07, "loss": 0.0005, "reward": 3.9074004888534546, "reward_std": 0.022341615986078978, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.924115002155304, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9861262738704681, "step": 204 }, { "completion_length": 190.28125, "epoch": 0.656, "grad_norm": 10.771315574645996, "kl": 0.0731201171875, "learning_rate": 7.45e-07, "loss": 0.0007, "reward": 3.8562848567962646, "reward_std": 0.05522243678569794, "rewards/answer_entity_reward": 0.9873949587345123, "rewards/answer_wer_reward": 0.9283336997032166, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9405562579631805, "step": 205 }, { "completion_length": 254.375, "epoch": 0.6592, "grad_norm": 1.2101417779922485, "kl": 0.054443359375, "learning_rate": 7.4375e-07, "loss": 0.0005, "reward": 3.9058661460876465, "reward_std": 0.015844878274947405, "rewards/answer_entity_reward": 0.9788995683193207, "rewards/answer_wer_reward": 0.9304846525192261, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9964818954467773, "step": 206 }, { "completion_length": 202.96875, "epoch": 0.6624, "grad_norm": 3.355869770050049, "kl": 0.0572509765625, "learning_rate": 7.425e-07, "loss": 0.0006, "reward": 3.8065719604492188, "reward_std": 0.19051394425332546, "rewards/answer_entity_reward": 0.9650735259056091, "rewards/answer_wer_reward": 0.8801510035991669, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9925974309444427, "step": 207 }, { "completion_length": 226.1875, "epoch": 0.6656, "grad_norm": 1.7292360067367554, "kl": 0.104248046875, "learning_rate": 7.412499999999999e-07, "loss": 0.001, "reward": 3.8113776445388794, "reward_std": 0.02462965715676546, "rewards/answer_entity_reward": 0.9770916700363159, "rewards/answer_wer_reward": 0.864607959985733, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9696778357028961, "step": 208 }, { "completion_length": 198.75, "epoch": 0.6688, "grad_norm": 4.215091705322266, "kl": 0.06640625, "learning_rate": 7.4e-07, "loss": 0.0007, "reward": 3.8144696950912476, "reward_std": 0.025187399238348007, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9298737645149231, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8845959007740021, "step": 209 }, { "completion_length": 200.1875, "epoch": 0.672, "grad_norm": 1.537361979484558, "kl": 0.049560546875, "learning_rate": 7.3875e-07, "loss": 0.0005, "reward": 3.9332594871520996, "reward_std": 0.011271146591752768, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.951434314250946, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9818252325057983, "step": 210 }, { "completion_length": 190.78125, "epoch": 0.6752, "grad_norm": 2.9701907634735107, "kl": 0.0654296875, "learning_rate": 7.375e-07, "loss": 0.0007, "reward": 3.8168801069259644, "reward_std": 0.024646650068461895, "rewards/answer_entity_reward": 0.9914772808551788, "rewards/answer_wer_reward": 0.9553571939468384, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8700454831123352, "step": 211 }, { "completion_length": 157.59375, "epoch": 0.6784, "grad_norm": 3.1656010150909424, "kl": 0.0611572265625, "learning_rate": 7.362499999999999e-07, "loss": 0.0006, "reward": 3.8838521242141724, "reward_std": 0.0407260712236166, "rewards/answer_entity_reward": 0.9767543971538544, "rewards/answer_wer_reward": 0.944227010011673, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9628707766532898, "step": 212 }, { "completion_length": 238.1875, "epoch": 0.6816, "grad_norm": 2.614816665649414, "kl": 0.0947265625, "learning_rate": 7.35e-07, "loss": 0.0009, "reward": 3.8542829751968384, "reward_std": 0.03231436479836702, "rewards/answer_entity_reward": 0.974577009677887, "rewards/answer_wer_reward": 0.8831658661365509, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9965401291847229, "step": 213 }, { "completion_length": 255.0, "epoch": 0.6848, "grad_norm": 1.8072490692138672, "kl": 0.048828125, "learning_rate": 7.3375e-07, "loss": 0.0005, "reward": 3.9139556884765625, "reward_std": 0.013969901017844677, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9155895113945007, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9983660876750946, "step": 214 }, { "completion_length": 163.96875, "epoch": 0.688, "grad_norm": 3.6364543437957764, "kl": 0.082763671875, "learning_rate": 7.325e-07, "loss": 0.0008, "reward": 3.8950713872909546, "reward_std": 0.030674483627080917, "rewards/answer_entity_reward": 0.9930555820465088, "rewards/answer_wer_reward": 0.9427915513515472, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.959224134683609, "step": 215 }, { "completion_length": 211.90625, "epoch": 0.6912, "grad_norm": 1.4036628007888794, "kl": 0.0504150390625, "learning_rate": 7.312499999999999e-07, "loss": 0.0005, "reward": 3.90190052986145, "reward_std": 0.028614184819161892, "rewards/answer_entity_reward": 0.9636363685131073, "rewards/answer_wer_reward": 0.9445142149925232, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9937500059604645, "step": 216 }, { "completion_length": 226.875, "epoch": 0.6944, "grad_norm": 1.5664644241333008, "kl": 0.051025390625, "learning_rate": 7.3e-07, "loss": 0.0005, "reward": 3.9051342010498047, "reward_std": 0.023595476523041725, "rewards/answer_entity_reward": 0.994463324546814, "rewards/answer_wer_reward": 0.9128024578094482, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9978685975074768, "step": 217 }, { "completion_length": 211.96875, "epoch": 0.6976, "grad_norm": 3.6565327644348145, "kl": 0.0567626953125, "learning_rate": 7.2875e-07, "loss": 0.0006, "reward": 3.920815348625183, "reward_std": 0.026728018186986446, "rewards/answer_entity_reward": 0.9936868846416473, "rewards/answer_wer_reward": 0.9297977983951569, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9973307251930237, "step": 218 }, { "completion_length": 226.90625, "epoch": 0.7008, "grad_norm": 5.147249221801758, "kl": 0.142333984375, "learning_rate": 7.275e-07, "loss": 0.0014, "reward": 3.887997627258301, "reward_std": 0.017563311383128166, "rewards/answer_entity_reward": 0.9923513829708099, "rewards/answer_wer_reward": 0.8966234028339386, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999022901058197, "step": 219 }, { "completion_length": 196.75, "epoch": 0.704, "grad_norm": 4.334951400756836, "kl": 0.07958984375, "learning_rate": 7.262499999999999e-07, "loss": 0.0008, "reward": 3.919954776763916, "reward_std": 0.020561310462653637, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.922648161649704, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997106492519379, "step": 220 }, { "completion_length": 208.15625, "epoch": 0.7072, "grad_norm": 4.896883964538574, "kl": 0.072509765625, "learning_rate": 7.249999999999999e-07, "loss": 0.0007, "reward": 3.8171916007995605, "reward_std": 0.044522007927298546, "rewards/answer_entity_reward": 0.9767857491970062, "rewards/answer_wer_reward": 0.9031675159931183, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9372382760047913, "step": 221 }, { "completion_length": 197.71875, "epoch": 0.7104, "grad_norm": 1.9743766784667969, "kl": 0.041259765625, "learning_rate": 7.2375e-07, "loss": 0.0004, "reward": 3.9599783420562744, "reward_std": 0.008235257118940353, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9602223634719849, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999755859375, "step": 222 }, { "completion_length": 181.84375, "epoch": 0.7136, "grad_norm": 6.57908296585083, "kl": 0.07421875, "learning_rate": 7.225e-07, "loss": 0.0007, "reward": 3.826643943786621, "reward_std": 0.06298277154564857, "rewards/answer_entity_reward": 0.9833333194255829, "rewards/answer_wer_reward": 0.9450017511844635, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8983088135719299, "step": 223 }, { "completion_length": 177.84375, "epoch": 0.7168, "grad_norm": 13.744032859802246, "kl": 0.078369140625, "learning_rate": 7.212499999999999e-07, "loss": 0.0008, "reward": 3.852834939956665, "reward_std": 0.044052885845303535, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9374657571315765, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9177731275558472, "step": 224 }, { "completion_length": 249.3125, "epoch": 0.72, "grad_norm": 1.7395777702331543, "kl": 0.05712890625, "learning_rate": 7.2e-07, "loss": 0.0006, "reward": 3.8659743070602417, "reward_std": 0.03202287387102842, "rewards/answer_entity_reward": 0.9767628312110901, "rewards/answer_wer_reward": 0.8964802920818329, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9927313327789307, "step": 225 }, { "completion_length": 246.75, "epoch": 0.7232, "grad_norm": 1.1522554159164429, "kl": 0.05419921875, "learning_rate": 7.1875e-07, "loss": 0.0005, "reward": 3.868378758430481, "reward_std": 0.02125831786543131, "rewards/answer_entity_reward": 0.9791666567325592, "rewards/answer_wer_reward": 0.8927575647830963, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9964545369148254, "step": 226 }, { "completion_length": 213.6875, "epoch": 0.7264, "grad_norm": 1.6328908205032349, "kl": 0.0452880859375, "learning_rate": 7.175e-07, "loss": 0.0004, "reward": 3.9461253881454468, "reward_std": 0.017373798182234168, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9516011476516724, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9989885687828064, "step": 227 }, { "completion_length": 180.25, "epoch": 0.7296, "grad_norm": 1.6245373487472534, "kl": 0.0810546875, "learning_rate": 7.1625e-07, "loss": 0.0008, "reward": 3.92253839969635, "reward_std": 0.009518959443084896, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9421058893203735, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9832733571529388, "step": 228 }, { "completion_length": 211.46875, "epoch": 0.7328, "grad_norm": 2.3507907390594482, "kl": 0.080078125, "learning_rate": 7.149999999999999e-07, "loss": 0.0008, "reward": 3.9085057973861694, "reward_std": 0.011625304818153381, "rewards/answer_entity_reward": 0.9886363744735718, "rewards/answer_wer_reward": 0.91986945271492, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 229 }, { "completion_length": 189.78125, "epoch": 0.736, "grad_norm": 2.801975965499878, "kl": 0.068603515625, "learning_rate": 7.137499999999999e-07, "loss": 0.0007, "reward": 3.849338173866272, "reward_std": 0.04476720932871103, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9499310851097107, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.90181103348732, "step": 230 }, { "completion_length": 232.46875, "epoch": 0.7392, "grad_norm": 18.121028900146484, "kl": 0.065673828125, "learning_rate": 7.125e-07, "loss": 0.0007, "reward": 3.8422099351882935, "reward_std": 0.05234749615192413, "rewards/answer_entity_reward": 0.9829545617103577, "rewards/answer_wer_reward": 0.8842452466487885, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9750101864337921, "step": 231 }, { "completion_length": 230.90625, "epoch": 0.7424, "grad_norm": 1.374346375465393, "kl": 0.0440673828125, "learning_rate": 7.1125e-07, "loss": 0.0004, "reward": 3.9123170375823975, "reward_std": 0.025476250797510147, "rewards/answer_entity_reward": 0.9930555820465088, "rewards/answer_wer_reward": 0.9220384955406189, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9972230195999146, "step": 232 }, { "completion_length": 197.5625, "epoch": 0.7456, "grad_norm": 3.1081960201263428, "kl": 0.067138671875, "learning_rate": 7.1e-07, "loss": 0.0007, "reward": 3.921274781227112, "reward_std": 0.04291347204707563, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9490483999252319, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9722263813018799, "step": 233 }, { "completion_length": 198.90625, "epoch": 0.7488, "grad_norm": 2.3603627681732178, "kl": 0.0550537109375, "learning_rate": 7.0875e-07, "loss": 0.0005, "reward": 3.9125137329101562, "reward_std": 0.03855661302804947, "rewards/answer_entity_reward": 0.9947552382946014, "rewards/answer_wer_reward": 0.9429784715175629, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9747799634933472, "step": 234 }, { "completion_length": 220.71875, "epoch": 0.752, "grad_norm": 3.3247504234313965, "kl": 0.070068359375, "learning_rate": 7.075e-07, "loss": 0.0007, "reward": 3.877889394760132, "reward_std": 0.03429079055786133, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9119226932525635, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9659668207168579, "step": 235 }, { "completion_length": 194.21875, "epoch": 0.7552, "grad_norm": 5.20084810256958, "kl": 0.067626953125, "learning_rate": 7.0625e-07, "loss": 0.0007, "reward": 3.918747305870056, "reward_std": 0.03475894033908844, "rewards/answer_entity_reward": 0.9929924309253693, "rewards/answer_wer_reward": 0.9448626041412354, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.980892151594162, "step": 236 }, { "completion_length": 222.53125, "epoch": 0.7584, "grad_norm": 3.0105435848236084, "kl": 0.07421875, "learning_rate": 7.049999999999999e-07, "loss": 0.0007, "reward": 3.9244236946105957, "reward_std": 0.010058181826025248, "rewards/answer_entity_reward": 0.9914772808551788, "rewards/answer_wer_reward": 0.9545913934707642, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9783551096916199, "step": 237 }, { "completion_length": 222.5625, "epoch": 0.7616, "grad_norm": 4.065408229827881, "kl": 0.1181640625, "learning_rate": 7.037499999999999e-07, "loss": 0.0012, "reward": 3.873254418373108, "reward_std": 0.0757724829018116, "rewards/answer_entity_reward": 0.9845328330993652, "rewards/answer_wer_reward": 0.936627209186554, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9520943462848663, "step": 238 }, { "completion_length": 184.21875, "epoch": 0.7648, "grad_norm": 1.1628284454345703, "kl": 0.0579833984375, "learning_rate": 7.024999999999999e-07, "loss": 0.0006, "reward": 3.9432320594787598, "reward_std": 0.010221295058727264, "rewards/answer_entity_reward": 0.9905790388584137, "rewards/answer_wer_reward": 0.953954666852951, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.998698353767395, "step": 239 }, { "completion_length": 233.90625, "epoch": 0.768, "grad_norm": 1.4767858982086182, "kl": 0.079345703125, "learning_rate": 7.0125e-07, "loss": 0.0008, "reward": 3.8955001831054688, "reward_std": 0.03214742988348007, "rewards/answer_entity_reward": 0.9854603707790375, "rewards/answer_wer_reward": 0.9112924933433533, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987473487854004, "step": 240 }, { "completion_length": 176.53125, "epoch": 0.7712, "grad_norm": 5.655521869659424, "kl": 0.0872802734375, "learning_rate": 7e-07, "loss": 0.0009, "reward": 3.8957866430282593, "reward_std": 0.02847579075023532, "rewards/answer_entity_reward": 0.9903846085071564, "rewards/answer_wer_reward": 0.9654708206653595, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9399311542510986, "step": 241 }, { "completion_length": 252.71875, "epoch": 0.7744, "grad_norm": 3.268174886703491, "kl": 0.073486328125, "learning_rate": 6.9875e-07, "loss": 0.0007, "reward": 3.8414435386657715, "reward_std": 0.08019998762756586, "rewards/answer_entity_reward": 0.9822989404201508, "rewards/answer_wer_reward": 0.8909429609775543, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.968201756477356, "step": 242 }, { "completion_length": 234.875, "epoch": 0.7776, "grad_norm": 3.445681571960449, "kl": 0.15869140625, "learning_rate": 6.975e-07, "loss": 0.0016, "reward": 3.856196165084839, "reward_std": 0.0546736940741539, "rewards/answer_entity_reward": 0.9822468161582947, "rewards/answer_wer_reward": 0.9020899534225464, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9718593955039978, "step": 243 }, { "completion_length": 186.375, "epoch": 0.7808, "grad_norm": 3.4756290912628174, "kl": 0.109130859375, "learning_rate": 6.9625e-07, "loss": 0.0011, "reward": 3.878678798675537, "reward_std": 0.014406855218112469, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9121991693973541, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9693206548690796, "step": 244 }, { "completion_length": 224.46875, "epoch": 0.784, "grad_norm": 2.4778082370758057, "kl": 0.0618896484375, "learning_rate": 6.949999999999999e-07, "loss": 0.0006, "reward": 3.890427350997925, "reward_std": 0.013088527135550976, "rewards/answer_entity_reward": 0.9849699139595032, "rewards/answer_wer_reward": 0.9565823972225189, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9488749206066132, "step": 245 }, { "completion_length": 220.0625, "epoch": 0.7872, "grad_norm": 1.7784525156021118, "kl": 0.0592041015625, "learning_rate": 6.937499999999999e-07, "loss": 0.0006, "reward": 3.9208799600601196, "reward_std": 0.013537504710257053, "rewards/answer_entity_reward": 0.9903846085071564, "rewards/answer_wer_reward": 0.9380317628383636, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9924636483192444, "step": 246 }, { "completion_length": 215.03125, "epoch": 0.7904, "grad_norm": 1.7845004796981812, "kl": 0.087158203125, "learning_rate": 6.924999999999999e-07, "loss": 0.0009, "reward": 3.874635100364685, "reward_std": 0.047601671889424324, "rewards/answer_entity_reward": 0.9777146875858307, "rewards/answer_wer_reward": 0.9114454984664917, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9854750335216522, "step": 247 }, { "completion_length": 237.0, "epoch": 0.7936, "grad_norm": 1.9031370878219604, "kl": 0.0665283203125, "learning_rate": 6.9125e-07, "loss": 0.0007, "reward": 3.8799991607666016, "reward_std": 0.040791427716612816, "rewards/answer_entity_reward": 0.9725233018398285, "rewards/answer_wer_reward": 0.9113976061344147, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9960784018039703, "step": 248 }, { "completion_length": 247.625, "epoch": 0.7968, "grad_norm": 6.799812316894531, "kl": 0.5244140625, "learning_rate": 6.9e-07, "loss": 0.0052, "reward": 3.9148751497268677, "reward_std": 0.012524784076958895, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9155747294425964, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993003606796265, "step": 249 }, { "completion_length": 202.78125, "epoch": 0.8, "grad_norm": 2.9497642517089844, "kl": 0.108642578125, "learning_rate": 6.8875e-07, "loss": 0.0011, "reward": 3.88541841506958, "reward_std": 0.05846460163593292, "rewards/answer_entity_reward": 0.9898538887500763, "rewards/answer_wer_reward": 0.9265855848789215, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.968979001045227, "step": 250 }, { "completion_length": 230.03125, "epoch": 0.8032, "grad_norm": 3.021209478378296, "kl": 0.064453125, "learning_rate": 6.875e-07, "loss": 0.0006, "reward": 3.9006909132003784, "reward_std": 0.02151984628289938, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9085462689399719, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9956169128417969, "step": 251 }, { "completion_length": 202.0625, "epoch": 0.8064, "grad_norm": 3.288858413696289, "kl": 0.0810546875, "learning_rate": 6.8625e-07, "loss": 0.0008, "reward": 3.9228957891464233, "reward_std": 0.012390648480504751, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9330424964427948, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9898532032966614, "step": 252 }, { "completion_length": 202.84375, "epoch": 0.8096, "grad_norm": 2.384650468826294, "kl": 0.084228515625, "learning_rate": 6.85e-07, "loss": 0.0009, "reward": 3.8598722219467163, "reward_std": 0.03435686323791742, "rewards/answer_entity_reward": 0.9775519669055939, "rewards/answer_wer_reward": 0.9159774780273438, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9663428068161011, "step": 253 }, { "completion_length": 235.15625, "epoch": 0.8128, "grad_norm": 3.9519598484039307, "kl": 0.061767578125, "learning_rate": 6.837499999999999e-07, "loss": 0.0006, "reward": 3.8161985874176025, "reward_std": 0.06573762744665146, "rewards/answer_entity_reward": 0.9905131459236145, "rewards/answer_wer_reward": 0.8475149571895599, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9781705141067505, "step": 254 }, { "completion_length": 241.125, "epoch": 0.816, "grad_norm": 3.464174509048462, "kl": 0.077392578125, "learning_rate": 6.824999999999999e-07, "loss": 0.0008, "reward": 3.894362449645996, "reward_std": 0.025215300731360912, "rewards/answer_entity_reward": 0.9895833432674408, "rewards/answer_wer_reward": 0.9064654111862183, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.998313695192337, "step": 255 }, { "completion_length": 177.59375, "epoch": 0.8192, "grad_norm": 1.5625709295272827, "kl": 0.0986328125, "learning_rate": 6.8125e-07, "loss": 0.001, "reward": 3.9517083168029785, "reward_std": 0.01383261731825769, "rewards/answer_entity_reward": 0.995192289352417, "rewards/answer_wer_reward": 0.9637933671474457, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9927226006984711, "step": 256 }, { "completion_length": 191.625, "epoch": 0.8224, "grad_norm": 1.4757704734802246, "kl": 0.0791015625, "learning_rate": 6.800000000000001e-07, "loss": 0.0008, "reward": 3.8987783193588257, "reward_std": 0.016407988965511322, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9312387406826019, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9699434041976929, "step": 257 }, { "completion_length": 191.65625, "epoch": 0.8256, "grad_norm": 3.355372428894043, "kl": 0.09033203125, "learning_rate": 6.7875e-07, "loss": 0.0009, "reward": 3.9129350185394287, "reward_std": 0.015536424703896046, "rewards/answer_entity_reward": 0.9944852888584137, "rewards/answer_wer_reward": 0.9205312728881836, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9979183673858643, "step": 258 }, { "completion_length": 188.65625, "epoch": 0.8288, "grad_norm": 1.917312741279602, "kl": 0.086669921875, "learning_rate": 6.775e-07, "loss": 0.0009, "reward": 3.918121814727783, "reward_std": 0.0268348827958107, "rewards/answer_entity_reward": 0.9890183508396149, "rewards/answer_wer_reward": 0.9294547438621521, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996488690376282, "step": 259 }, { "completion_length": 234.0, "epoch": 0.832, "grad_norm": 1.334208369255066, "kl": 0.0635986328125, "learning_rate": 6.7625e-07, "loss": 0.0006, "reward": 3.924370527267456, "reward_std": 0.02556901052594185, "rewards/answer_entity_reward": 0.980710506439209, "rewards/answer_wer_reward": 0.9436598718166351, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 260 }, { "completion_length": 157.09375, "epoch": 0.8352, "grad_norm": 3.0484063625335693, "kl": 0.093017578125, "learning_rate": 6.75e-07, "loss": 0.0009, "reward": 3.928007483482361, "reward_std": 0.01636551646515727, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9610774517059326, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.969333827495575, "step": 261 }, { "completion_length": 222.6875, "epoch": 0.8384, "grad_norm": 1.5266326665878296, "kl": 0.110595703125, "learning_rate": 6.737499999999999e-07, "loss": 0.0011, "reward": 3.826764225959778, "reward_std": 0.014424358261749148, "rewards/answer_entity_reward": 0.875, "rewards/answer_wer_reward": 0.9528080821037292, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9989560842514038, "step": 262 }, { "completion_length": 245.96875, "epoch": 0.8416, "grad_norm": 2.332728624343872, "kl": 0.0777587890625, "learning_rate": 6.724999999999999e-07, "loss": 0.0008, "reward": 3.84222412109375, "reward_std": 0.018232629168778658, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.8854961693286896, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9591319561004639, "step": 263 }, { "completion_length": 155.6875, "epoch": 0.8448, "grad_norm": 7.505854606628418, "kl": 0.101806640625, "learning_rate": 6.7125e-07, "loss": 0.001, "reward": 3.875036120414734, "reward_std": 0.07785245403647423, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9335145354270935, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9436048865318298, "step": 264 }, { "completion_length": 249.75, "epoch": 0.848, "grad_norm": 2.8738133907318115, "kl": 0.0516357421875, "learning_rate": 6.7e-07, "loss": 0.0005, "reward": 3.903374195098877, "reward_std": 0.014860059600323439, "rewards/answer_entity_reward": 0.995192289352417, "rewards/answer_wer_reward": 0.9094418883323669, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987399280071259, "step": 265 }, { "completion_length": 199.34375, "epoch": 0.8512, "grad_norm": 8.186075210571289, "kl": 0.074462890625, "learning_rate": 6.6875e-07, "loss": 0.0007, "reward": 3.8564417362213135, "reward_std": 0.06331180594861507, "rewards/answer_entity_reward": 0.9917200803756714, "rewards/answer_wer_reward": 0.9368169605731964, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9279046654701233, "step": 266 }, { "completion_length": 207.75, "epoch": 0.8544, "grad_norm": 1.7668160200119019, "kl": 0.191650390625, "learning_rate": 6.675e-07, "loss": 0.0019, "reward": 3.791893243789673, "reward_std": 0.21384014189243317, "rewards/answer_entity_reward": 0.9642857313156128, "rewards/answer_wer_reward": 0.8976732790470123, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9611842036247253, "step": 267 }, { "completion_length": 232.78125, "epoch": 0.8576, "grad_norm": 3.357858180999756, "kl": 0.0655517578125, "learning_rate": 6.6625e-07, "loss": 0.0006, "reward": 3.849023461341858, "reward_std": 0.07564813643693924, "rewards/answer_entity_reward": 0.981249988079071, "rewards/answer_wer_reward": 0.9172319173812866, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9505417346954346, "step": 268 }, { "completion_length": 159.90625, "epoch": 0.8608, "grad_norm": 8.665388107299805, "kl": 0.083740234375, "learning_rate": 6.65e-07, "loss": 0.0008, "reward": 3.8619388341903687, "reward_std": 0.03842100687325001, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9294092357158661, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9369938969612122, "step": 269 }, { "completion_length": 199.09375, "epoch": 0.864, "grad_norm": 2.6412887573242188, "kl": 0.24951171875, "learning_rate": 6.637499999999999e-07, "loss": 0.0025, "reward": 3.92287015914917, "reward_std": 0.04514491464942694, "rewards/answer_entity_reward": 0.9867424070835114, "rewards/answer_wer_reward": 0.948787659406662, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9873401820659637, "step": 270 }, { "completion_length": 138.625, "epoch": 0.8672, "grad_norm": 5.494461536407471, "kl": 0.1064453125, "learning_rate": 6.624999999999999e-07, "loss": 0.0011, "reward": 3.80997896194458, "reward_std": 0.10453111864626408, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9460954964160919, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8638834655284882, "step": 271 }, { "completion_length": 207.0625, "epoch": 0.8704, "grad_norm": 6.705058574676514, "kl": 0.0904541015625, "learning_rate": 6.6125e-07, "loss": 0.0009, "reward": 3.918370246887207, "reward_std": 0.016086122021079063, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.94427290558815, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9765011072158813, "step": 272 }, { "completion_length": 193.46875, "epoch": 0.8736, "grad_norm": 3.6274845600128174, "kl": 0.16259765625, "learning_rate": 6.6e-07, "loss": 0.0016, "reward": 3.8420186042785645, "reward_std": 0.042743777856230736, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.88405841588974, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9579601883888245, "step": 273 }, { "completion_length": 238.03125, "epoch": 0.8768, "grad_norm": 39.40747833251953, "kl": 0.064453125, "learning_rate": 6.587499999999999e-07, "loss": 0.0006, "reward": 3.8922038078308105, "reward_std": 0.08438011445105076, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.8997087776660919, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9924951493740082, "step": 274 }, { "completion_length": 215.03125, "epoch": 0.88, "grad_norm": 3.786466360092163, "kl": 0.073974609375, "learning_rate": 6.575e-07, "loss": 0.0007, "reward": 3.936691641807556, "reward_std": 0.013240452855825424, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9393938779830933, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993811845779419, "step": 275 }, { "completion_length": 171.84375, "epoch": 0.8832, "grad_norm": 6.402861595153809, "kl": 0.09619140625, "learning_rate": 6.5625e-07, "loss": 0.001, "reward": 3.8171043395996094, "reward_std": 0.07490862905979156, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9142147600650787, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.902889609336853, "step": 276 }, { "completion_length": 202.90625, "epoch": 0.8864, "grad_norm": 1.9027079343795776, "kl": 0.07958984375, "learning_rate": 6.55e-07, "loss": 0.0008, "reward": 3.910063624382019, "reward_std": 0.014503994956612587, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9100635945796967, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 277 }, { "completion_length": 194.90625, "epoch": 0.8896, "grad_norm": 3.430772304534912, "kl": 0.10107421875, "learning_rate": 6.5375e-07, "loss": 0.001, "reward": 3.9086241722106934, "reward_std": 0.011167994700372219, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9395906329154968, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9690335094928741, "step": 278 }, { "completion_length": 214.21875, "epoch": 0.8928, "grad_norm": 1.209375262260437, "kl": 0.07763671875, "learning_rate": 6.524999999999999e-07, "loss": 0.0008, "reward": 3.934818387031555, "reward_std": 0.013630851171910763, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9348185062408447, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 279 }, { "completion_length": 210.375, "epoch": 0.896, "grad_norm": 3.4542951583862305, "kl": 0.09619140625, "learning_rate": 6.5125e-07, "loss": 0.001, "reward": 3.8483023643493652, "reward_std": 0.022013184614479542, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9127626419067383, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.935539722442627, "step": 280 }, { "completion_length": 171.96875, "epoch": 0.8992, "grad_norm": 5.6723761558532715, "kl": 0.138671875, "learning_rate": 6.5e-07, "loss": 0.0014, "reward": 3.894706964492798, "reward_std": 0.01279338588938117, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9379555583000183, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9567514657974243, "step": 281 }, { "completion_length": 111.78125, "epoch": 0.9024, "grad_norm": 4.6447954177856445, "kl": 0.1376953125, "learning_rate": 6.4875e-07, "loss": 0.0014, "reward": 3.901338577270508, "reward_std": 0.019952512811869383, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.978780597448349, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9253990054130554, "step": 282 }, { "completion_length": 244.96875, "epoch": 0.9056, "grad_norm": 2.825244665145874, "kl": 0.0611572265625, "learning_rate": 6.474999999999999e-07, "loss": 0.0006, "reward": 3.9182543754577637, "reward_std": 0.02383749559521675, "rewards/answer_entity_reward": 0.9927884340286255, "rewards/answer_wer_reward": 0.9259287714958191, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9995370507240295, "step": 283 }, { "completion_length": 218.09375, "epoch": 0.9088, "grad_norm": 2.9246108531951904, "kl": 0.0736083984375, "learning_rate": 6.4625e-07, "loss": 0.0007, "reward": 3.9247629642486572, "reward_std": 0.019582282286137342, "rewards/answer_entity_reward": 0.9866071343421936, "rewards/answer_wer_reward": 0.9388971030712128, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9992588460445404, "step": 284 }, { "completion_length": 174.8125, "epoch": 0.912, "grad_norm": 1.4176238775253296, "kl": 0.115478515625, "learning_rate": 6.45e-07, "loss": 0.0012, "reward": 3.9359350204467773, "reward_std": 0.01886278996244073, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9649160206317902, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9731022417545319, "step": 285 }, { "completion_length": 152.40625, "epoch": 0.9152, "grad_norm": 4.273341178894043, "kl": 0.176025390625, "learning_rate": 6.4375e-07, "loss": 0.0018, "reward": 3.850113034248352, "reward_std": 0.07313014380633831, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9544805884361267, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8991046845912933, "step": 286 }, { "completion_length": 221.90625, "epoch": 0.9184, "grad_norm": 3.1975696086883545, "kl": 0.083984375, "learning_rate": 6.424999999999999e-07, "loss": 0.0008, "reward": 3.8276385068893433, "reward_std": 0.019742398988455534, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.8953758776187897, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9379445612430573, "step": 287 }, { "completion_length": 203.1875, "epoch": 0.9216, "grad_norm": 4.396200180053711, "kl": 0.1318359375, "learning_rate": 6.4125e-07, "loss": 0.0013, "reward": 3.9295929670333862, "reward_std": 0.022352089174091816, "rewards/answer_entity_reward": 0.9927884340286255, "rewards/answer_wer_reward": 0.9394927024841309, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9973118305206299, "step": 288 }, { "completion_length": 188.34375, "epoch": 0.9248, "grad_norm": 23.72756004333496, "kl": 0.098388671875, "learning_rate": 6.4e-07, "loss": 0.001, "reward": 3.7452211380004883, "reward_std": 0.12425664439797401, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.8480645418167114, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8971565663814545, "step": 289 }, { "completion_length": 233.5625, "epoch": 0.928, "grad_norm": 1.2391304969787598, "kl": 0.068603515625, "learning_rate": 6.3875e-07, "loss": 0.0007, "reward": 3.8707345724105835, "reward_std": 0.03127638017758727, "rewards/answer_entity_reward": 0.989980161190033, "rewards/answer_wer_reward": 0.8822586238384247, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9984957575798035, "step": 290 }, { "completion_length": 176.8125, "epoch": 0.9312, "grad_norm": 3.8803555965423584, "kl": 0.14697265625, "learning_rate": 6.374999999999999e-07, "loss": 0.0015, "reward": 3.890373468399048, "reward_std": 0.01580220554023981, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9229053854942322, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9674679934978485, "step": 291 }, { "completion_length": 249.9375, "epoch": 0.9344, "grad_norm": 1.001364827156067, "kl": 0.08447265625, "learning_rate": 6.362499999999999e-07, "loss": 0.0008, "reward": 3.8967798948287964, "reward_std": 0.015075822360813618, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.8994384407997131, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994248449802399, "step": 292 }, { "completion_length": 192.125, "epoch": 0.9376, "grad_norm": 7.706722736358643, "kl": 0.12255859375, "learning_rate": 6.35e-07, "loss": 0.0012, "reward": 3.92827308177948, "reward_std": 0.02050976036116481, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9391875863075256, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9890855252742767, "step": 293 }, { "completion_length": 235.125, "epoch": 0.9408, "grad_norm": 1.723900556564331, "kl": 0.0587158203125, "learning_rate": 6.3375e-07, "loss": 0.0006, "reward": 3.9498140811920166, "reward_std": 0.012220169650390744, "rewards/answer_entity_reward": 0.9981617629528046, "rewards/answer_wer_reward": 0.9532225430011749, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.99842968583107, "step": 294 }, { "completion_length": 205.65625, "epoch": 0.944, "grad_norm": 5.019091606140137, "kl": 0.092041015625, "learning_rate": 6.324999999999999e-07, "loss": 0.0009, "reward": 3.72371768951416, "reward_std": 0.03362658293917775, "rewards/answer_entity_reward": 0.988194465637207, "rewards/answer_wer_reward": 0.8201212882995605, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9154019355773926, "step": 295 }, { "completion_length": 229.78125, "epoch": 0.9472, "grad_norm": 2.4262614250183105, "kl": 0.07763671875, "learning_rate": 6.3125e-07, "loss": 0.0008, "reward": 3.9112552404403687, "reward_std": 0.02215595170855522, "rewards/answer_entity_reward": 0.9932383000850677, "rewards/answer_wer_reward": 0.9202675223350525, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9977494478225708, "step": 296 }, { "completion_length": 201.4375, "epoch": 0.9504, "grad_norm": 15.131966590881348, "kl": 1.363037109375, "learning_rate": 6.3e-07, "loss": 0.0136, "reward": 3.8845863342285156, "reward_std": 0.025053692050278187, "rewards/answer_entity_reward": 0.995192289352417, "rewards/answer_wer_reward": 0.9146546125411987, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9747393429279327, "step": 297 }, { "completion_length": 172.9375, "epoch": 0.9536, "grad_norm": 0.7034117579460144, "kl": 0.114501953125, "learning_rate": 6.2875e-07, "loss": 0.0011, "reward": 3.9505850076675415, "reward_std": 0.004406077787280083, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9516552090644836, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9989297986030579, "step": 298 }, { "completion_length": 226.03125, "epoch": 0.9568, "grad_norm": 10.005863189697266, "kl": 0.099853515625, "learning_rate": 6.274999999999999e-07, "loss": 0.001, "reward": 3.78713595867157, "reward_std": 0.118343286216259, "rewards/answer_entity_reward": 0.9955128133296967, "rewards/answer_wer_reward": 0.8108388781547546, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9807842969894409, "step": 299 }, { "completion_length": 183.0, "epoch": 0.96, "grad_norm": 12.267927169799805, "kl": 0.142578125, "learning_rate": 6.262499999999999e-07, "loss": 0.0014, "reward": 3.7959177494049072, "reward_std": 0.09426255617290735, "rewards/answer_entity_reward": 0.9763257503509521, "rewards/answer_wer_reward": 0.963774561882019, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8558174073696136, "step": 300 }, { "completion_length": 255.28125, "epoch": 0.9632, "grad_norm": 1.5198532342910767, "kl": 0.0638427734375, "learning_rate": 6.249999999999999e-07, "loss": 0.0006, "reward": 3.8590621948242188, "reward_std": 0.05621089227497578, "rewards/answer_entity_reward": 0.9652777910232544, "rewards/answer_wer_reward": 0.8950084447860718, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987758696079254, "step": 301 }, { "completion_length": 231.09375, "epoch": 0.9664, "grad_norm": 2.063969135284424, "kl": 0.0770263671875, "learning_rate": 6.2375e-07, "loss": 0.0008, "reward": 3.8598477840423584, "reward_std": 0.04335158132016659, "rewards/answer_entity_reward": 0.9843385815620422, "rewards/answer_wer_reward": 0.8991816341876984, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.976327657699585, "step": 302 }, { "completion_length": 214.1875, "epoch": 0.9696, "grad_norm": 4.762388706207275, "kl": 0.09765625, "learning_rate": 6.225000000000001e-07, "loss": 0.001, "reward": 3.86174213886261, "reward_std": 0.03313549840822816, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9338361918926239, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9313782155513763, "step": 303 }, { "completion_length": 232.5625, "epoch": 0.9728, "grad_norm": 2.811995506286621, "kl": 0.10595703125, "learning_rate": 6.2125e-07, "loss": 0.0011, "reward": 3.732570767402649, "reward_std": 0.14181919861584902, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9169972240924835, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.846823513507843, "step": 304 }, { "completion_length": 221.96875, "epoch": 0.976, "grad_norm": 2.424633741378784, "kl": 0.0677490234375, "learning_rate": 6.2e-07, "loss": 0.0007, "reward": 3.9095277786254883, "reward_std": 0.047814636724069715, "rewards/answer_entity_reward": 0.9927884638309479, "rewards/answer_wer_reward": 0.93398517370224, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9827540516853333, "step": 305 }, { "completion_length": 273.8125, "epoch": 0.9792, "grad_norm": 1.3363338708877563, "kl": 0.0654296875, "learning_rate": 6.1875e-07, "loss": 0.0007, "reward": 3.8615630865097046, "reward_std": 0.029406235553324223, "rewards/answer_entity_reward": 0.9869123697280884, "rewards/answer_wer_reward": 0.8774734139442444, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9971773624420166, "step": 306 }, { "completion_length": 243.34375, "epoch": 0.9824, "grad_norm": 3.1950275897979736, "kl": 0.05810546875, "learning_rate": 6.175e-07, "loss": 0.0006, "reward": 3.898465633392334, "reward_std": 0.022021150682121515, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.903068333864212, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.998238205909729, "step": 307 }, { "completion_length": 230.375, "epoch": 0.9856, "grad_norm": 1.1819887161254883, "kl": 0.075927734375, "learning_rate": 6.162499999999999e-07, "loss": 0.0008, "reward": 3.9233819246292114, "reward_std": 0.01652457471936941, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9294087886810303, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9968140125274658, "step": 308 }, { "completion_length": 193.78125, "epoch": 0.9888, "grad_norm": 3.613255739212036, "kl": 0.089111328125, "learning_rate": 6.149999999999999e-07, "loss": 0.0009, "reward": 3.9530293941497803, "reward_std": 0.013143055606633425, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9536189138889313, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994103908538818, "step": 309 }, { "completion_length": 223.1875, "epoch": 0.992, "grad_norm": 2.9832558631896973, "kl": 0.076904296875, "learning_rate": 6.1375e-07, "loss": 0.0008, "reward": 3.9074047803878784, "reward_std": 0.03526896797120571, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9134717583656311, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9939330518245697, "step": 310 }, { "completion_length": 202.78125, "epoch": 0.9952, "grad_norm": 1.6509346961975098, "kl": 0.100830078125, "learning_rate": 6.125000000000001e-07, "loss": 0.001, "reward": 3.897627115249634, "reward_std": 0.025366032496094704, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9004680216312408, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 311 }, { "completion_length": 227.75, "epoch": 0.9984, "grad_norm": 2.9892170429229736, "kl": 0.091064453125, "learning_rate": 6.1125e-07, "loss": 0.0009, "reward": 3.879219174385071, "reward_std": 0.04558245837688446, "rewards/answer_entity_reward": 0.9914772808551788, "rewards/answer_wer_reward": 0.9074902236461639, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9802517294883728, "step": 312 }, { "completion_length": 165.125, "epoch": 1.0, "grad_norm": 1.1831876039505005, "kl": 0.09814453125, "learning_rate": 6.1e-07, "loss": 0.0005, "reward": 3.956197738647461, "reward_std": 0.047231610864400864, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.985044002532959, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9711538553237915, "step": 313 }, { "completion_length": 194.875, "epoch": 1.0032, "grad_norm": 1.1336063146591187, "kl": 0.10302734375, "learning_rate": 6.0875e-07, "loss": 0.001, "reward": 3.955459713935852, "reward_std": 0.010184567421674728, "rewards/answer_entity_reward": 0.995192289352417, "rewards/answer_wer_reward": 0.9638065993785858, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9964607954025269, "step": 314 }, { "completion_length": 172.1875, "epoch": 1.0064, "grad_norm": 7.745497226715088, "kl": 0.099609375, "learning_rate": 6.075e-07, "loss": 0.001, "reward": 3.9203338623046875, "reward_std": 0.005493420176208019, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9503339529037476, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9699999988079071, "step": 315 }, { "completion_length": 216.90625, "epoch": 1.0096, "grad_norm": 5.326587200164795, "kl": 0.076904296875, "learning_rate": 6.062499999999999e-07, "loss": 0.0008, "reward": 3.8242450952529907, "reward_std": 0.04496973566710949, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9261577427387238, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8980873227119446, "step": 316 }, { "completion_length": 179.59375, "epoch": 1.0128, "grad_norm": 1.887527346611023, "kl": 0.0675048828125, "learning_rate": 6.049999999999999e-07, "loss": 0.0007, "reward": 3.9317299127578735, "reward_std": 0.023447751067578793, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9398273527622223, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9963668584823608, "step": 317 }, { "completion_length": 215.1875, "epoch": 1.016, "grad_norm": 2.478510618209839, "kl": 0.060791015625, "learning_rate": 6.037499999999999e-07, "loss": 0.0006, "reward": 3.8788411617279053, "reward_std": 0.020661167800426483, "rewards/answer_entity_reward": 0.9930555820465088, "rewards/answer_wer_reward": 0.8995265662670135, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9862589836120605, "step": 318 }, { "completion_length": 205.5, "epoch": 1.0192, "grad_norm": 1.7058178186416626, "kl": 0.0830078125, "learning_rate": 6.025000000000001e-07, "loss": 0.0008, "reward": 3.807918906211853, "reward_std": 0.04822289012372494, "rewards/answer_entity_reward": 0.9788461625576019, "rewards/answer_wer_reward": 0.8715765476226807, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9574962258338928, "step": 319 }, { "completion_length": 242.46875, "epoch": 1.0224, "grad_norm": 1.7695921659469604, "kl": 0.0859375, "learning_rate": 6.0125e-07, "loss": 0.0009, "reward": 3.9255610704421997, "reward_std": 0.019923360086977482, "rewards/answer_entity_reward": 0.9906516969203949, "rewards/answer_wer_reward": 0.9401695132255554, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9947398900985718, "step": 320 }, { "completion_length": 175.40625, "epoch": 1.0256, "grad_norm": 2.60329270362854, "kl": 0.085693359375, "learning_rate": 6e-07, "loss": 0.0009, "reward": 3.9218677282333374, "reward_std": 0.008750536944717169, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9223886132240295, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994791746139526, "step": 321 }, { "completion_length": 157.875, "epoch": 1.0288, "grad_norm": 5.270680904388428, "kl": 0.120361328125, "learning_rate": 5.9875e-07, "loss": 0.0012, "reward": 3.8664562702178955, "reward_std": 0.03370736539363861, "rewards/answer_entity_reward": 0.9868055582046509, "rewards/answer_wer_reward": 0.9486467838287354, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9310039281845093, "step": 322 }, { "completion_length": 202.15625, "epoch": 1.032, "grad_norm": 0.9677954316139221, "kl": 0.072998046875, "learning_rate": 5.975e-07, "loss": 0.0007, "reward": 3.9512887001037598, "reward_std": 0.008498450508341193, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9516439437866211, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996448755264282, "step": 323 }, { "completion_length": 183.21875, "epoch": 1.0352, "grad_norm": 8.04370403289795, "kl": 0.0908203125, "learning_rate": 5.962499999999999e-07, "loss": 0.0009, "reward": 3.810960531234741, "reward_std": 0.017052859999239445, "rewards/answer_entity_reward": 0.9861111044883728, "rewards/answer_wer_reward": 0.9431954920291901, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8816540241241455, "step": 324 }, { "completion_length": 209.125, "epoch": 1.0384, "grad_norm": 1.1835105419158936, "kl": 0.09326171875, "learning_rate": 5.949999999999999e-07, "loss": 0.0009, "reward": 3.9159555435180664, "reward_std": 0.02768123522400856, "rewards/answer_entity_reward": 0.9866695702075958, "rewards/answer_wer_reward": 0.930209755897522, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990763068199158, "step": 325 }, { "completion_length": 202.15625, "epoch": 1.0416, "grad_norm": 1.198609471321106, "kl": 0.0748291015625, "learning_rate": 5.937499999999999e-07, "loss": 0.0007, "reward": 3.85296094417572, "reward_std": 0.19228698359802365, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9154608845710754, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.96875, "step": 326 }, { "completion_length": 187.9375, "epoch": 1.0448, "grad_norm": 3.9246749877929688, "kl": 0.08740234375, "learning_rate": 5.925e-07, "loss": 0.0009, "reward": 3.8706984519958496, "reward_std": 0.046023860573768616, "rewards/answer_entity_reward": 0.9947552382946014, "rewards/answer_wer_reward": 0.9316051602363586, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9443379342556, "step": 327 }, { "completion_length": 206.5625, "epoch": 1.048, "grad_norm": 2.1665873527526855, "kl": 0.111083984375, "learning_rate": 5.912500000000001e-07, "loss": 0.0011, "reward": 3.8563778400421143, "reward_std": 0.02296618465334177, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9032285511493683, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9531491994857788, "step": 328 }, { "completion_length": 202.71875, "epoch": 1.0512, "grad_norm": 2.493177890777588, "kl": 0.087646484375, "learning_rate": 5.9e-07, "loss": 0.0009, "reward": 3.8221092224121094, "reward_std": 0.13764610793441534, "rewards/answer_entity_reward": 0.9418402910232544, "rewards/answer_wer_reward": 0.8825558722019196, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9977130889892578, "step": 329 }, { "completion_length": 200.0, "epoch": 1.0544, "grad_norm": 1.2568529844284058, "kl": 0.114013671875, "learning_rate": 5.8875e-07, "loss": 0.0011, "reward": 3.934491515159607, "reward_std": 0.012761063873767853, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9390542805194855, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9954372346401215, "step": 330 }, { "completion_length": 212.71875, "epoch": 1.0576, "grad_norm": 1.3623089790344238, "kl": 0.086669921875, "learning_rate": 5.875e-07, "loss": 0.0009, "reward": 3.8928335905075073, "reward_std": 0.03161040600389242, "rewards/answer_entity_reward": 0.9936868846416473, "rewards/answer_wer_reward": 0.8996903300285339, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994563460350037, "step": 331 }, { "completion_length": 240.3125, "epoch": 1.0608, "grad_norm": 1.2754676342010498, "kl": 0.0615234375, "learning_rate": 5.8625e-07, "loss": 0.0006, "reward": 3.925002932548523, "reward_std": 0.0067287166602909565, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9267281293869019, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9982748329639435, "step": 332 }, { "completion_length": 218.6875, "epoch": 1.064, "grad_norm": 1.989392638206482, "kl": 0.073486328125, "learning_rate": 5.849999999999999e-07, "loss": 0.0007, "reward": 3.9305100440979004, "reward_std": 0.014313624240458012, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9314764738082886, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999033510684967, "step": 333 }, { "completion_length": 187.125, "epoch": 1.0672, "grad_norm": 4.332698822021484, "kl": 0.11474609375, "learning_rate": 5.837499999999999e-07, "loss": 0.0011, "reward": 3.9111961126327515, "reward_std": 0.017924371175467968, "rewards/answer_entity_reward": 0.9967105388641357, "rewards/answer_wer_reward": 0.9153991043567657, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990865290164948, "step": 334 }, { "completion_length": 240.8125, "epoch": 1.0704, "grad_norm": 0.991020143032074, "kl": 0.0609130859375, "learning_rate": 5.825e-07, "loss": 0.0006, "reward": 3.9502662420272827, "reward_std": 0.006167408544570208, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9529542922973633, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9973118305206299, "step": 335 }, { "completion_length": 250.625, "epoch": 1.0735999999999999, "grad_norm": 2.3996546268463135, "kl": 0.06396484375, "learning_rate": 5.8125e-07, "loss": 0.0006, "reward": 3.899760365486145, "reward_std": 0.02179525839164853, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9130350351333618, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9867254495620728, "step": 336 }, { "completion_length": 193.9375, "epoch": 1.0768, "grad_norm": 3.6998724937438965, "kl": 0.090576171875, "learning_rate": 5.8e-07, "loss": 0.0009, "reward": 3.8309794664382935, "reward_std": 0.01553899934515357, "rewards/answer_entity_reward": 0.9886363744735718, "rewards/answer_wer_reward": 0.9471099972724915, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.895233154296875, "step": 337 }, { "completion_length": 221.90625, "epoch": 1.08, "grad_norm": 1.1334843635559082, "kl": 0.0587158203125, "learning_rate": 5.7875e-07, "loss": 0.0006, "reward": 3.936136484146118, "reward_std": 0.012863298412412405, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9376117587089539, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9985246956348419, "step": 338 }, { "completion_length": 242.125, "epoch": 1.0832, "grad_norm": 1.0358681678771973, "kl": 0.0643310546875, "learning_rate": 5.775e-07, "loss": 0.0007, "reward": 3.887587547302246, "reward_std": 0.0230812830850482, "rewards/answer_entity_reward": 0.9798610806465149, "rewards/answer_wer_reward": 0.9077264070510864, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 339 }, { "completion_length": 213.9375, "epoch": 1.0864, "grad_norm": 24.39422035217285, "kl": 0.080078125, "learning_rate": 5.7625e-07, "loss": 0.0008, "reward": 3.887939691543579, "reward_std": 0.014108296483755112, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.8971990048885345, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9907407462596893, "step": 340 }, { "completion_length": 205.375, "epoch": 1.0896, "grad_norm": 1.204923152923584, "kl": 0.1015625, "learning_rate": 5.749999999999999e-07, "loss": 0.001, "reward": 3.819010019302368, "reward_std": 0.24664557841606438, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9135412275791168, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.936718761920929, "step": 341 }, { "completion_length": 231.09375, "epoch": 1.0928, "grad_norm": 0.831721842288971, "kl": 0.06884765625, "learning_rate": 5.737499999999999e-07, "loss": 0.0007, "reward": 3.9083417654037476, "reward_std": 0.023847888689488173, "rewards/answer_entity_reward": 0.9902146458625793, "rewards/answer_wer_reward": 0.9184364974498749, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999690592288971, "step": 342 }, { "completion_length": 225.28125, "epoch": 1.096, "grad_norm": 1.239318609237671, "kl": 0.070068359375, "learning_rate": 5.725e-07, "loss": 0.0007, "reward": 3.8802337646484375, "reward_std": 0.019388118293136358, "rewards/answer_entity_reward": 0.9895833730697632, "rewards/answer_wer_reward": 0.8906503319740295, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 343 }, { "completion_length": 182.25, "epoch": 1.0992, "grad_norm": 2.810415267944336, "kl": 0.08349609375, "learning_rate": 5.7125e-07, "loss": 0.0008, "reward": 3.8992663621902466, "reward_std": 0.017442656215280294, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9195939302444458, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9825133979320526, "step": 344 }, { "completion_length": 228.9375, "epoch": 1.1024, "grad_norm": 2.4584133625030518, "kl": 0.11376953125, "learning_rate": 5.699999999999999e-07, "loss": 0.0011, "reward": 3.893067240715027, "reward_std": 0.024248626083135605, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9001834988594055, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9963560402393341, "step": 345 }, { "completion_length": 154.46875, "epoch": 1.1056, "grad_norm": 2.5888006687164307, "kl": 0.1025390625, "learning_rate": 5.6875e-07, "loss": 0.001, "reward": 3.8254867792129517, "reward_std": 0.031096864491701126, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9297608137130737, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8957259356975555, "step": 346 }, { "completion_length": 174.0625, "epoch": 1.1088, "grad_norm": 2.087509870529175, "kl": 0.12158203125, "learning_rate": 5.675e-07, "loss": 0.0012, "reward": 3.920476198196411, "reward_std": 0.017223183065652847, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9334003627300262, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9870758056640625, "step": 347 }, { "completion_length": 209.3125, "epoch": 1.112, "grad_norm": 1.5391756296157837, "kl": 0.105712890625, "learning_rate": 5.6625e-07, "loss": 0.0011, "reward": 3.9325058460235596, "reward_std": 0.011998760513961315, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9345271587371826, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9979787170886993, "step": 348 }, { "completion_length": 211.9375, "epoch": 1.1152, "grad_norm": 2.1449012756347656, "kl": 0.072021484375, "learning_rate": 5.649999999999999e-07, "loss": 0.0007, "reward": 3.887805461883545, "reward_std": 0.01465547364205122, "rewards/answer_entity_reward": 0.9981617629528046, "rewards/answer_wer_reward": 0.8914407789707184, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.998202919960022, "step": 349 }, { "completion_length": 219.875, "epoch": 1.1184, "grad_norm": 2.7394628524780273, "kl": 0.065185546875, "learning_rate": 5.637499999999999e-07, "loss": 0.0007, "reward": 3.905122399330139, "reward_std": 0.014080648310482502, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9162788391113281, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9888435006141663, "step": 350 }, { "completion_length": 191.28125, "epoch": 1.1216, "grad_norm": 2.381448745727539, "kl": 0.0721435546875, "learning_rate": 5.625e-07, "loss": 0.0007, "reward": 3.8880510330200195, "reward_std": 0.04133735504001379, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9348196983337402, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9576955437660217, "step": 351 }, { "completion_length": 263.5625, "epoch": 1.1248, "grad_norm": 1.0376274585723877, "kl": 0.0584716796875, "learning_rate": 5.6125e-07, "loss": 0.0006, "reward": 3.8982614278793335, "reward_std": 0.012545288074761629, "rewards/answer_entity_reward": 0.9981617629528046, "rewards/answer_wer_reward": 0.9007040560245514, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999395489692688, "step": 352 }, { "completion_length": 218.59375, "epoch": 1.1280000000000001, "grad_norm": 1.5081944465637207, "kl": 0.10009765625, "learning_rate": 5.6e-07, "loss": 0.001, "reward": 3.9146311283111572, "reward_std": 0.021717723459005356, "rewards/answer_entity_reward": 0.9917200803756714, "rewards/answer_wer_reward": 0.9235903024673462, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993206560611725, "step": 353 }, { "completion_length": 226.84375, "epoch": 1.1312, "grad_norm": 1.0990034341812134, "kl": 0.063720703125, "learning_rate": 5.587499999999999e-07, "loss": 0.0006, "reward": 3.9005931615829468, "reward_std": 0.018239760771393776, "rewards/answer_entity_reward": 0.9927884340286255, "rewards/answer_wer_reward": 0.9203313589096069, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9874734580516815, "step": 354 }, { "completion_length": 238.59375, "epoch": 1.1344, "grad_norm": 10.765813827514648, "kl": 0.056884765625, "learning_rate": 5.575e-07, "loss": 0.0006, "reward": 3.9274662733078003, "reward_std": 0.016329116653651, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9380079507827759, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9894582629203796, "step": 355 }, { "completion_length": 216.1875, "epoch": 1.1376, "grad_norm": 6.097777843475342, "kl": 0.43701171875, "learning_rate": 5.5625e-07, "loss": 0.0044, "reward": 3.6753621101379395, "reward_std": 0.09127287194132805, "rewards/answer_entity_reward": 0.9843385517597198, "rewards/answer_wer_reward": 0.9279595017433167, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.7630640268325806, "step": 356 }, { "completion_length": 233.25, "epoch": 1.1408, "grad_norm": 1.9484727382659912, "kl": 0.07470703125, "learning_rate": 5.55e-07, "loss": 0.0007, "reward": 3.8734058141708374, "reward_std": 0.026476514525711536, "rewards/answer_entity_reward": 0.9829545617103577, "rewards/answer_wer_reward": 0.906408816576004, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9840425550937653, "step": 357 }, { "completion_length": 224.125, "epoch": 1.144, "grad_norm": 1.650207757949829, "kl": 0.071533203125, "learning_rate": 5.5375e-07, "loss": 0.0007, "reward": 3.9309768676757812, "reward_std": 0.016152822878211737, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9357885122299194, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9986605942249298, "step": 358 }, { "completion_length": 202.8125, "epoch": 1.1472, "grad_norm": 2.33708119392395, "kl": 0.102294921875, "learning_rate": 5.525e-07, "loss": 0.001, "reward": 3.901100993156433, "reward_std": 0.06198639050126076, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9566735327243805, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9444274306297302, "step": 359 }, { "completion_length": 231.03125, "epoch": 1.1504, "grad_norm": 2.603564977645874, "kl": 0.0662841796875, "learning_rate": 5.5125e-07, "loss": 0.0007, "reward": 3.8539780378341675, "reward_std": 0.04134450480341911, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.8810023069381714, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.975816547870636, "step": 360 }, { "completion_length": 176.8125, "epoch": 1.1536, "grad_norm": 1.9730738401412964, "kl": 0.0673828125, "learning_rate": 5.5e-07, "loss": 0.0007, "reward": 3.946772813796997, "reward_std": 0.007931779837235808, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9499374032020569, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9968354403972626, "step": 361 }, { "completion_length": 205.28125, "epoch": 1.1568, "grad_norm": 2.6627304553985596, "kl": 0.0997314453125, "learning_rate": 5.487499999999999e-07, "loss": 0.001, "reward": 3.914576292037964, "reward_std": 0.015826540999114513, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9475591778755188, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9670170843601227, "step": 362 }, { "completion_length": 199.9375, "epoch": 1.16, "grad_norm": 2.073272466659546, "kl": 0.091064453125, "learning_rate": 5.474999999999999e-07, "loss": 0.0009, "reward": 3.89456570148468, "reward_std": 0.008259527385234833, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9333997070789337, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9611659348011017, "step": 363 }, { "completion_length": 222.0625, "epoch": 1.1632, "grad_norm": 1.7804555892944336, "kl": 0.1220703125, "learning_rate": 5.4625e-07, "loss": 0.0012, "reward": 3.847594380378723, "reward_std": 0.09885499440133572, "rewards/answer_entity_reward": 0.9692708849906921, "rewards/answer_wer_reward": 0.8783235251903534, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 364 }, { "completion_length": 206.71875, "epoch": 1.1663999999999999, "grad_norm": 1.6756658554077148, "kl": 0.097900390625, "learning_rate": 5.45e-07, "loss": 0.001, "reward": 3.866326928138733, "reward_std": 0.027653913479298353, "rewards/answer_entity_reward": 0.990705132484436, "rewards/answer_wer_reward": 0.9324296712875366, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9431920945644379, "step": 365 }, { "completion_length": 187.125, "epoch": 1.1696, "grad_norm": 1.6528626680374146, "kl": 0.075439453125, "learning_rate": 5.4375e-07, "loss": 0.0008, "reward": 3.821729063987732, "reward_std": 0.14681637566536665, "rewards/answer_entity_reward": 0.995192289352417, "rewards/answer_wer_reward": 0.896637350320816, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9611493647098541, "step": 366 }, { "completion_length": 180.5625, "epoch": 1.1728, "grad_norm": 2.211965560913086, "kl": 0.10302734375, "learning_rate": 5.425e-07, "loss": 0.001, "reward": 3.857783317565918, "reward_std": 0.13934296648949385, "rewards/answer_entity_reward": 0.9847756326198578, "rewards/answer_wer_reward": 0.9358752965927124, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9683823585510254, "step": 367 }, { "completion_length": 208.21875, "epoch": 1.176, "grad_norm": 2.522264242172241, "kl": 0.060546875, "learning_rate": 5.4125e-07, "loss": 0.0006, "reward": 3.8018884658813477, "reward_std": 0.07955996971577406, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.8091042637825012, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9962564706802368, "step": 368 }, { "completion_length": 192.84375, "epoch": 1.1792, "grad_norm": 1.4488089084625244, "kl": 0.0791015625, "learning_rate": 5.4e-07, "loss": 0.0008, "reward": 3.940070152282715, "reward_std": 0.008247917518019676, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9410351514816284, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999035120010376, "step": 369 }, { "completion_length": 244.96875, "epoch": 1.1824, "grad_norm": 5.085299968719482, "kl": 0.109130859375, "learning_rate": 5.387499999999999e-07, "loss": 0.0011, "reward": 3.834069848060608, "reward_std": 0.027521015144884586, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.907810240983963, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9319414496421814, "step": 370 }, { "completion_length": 223.25, "epoch": 1.1856, "grad_norm": 2.248169183731079, "kl": 0.1083984375, "learning_rate": 5.374999999999999e-07, "loss": 0.0011, "reward": 3.9311490058898926, "reward_std": 0.011384843150153756, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.931148886680603, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 371 }, { "completion_length": 237.3125, "epoch": 1.1888, "grad_norm": 1.0549304485321045, "kl": 0.05419921875, "learning_rate": 5.3625e-07, "loss": 0.0005, "reward": 3.890028476715088, "reward_std": 0.012344780378043652, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.8915461599826813, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9984822571277618, "step": 372 }, { "completion_length": 216.0625, "epoch": 1.192, "grad_norm": 1.3054077625274658, "kl": 0.0694580078125, "learning_rate": 5.35e-07, "loss": 0.0007, "reward": 3.8679678440093994, "reward_std": 0.016808426938951015, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.8875625133514404, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.980405330657959, "step": 373 }, { "completion_length": 222.34375, "epoch": 1.1952, "grad_norm": 10.381876945495605, "kl": 0.067626953125, "learning_rate": 5.3375e-07, "loss": 0.0007, "reward": 3.946020483970642, "reward_std": 0.016021378338336945, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9507038593292236, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9977203905582428, "step": 374 }, { "completion_length": 208.875, "epoch": 1.1984, "grad_norm": 2.7493553161621094, "kl": 0.13525390625, "learning_rate": 5.325e-07, "loss": 0.0014, "reward": 3.942535161972046, "reward_std": 0.01458098879083991, "rewards/answer_entity_reward": 0.993686854839325, "rewards/answer_wer_reward": 0.9490944147109985, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997539520263672, "step": 375 }, { "completion_length": 252.09375, "epoch": 1.2016, "grad_norm": 1.9127050638198853, "kl": 0.079345703125, "learning_rate": 5.3125e-07, "loss": 0.0008, "reward": 3.8897405862808228, "reward_std": 0.015877339988946915, "rewards/answer_entity_reward": 0.9888257682323456, "rewards/answer_wer_reward": 0.9078421294689178, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9930725991725922, "step": 376 }, { "completion_length": 186.375, "epoch": 1.2048, "grad_norm": 1.832676887512207, "kl": 0.096435546875, "learning_rate": 5.3e-07, "loss": 0.001, "reward": 3.9009323120117188, "reward_std": 0.013205710332840681, "rewards/answer_entity_reward": 0.995192289352417, "rewards/answer_wer_reward": 0.940411388874054, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9653284549713135, "step": 377 }, { "completion_length": 224.46875, "epoch": 1.208, "grad_norm": 1.1020106077194214, "kl": 0.0638427734375, "learning_rate": 5.2875e-07, "loss": 0.0006, "reward": 3.9539231061935425, "reward_std": 0.005315458634868264, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9545543491840363, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993686676025391, "step": 378 }, { "completion_length": 158.25, "epoch": 1.2112, "grad_norm": 2.493016481399536, "kl": 0.123779296875, "learning_rate": 5.274999999999999e-07, "loss": 0.0012, "reward": 3.921034097671509, "reward_std": 0.009559540543705225, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9532065689563751, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9678275287151337, "step": 379 }, { "completion_length": 253.4375, "epoch": 1.2144, "grad_norm": 1.1055541038513184, "kl": 0.067626953125, "learning_rate": 5.262499999999999e-07, "loss": 0.0007, "reward": 3.8998262882232666, "reward_std": 0.021630683913826942, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9026672542095184, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 380 }, { "completion_length": 211.28125, "epoch": 1.2176, "grad_norm": 2.4898200035095215, "kl": 0.072998046875, "learning_rate": 5.25e-07, "loss": 0.0007, "reward": 3.8961129188537598, "reward_std": 0.02530479012057185, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9441809356212616, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9547730088233948, "step": 381 }, { "completion_length": 241.34375, "epoch": 1.2208, "grad_norm": 1.5863702297210693, "kl": 0.09033203125, "learning_rate": 5.237500000000001e-07, "loss": 0.0009, "reward": 3.9048832654953003, "reward_std": 0.02675863727927208, "rewards/answer_entity_reward": 0.9836346209049225, "rewards/answer_wer_reward": 0.9218496978282928, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993990361690521, "step": 382 }, { "completion_length": 244.90625, "epoch": 1.224, "grad_norm": 1.3265018463134766, "kl": 0.08984375, "learning_rate": 5.225e-07, "loss": 0.0009, "reward": 3.9047261476516724, "reward_std": 0.013275579549372196, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9047262072563171, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 383 }, { "completion_length": 192.3125, "epoch": 1.2272, "grad_norm": 2.3593811988830566, "kl": 0.09521484375, "learning_rate": 5.2125e-07, "loss": 0.0009, "reward": 3.893195629119873, "reward_std": 0.03080725111067295, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9534947872161865, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9441652297973633, "step": 384 }, { "completion_length": 218.65625, "epoch": 1.2304, "grad_norm": 2.7099356651306152, "kl": 0.06982421875, "learning_rate": 5.2e-07, "loss": 0.0007, "reward": 3.8559422492980957, "reward_std": 0.0489511676132679, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9215229749679565, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9344193935394287, "step": 385 }, { "completion_length": 168.375, "epoch": 1.2336, "grad_norm": 3.930095672607422, "kl": 0.109130859375, "learning_rate": 5.1875e-07, "loss": 0.0011, "reward": 3.848017930984497, "reward_std": 0.043564099818468094, "rewards/answer_entity_reward": 0.96875, "rewards/answer_wer_reward": 0.9368032217025757, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9424647688865662, "step": 386 }, { "completion_length": 183.1875, "epoch": 1.2368000000000001, "grad_norm": 7.302414894104004, "kl": 0.1279296875, "learning_rate": 5.174999999999999e-07, "loss": 0.0013, "reward": 3.7856842279434204, "reward_std": 0.026621405966579914, "rewards/answer_entity_reward": 0.9930555820465088, "rewards/answer_wer_reward": 0.9465668201446533, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8460619151592255, "step": 387 }, { "completion_length": 246.0, "epoch": 1.24, "grad_norm": 1.0175095796585083, "kl": 0.06591796875, "learning_rate": 5.162499999999999e-07, "loss": 0.0007, "reward": 3.923374652862549, "reward_std": 0.011706824880093336, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9246262907981873, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987484514713287, "step": 388 }, { "completion_length": 214.34375, "epoch": 1.2432, "grad_norm": 0.9391213655471802, "kl": 0.072021484375, "learning_rate": 5.149999999999999e-07, "loss": 0.0007, "reward": 3.9639917612075806, "reward_std": 0.009625846752896905, "rewards/answer_entity_reward": 0.995192289352417, "rewards/answer_wer_reward": 0.9695361256599426, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9992632865905762, "step": 389 }, { "completion_length": 237.8125, "epoch": 1.2464, "grad_norm": 1.1664483547210693, "kl": 0.07568359375, "learning_rate": 5.137500000000001e-07, "loss": 0.0008, "reward": 3.935038685798645, "reward_std": 0.018754366785287857, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9395028948783875, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 390 }, { "completion_length": 221.09375, "epoch": 1.2496, "grad_norm": 1.0274744033813477, "kl": 0.06591796875, "learning_rate": 5.125e-07, "loss": 0.0007, "reward": 3.9391175508499146, "reward_std": 0.008871730417013168, "rewards/answer_entity_reward": 0.9886363744735718, "rewards/answer_wer_reward": 0.9511756002902985, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993055462837219, "step": 391 }, { "completion_length": 216.53125, "epoch": 1.2528000000000001, "grad_norm": 1.4062410593032837, "kl": 0.0712890625, "learning_rate": 5.1125e-07, "loss": 0.0007, "reward": 3.8631064891815186, "reward_std": 0.02681769710034132, "rewards/answer_entity_reward": 0.9895833432674408, "rewards/answer_wer_reward": 0.9219101965427399, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9516128897666931, "step": 392 }, { "completion_length": 141.9375, "epoch": 1.256, "grad_norm": 9.963582038879395, "kl": 0.12841796875, "learning_rate": 5.1e-07, "loss": 0.0013, "reward": 3.886857271194458, "reward_std": 0.011839461978524923, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9376890957355499, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9520089328289032, "step": 393 }, { "completion_length": 223.75, "epoch": 1.2591999999999999, "grad_norm": 3.129469156265259, "kl": 0.06201171875, "learning_rate": 5.0875e-07, "loss": 0.0006, "reward": 3.8934308290481567, "reward_std": 0.04124835692346096, "rewards/answer_entity_reward": 0.9847756326198578, "rewards/answer_wer_reward": 0.9095006585121155, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9991544783115387, "step": 394 }, { "completion_length": 194.21875, "epoch": 1.2624, "grad_norm": 8.187355995178223, "kl": 0.0849609375, "learning_rate": 5.074999999999999e-07, "loss": 0.0008, "reward": 3.8118830919265747, "reward_std": 0.03198861540295184, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.8248356580734253, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9870474934577942, "step": 395 }, { "completion_length": 216.0625, "epoch": 1.2656, "grad_norm": 1.9981720447540283, "kl": 0.08349609375, "learning_rate": 5.062499999999999e-07, "loss": 0.0008, "reward": 3.876628041267395, "reward_std": 0.030061259865760803, "rewards/answer_entity_reward": 0.9899572730064392, "rewards/answer_wer_reward": 0.9325210154056549, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9541498124599457, "step": 396 }, { "completion_length": 244.9375, "epoch": 1.2688, "grad_norm": 1.46060311794281, "kl": 0.08740234375, "learning_rate": 5.049999999999999e-07, "loss": 0.0009, "reward": 3.9221689701080322, "reward_std": 0.016801749356091022, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9224453568458557, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997234344482422, "step": 397 }, { "completion_length": 171.78125, "epoch": 1.272, "grad_norm": 2.054922342300415, "kl": 0.116455078125, "learning_rate": 5.0375e-07, "loss": 0.0012, "reward": 3.922398328781128, "reward_std": 0.015158042311668396, "rewards/answer_entity_reward": 0.9818181991577148, "rewards/answer_wer_reward": 0.9412411153316498, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993391633033752, "step": 398 }, { "completion_length": 223.6875, "epoch": 1.2752, "grad_norm": 4.638472557067871, "kl": 0.0859375, "learning_rate": 5.025e-07, "loss": 0.0009, "reward": 3.928803563117981, "reward_std": 0.015867930836975574, "rewards/answer_entity_reward": 0.9790209829807281, "rewards/answer_wer_reward": 0.9508891105651855, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9988935589790344, "step": 399 }, { "completion_length": 180.40625, "epoch": 1.2784, "grad_norm": 17.943954467773438, "kl": 0.09228515625, "learning_rate": 5.0125e-07, "loss": 0.0009, "reward": 3.918807029724121, "reward_std": 0.010303683578968048, "rewards/answer_entity_reward": 0.9916666746139526, "rewards/answer_wer_reward": 0.9271402955055237, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 400 }, { "completion_length": 208.59375, "epoch": 1.2816, "grad_norm": 2.634068489074707, "kl": 0.10400390625, "learning_rate": 5e-07, "loss": 0.001, "reward": 3.825340986251831, "reward_std": 0.0303196981549263, "rewards/answer_entity_reward": 0.9871430397033691, "rewards/answer_wer_reward": 0.9115504324436188, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9266475439071655, "step": 401 }, { "completion_length": 203.25, "epoch": 1.2848, "grad_norm": 1.149072289466858, "kl": 0.066162109375, "learning_rate": 4.9875e-07, "loss": 0.0007, "reward": 3.9346535205841064, "reward_std": 0.009479325264692307, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9361503720283508, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9985032677650452, "step": 402 }, { "completion_length": 214.875, "epoch": 1.288, "grad_norm": 1.2013689279556274, "kl": 0.08447265625, "learning_rate": 4.975e-07, "loss": 0.0008, "reward": 3.8625338077545166, "reward_std": 0.012592533603310585, "rewards/answer_entity_reward": 0.9916666746139526, "rewards/answer_wer_reward": 0.9030886590480804, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9677785038948059, "step": 403 }, { "completion_length": 252.78125, "epoch": 1.2912, "grad_norm": 1.6769248247146606, "kl": 0.066162109375, "learning_rate": 4.9625e-07, "loss": 0.0007, "reward": 3.8860517740249634, "reward_std": 0.034614769741892815, "rewards/answer_entity_reward": 0.9836356937885284, "rewards/answer_wer_reward": 0.9036450088024139, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987711310386658, "step": 404 }, { "completion_length": 213.96875, "epoch": 1.2944, "grad_norm": 1.5894328355789185, "kl": 0.069580078125, "learning_rate": 4.95e-07, "loss": 0.0007, "reward": 3.921362280845642, "reward_std": 0.014703459106385708, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.92447629570961, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9992897808551788, "step": 405 }, { "completion_length": 242.53125, "epoch": 1.2976, "grad_norm": 3.458373785018921, "kl": 0.1416015625, "learning_rate": 4.9375e-07, "loss": 0.0014, "reward": 3.7037495374679565, "reward_std": 0.1908966824412346, "rewards/answer_entity_reward": 0.9941239356994629, "rewards/answer_wer_reward": 0.8811471164226532, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.85972860455513, "step": 406 }, { "completion_length": 187.53125, "epoch": 1.3008, "grad_norm": 7.737911224365234, "kl": 0.107666015625, "learning_rate": 4.924999999999999e-07, "loss": 0.0011, "reward": 3.9244754314422607, "reward_std": 0.021069620735943317, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9345695376396179, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9933781623840332, "step": 407 }, { "completion_length": 208.4375, "epoch": 1.304, "grad_norm": 2.0846338272094727, "kl": 0.165771484375, "learning_rate": 4.9125e-07, "loss": 0.0017, "reward": 3.9408286809921265, "reward_std": 0.011463565286248922, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9510546028614044, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9897740483283997, "step": 408 }, { "completion_length": 198.28125, "epoch": 1.3072, "grad_norm": 2.9788646697998047, "kl": 0.089111328125, "learning_rate": 4.9e-07, "loss": 0.0009, "reward": 3.900764584541321, "reward_std": 0.03450075723230839, "rewards/answer_entity_reward": 0.9874475002288818, "rewards/answer_wer_reward": 0.9153684377670288, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9979486465454102, "step": 409 }, { "completion_length": 176.28125, "epoch": 1.3104, "grad_norm": 2.856952667236328, "kl": 0.09228515625, "learning_rate": 4.8875e-07, "loss": 0.0009, "reward": 3.9486716985702515, "reward_std": 0.021902556531131268, "rewards/answer_entity_reward": 0.9912830293178558, "rewards/answer_wer_reward": 0.9610228836536407, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9963657557964325, "step": 410 }, { "completion_length": 209.28125, "epoch": 1.3136, "grad_norm": 2.0441436767578125, "kl": 0.08642578125, "learning_rate": 4.875e-07, "loss": 0.0009, "reward": 3.916486144065857, "reward_std": 0.018760663457214832, "rewards/answer_entity_reward": 0.9963235259056091, "rewards/answer_wer_reward": 0.9207533895969391, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994091391563416, "step": 411 }, { "completion_length": 232.21875, "epoch": 1.3168, "grad_norm": 1.071075201034546, "kl": 0.0633544921875, "learning_rate": 4.8625e-07, "loss": 0.0006, "reward": 3.9308619499206543, "reward_std": 0.018531675916165113, "rewards/answer_entity_reward": 0.9893162250518799, "rewards/answer_wer_reward": 0.943993479013443, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9975521564483643, "step": 412 }, { "completion_length": 210.0625, "epoch": 1.32, "grad_norm": 3.82405686378479, "kl": 0.09326171875, "learning_rate": 4.85e-07, "loss": 0.0009, "reward": 3.889458179473877, "reward_std": 0.02208129083737731, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9473488032817841, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9449501633644104, "step": 413 }, { "completion_length": 197.15625, "epoch": 1.3232, "grad_norm": 1.4103983640670776, "kl": 0.0849609375, "learning_rate": 4.8375e-07, "loss": 0.0009, "reward": 3.9459102153778076, "reward_std": 0.014464881271123886, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9481469988822937, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9977630972862244, "step": 414 }, { "completion_length": 239.9375, "epoch": 1.3264, "grad_norm": 1.4598060846328735, "kl": 0.06982421875, "learning_rate": 4.824999999999999e-07, "loss": 0.0007, "reward": 3.862109899520874, "reward_std": 0.07382148411124945, "rewards/answer_entity_reward": 0.9833333194255829, "rewards/answer_wer_reward": 0.9100264310836792, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.96875, "step": 415 }, { "completion_length": 184.3125, "epoch": 1.3296000000000001, "grad_norm": 0.6735196709632874, "kl": 0.063720703125, "learning_rate": 4.812499999999999e-07, "loss": 0.0006, "reward": 3.8697965145111084, "reward_std": 0.18503482337109745, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.932296484708786, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.96875, "step": 416 }, { "completion_length": 172.1875, "epoch": 1.3328, "grad_norm": 1.7613649368286133, "kl": 0.11962890625, "learning_rate": 4.8e-07, "loss": 0.0012, "reward": 3.938371181488037, "reward_std": 0.020422414876520634, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9546558260917664, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.988179475069046, "step": 417 }, { "completion_length": 223.09375, "epoch": 1.336, "grad_norm": 3.332552671432495, "kl": 0.12841796875, "learning_rate": 4.7875e-07, "loss": 0.0013, "reward": 3.9398679733276367, "reward_std": 0.018179779406636953, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9447188973426819, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9979900419712067, "step": 418 }, { "completion_length": 226.5, "epoch": 1.3392, "grad_norm": 1.4374769926071167, "kl": 0.083984375, "learning_rate": 4.775e-07, "loss": 0.0008, "reward": 3.891066312789917, "reward_std": 0.02610717061907053, "rewards/answer_entity_reward": 0.9841079115867615, "rewards/answer_wer_reward": 0.9078442752361298, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9991140365600586, "step": 419 }, { "completion_length": 196.84375, "epoch": 1.3424, "grad_norm": 1.7055010795593262, "kl": 0.100830078125, "learning_rate": 4.7625e-07, "loss": 0.001, "reward": 3.899353504180908, "reward_std": 0.022911718115210533, "rewards/answer_entity_reward": 0.9914772808551788, "rewards/answer_wer_reward": 0.9394311308860779, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9684451520442963, "step": 420 }, { "completion_length": 223.34375, "epoch": 1.3456000000000001, "grad_norm": 2.624370574951172, "kl": 0.13427734375, "learning_rate": 4.7499999999999995e-07, "loss": 0.0013, "reward": 3.8897502422332764, "reward_std": 0.06373783992603421, "rewards/answer_entity_reward": 0.9921875, "rewards/answer_wer_reward": 0.9388905465602875, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9586721360683441, "step": 421 }, { "completion_length": 209.0, "epoch": 1.3488, "grad_norm": 2.2683520317077637, "kl": 0.102294921875, "learning_rate": 4.7374999999999996e-07, "loss": 0.001, "reward": 3.960143804550171, "reward_std": 0.006363062420859933, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9613305628299713, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9988133013248444, "step": 422 }, { "completion_length": 187.6875, "epoch": 1.3519999999999999, "grad_norm": 1.426279067993164, "kl": 0.130859375, "learning_rate": 4.725e-07, "loss": 0.0013, "reward": 3.904189109802246, "reward_std": 0.017666546627879143, "rewards/answer_entity_reward": 0.9875437021255493, "rewards/answer_wer_reward": 0.9480733275413513, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9685720801353455, "step": 423 }, { "completion_length": 227.96875, "epoch": 1.3552, "grad_norm": 2.3656458854675293, "kl": 0.202880859375, "learning_rate": 4.7125e-07, "loss": 0.002, "reward": 3.8170067071914673, "reward_std": 0.15287955617532134, "rewards/answer_entity_reward": 0.993697464466095, "rewards/answer_wer_reward": 0.9000802934169769, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9544788599014282, "step": 424 }, { "completion_length": 260.375, "epoch": 1.3584, "grad_norm": 19.11045265197754, "kl": 0.0771484375, "learning_rate": 4.6999999999999995e-07, "loss": 0.0008, "reward": 3.9173004627227783, "reward_std": 0.02492327243089676, "rewards/answer_entity_reward": 0.9955128133296967, "rewards/answer_wer_reward": 0.9262253046035767, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9955623149871826, "step": 425 }, { "completion_length": 244.0625, "epoch": 1.3616, "grad_norm": 3.7110118865966797, "kl": 0.0650634765625, "learning_rate": 4.6874999999999996e-07, "loss": 0.0007, "reward": 3.912764072418213, "reward_std": 0.022814412601292133, "rewards/answer_entity_reward": 0.9910714328289032, "rewards/answer_wer_reward": 0.9220215976238251, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996710419654846, "step": 426 }, { "completion_length": 206.0625, "epoch": 1.3648, "grad_norm": 7.218249797821045, "kl": 0.0869140625, "learning_rate": 4.675e-07, "loss": 0.0009, "reward": 3.8915610313415527, "reward_std": 0.020747858565300703, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.913354367017746, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9816789925098419, "step": 427 }, { "completion_length": 229.21875, "epoch": 1.3679999999999999, "grad_norm": 6.419763088226318, "kl": 0.078857421875, "learning_rate": 4.6625e-07, "loss": 0.0008, "reward": 3.7964917421340942, "reward_std": 0.03975658491253853, "rewards/answer_entity_reward": 0.9886363744735718, "rewards/answer_wer_reward": 0.9177364408969879, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8901189565658569, "step": 428 }, { "completion_length": 252.96875, "epoch": 1.3712, "grad_norm": 6.5345025062561035, "kl": 0.0782470703125, "learning_rate": 4.65e-07, "loss": 0.0008, "reward": 3.903268814086914, "reward_std": 0.016737705329433084, "rewards/answer_entity_reward": 0.9764957129955292, "rewards/answer_wer_reward": 0.92976513504982, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9970079958438873, "step": 429 }, { "completion_length": 240.15625, "epoch": 1.3744, "grad_norm": 2.109302043914795, "kl": 0.078857421875, "learning_rate": 4.6374999999999995e-07, "loss": 0.0008, "reward": 3.935005784034729, "reward_std": 0.035214878618717194, "rewards/answer_entity_reward": 0.9908459782600403, "rewards/answer_wer_reward": 0.9483801424503326, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9957797825336456, "step": 430 }, { "completion_length": 204.21875, "epoch": 1.3776, "grad_norm": 2.1557323932647705, "kl": 0.0986328125, "learning_rate": 4.625e-07, "loss": 0.001, "reward": 3.895322561264038, "reward_std": 0.00989355193451047, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9249120354652405, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9704104661941528, "step": 431 }, { "completion_length": 222.375, "epoch": 1.3808, "grad_norm": 1.1159002780914307, "kl": 0.139892578125, "learning_rate": 4.6125e-07, "loss": 0.0014, "reward": 3.909332513809204, "reward_std": 0.02693999744951725, "rewards/answer_entity_reward": 0.995192289352417, "rewards/answer_wer_reward": 0.9155605435371399, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9985795617103577, "step": 432 }, { "completion_length": 203.8125, "epoch": 1.384, "grad_norm": 1.4166613817214966, "kl": 0.1220703125, "learning_rate": 4.6e-07, "loss": 0.0012, "reward": 3.8192185163497925, "reward_std": 0.20739353261888027, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.8859462738037109, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9673629999160767, "step": 433 }, { "completion_length": 253.25, "epoch": 1.3872, "grad_norm": 2.674269437789917, "kl": 0.0657958984375, "learning_rate": 4.5874999999999995e-07, "loss": 0.0007, "reward": 3.88591992855072, "reward_std": 0.02829979732632637, "rewards/answer_entity_reward": 0.9763771891593933, "rewards/answer_wer_reward": 0.9098401963710785, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997023940086365, "step": 434 }, { "completion_length": 216.15625, "epoch": 1.3904, "grad_norm": 2.3317995071411133, "kl": 0.1495361328125, "learning_rate": 4.575e-07, "loss": 0.0015, "reward": 3.8120020627975464, "reward_std": 0.0887885820120573, "rewards/answer_entity_reward": 0.9914772808551788, "rewards/answer_wer_reward": 0.9127777814865112, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9077470898628235, "step": 435 }, { "completion_length": 210.5625, "epoch": 1.3936, "grad_norm": 8.527549743652344, "kl": 0.13818359375, "learning_rate": 4.5624999999999997e-07, "loss": 0.0014, "reward": 3.802919387817383, "reward_std": 0.015426071360707283, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9406470954418182, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.862272173166275, "step": 436 }, { "completion_length": 194.21875, "epoch": 1.3968, "grad_norm": 3.6950721740722656, "kl": 0.098388671875, "learning_rate": 4.55e-07, "loss": 0.001, "reward": 3.9119696617126465, "reward_std": 0.025569402612745762, "rewards/answer_entity_reward": 0.9852430522441864, "rewards/answer_wer_reward": 0.9274449944496155, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9992816150188446, "step": 437 }, { "completion_length": 218.0, "epoch": 1.4, "grad_norm": 2.0461039543151855, "kl": 0.09130859375, "learning_rate": 4.5374999999999994e-07, "loss": 0.0009, "reward": 3.9378126859664917, "reward_std": 0.023795679211616516, "rewards/answer_entity_reward": 0.9909090995788574, "rewards/answer_wer_reward": 0.9559187889099121, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9909848570823669, "step": 438 }, { "completion_length": 166.21875, "epoch": 1.4032, "grad_norm": 6.606758117675781, "kl": 0.099853515625, "learning_rate": 4.525e-07, "loss": 0.001, "reward": 3.7634676694869995, "reward_std": 0.12013816519174725, "rewards/answer_entity_reward": 0.9916666746139526, "rewards/answer_wer_reward": 0.9638259708881378, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8079750537872314, "step": 439 }, { "completion_length": 199.0625, "epoch": 1.4064, "grad_norm": 2.7103731632232666, "kl": 0.107666015625, "learning_rate": 4.5124999999999997e-07, "loss": 0.0011, "reward": 3.88293993473053, "reward_std": 0.030841628089547157, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9417436718940735, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9411962330341339, "step": 440 }, { "completion_length": 228.5, "epoch": 1.4096, "grad_norm": 16.007980346679688, "kl": 0.090576171875, "learning_rate": 4.5e-07, "loss": 0.0009, "reward": 3.8373541831970215, "reward_std": 0.07324423175305128, "rewards/answer_entity_reward": 0.9903846085071564, "rewards/answer_wer_reward": 0.8474734723567963, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994959831237793, "step": 441 }, { "completion_length": 198.0, "epoch": 1.4128, "grad_norm": 1.5419743061065674, "kl": 0.090576171875, "learning_rate": 4.4874999999999994e-07, "loss": 0.0009, "reward": 3.9394867420196533, "reward_std": 0.020834744907915592, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9435902535915375, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993686676025391, "step": 442 }, { "completion_length": 212.9375, "epoch": 1.416, "grad_norm": 2.2846686840057373, "kl": 0.09375, "learning_rate": 4.475e-07, "loss": 0.0009, "reward": 3.9014971256256104, "reward_std": 0.05675862170755863, "rewards/answer_entity_reward": 0.9917200803756714, "rewards/answer_wer_reward": 0.9370100498199463, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9727669358253479, "step": 443 }, { "completion_length": 228.78125, "epoch": 1.4192, "grad_norm": 1.8499493598937988, "kl": 0.0723876953125, "learning_rate": 4.4624999999999996e-07, "loss": 0.0007, "reward": 3.967541456222534, "reward_std": 0.005963538307696581, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9685240089893341, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990174472332001, "step": 444 }, { "completion_length": 224.09375, "epoch": 1.4224, "grad_norm": 5.091113567352295, "kl": 0.0704345703125, "learning_rate": 4.45e-07, "loss": 0.0007, "reward": 3.886088252067566, "reward_std": 0.04133851733058691, "rewards/answer_entity_reward": 0.9886092245578766, "rewards/answer_wer_reward": 0.9384825825691223, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9589964747428894, "step": 445 }, { "completion_length": 228.34375, "epoch": 1.4256, "grad_norm": 57.5860710144043, "kl": 0.142822265625, "learning_rate": 4.4374999999999993e-07, "loss": 0.0014, "reward": 3.9096713066101074, "reward_std": 0.01603887975215912, "rewards/answer_entity_reward": 0.9981617629528046, "rewards/answer_wer_reward": 0.9126511812210083, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9988585412502289, "step": 446 }, { "completion_length": 202.34375, "epoch": 1.4288, "grad_norm": 2.211174964904785, "kl": 0.056640625, "learning_rate": 4.425e-07, "loss": 0.0006, "reward": 3.893475890159607, "reward_std": 0.046710459515452385, "rewards/answer_entity_reward": 0.9659091234207153, "rewards/answer_wer_reward": 0.9309596717357635, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9966072142124176, "step": 447 }, { "completion_length": 186.375, "epoch": 1.432, "grad_norm": 3.053344249725342, "kl": 0.07666015625, "learning_rate": 4.4124999999999996e-07, "loss": 0.0008, "reward": 3.941947340965271, "reward_std": 0.009567510336637497, "rewards/answer_entity_reward": 0.9883012771606445, "rewards/answer_wer_reward": 0.964864045381546, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9887820482254028, "step": 448 }, { "completion_length": 170.65625, "epoch": 1.4352, "grad_norm": 2.5118942260742188, "kl": 0.086181640625, "learning_rate": 4.3999999999999997e-07, "loss": 0.0009, "reward": 3.8258646726608276, "reward_std": 0.011685115285217762, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.8330351114273071, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9928297400474548, "step": 449 }, { "completion_length": 199.8125, "epoch": 1.4384000000000001, "grad_norm": 3.3471686840057373, "kl": 0.12109375, "learning_rate": 4.3874999999999993e-07, "loss": 0.0012, "reward": 3.768259644508362, "reward_std": 0.061878617852926254, "rewards/answer_entity_reward": 0.9866071343421936, "rewards/answer_wer_reward": 0.8001611828804016, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9814914762973785, "step": 450 }, { "completion_length": 179.625, "epoch": 1.4416, "grad_norm": 6.58098840713501, "kl": 0.13671875, "learning_rate": 4.375e-07, "loss": 0.0014, "reward": 3.9213969707489014, "reward_std": 0.007897446397691965, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.954677164554596, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.966719776391983, "step": 451 }, { "completion_length": 201.53125, "epoch": 1.4447999999999999, "grad_norm": 2.6606149673461914, "kl": 0.07275390625, "learning_rate": 4.3625e-07, "loss": 0.0007, "reward": 3.930065631866455, "reward_std": 0.016306706704199314, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9411978721618652, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9888678193092346, "step": 452 }, { "completion_length": 218.71875, "epoch": 1.448, "grad_norm": 2.720804452896118, "kl": 0.068115234375, "learning_rate": 4.3499999999999996e-07, "loss": 0.0007, "reward": 3.9184677600860596, "reward_std": 0.018319842871278524, "rewards/answer_entity_reward": 0.9955128133296967, "rewards/answer_wer_reward": 0.9235129654407501, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994419813156128, "step": 453 }, { "completion_length": 207.65625, "epoch": 1.4512, "grad_norm": 3.4664785861968994, "kl": 0.153564453125, "learning_rate": 4.3375000000000003e-07, "loss": 0.0015, "reward": 3.9119069576263428, "reward_std": 0.017484096810221672, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9505945444107056, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9613124430179596, "step": 454 }, { "completion_length": 212.125, "epoch": 1.4544000000000001, "grad_norm": 1.4592719078063965, "kl": 0.081298828125, "learning_rate": 4.325e-07, "loss": 0.0008, "reward": 3.9381325244903564, "reward_std": 0.015622157603502274, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.94671231508255, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9938240349292755, "step": 455 }, { "completion_length": 237.8125, "epoch": 1.4576, "grad_norm": 1.2292534112930298, "kl": 0.089111328125, "learning_rate": 4.3125e-07, "loss": 0.0009, "reward": 3.940074920654297, "reward_std": 0.013516389299184084, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.943013072013855, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9970619082450867, "step": 456 }, { "completion_length": 206.90625, "epoch": 1.4607999999999999, "grad_norm": 2.4139420986175537, "kl": 0.08837890625, "learning_rate": 4.2999999999999996e-07, "loss": 0.0009, "reward": 3.9423701763153076, "reward_std": 0.017034863587468863, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.942692369222641, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999677836894989, "step": 457 }, { "completion_length": 200.21875, "epoch": 1.464, "grad_norm": 1.0297181606292725, "kl": 0.11083984375, "learning_rate": 4.2875e-07, "loss": 0.0011, "reward": 3.9459547996520996, "reward_std": 0.014651869423687458, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9467397332191467, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999215304851532, "step": 458 }, { "completion_length": 219.75, "epoch": 1.4672, "grad_norm": 1.3148033618927002, "kl": 0.10546875, "learning_rate": 4.275e-07, "loss": 0.0011, "reward": 3.9567900896072388, "reward_std": 0.011340227210894227, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9586590230464935, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.99813112616539, "step": 459 }, { "completion_length": 181.0625, "epoch": 1.4704, "grad_norm": 2.4274115562438965, "kl": 0.09814453125, "learning_rate": 4.2625e-07, "loss": 0.001, "reward": 3.9310171604156494, "reward_std": 0.017811311408877373, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9543968439102173, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9766204059123993, "step": 460 }, { "completion_length": 205.625, "epoch": 1.4736, "grad_norm": 2.885746717453003, "kl": 0.12548828125, "learning_rate": 4.2499999999999995e-07, "loss": 0.0013, "reward": 3.878596305847168, "reward_std": 0.02481621317565441, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.9378580451011658, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9464203119277954, "step": 461 }, { "completion_length": 205.4375, "epoch": 1.4768, "grad_norm": 2.366044521331787, "kl": 0.10595703125, "learning_rate": 4.2375e-07, "loss": 0.0011, "reward": 3.940233826637268, "reward_std": 0.013500516302883625, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.9461761116981506, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997395873069763, "step": 462 }, { "completion_length": 229.46875, "epoch": 1.48, "grad_norm": 2.4469070434570312, "kl": 0.078369140625, "learning_rate": 4.225e-07, "loss": 0.0008, "reward": 3.92271089553833, "reward_std": 0.022854273673146963, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9247944056987762, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9979166686534882, "step": 463 }, { "completion_length": 225.96875, "epoch": 1.4832, "grad_norm": 11.768393516540527, "kl": 0.1123046875, "learning_rate": 4.2125e-07, "loss": 0.0011, "reward": 3.9518144130706787, "reward_std": 0.010446197353303432, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9525662660598755, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9992480874061584, "step": 464 }, { "completion_length": 149.0, "epoch": 1.4864, "grad_norm": 6.672958850860596, "kl": 0.185791015625, "learning_rate": 4.1999999999999995e-07, "loss": 0.0019, "reward": 3.944068431854248, "reward_std": 0.02685389667749405, "rewards/answer_entity_reward": 0.9774305522441864, "rewards/answer_wer_reward": 0.9714455008506775, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.995192289352417, "step": 465 }, { "completion_length": 249.625, "epoch": 1.4896, "grad_norm": 1.7048887014389038, "kl": 0.10986328125, "learning_rate": 4.1875e-07, "loss": 0.0011, "reward": 3.902083158493042, "reward_std": 0.011234605684876442, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9024596214294434, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996234774589539, "step": 466 }, { "completion_length": 181.8125, "epoch": 1.4928, "grad_norm": 2.429704189300537, "kl": 0.112548828125, "learning_rate": 4.1749999999999997e-07, "loss": 0.0011, "reward": 3.9163752794265747, "reward_std": 0.014369658660143614, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9364789724349976, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9798963963985443, "step": 467 }, { "completion_length": 201.5625, "epoch": 1.496, "grad_norm": 3.5214920043945312, "kl": 0.09912109375, "learning_rate": 4.1625e-07, "loss": 0.001, "reward": 3.9279046058654785, "reward_std": 0.016232089139521122, "rewards/answer_entity_reward": 0.9983552694320679, "rewards/answer_wer_reward": 0.9468095898628235, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9827397167682648, "step": 468 }, { "completion_length": 180.09375, "epoch": 1.4992, "grad_norm": 2.471404790878296, "kl": 0.10693359375, "learning_rate": 4.1499999999999994e-07, "loss": 0.0011, "reward": 3.860212564468384, "reward_std": 0.02879812940955162, "rewards/answer_entity_reward": 0.9914772808551788, "rewards/answer_wer_reward": 0.9390608966350555, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9296742975711823, "step": 469 }, { "completion_length": 208.1875, "epoch": 1.5024, "grad_norm": 0.9673317074775696, "kl": 0.104248046875, "learning_rate": 4.1375e-07, "loss": 0.001, "reward": 3.944485664367676, "reward_std": 0.01182422018609941, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9444854557514191, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 470 }, { "completion_length": 194.0625, "epoch": 1.5056, "grad_norm": 1.0823942422866821, "kl": 0.096435546875, "learning_rate": 4.1249999999999997e-07, "loss": 0.001, "reward": 3.9105581045150757, "reward_std": 0.015555873978883028, "rewards/answer_entity_reward": 0.9869916439056396, "rewards/answer_wer_reward": 0.9253619015216827, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9982046484947205, "step": 471 }, { "completion_length": 221.5625, "epoch": 1.5088, "grad_norm": 4.074758052825928, "kl": 0.077880859375, "learning_rate": 4.1125e-07, "loss": 0.0008, "reward": 3.933529496192932, "reward_std": 0.019466498168185353, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9344994425773621, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990300834178925, "step": 472 }, { "completion_length": 167.3125, "epoch": 1.512, "grad_norm": 2.003244400024414, "kl": 0.10888671875, "learning_rate": 4.0999999999999994e-07, "loss": 0.0011, "reward": 3.927189588546753, "reward_std": 0.00937123317271471, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9625618755817413, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9646276533603668, "step": 473 }, { "completion_length": 253.59375, "epoch": 1.5152, "grad_norm": 1.7125921249389648, "kl": 0.1005859375, "learning_rate": 4.0875e-07, "loss": 0.001, "reward": 3.9120967388153076, "reward_std": 0.020000137854367495, "rewards/answer_entity_reward": 0.9910256266593933, "rewards/answer_wer_reward": 0.9225968718528748, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9984740912914276, "step": 474 }, { "completion_length": 168.5625, "epoch": 1.5184, "grad_norm": 2.8377087116241455, "kl": 0.176513671875, "learning_rate": 4.0749999999999996e-07, "loss": 0.0018, "reward": 3.8111839294433594, "reward_std": 0.015397761948406696, "rewards/answer_entity_reward": 0.9882352948188782, "rewards/answer_wer_reward": 0.9508877992630005, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8720609545707703, "step": 475 }, { "completion_length": 180.0625, "epoch": 1.5215999999999998, "grad_norm": 1.8417869806289673, "kl": 0.09814453125, "learning_rate": 4.0625e-07, "loss": 0.001, "reward": 3.9401214122772217, "reward_std": 0.017564786598086357, "rewards/answer_entity_reward": 0.9958333373069763, "rewards/answer_wer_reward": 0.95186448097229, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9924235343933105, "step": 476 }, { "completion_length": 259.5625, "epoch": 1.5248, "grad_norm": 3.1482410430908203, "kl": 0.065673828125, "learning_rate": 4.05e-07, "loss": 0.0007, "reward": 3.8720295429229736, "reward_std": 0.05017535015940666, "rewards/answer_entity_reward": 0.9819904267787933, "rewards/answer_wer_reward": 0.9132304787635803, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9768086373806, "step": 477 }, { "completion_length": 224.25, "epoch": 1.528, "grad_norm": 1.309258222579956, "kl": 0.151123046875, "learning_rate": 4.0375e-07, "loss": 0.0015, "reward": 3.9491400718688965, "reward_std": 0.015128562692552805, "rewards/answer_entity_reward": 0.9905637204647064, "rewards/answer_wer_reward": 0.9590685665607452, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999507874250412, "step": 478 }, { "completion_length": 195.03125, "epoch": 1.5312000000000001, "grad_norm": 2.627673864364624, "kl": 0.10205078125, "learning_rate": 4.025e-07, "loss": 0.001, "reward": 3.8748838901519775, "reward_std": 0.03435908444225788, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.931220144033432, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9471359848976135, "step": 479 }, { "completion_length": 214.09375, "epoch": 1.5344, "grad_norm": 1.328961730003357, "kl": 0.09375, "learning_rate": 4.0124999999999997e-07, "loss": 0.0009, "reward": 3.895302414894104, "reward_std": 0.05907848384231329, "rewards/answer_entity_reward": 0.9810912609100342, "rewards/answer_wer_reward": 0.9161643385887146, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.998046875, "step": 480 }, { "completion_length": 238.34375, "epoch": 1.5375999999999999, "grad_norm": 1.2219247817993164, "kl": 0.073486328125, "learning_rate": 4e-07, "loss": 0.0007, "reward": 3.9036080837249756, "reward_std": 0.039926802739501, "rewards/answer_entity_reward": 0.9823717474937439, "rewards/answer_wer_reward": 0.9258527159690857, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9953835308551788, "step": 481 }, { "completion_length": 197.90625, "epoch": 1.5408, "grad_norm": 1.6363537311553955, "kl": 0.14306640625, "learning_rate": 3.9875e-07, "loss": 0.0014, "reward": 3.948188543319702, "reward_std": 0.010867676697671413, "rewards/answer_entity_reward": 0.9958333373069763, "rewards/answer_wer_reward": 0.9528435170650482, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.99951171875, "step": 482 }, { "completion_length": 248.84375, "epoch": 1.544, "grad_norm": 1.3250434398651123, "kl": 0.07421875, "learning_rate": 3.975e-07, "loss": 0.0007, "reward": 3.9251530170440674, "reward_std": 0.011389322113245726, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9254424273967743, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997106492519379, "step": 483 }, { "completion_length": 183.03125, "epoch": 1.5472000000000001, "grad_norm": 1.3042057752609253, "kl": 0.10107421875, "learning_rate": 3.9624999999999996e-07, "loss": 0.001, "reward": 3.8853487968444824, "reward_std": 0.06827400880865753, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9366248250007629, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9487238228321075, "step": 484 }, { "completion_length": 192.0625, "epoch": 1.5504, "grad_norm": 2.232529640197754, "kl": 0.11279296875, "learning_rate": 3.95e-07, "loss": 0.0011, "reward": 3.9339704513549805, "reward_std": 0.011435477063059807, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.935157060623169, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9988133013248444, "step": 485 }, { "completion_length": 237.28125, "epoch": 1.5535999999999999, "grad_norm": 1.1462312936782837, "kl": 0.098876953125, "learning_rate": 3.9375e-07, "loss": 0.001, "reward": 3.953871250152588, "reward_std": 0.007947361096739769, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9546429216861725, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9992283880710602, "step": 486 }, { "completion_length": 167.8125, "epoch": 1.5568, "grad_norm": 2.324936628341675, "kl": 0.1357421875, "learning_rate": 3.925e-07, "loss": 0.0014, "reward": 3.858751654624939, "reward_std": 0.14167471043765545, "rewards/answer_entity_reward": 0.9930555820465088, "rewards/answer_wer_reward": 0.9401543736457825, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9567916989326477, "step": 487 }, { "completion_length": 245.09375, "epoch": 1.56, "grad_norm": 4.512195587158203, "kl": 0.076171875, "learning_rate": 3.9124999999999996e-07, "loss": 0.0008, "reward": 3.920499563217163, "reward_std": 0.03615456819534302, "rewards/answer_entity_reward": 0.9908459782600403, "rewards/answer_wer_reward": 0.9309035241603851, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987500607967377, "step": 488 }, { "completion_length": 227.84375, "epoch": 1.5632000000000001, "grad_norm": 10.537569046020508, "kl": 0.0859375, "learning_rate": 3.8999999999999997e-07, "loss": 0.0009, "reward": 3.9345154762268066, "reward_std": 0.0299052600748837, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9527814090251923, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9817341864109039, "step": 489 }, { "completion_length": 228.78125, "epoch": 1.5664, "grad_norm": 1.635452151298523, "kl": 0.125, "learning_rate": 3.8875e-07, "loss": 0.0013, "reward": 3.944974184036255, "reward_std": 0.019456470385193825, "rewards/answer_entity_reward": 0.9919143319129944, "rewards/answer_wer_reward": 0.9538533091545105, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.99920654296875, "step": 490 }, { "completion_length": 146.78125, "epoch": 1.5695999999999999, "grad_norm": 3.557502031326294, "kl": 0.256103515625, "learning_rate": 3.875e-07, "loss": 0.0026, "reward": 3.865471601486206, "reward_std": 0.05972531996667385, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.972651481628418, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8928201496601105, "step": 491 }, { "completion_length": 207.28125, "epoch": 1.5728, "grad_norm": 1.0813632011413574, "kl": 0.11767578125, "learning_rate": 3.8624999999999995e-07, "loss": 0.0012, "reward": 3.91045343875885, "reward_std": 0.01970634702593088, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9153560400009155, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9979382753372192, "step": 492 }, { "completion_length": 219.4375, "epoch": 1.576, "grad_norm": 9.319220542907715, "kl": 0.11181640625, "learning_rate": 3.8499999999999997e-07, "loss": 0.0011, "reward": 3.876826047897339, "reward_std": 0.02829575538635254, "rewards/answer_entity_reward": 0.9930555820465088, "rewards/answer_wer_reward": 0.9400706589221954, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9436996281147003, "step": 493 }, { "completion_length": 205.34375, "epoch": 1.5792000000000002, "grad_norm": 1.0891739130020142, "kl": 0.091796875, "learning_rate": 3.8375e-07, "loss": 0.0009, "reward": 3.9375252723693848, "reward_std": 0.01566324196755886, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9394660592079163, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9980591833591461, "step": 494 }, { "completion_length": 247.4375, "epoch": 1.5824, "grad_norm": 1.313225507736206, "kl": 0.120361328125, "learning_rate": 3.825e-07, "loss": 0.0012, "reward": 3.9278939962387085, "reward_std": 0.01758108288049698, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9288604855537415, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999033510684967, "step": 495 }, { "completion_length": 190.28125, "epoch": 1.5856, "grad_norm": 9.440873146057129, "kl": 0.19677734375, "learning_rate": 3.8124999999999995e-07, "loss": 0.002, "reward": 3.7597657442092896, "reward_std": 0.05097449291497469, "rewards/answer_entity_reward": 0.9879385828971863, "rewards/answer_wer_reward": 0.9360098242759705, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8358173370361328, "step": 496 }, { "completion_length": 165.09375, "epoch": 1.5888, "grad_norm": 3.4180030822753906, "kl": 0.111572265625, "learning_rate": 3.7999999999999996e-07, "loss": 0.0011, "reward": 3.87969434261322, "reward_std": 0.058905988931655884, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9649176299571991, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9147767424583435, "step": 497 }, { "completion_length": 188.96875, "epoch": 1.592, "grad_norm": 5.278741359710693, "kl": 0.072998046875, "learning_rate": 3.7875e-07, "loss": 0.0007, "reward": 3.9229685068130493, "reward_std": 0.03557159844785929, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9312180578708649, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9938337206840515, "step": 498 }, { "completion_length": 223.125, "epoch": 1.5952, "grad_norm": 1.8821698427200317, "kl": 0.100341796875, "learning_rate": 3.775e-07, "loss": 0.001, "reward": 3.85340416431427, "reward_std": 0.1366682257503271, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9250198900699615, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9596343636512756, "step": 499 }, { "completion_length": 206.125, "epoch": 1.5984, "grad_norm": 2.528049945831299, "kl": 0.089111328125, "learning_rate": 3.7624999999999994e-07, "loss": 0.0009, "reward": 3.9384653568267822, "reward_std": 0.011245439760386944, "rewards/answer_entity_reward": 0.9983552694320679, "rewards/answer_wer_reward": 0.9415569603443146, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9985532462596893, "step": 500 }, { "completion_length": 229.46875, "epoch": 1.6016, "grad_norm": 1.4024198055267334, "kl": 0.082763671875, "learning_rate": 3.75e-07, "loss": 0.0008, "reward": 3.9224425554275513, "reward_std": 0.012164951767772436, "rewards/answer_entity_reward": 0.9886363744735718, "rewards/answer_wer_reward": 0.9341387450695038, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996675550937653, "step": 501 }, { "completion_length": 183.09375, "epoch": 1.6048, "grad_norm": 2.642270088195801, "kl": 0.0888671875, "learning_rate": 3.7375e-07, "loss": 0.0009, "reward": 3.872815251350403, "reward_std": 0.04407367669045925, "rewards/answer_entity_reward": 0.995192289352417, "rewards/answer_wer_reward": 0.953230619430542, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9243922829627991, "step": 502 }, { "completion_length": 242.8125, "epoch": 1.608, "grad_norm": 3.0733675956726074, "kl": 0.1044921875, "learning_rate": 3.725e-07, "loss": 0.001, "reward": 3.9133812189102173, "reward_std": 0.017343452665954828, "rewards/answer_entity_reward": 0.9914772808551788, "rewards/answer_wer_reward": 0.9236075580120087, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9982964098453522, "step": 503 }, { "completion_length": 234.34375, "epoch": 1.6112, "grad_norm": 1.4146682024002075, "kl": 0.1064453125, "learning_rate": 3.7125e-07, "loss": 0.0011, "reward": 3.941379427909851, "reward_std": 0.011062228586524725, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9433701932430267, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9980092346668243, "step": 504 }, { "completion_length": 252.84375, "epoch": 1.6143999999999998, "grad_norm": 1.9019030332565308, "kl": 0.101318359375, "learning_rate": 3.7e-07, "loss": 0.001, "reward": 3.87961208820343, "reward_std": 0.02180068287998438, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.8812887370586395, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9983234107494354, "step": 505 }, { "completion_length": 184.34375, "epoch": 1.6176, "grad_norm": 3.1965742111206055, "kl": 0.114501953125, "learning_rate": 3.6875e-07, "loss": 0.0011, "reward": 3.650223731994629, "reward_std": 0.16780234314501286, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9241631031036377, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.7573105096817017, "step": 506 }, { "completion_length": 224.1875, "epoch": 1.6208, "grad_norm": 1.8885560035705566, "kl": 0.115966796875, "learning_rate": 3.675e-07, "loss": 0.0012, "reward": 3.9122270345687866, "reward_std": 0.04261860717087984, "rewards/answer_entity_reward": 0.9947552382946014, "rewards/answer_wer_reward": 0.9476769864559174, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9697948098182678, "step": 507 }, { "completion_length": 220.25, "epoch": 1.624, "grad_norm": 3.2499520778656006, "kl": 0.1044921875, "learning_rate": 3.6625e-07, "loss": 0.0011, "reward": 3.927606225013733, "reward_std": 0.023842450696974993, "rewards/answer_entity_reward": 0.9879376590251923, "rewards/answer_wer_reward": 0.9439153373241425, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9957531988620758, "step": 508 }, { "completion_length": 227.96875, "epoch": 1.6272, "grad_norm": 2.6528868675231934, "kl": 0.087890625, "learning_rate": 3.65e-07, "loss": 0.0009, "reward": 3.928350806236267, "reward_std": 0.01731124660000205, "rewards/answer_entity_reward": 0.9886363744735718, "rewards/answer_wer_reward": 0.9420913755893707, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9976231157779694, "step": 509 }, { "completion_length": 243.5, "epoch": 1.6303999999999998, "grad_norm": 1.618895411491394, "kl": 0.09814453125, "learning_rate": 3.6375e-07, "loss": 0.001, "reward": 3.9476526975631714, "reward_std": 0.011007866356521845, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9514043629169464, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9962483048439026, "step": 510 }, { "completion_length": 263.4375, "epoch": 1.6336, "grad_norm": 2.8576741218566895, "kl": 0.104736328125, "learning_rate": 3.6249999999999997e-07, "loss": 0.001, "reward": 3.9101955890655518, "reward_std": 0.01921992190182209, "rewards/answer_entity_reward": 0.9935776889324188, "rewards/answer_wer_reward": 0.9185610413551331, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9980569183826447, "step": 511 }, { "completion_length": 184.5625, "epoch": 1.6368, "grad_norm": 6.8555908203125, "kl": 0.113525390625, "learning_rate": 3.6125e-07, "loss": 0.0011, "reward": 3.8663313388824463, "reward_std": 0.10157291498035192, "rewards/answer_entity_reward": 0.9823717772960663, "rewards/answer_wer_reward": 0.9564132988452911, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9275462925434113, "step": 512 }, { "completion_length": 218.25, "epoch": 1.6400000000000001, "grad_norm": 1.9482468366622925, "kl": 0.091064453125, "learning_rate": 3.6e-07, "loss": 0.0009, "reward": 3.8723970651626587, "reward_std": 0.07238492835313082, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.9418750703334808, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9362037181854248, "step": 513 }, { "completion_length": 240.90625, "epoch": 1.6432, "grad_norm": 1.2296831607818604, "kl": 0.079345703125, "learning_rate": 3.5875e-07, "loss": 0.0008, "reward": 3.9039018154144287, "reward_std": 0.09914317354559898, "rewards/answer_entity_reward": 0.984375, "rewards/answer_wer_reward": 0.9198593199253082, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996675550937653, "step": 514 }, { "completion_length": 234.4375, "epoch": 1.6463999999999999, "grad_norm": 1.4495328664779663, "kl": 0.1455078125, "learning_rate": 3.5749999999999997e-07, "loss": 0.0015, "reward": 3.9163358211517334, "reward_std": 0.013342800550162792, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9182494282722473, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9980863332748413, "step": 515 }, { "completion_length": 251.5, "epoch": 1.6496, "grad_norm": 1.5324357748031616, "kl": 0.085693359375, "learning_rate": 3.5625e-07, "loss": 0.0009, "reward": 3.8444111347198486, "reward_std": 0.19724943954497576, "rewards/answer_entity_reward": 0.9548611044883728, "rewards/answer_wer_reward": 0.9235136210918427, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9972864091396332, "step": 516 }, { "completion_length": 242.40625, "epoch": 1.6528, "grad_norm": 33.44215774536133, "kl": 0.107177734375, "learning_rate": 3.55e-07, "loss": 0.0011, "reward": 3.8674838542938232, "reward_std": 0.024256199598312378, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.931198239326477, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9362856149673462, "step": 517 }, { "completion_length": 222.375, "epoch": 1.6560000000000001, "grad_norm": 2.1098077297210693, "kl": 0.119140625, "learning_rate": 3.5375e-07, "loss": 0.0012, "reward": 3.916640877723694, "reward_std": 0.012934736907482147, "rewards/answer_entity_reward": 0.9841346144676208, "rewards/answer_wer_reward": 0.9368312060832977, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.99567511677742, "step": 518 }, { "completion_length": 187.09375, "epoch": 1.6592, "grad_norm": 5.296720504760742, "kl": 0.1220703125, "learning_rate": 3.5249999999999996e-07, "loss": 0.0012, "reward": 3.9440935850143433, "reward_std": 0.02182569820433855, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9536486864089966, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9939171075820923, "step": 519 }, { "completion_length": 199.1875, "epoch": 1.6623999999999999, "grad_norm": 2.8992345333099365, "kl": 0.1083984375, "learning_rate": 3.5124999999999997e-07, "loss": 0.0011, "reward": 3.868250846862793, "reward_std": 0.01035462855361402, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.952102780342102, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9196203351020813, "step": 520 }, { "completion_length": 201.03125, "epoch": 1.6656, "grad_norm": 2.3841094970703125, "kl": 0.176025390625, "learning_rate": 3.5e-07, "loss": 0.0018, "reward": 3.8405520915985107, "reward_std": 0.020799917168915272, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9267003536224365, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.913851797580719, "step": 521 }, { "completion_length": 212.84375, "epoch": 1.6688, "grad_norm": 2.3912744522094727, "kl": 0.126953125, "learning_rate": 3.4875e-07, "loss": 0.0013, "reward": 3.894093632698059, "reward_std": 0.027726877480745316, "rewards/answer_entity_reward": 0.9886363744735718, "rewards/answer_wer_reward": 0.935745120048523, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9697121679782867, "step": 522 }, { "completion_length": 235.875, "epoch": 1.6720000000000002, "grad_norm": 3.050795078277588, "kl": 0.109130859375, "learning_rate": 3.4749999999999996e-07, "loss": 0.0011, "reward": 3.8923540115356445, "reward_std": 0.01905027125030756, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9099950790405273, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9851997494697571, "step": 523 }, { "completion_length": 228.25, "epoch": 1.6752, "grad_norm": 1.20732843875885, "kl": 0.09375, "learning_rate": 3.4624999999999997e-07, "loss": 0.0009, "reward": 3.936145067214966, "reward_std": 0.009886496467515826, "rewards/answer_entity_reward": 0.9944852888584137, "rewards/answer_wer_reward": 0.9416597485542297, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 524 }, { "completion_length": 225.78125, "epoch": 1.6784, "grad_norm": 8.249052047729492, "kl": 0.09326171875, "learning_rate": 3.45e-07, "loss": 0.0009, "reward": 3.922656536102295, "reward_std": 0.030036092270165682, "rewards/answer_entity_reward": 0.9934523701667786, "rewards/answer_wer_reward": 0.9322790205478668, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9969250857830048, "step": 525 }, { "completion_length": 181.78125, "epoch": 1.6816, "grad_norm": 3.0338377952575684, "kl": 0.42138671875, "learning_rate": 3.4375e-07, "loss": 0.0042, "reward": 3.9170188903808594, "reward_std": 0.02494343649595976, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9231892824172974, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9962334036827087, "step": 526 }, { "completion_length": 197.0625, "epoch": 1.6848, "grad_norm": 1.7836970090866089, "kl": 0.2099609375, "learning_rate": 3.425e-07, "loss": 0.0021, "reward": 3.9194570779800415, "reward_std": 0.03800513781607151, "rewards/answer_entity_reward": 0.9902680516242981, "rewards/answer_wer_reward": 0.9317581951618195, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9974307715892792, "step": 527 }, { "completion_length": 210.59375, "epoch": 1.688, "grad_norm": 2.595771074295044, "kl": 0.1103515625, "learning_rate": 3.4124999999999996e-07, "loss": 0.0011, "reward": 3.8902900218963623, "reward_std": 0.03382246592082083, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.959803968667984, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9328898191452026, "step": 528 }, { "completion_length": 220.0, "epoch": 1.6912, "grad_norm": 1.7138639688491821, "kl": 0.1044921875, "learning_rate": 3.4000000000000003e-07, "loss": 0.001, "reward": 3.9327096939086914, "reward_std": 0.02261860202997923, "rewards/answer_entity_reward": 0.9938696324825287, "rewards/answer_wer_reward": 0.9420961737632751, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9967438876628876, "step": 529 }, { "completion_length": 196.75, "epoch": 1.6944, "grad_norm": 11.008087158203125, "kl": 0.25732421875, "learning_rate": 3.3875e-07, "loss": 0.0026, "reward": 3.9551256895065308, "reward_std": 0.013849829090759158, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9695225656032562, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9884441494941711, "step": 530 }, { "completion_length": 206.25, "epoch": 1.6976, "grad_norm": 3.295365810394287, "kl": 0.17822265625, "learning_rate": 3.375e-07, "loss": 0.0018, "reward": 3.8593257665634155, "reward_std": 0.03199449460953474, "rewards/answer_entity_reward": 0.9895833134651184, "rewards/answer_wer_reward": 0.9447747468948364, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9249675273895264, "step": 531 }, { "completion_length": 202.9375, "epoch": 1.7008, "grad_norm": 1.3525906801223755, "kl": 0.1484375, "learning_rate": 3.3624999999999996e-07, "loss": 0.0015, "reward": 3.9375537633895874, "reward_std": 0.017243665643036366, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9407197833061218, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9992378056049347, "step": 532 }, { "completion_length": 243.09375, "epoch": 1.704, "grad_norm": 3.5387661457061768, "kl": 0.074951171875, "learning_rate": 3.35e-07, "loss": 0.0007, "reward": 3.907800793647766, "reward_std": 0.019072275608778, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9087632894515991, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999037504196167, "step": 533 }, { "completion_length": 235.15625, "epoch": 1.7072, "grad_norm": 2.016521453857422, "kl": 0.09326171875, "learning_rate": 3.3375e-07, "loss": 0.0009, "reward": 3.8281819820404053, "reward_std": 0.021804995834827423, "rewards/answer_entity_reward": 0.9937500059604645, "rewards/answer_wer_reward": 0.9325411021709442, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9018907845020294, "step": 534 }, { "completion_length": 228.25, "epoch": 1.7104, "grad_norm": 2.274576187133789, "kl": 0.090087890625, "learning_rate": 3.325e-07, "loss": 0.0009, "reward": 3.9243232011795044, "reward_std": 0.02412506751716137, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9507229626178741, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9764412045478821, "step": 535 }, { "completion_length": 224.53125, "epoch": 1.7136, "grad_norm": 2.6043360233306885, "kl": 0.10400390625, "learning_rate": 3.3124999999999995e-07, "loss": 0.001, "reward": 3.876230835914612, "reward_std": 0.07055234862491488, "rewards/answer_entity_reward": 0.9927884340286255, "rewards/answer_wer_reward": 0.9505043029785156, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9329380691051483, "step": 536 }, { "completion_length": 225.75, "epoch": 1.7168, "grad_norm": 2.8599207401275635, "kl": 0.09814453125, "learning_rate": 3.3e-07, "loss": 0.001, "reward": 3.8414641618728638, "reward_std": 0.05350587982684374, "rewards/answer_entity_reward": 0.9983552694320679, "rewards/answer_wer_reward": 0.9369199872016907, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9061886966228485, "step": 537 }, { "completion_length": 148.6875, "epoch": 1.72, "grad_norm": 1.6326717138290405, "kl": 0.10009765625, "learning_rate": 3.2875e-07, "loss": 0.001, "reward": 3.9361575841903687, "reward_std": 0.004058501799590886, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9676616787910461, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9684959352016449, "step": 538 }, { "completion_length": 195.0625, "epoch": 1.7231999999999998, "grad_norm": 1.9592961072921753, "kl": 0.12841796875, "learning_rate": 3.275e-07, "loss": 0.0013, "reward": 3.772740364074707, "reward_std": 0.1297362227924168, "rewards/answer_entity_reward": 0.8774839639663696, "rewards/answer_wer_reward": 0.953325480222702, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9419309496879578, "step": 539 }, { "completion_length": 234.75, "epoch": 1.7264, "grad_norm": 2.8339364528656006, "kl": 0.09130859375, "learning_rate": 3.2624999999999995e-07, "loss": 0.0009, "reward": 3.9273258447647095, "reward_std": 0.019230290316045284, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.929772675037384, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996366202831268, "step": 540 }, { "completion_length": 202.21875, "epoch": 1.7296, "grad_norm": 1.428126335144043, "kl": 0.11083984375, "learning_rate": 3.25e-07, "loss": 0.0011, "reward": 3.8019243478775024, "reward_std": 0.012322985101491213, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9284610748291016, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8734632432460785, "step": 541 }, { "completion_length": 204.875, "epoch": 1.7328000000000001, "grad_norm": 2.1539251804351807, "kl": 0.11376953125, "learning_rate": 3.2374999999999997e-07, "loss": 0.0011, "reward": 3.9308128356933594, "reward_std": 0.03895580768585205, "rewards/answer_entity_reward": 0.9930555820465088, "rewards/answer_wer_reward": 0.9500284790992737, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9877286553382874, "step": 542 }, { "completion_length": 237.40625, "epoch": 1.736, "grad_norm": 2.9644949436187744, "kl": 0.091796875, "learning_rate": 3.225e-07, "loss": 0.0009, "reward": 3.8919214010238647, "reward_std": 0.025371606461703777, "rewards/answer_entity_reward": 0.9927884340286255, "rewards/answer_wer_reward": 0.9108140766620636, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9883188605308533, "step": 543 }, { "completion_length": 173.5, "epoch": 1.7391999999999999, "grad_norm": 1.8892700672149658, "kl": 0.11376953125, "learning_rate": 3.2124999999999994e-07, "loss": 0.0011, "reward": 3.816041350364685, "reward_std": 0.021231804974377155, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.8221178352832794, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9973958432674408, "step": 544 }, { "completion_length": 196.90625, "epoch": 1.7424, "grad_norm": 1.6765927076339722, "kl": 0.103271484375, "learning_rate": 3.2e-07, "loss": 0.001, "reward": 3.825459599494934, "reward_std": 0.1512175016105175, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9306570887565613, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.928135871887207, "step": 545 }, { "completion_length": 245.15625, "epoch": 1.7456, "grad_norm": 2.408535957336426, "kl": 0.100830078125, "learning_rate": 3.1874999999999997e-07, "loss": 0.001, "reward": 3.904157519340515, "reward_std": 0.024684349074959755, "rewards/answer_entity_reward": 0.9914772808551788, "rewards/answer_wer_reward": 0.9129303097724915, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999750018119812, "step": 546 }, { "completion_length": 192.15625, "epoch": 1.7488000000000001, "grad_norm": 1.3466379642486572, "kl": 0.1162109375, "learning_rate": 3.175e-07, "loss": 0.0012, "reward": 3.8801496028900146, "reward_std": 0.028854741947725415, "rewards/answer_entity_reward": 0.987500011920929, "rewards/answer_wer_reward": 0.9211397469043732, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9715098142623901, "step": 547 }, { "completion_length": 199.5, "epoch": 1.752, "grad_norm": 1.6798815727233887, "kl": 0.14404296875, "learning_rate": 3.1624999999999994e-07, "loss": 0.0014, "reward": 3.9099488258361816, "reward_std": 0.01651060301810503, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9137388169765472, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9986139535903931, "step": 548 }, { "completion_length": 244.9375, "epoch": 1.7551999999999999, "grad_norm": 1.4050216674804688, "kl": 0.12451171875, "learning_rate": 3.15e-07, "loss": 0.0012, "reward": 3.9373788833618164, "reward_std": 0.015202231705188751, "rewards/answer_entity_reward": 0.9914772808551788, "rewards/answer_wer_reward": 0.9471401572227478, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987614154815674, "step": 549 }, { "completion_length": 248.25, "epoch": 1.7584, "grad_norm": 1.4261935949325562, "kl": 0.06884765625, "learning_rate": 3.1374999999999996e-07, "loss": 0.0007, "reward": 3.8940224647521973, "reward_std": 0.02191777713596821, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.8968429565429688, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9995833337306976, "step": 550 }, { "completion_length": 201.96875, "epoch": 1.7616, "grad_norm": 3.3936564922332764, "kl": 0.10986328125, "learning_rate": 3.1249999999999997e-07, "loss": 0.0011, "reward": 3.8392233848571777, "reward_std": 0.054989127907902, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9420890212059021, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8999753296375275, "step": 551 }, { "completion_length": 224.28125, "epoch": 1.7648000000000001, "grad_norm": 3.447808027267456, "kl": 0.1162109375, "learning_rate": 3.1125000000000004e-07, "loss": 0.0012, "reward": 3.928855776786804, "reward_std": 0.03860421013087034, "rewards/answer_entity_reward": 0.995192289352417, "rewards/answer_wer_reward": 0.9465460479259491, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9871174991130829, "step": 552 }, { "completion_length": 239.0625, "epoch": 1.768, "grad_norm": 0.9099166989326477, "kl": 0.09228515625, "learning_rate": 3.1e-07, "loss": 0.0008, "reward": 3.946284055709839, "reward_std": 0.0096789482049644, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9500063955783844, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9962775409221649, "step": 553 }, { "completion_length": 225.0625, "epoch": 1.7711999999999999, "grad_norm": 5.470230579376221, "kl": 0.0791015625, "learning_rate": 3.0875e-07, "loss": 0.0008, "reward": 3.919348955154419, "reward_std": 0.03945630043745041, "rewards/answer_entity_reward": 0.9685782790184021, "rewards/answer_wer_reward": 0.9539141952991486, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9968563914299011, "step": 554 }, { "completion_length": 221.46875, "epoch": 1.7744, "grad_norm": 2.4623939990997314, "kl": 0.091064453125, "learning_rate": 3.0749999999999997e-07, "loss": 0.0009, "reward": 3.9319478273391724, "reward_std": 0.020772571209818125, "rewards/answer_entity_reward": 0.9983552694320679, "rewards/answer_wer_reward": 0.9341712892055511, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994212985038757, "step": 555 }, { "completion_length": 195.28125, "epoch": 1.7776, "grad_norm": 3.2428677082061768, "kl": 0.1201171875, "learning_rate": 3.0625000000000003e-07, "loss": 0.0012, "reward": 3.8943945169448853, "reward_std": 0.03664180589839816, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.9439602494239807, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9561159908771515, "step": 556 }, { "completion_length": 181.4375, "epoch": 1.7808000000000002, "grad_norm": 3.0905327796936035, "kl": 0.1103515625, "learning_rate": 3.05e-07, "loss": 0.0011, "reward": 3.761397957801819, "reward_std": 0.21460139192640781, "rewards/answer_entity_reward": 0.9930555522441864, "rewards/answer_wer_reward": 0.928047776222229, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.8715444803237915, "step": 557 }, { "completion_length": 221.625, "epoch": 1.784, "grad_norm": 1.951019525527954, "kl": 0.075927734375, "learning_rate": 3.0375e-07, "loss": 0.0008, "reward": 3.77008855342865, "reward_std": 0.32161275763064623, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.8999943733215332, "rewards/format_reward": 0.9375, "rewards/think_ocr_reward": 0.9325942993164062, "step": 558 }, { "completion_length": 205.28125, "epoch": 1.7872, "grad_norm": 3.277336359024048, "kl": 0.219482421875, "learning_rate": 3.0249999999999996e-07, "loss": 0.0022, "reward": 3.934972047805786, "reward_std": 0.0279585188254714, "rewards/answer_entity_reward": 0.9919143319129944, "rewards/answer_wer_reward": 0.944387674331665, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.998670220375061, "step": 559 }, { "completion_length": 228.5625, "epoch": 1.7904, "grad_norm": 1.3801170587539673, "kl": 0.090576171875, "learning_rate": 3.0125000000000003e-07, "loss": 0.0009, "reward": 3.93076229095459, "reward_std": 0.018667958676815033, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9448211789131165, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9887820482254028, "step": 560 }, { "completion_length": 210.71875, "epoch": 1.7936, "grad_norm": 1.791351556777954, "kl": 0.109130859375, "learning_rate": 3e-07, "loss": 0.0011, "reward": 3.883724331855774, "reward_std": 0.061979083344340324, "rewards/answer_entity_reward": 0.9836647808551788, "rewards/answer_wer_reward": 0.9013588726520538, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987007081508636, "step": 561 }, { "completion_length": 205.125, "epoch": 1.7968, "grad_norm": 2.168004274368286, "kl": 0.103271484375, "learning_rate": 2.9875e-07, "loss": 0.001, "reward": 3.8556606769561768, "reward_std": 0.08509537391364574, "rewards/answer_entity_reward": 0.995192289352417, "rewards/answer_wer_reward": 0.9325578808784485, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9279104173183441, "step": 562 }, { "completion_length": 206.78125, "epoch": 1.8, "grad_norm": 1.8020058870315552, "kl": 0.112548828125, "learning_rate": 2.9749999999999996e-07, "loss": 0.0011, "reward": 3.9098552465438843, "reward_std": 0.027897534891963005, "rewards/answer_entity_reward": 0.995192289352417, "rewards/answer_wer_reward": 0.9327702820301056, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9818926453590393, "step": 563 }, { "completion_length": 199.84375, "epoch": 1.8032, "grad_norm": 2.1101276874542236, "kl": 0.08056640625, "learning_rate": 2.9625e-07, "loss": 0.0008, "reward": 3.928394079208374, "reward_std": 0.013759741093963385, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9306167364120483, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9977773427963257, "step": 564 }, { "completion_length": 198.34375, "epoch": 1.8064, "grad_norm": 1.7468022108078003, "kl": 0.110595703125, "learning_rate": 2.95e-07, "loss": 0.0011, "reward": 3.8688454627990723, "reward_std": 0.01723374053835869, "rewards/answer_entity_reward": 0.9888257682323456, "rewards/answer_wer_reward": 0.9232835471630096, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9567362368106842, "step": 565 }, { "completion_length": 166.21875, "epoch": 1.8096, "grad_norm": 3.4565577507019043, "kl": 0.12451171875, "learning_rate": 2.9375e-07, "loss": 0.0012, "reward": 3.8460001945495605, "reward_std": 0.12010016990825534, "rewards/answer_entity_reward": 0.9685245454311371, "rewards/answer_wer_reward": 0.905397355556488, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9720781445503235, "step": 566 }, { "completion_length": 204.375, "epoch": 1.8128, "grad_norm": 2.109642267227173, "kl": 0.1474609375, "learning_rate": 2.9249999999999995e-07, "loss": 0.0015, "reward": 3.9155898094177246, "reward_std": 0.021943609230220318, "rewards/answer_entity_reward": 0.995192289352417, "rewards/answer_wer_reward": 0.9553306102752686, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9650669693946838, "step": 567 }, { "completion_length": 231.5625, "epoch": 1.8159999999999998, "grad_norm": 1.4336498975753784, "kl": 0.101318359375, "learning_rate": 2.9125e-07, "loss": 0.001, "reward": 3.9311413764953613, "reward_std": 0.012714509852230549, "rewards/answer_entity_reward": 0.9944852888584137, "rewards/answer_wer_reward": 0.9397754371166229, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9968807101249695, "step": 568 }, { "completion_length": 204.8125, "epoch": 1.8192, "grad_norm": 2.3991148471832275, "kl": 0.0830078125, "learning_rate": 2.9e-07, "loss": 0.0008, "reward": 3.843847155570984, "reward_std": 0.1981589295901358, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9109402298927307, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9641570150852203, "step": 569 }, { "completion_length": 232.46875, "epoch": 1.8224, "grad_norm": 1.6885050535202026, "kl": 0.085693359375, "learning_rate": 2.8875e-07, "loss": 0.0009, "reward": 3.8632709980010986, "reward_std": 0.08977647870779037, "rewards/answer_entity_reward": 0.9838541746139526, "rewards/answer_wer_reward": 0.9250127673149109, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9544041156768799, "step": 570 }, { "completion_length": 277.78125, "epoch": 1.8256000000000001, "grad_norm": 1.6569448709487915, "kl": 0.09619140625, "learning_rate": 2.8749999999999995e-07, "loss": 0.001, "reward": 3.70079243183136, "reward_std": 0.15860513970255852, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.901878148317337, "rewards/format_reward": 0.9375, "rewards/think_ocr_reward": 0.8614143133163452, "step": 571 }, { "completion_length": 248.0625, "epoch": 1.8288, "grad_norm": 1.72274649143219, "kl": 0.08251953125, "learning_rate": 2.8625e-07, "loss": 0.0008, "reward": 3.8939234018325806, "reward_std": 0.016017161309719086, "rewards/answer_entity_reward": 0.9803321659564972, "rewards/answer_wer_reward": 0.9138159155845642, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997751712799072, "step": 572 }, { "completion_length": 222.1875, "epoch": 1.8319999999999999, "grad_norm": 5.086897373199463, "kl": 0.3359375, "learning_rate": 2.8499999999999997e-07, "loss": 0.0034, "reward": 3.931598663330078, "reward_std": 0.02384120598435402, "rewards/answer_entity_reward": 0.9947552382946014, "rewards/answer_wer_reward": 0.9370801150798798, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997632503509521, "step": 573 }, { "completion_length": 216.3125, "epoch": 1.8352, "grad_norm": 1.5536502599716187, "kl": 0.089599609375, "learning_rate": 2.8375e-07, "loss": 0.0009, "reward": 3.941322922706604, "reward_std": 0.018783860839903355, "rewards/answer_entity_reward": 0.9833333194255829, "rewards/answer_wer_reward": 0.9589866697788239, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990028738975525, "step": 574 }, { "completion_length": 232.9375, "epoch": 1.8384, "grad_norm": 1.4012224674224854, "kl": 0.09033203125, "learning_rate": 2.8249999999999994e-07, "loss": 0.0009, "reward": 3.9058892726898193, "reward_std": 0.05868656514212489, "rewards/answer_entity_reward": 0.9895833432674408, "rewards/answer_wer_reward": 0.9166894555091858, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996165633201599, "step": 575 }, { "completion_length": 202.125, "epoch": 1.8416000000000001, "grad_norm": 3.967221260070801, "kl": 0.098388671875, "learning_rate": 2.8125e-07, "loss": 0.001, "reward": 3.936561346054077, "reward_std": 0.028190571581944823, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9553852677345276, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9811758995056152, "step": 576 }, { "completion_length": 248.25, "epoch": 1.8448, "grad_norm": 3.581430673599243, "kl": 0.19970703125, "learning_rate": 2.8e-07, "loss": 0.002, "reward": 3.854965329170227, "reward_std": 0.08418525848537683, "rewards/answer_entity_reward": 0.9715560376644135, "rewards/answer_wer_reward": 0.9045931100845337, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9788161218166351, "step": 577 }, { "completion_length": 199.25, "epoch": 1.8479999999999999, "grad_norm": 3.7948851585388184, "kl": 0.147705078125, "learning_rate": 2.7875e-07, "loss": 0.0015, "reward": 3.923743486404419, "reward_std": 0.042667020577937365, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9548681676387787, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9688753485679626, "step": 578 }, { "completion_length": 193.34375, "epoch": 1.8512, "grad_norm": 2.4876842498779297, "kl": 0.102294921875, "learning_rate": 2.775e-07, "loss": 0.001, "reward": 3.97040331363678, "reward_std": 0.01486315974034369, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.973244309425354, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 579 }, { "completion_length": 223.3125, "epoch": 1.8544, "grad_norm": 4.710970878601074, "kl": 0.14501953125, "learning_rate": 2.7625e-07, "loss": 0.0015, "reward": 3.9073562622070312, "reward_std": 0.040397388860583305, "rewards/answer_entity_reward": 0.9927884638309479, "rewards/answer_wer_reward": 0.9182255864143372, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9963421821594238, "step": 580 }, { "completion_length": 240.75, "epoch": 1.8576000000000001, "grad_norm": 1.0144391059875488, "kl": 0.089599609375, "learning_rate": 2.75e-07, "loss": 0.0009, "reward": 3.8770586252212524, "reward_std": 0.030949956737458706, "rewards/answer_entity_reward": 0.9654052257537842, "rewards/answer_wer_reward": 0.9127066433429718, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9989466071128845, "step": 581 }, { "completion_length": 240.46875, "epoch": 1.8608, "grad_norm": 33.290077209472656, "kl": 0.14111328125, "learning_rate": 2.7374999999999997e-07, "loss": 0.0014, "reward": 3.931227445602417, "reward_std": 0.017369844019412994, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9346133172512054, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990180134773254, "step": 582 }, { "completion_length": 247.1875, "epoch": 1.8639999999999999, "grad_norm": 1.7812319993972778, "kl": 0.083984375, "learning_rate": 2.725e-07, "loss": 0.0008, "reward": 3.885915160179138, "reward_std": 0.0845849048346281, "rewards/answer_entity_reward": 0.9893162548542023, "rewards/answer_wer_reward": 0.9280518591403961, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9685470759868622, "step": 583 }, { "completion_length": 208.0625, "epoch": 1.8672, "grad_norm": 3.882129192352295, "kl": 0.133544921875, "learning_rate": 2.7125e-07, "loss": 0.0013, "reward": 3.8892873525619507, "reward_std": 0.04396933689713478, "rewards/answer_entity_reward": 0.9919143319129944, "rewards/answer_wer_reward": 0.9468680024147034, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9505050778388977, "step": 584 }, { "completion_length": 254.6875, "epoch": 1.8704, "grad_norm": 1.1797689199447632, "kl": 0.09619140625, "learning_rate": 2.7e-07, "loss": 0.001, "reward": 3.9246060848236084, "reward_std": 0.021186589263379574, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9278402030467987, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9991695880889893, "step": 585 }, { "completion_length": 225.9375, "epoch": 1.8736000000000002, "grad_norm": 1.4846960306167603, "kl": 0.1142578125, "learning_rate": 2.6874999999999997e-07, "loss": 0.0011, "reward": 3.9609856605529785, "reward_std": 0.024455342907458544, "rewards/answer_entity_reward": 0.9909090995788574, "rewards/answer_wer_reward": 0.9731672704219818, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9969093799591064, "step": 586 }, { "completion_length": 214.03125, "epoch": 1.8768, "grad_norm": 1.4108463525772095, "kl": 0.2236328125, "learning_rate": 2.675e-07, "loss": 0.0022, "reward": 3.9008651971817017, "reward_std": 0.023720702156424522, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9427990317344666, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9615384638309479, "step": 587 }, { "completion_length": 257.34375, "epoch": 1.88, "grad_norm": 2.2120485305786133, "kl": 0.085693359375, "learning_rate": 2.6625e-07, "loss": 0.0009, "reward": 3.905014157295227, "reward_std": 0.02011673618108034, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9055063128471375, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999507874250412, "step": 588 }, { "completion_length": 217.21875, "epoch": 1.8832, "grad_norm": 2.6982715129852295, "kl": 0.08935546875, "learning_rate": 2.65e-07, "loss": 0.0009, "reward": 3.9260659217834473, "reward_std": 0.02971976064145565, "rewards/answer_entity_reward": 0.9871068000793457, "rewards/answer_wer_reward": 0.9389589130878448, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 589 }, { "completion_length": 253.65625, "epoch": 1.8864, "grad_norm": 1.0963667631149292, "kl": 0.081298828125, "learning_rate": 2.6374999999999996e-07, "loss": 0.0008, "reward": 3.9269603490829468, "reward_std": 0.02615117933601141, "rewards/answer_entity_reward": 0.9908459782600403, "rewards/answer_wer_reward": 0.9372480809688568, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9988662898540497, "step": 590 }, { "completion_length": 204.09375, "epoch": 1.8896, "grad_norm": 2.6849443912506104, "kl": 0.128173828125, "learning_rate": 2.625e-07, "loss": 0.0013, "reward": 3.912359118461609, "reward_std": 0.025913351215422153, "rewards/answer_entity_reward": 0.9798610806465149, "rewards/answer_wer_reward": 0.9567141532897949, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9757837951183319, "step": 591 }, { "completion_length": 229.75, "epoch": 1.8928, "grad_norm": 12.276920318603516, "kl": 0.524169921875, "learning_rate": 2.6125e-07, "loss": 0.0052, "reward": 3.8932021856307983, "reward_std": 0.014225118793547153, "rewards/answer_entity_reward": 0.9981617629528046, "rewards/answer_wer_reward": 0.9339624643325806, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9610779881477356, "step": 592 }, { "completion_length": 172.0, "epoch": 1.896, "grad_norm": 3.136312961578369, "kl": 0.197509765625, "learning_rate": 2.6e-07, "loss": 0.002, "reward": 3.927412748336792, "reward_std": 0.02914919052273035, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9748775362968445, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9525350630283356, "step": 593 }, { "completion_length": 231.625, "epoch": 1.8992, "grad_norm": 1.4952311515808105, "kl": 0.0966796875, "learning_rate": 2.5874999999999996e-07, "loss": 0.001, "reward": 3.920572519302368, "reward_std": 0.023940533865243196, "rewards/answer_entity_reward": 0.9852676391601562, "rewards/answer_wer_reward": 0.9361679553985596, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9991368651390076, "step": 594 }, { "completion_length": 222.71875, "epoch": 1.9024, "grad_norm": 1.1621572971343994, "kl": 0.10302734375, "learning_rate": 2.5749999999999997e-07, "loss": 0.001, "reward": 3.8978298902511597, "reward_std": 0.07608090154826641, "rewards/answer_entity_reward": 0.9649057686328888, "rewards/answer_wer_reward": 0.9345213770866394, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9984027147293091, "step": 595 }, { "completion_length": 254.21875, "epoch": 1.9056, "grad_norm": 0.9472298622131348, "kl": 0.092041015625, "learning_rate": 2.5625e-07, "loss": 0.0009, "reward": 3.916618824005127, "reward_std": 0.015597880817949772, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9208222925662994, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9978796541690826, "step": 596 }, { "completion_length": 247.84375, "epoch": 1.9088, "grad_norm": 1.7148473262786865, "kl": 0.106201171875, "learning_rate": 2.55e-07, "loss": 0.0011, "reward": 3.9292455911636353, "reward_std": 0.007026449544355273, "rewards/answer_entity_reward": 0.9861111044883728, "rewards/answer_wer_reward": 0.9444275796413422, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987069070339203, "step": 597 }, { "completion_length": 183.21875, "epoch": 1.912, "grad_norm": 1.6502317190170288, "kl": 0.119140625, "learning_rate": 2.5374999999999995e-07, "loss": 0.0012, "reward": 3.9383983612060547, "reward_std": 0.03170687519013882, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9614830911159515, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.981379508972168, "step": 598 }, { "completion_length": 167.625, "epoch": 1.9152, "grad_norm": 1.1803314685821533, "kl": 0.137939453125, "learning_rate": 2.5249999999999996e-07, "loss": 0.0014, "reward": 3.9067423343658447, "reward_std": 0.013731301296502352, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.949960470199585, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9567819237709045, "step": 599 }, { "completion_length": 199.75, "epoch": 1.9184, "grad_norm": 1.3902597427368164, "kl": 0.080322265625, "learning_rate": 2.5125e-07, "loss": 0.0008, "reward": 3.927718758583069, "reward_std": 0.02047483716160059, "rewards/answer_entity_reward": 0.9840544760227203, "rewards/answer_wer_reward": 0.9543785154819489, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9892857074737549, "step": 600 }, { "completion_length": 210.375, "epoch": 1.9216, "grad_norm": 1.122063159942627, "kl": 0.120361328125, "learning_rate": 2.5e-07, "loss": 0.0012, "reward": 3.942714214324951, "reward_std": 0.027352871373295784, "rewards/answer_entity_reward": 0.9909090995788574, "rewards/answer_wer_reward": 0.9525187313556671, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9992863833904266, "step": 601 }, { "completion_length": 191.28125, "epoch": 1.9247999999999998, "grad_norm": 1.9480561017990112, "kl": 0.092529296875, "learning_rate": 2.4875e-07, "loss": 0.0009, "reward": 3.8946096897125244, "reward_std": 0.07258242554962635, "rewards/answer_entity_reward": 0.9886363744735718, "rewards/answer_wer_reward": 0.9430054724216461, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9629679620265961, "step": 602 }, { "completion_length": 210.0, "epoch": 1.928, "grad_norm": 1.522335171699524, "kl": 0.08203125, "learning_rate": 2.475e-07, "loss": 0.0008, "reward": 3.9419585466384888, "reward_std": 0.02094284538179636, "rewards/answer_entity_reward": 0.9960784316062927, "rewards/answer_wer_reward": 0.9462659358978271, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996141791343689, "step": 603 }, { "completion_length": 194.375, "epoch": 1.9312, "grad_norm": 1.9648785591125488, "kl": 0.2880859375, "learning_rate": 2.4624999999999997e-07, "loss": 0.0029, "reward": 3.9480878114700317, "reward_std": 0.015357580035924911, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9483262896537781, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997614622116089, "step": 604 }, { "completion_length": 220.75, "epoch": 1.9344000000000001, "grad_norm": 2.2600207328796387, "kl": 0.09814453125, "learning_rate": 2.45e-07, "loss": 0.001, "reward": 3.92673122882843, "reward_std": 0.025613561272621155, "rewards/answer_entity_reward": 0.9899475276470184, "rewards/answer_wer_reward": 0.9408612251281738, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9959224164485931, "step": 605 }, { "completion_length": 159.125, "epoch": 1.9376, "grad_norm": 3.3259623050689697, "kl": 0.15869140625, "learning_rate": 2.4375e-07, "loss": 0.0016, "reward": 3.9267284870147705, "reward_std": 0.024998134351335466, "rewards/answer_entity_reward": 0.987500011920929, "rewards/answer_wer_reward": 0.9395100474357605, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997184872627258, "step": 606 }, { "completion_length": 250.0625, "epoch": 1.9407999999999999, "grad_norm": 1.4518193006515503, "kl": 0.1357421875, "learning_rate": 2.425e-07, "loss": 0.0014, "reward": 3.825323224067688, "reward_std": 0.11214365810155869, "rewards/answer_entity_reward": 0.9936868846416473, "rewards/answer_wer_reward": 0.9456245005130768, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9172618985176086, "step": 607 }, { "completion_length": 219.53125, "epoch": 1.944, "grad_norm": 1.2852040529251099, "kl": 0.081298828125, "learning_rate": 2.4124999999999997e-07, "loss": 0.0008, "reward": 3.9620739221572876, "reward_std": 0.00826547248288989, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9646617472171783, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9974121451377869, "step": 608 }, { "completion_length": 232.875, "epoch": 1.9472, "grad_norm": 4.870666027069092, "kl": 0.111328125, "learning_rate": 2.4e-07, "loss": 0.0011, "reward": 3.9366871118545532, "reward_std": 0.022448008647188544, "rewards/answer_entity_reward": 0.9916141629219055, "rewards/answer_wer_reward": 0.9457239210605621, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993489384651184, "step": 609 }, { "completion_length": 241.625, "epoch": 1.9504000000000001, "grad_norm": 1.8101410865783691, "kl": 0.096435546875, "learning_rate": 2.3875e-07, "loss": 0.001, "reward": 3.9368724822998047, "reward_std": 0.022243991494178772, "rewards/answer_entity_reward": 0.9929924309253693, "rewards/answer_wer_reward": 0.943880021572113, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 610 }, { "completion_length": 223.0, "epoch": 1.9536, "grad_norm": 0.8068660497665405, "kl": 0.099609375, "learning_rate": 2.3749999999999998e-07, "loss": 0.001, "reward": 3.9270153045654297, "reward_std": 0.01834964146837592, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.931628555059433, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9988590180873871, "step": 611 }, { "completion_length": 259.59375, "epoch": 1.9567999999999999, "grad_norm": 1.522141695022583, "kl": 0.08203125, "learning_rate": 2.3625e-07, "loss": 0.0008, "reward": 3.9466216564178467, "reward_std": 0.015272928401827812, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9516074061393738, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9984864592552185, "step": 612 }, { "completion_length": 211.6875, "epoch": 1.96, "grad_norm": 5.929853916168213, "kl": 0.10205078125, "learning_rate": 2.3499999999999997e-07, "loss": 0.001, "reward": 3.912535309791565, "reward_std": 0.026867160573601723, "rewards/answer_entity_reward": 0.9908565580844879, "rewards/answer_wer_reward": 0.9229700565338135, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987086653709412, "step": 613 }, { "completion_length": 222.40625, "epoch": 1.9632, "grad_norm": 2.7727534770965576, "kl": 0.106689453125, "learning_rate": 2.3375e-07, "loss": 0.0011, "reward": 3.9305132627487183, "reward_std": 0.07138971472159028, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9646386206150055, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9658747315406799, "step": 614 }, { "completion_length": 222.34375, "epoch": 1.9664000000000001, "grad_norm": 1.5660823583602905, "kl": 0.09912109375, "learning_rate": 2.325e-07, "loss": 0.001, "reward": 3.935341477394104, "reward_std": 0.01785436598584056, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9401703774929047, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.997254341840744, "step": 615 }, { "completion_length": 236.34375, "epoch": 1.9696, "grad_norm": 1.94826340675354, "kl": 0.078857421875, "learning_rate": 2.3125e-07, "loss": 0.0008, "reward": 3.958570718765259, "reward_std": 0.008256069151684642, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9587988257408142, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997718930244446, "step": 616 }, { "completion_length": 143.5, "epoch": 1.9727999999999999, "grad_norm": 2.0813863277435303, "kl": 0.121337890625, "learning_rate": 2.3e-07, "loss": 0.0012, "reward": 3.886753797531128, "reward_std": 0.027786132879555225, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9573519229888916, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9294019639492035, "step": 617 }, { "completion_length": 227.75, "epoch": 1.976, "grad_norm": 8.525589942932129, "kl": 0.097900390625, "learning_rate": 2.2875e-07, "loss": 0.001, "reward": 3.857698917388916, "reward_std": 0.07474052533507347, "rewards/answer_entity_reward": 0.9985119104385376, "rewards/answer_wer_reward": 0.9551934599876404, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9039934873580933, "step": 618 }, { "completion_length": 233.46875, "epoch": 1.9792, "grad_norm": 1.966539978981018, "kl": 0.082763671875, "learning_rate": 2.275e-07, "loss": 0.0008, "reward": 3.9442771673202515, "reward_std": 0.018204713938757777, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.9551240801811218, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9948348999023438, "step": 619 }, { "completion_length": 254.59375, "epoch": 1.9824000000000002, "grad_norm": 2.300699234008789, "kl": 0.338134765625, "learning_rate": 2.2625e-07, "loss": 0.0034, "reward": 3.9195804595947266, "reward_std": 0.014696986880153418, "rewards/answer_entity_reward": 0.974116176366806, "rewards/answer_wer_reward": 0.9479033648967743, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9975608885288239, "step": 620 }, { "completion_length": 239.71875, "epoch": 1.9856, "grad_norm": 20.98819923400879, "kl": 0.16015625, "learning_rate": 2.25e-07, "loss": 0.0016, "reward": 3.858734607696533, "reward_std": 0.14412511140108109, "rewards/answer_entity_reward": 0.9937500059604645, "rewards/answer_wer_reward": 0.9278987050056458, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9683361053466797, "step": 621 }, { "completion_length": 225.1875, "epoch": 1.9888, "grad_norm": 2.4856221675872803, "kl": 0.1181640625, "learning_rate": 2.2375e-07, "loss": 0.0012, "reward": 3.92032527923584, "reward_std": 0.030348293483257294, "rewards/answer_entity_reward": 0.9947916567325592, "rewards/answer_wer_reward": 0.9266910254955292, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9988425970077515, "step": 622 }, { "completion_length": 209.4375, "epoch": 1.992, "grad_norm": 2.2857937812805176, "kl": 0.109619140625, "learning_rate": 2.225e-07, "loss": 0.0011, "reward": 3.790956974029541, "reward_std": 0.07298576645553112, "rewards/answer_entity_reward": 0.993697464466095, "rewards/answer_wer_reward": 0.93813356757164, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8591260015964508, "step": 623 }, { "completion_length": 182.90625, "epoch": 1.9952, "grad_norm": 4.463064670562744, "kl": 0.126953125, "learning_rate": 2.2125e-07, "loss": 0.0013, "reward": 3.906226873397827, "reward_std": 0.0698380870744586, "rewards/answer_entity_reward": 0.992799699306488, "rewards/answer_wer_reward": 0.9589782953262329, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9544488191604614, "step": 624 }, { "completion_length": 232.875, "epoch": 1.9984, "grad_norm": 1.20980966091156, "kl": 0.1044921875, "learning_rate": 2.1999999999999998e-07, "loss": 0.001, "reward": 3.9167827367782593, "reward_std": 0.01762760616838932, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9183346629142761, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9984481334686279, "step": 625 }, { "completion_length": 177.75, "epoch": 2.0, "grad_norm": 2.3776934146881104, "kl": 0.11865234375, "learning_rate": 2.1875e-07, "loss": 0.0006, "reward": 3.8532142639160156, "reward_std": 0.018374208360910416, "rewards/answer_entity_reward": 0.9963235259056091, "rewards/answer_wer_reward": 0.9639798402786255, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8929109573364258, "step": 626 }, { "completion_length": 234.6875, "epoch": 2.0032, "grad_norm": 1.611534833908081, "kl": 0.13134765625, "learning_rate": 2.1749999999999998e-07, "loss": 0.0013, "reward": 3.9422988891601562, "reward_std": 0.020460932981222868, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9433672726154327, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9989316165447235, "step": 627 }, { "completion_length": 174.4375, "epoch": 2.0064, "grad_norm": 3.054837942123413, "kl": 0.1240234375, "learning_rate": 2.1625e-07, "loss": 0.0012, "reward": 3.958639144897461, "reward_std": 0.009486648719757795, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9633738994598389, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9952651560306549, "step": 628 }, { "completion_length": 222.15625, "epoch": 2.0096, "grad_norm": 4.340118885040283, "kl": 0.093017578125, "learning_rate": 2.1499999999999998e-07, "loss": 0.0009, "reward": 3.8726435899734497, "reward_std": 0.033597009256482124, "rewards/answer_entity_reward": 0.9919143319129944, "rewards/answer_wer_reward": 0.9492302238941193, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9314990937709808, "step": 629 }, { "completion_length": 207.8125, "epoch": 2.0128, "grad_norm": 2.162853717803955, "kl": 0.109130859375, "learning_rate": 2.1375e-07, "loss": 0.0011, "reward": 3.9318206310272217, "reward_std": 0.01979170460253954, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9489758312702179, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9863169491291046, "step": 630 }, { "completion_length": 239.90625, "epoch": 2.016, "grad_norm": 1.676960825920105, "kl": 0.182861328125, "learning_rate": 2.1249999999999998e-07, "loss": 0.0018, "reward": 3.937206506729126, "reward_std": 0.014235546346753836, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9372064471244812, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 631 }, { "completion_length": 194.28125, "epoch": 2.0192, "grad_norm": 5.123164176940918, "kl": 0.110107421875, "learning_rate": 2.1125e-07, "loss": 0.0011, "reward": 3.7548152208328247, "reward_std": 0.10348369181156158, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.8692809343338013, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8912160098552704, "step": 632 }, { "completion_length": 214.5625, "epoch": 2.0224, "grad_norm": 1.3529505729675293, "kl": 0.106689453125, "learning_rate": 2.0999999999999997e-07, "loss": 0.0011, "reward": 3.8920629024505615, "reward_std": 0.01197694381698966, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9270462095737457, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9650167524814606, "step": 633 }, { "completion_length": 194.25, "epoch": 2.0256, "grad_norm": 1.6181930303573608, "kl": 0.109130859375, "learning_rate": 2.0874999999999999e-07, "loss": 0.0011, "reward": 3.9565629959106445, "reward_std": 0.021904858760535717, "rewards/answer_entity_reward": 0.995192289352417, "rewards/answer_wer_reward": 0.9613706469535828, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 634 }, { "completion_length": 206.59375, "epoch": 2.0288, "grad_norm": 2.353773832321167, "kl": 0.0986328125, "learning_rate": 2.0749999999999997e-07, "loss": 0.001, "reward": 3.919626474380493, "reward_std": 0.02727056946605444, "rewards/answer_entity_reward": 0.987500011920929, "rewards/answer_wer_reward": 0.9333742260932922, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987522065639496, "step": 635 }, { "completion_length": 189.71875, "epoch": 2.032, "grad_norm": 1.6075130701065063, "kl": 0.13720703125, "learning_rate": 2.0624999999999998e-07, "loss": 0.0014, "reward": 3.9046106338500977, "reward_std": 0.025621079374104738, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.9407951831817627, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9694972634315491, "step": 636 }, { "completion_length": 230.46875, "epoch": 2.0352, "grad_norm": 5.240235805511475, "kl": 0.10302734375, "learning_rate": 2.0499999999999997e-07, "loss": 0.001, "reward": 3.9211699962615967, "reward_std": 0.017814213410019875, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9250318109989166, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9982215464115143, "step": 637 }, { "completion_length": 209.59375, "epoch": 2.0384, "grad_norm": 2.4782729148864746, "kl": 0.083984375, "learning_rate": 2.0374999999999998e-07, "loss": 0.0008, "reward": 3.894644021987915, "reward_std": 0.020965205505490303, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9609961807727814, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9336478412151337, "step": 638 }, { "completion_length": 233.5, "epoch": 2.0416, "grad_norm": 1.102921485900879, "kl": 0.089599609375, "learning_rate": 2.025e-07, "loss": 0.0009, "reward": 3.9374464750289917, "reward_std": 0.015141086652874947, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9409857094287872, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993016719818115, "step": 639 }, { "completion_length": 227.84375, "epoch": 2.0448, "grad_norm": 1.3384666442871094, "kl": 0.0908203125, "learning_rate": 2.0125e-07, "loss": 0.0009, "reward": 3.9045239686965942, "reward_std": 0.12723269453272223, "rewards/answer_entity_reward": 0.96875, "rewards/answer_wer_reward": 0.9367768168449402, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.998997151851654, "step": 640 }, { "completion_length": 175.625, "epoch": 2.048, "grad_norm": 0.6850874423980713, "kl": 0.124267578125, "learning_rate": 2e-07, "loss": 0.0012, "reward": 3.929681897163391, "reward_std": 0.008345533395186067, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9302853643894196, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993966221809387, "step": 641 }, { "completion_length": 213.59375, "epoch": 2.0512, "grad_norm": 2.9222097396850586, "kl": 0.101318359375, "learning_rate": 1.9875e-07, "loss": 0.001, "reward": 3.8092339038848877, "reward_std": 0.11687304638326168, "rewards/answer_entity_reward": 0.9529532790184021, "rewards/answer_wer_reward": 0.8997257351875305, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9565548896789551, "step": 642 }, { "completion_length": 205.5, "epoch": 2.0544, "grad_norm": 1.1586568355560303, "kl": 0.092041015625, "learning_rate": 1.975e-07, "loss": 0.0009, "reward": 3.9247117042541504, "reward_std": 0.011728376615792513, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.935352236032486, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9893594086170197, "step": 643 }, { "completion_length": 203.09375, "epoch": 2.0576, "grad_norm": 1.5699268579483032, "kl": 0.09326171875, "learning_rate": 1.9625e-07, "loss": 0.0009, "reward": 3.9444518089294434, "reward_std": 0.020181890577077866, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9571816027164459, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9907424449920654, "step": 644 }, { "completion_length": 203.4375, "epoch": 2.0608, "grad_norm": 1.7927268743515015, "kl": 0.15478515625, "learning_rate": 1.9499999999999999e-07, "loss": 0.0015, "reward": 3.9478741884231567, "reward_std": 0.01690173940733075, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9509572982788086, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997577667236328, "step": 645 }, { "completion_length": 249.40625, "epoch": 2.064, "grad_norm": 1.3610011339187622, "kl": 0.09033203125, "learning_rate": 1.9375e-07, "loss": 0.0009, "reward": 3.817861795425415, "reward_std": 0.1958598094061017, "rewards/answer_entity_reward": 0.990950733423233, "rewards/answer_wer_reward": 0.8910082578659058, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9671527743339539, "step": 646 }, { "completion_length": 211.84375, "epoch": 2.0672, "grad_norm": 1.7078856229782104, "kl": 0.104248046875, "learning_rate": 1.9249999999999998e-07, "loss": 0.001, "reward": 3.9115630388259888, "reward_std": 0.024524363689124584, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9344038963317871, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9800000190734863, "step": 647 }, { "completion_length": 250.3125, "epoch": 2.0704, "grad_norm": 1.6208539009094238, "kl": 0.10205078125, "learning_rate": 1.9125e-07, "loss": 0.001, "reward": 3.8414340019226074, "reward_std": 0.15159638598561287, "rewards/answer_entity_reward": 0.9867424070835114, "rewards/answer_wer_reward": 0.9219352900981903, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.964006245136261, "step": 648 }, { "completion_length": 191.21875, "epoch": 2.0736, "grad_norm": 2.747303009033203, "kl": 0.123046875, "learning_rate": 1.8999999999999998e-07, "loss": 0.0012, "reward": 3.929761052131653, "reward_std": 0.029091503005474806, "rewards/answer_entity_reward": 0.9930555522441864, "rewards/answer_wer_reward": 0.9552291929721832, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9814762473106384, "step": 649 }, { "completion_length": 242.21875, "epoch": 2.0768, "grad_norm": 1.213749647140503, "kl": 0.08203125, "learning_rate": 1.8875e-07, "loss": 0.0008, "reward": 3.9264228343963623, "reward_std": 0.022060640156269073, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9312105178833008, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9972956776618958, "step": 650 }, { "completion_length": 220.65625, "epoch": 2.08, "grad_norm": 6.092029571533203, "kl": 0.100830078125, "learning_rate": 1.875e-07, "loss": 0.001, "reward": 3.9253735542297363, "reward_std": 0.07221902348101139, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9613818228244781, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9668327569961548, "step": 651 }, { "completion_length": 182.5625, "epoch": 2.0832, "grad_norm": 1.7553961277008057, "kl": 0.11279296875, "learning_rate": 1.8625e-07, "loss": 0.0011, "reward": 3.8646005392074585, "reward_std": 0.14707163721323013, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.931645005941391, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9676778018474579, "step": 652 }, { "completion_length": 232.3125, "epoch": 2.0864, "grad_norm": 1.1559618711471558, "kl": 0.091796875, "learning_rate": 1.85e-07, "loss": 0.0009, "reward": 3.957954168319702, "reward_std": 0.013995198532938957, "rewards/answer_entity_reward": 0.9958333373069763, "rewards/answer_wer_reward": 0.963512659072876, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9986082315444946, "step": 653 }, { "completion_length": 223.15625, "epoch": 2.0896, "grad_norm": 2.3205788135528564, "kl": 0.10302734375, "learning_rate": 1.8375e-07, "loss": 0.001, "reward": 3.9284400939941406, "reward_std": 0.02101885131560266, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9305233955383301, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 654 }, { "completion_length": 174.03125, "epoch": 2.0928, "grad_norm": 3.2282309532165527, "kl": 0.089599609375, "learning_rate": 1.825e-07, "loss": 0.0009, "reward": 3.913803219795227, "reward_std": 0.06426881160587072, "rewards/answer_entity_reward": 0.9895833134651184, "rewards/answer_wer_reward": 0.9792838096618652, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.944936066865921, "step": 655 }, { "completion_length": 209.40625, "epoch": 2.096, "grad_norm": 3.0301027297973633, "kl": 0.15478515625, "learning_rate": 1.8124999999999999e-07, "loss": 0.0015, "reward": 3.7523492574691772, "reward_std": 0.15331693179905415, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9194472134113312, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.8641521334648132, "step": 656 }, { "completion_length": 183.46875, "epoch": 2.0992, "grad_norm": 3.859424591064453, "kl": 0.10498046875, "learning_rate": 1.8e-07, "loss": 0.001, "reward": 3.9188934564590454, "reward_std": 0.016068585216999054, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9425098896026611, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9784668385982513, "step": 657 }, { "completion_length": 249.28125, "epoch": 2.1024, "grad_norm": 1.1957335472106934, "kl": 0.0732421875, "learning_rate": 1.7874999999999998e-07, "loss": 0.0007, "reward": 3.920902967453003, "reward_std": 0.009091381449252367, "rewards/answer_entity_reward": 0.982051283121109, "rewards/answer_wer_reward": 0.9388516247272491, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 658 }, { "completion_length": 215.1875, "epoch": 2.1056, "grad_norm": 0.9195266962051392, "kl": 0.08349609375, "learning_rate": 1.775e-07, "loss": 0.0008, "reward": 3.9518920183181763, "reward_std": 0.00956010865047574, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9591186344623566, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9927734434604645, "step": 659 }, { "completion_length": 247.21875, "epoch": 2.1088, "grad_norm": 1.4949768781661987, "kl": 0.109130859375, "learning_rate": 1.7624999999999998e-07, "loss": 0.0011, "reward": 3.9060639142990112, "reward_std": 0.029787511564791203, "rewards/answer_entity_reward": 0.9838598966598511, "rewards/answer_wer_reward": 0.9244924187660217, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9977116584777832, "step": 660 }, { "completion_length": 213.15625, "epoch": 2.112, "grad_norm": 2.8325705528259277, "kl": 0.08740234375, "learning_rate": 1.75e-07, "loss": 0.0009, "reward": 3.951107144355774, "reward_std": 0.019360109698027372, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9607862234115601, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9931618571281433, "step": 661 }, { "completion_length": 210.1875, "epoch": 2.1152, "grad_norm": 4.155531883239746, "kl": 0.12890625, "learning_rate": 1.7374999999999998e-07, "loss": 0.0013, "reward": 3.8224732875823975, "reward_std": 0.200032701715827, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.909185916185379, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9480096995830536, "step": 662 }, { "completion_length": 220.59375, "epoch": 2.1184, "grad_norm": 1.299959421157837, "kl": 0.091796875, "learning_rate": 1.725e-07, "loss": 0.0009, "reward": 3.959660768508911, "reward_std": 0.009787917137145996, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9610175788402557, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9986431002616882, "step": 663 }, { "completion_length": 203.875, "epoch": 2.1216, "grad_norm": 1.2713404893875122, "kl": 0.0849609375, "learning_rate": 1.7125e-07, "loss": 0.0008, "reward": 3.913174271583557, "reward_std": 0.03005001787096262, "rewards/answer_entity_reward": 0.9858973920345306, "rewards/answer_wer_reward": 0.9283359348773956, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9989408254623413, "step": 664 }, { "completion_length": 235.6875, "epoch": 2.1248, "grad_norm": 13.369653701782227, "kl": 0.167236328125, "learning_rate": 1.7000000000000001e-07, "loss": 0.0017, "reward": 3.862402558326721, "reward_std": 0.1546822851523757, "rewards/answer_entity_reward": 0.9921875, "rewards/answer_wer_reward": 0.9333168268203735, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9681483209133148, "step": 665 }, { "completion_length": 154.8125, "epoch": 2.128, "grad_norm": 35.12384033203125, "kl": 0.115966796875, "learning_rate": 1.6875e-07, "loss": 0.0012, "reward": 3.9367175102233887, "reward_std": 0.02194784674793482, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9534772336483002, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9832403361797333, "step": 666 }, { "completion_length": 195.28125, "epoch": 2.1312, "grad_norm": 1.2815937995910645, "kl": 0.107666015625, "learning_rate": 1.675e-07, "loss": 0.0011, "reward": 3.937751293182373, "reward_std": 0.014415924437344074, "rewards/answer_entity_reward": 0.9930555522441864, "rewards/answer_wer_reward": 0.9462690353393555, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9984267055988312, "step": 667 }, { "completion_length": 225.90625, "epoch": 2.1344, "grad_norm": 0.840438723564148, "kl": 0.12841796875, "learning_rate": 1.6625e-07, "loss": 0.0013, "reward": 3.9389572143554688, "reward_std": 0.01061929203569889, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.939858615398407, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990985691547394, "step": 668 }, { "completion_length": 186.6875, "epoch": 2.1376, "grad_norm": 1.6506493091583252, "kl": 0.081787109375, "learning_rate": 1.65e-07, "loss": 0.0008, "reward": 3.956157922744751, "reward_std": 0.008805734105408192, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9561578929424286, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 669 }, { "completion_length": 256.84375, "epoch": 2.1408, "grad_norm": 1.2955864667892456, "kl": 0.134765625, "learning_rate": 1.6375e-07, "loss": 0.0013, "reward": 3.924846053123474, "reward_std": 0.016075235791504383, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9267153441905975, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.998130738735199, "step": 670 }, { "completion_length": 205.8125, "epoch": 2.144, "grad_norm": 2.9848484992980957, "kl": 0.09375, "learning_rate": 1.625e-07, "loss": 0.0009, "reward": 3.920342206954956, "reward_std": 0.017433147877454758, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9359186589717865, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9844235181808472, "step": 671 }, { "completion_length": 205.25, "epoch": 2.1471999999999998, "grad_norm": 3.0758063793182373, "kl": 0.0888671875, "learning_rate": 1.6125e-07, "loss": 0.0009, "reward": 3.94283390045166, "reward_std": 0.03408639598637819, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9549511075019836, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9878826439380646, "step": 672 }, { "completion_length": 185.3125, "epoch": 2.1504, "grad_norm": 4.493408203125, "kl": 0.1298828125, "learning_rate": 1.6e-07, "loss": 0.0013, "reward": 3.7623226642608643, "reward_std": 0.053579739294946194, "rewards/answer_entity_reward": 0.9799679517745972, "rewards/answer_wer_reward": 0.930513322353363, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8518414497375488, "step": 673 }, { "completion_length": 238.5, "epoch": 2.1536, "grad_norm": 1.0133973360061646, "kl": 0.074462890625, "learning_rate": 1.5875e-07, "loss": 0.0008, "reward": 3.949939250946045, "reward_std": 0.01046135206706822, "rewards/answer_entity_reward": 0.9926470518112183, "rewards/answer_wer_reward": 0.9593237638473511, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.997968465089798, "step": 674 }, { "completion_length": 204.75, "epoch": 2.1568, "grad_norm": 2.416959762573242, "kl": 0.24658203125, "learning_rate": 1.575e-07, "loss": 0.0025, "reward": 3.8200684785842896, "reward_std": 0.014629668090492487, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9454044103622437, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8746639788150787, "step": 675 }, { "completion_length": 201.0625, "epoch": 2.16, "grad_norm": 1.1082431077957153, "kl": 0.103515625, "learning_rate": 1.5624999999999999e-07, "loss": 0.001, "reward": 3.9603604078292847, "reward_std": 0.013338471297174692, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9646645486354828, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9985367655754089, "step": 676 }, { "completion_length": 200.5, "epoch": 2.1632, "grad_norm": 1.243102788925171, "kl": 0.088623046875, "learning_rate": 1.55e-07, "loss": 0.0009, "reward": 3.9476585388183594, "reward_std": 0.014779110439121723, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9507860839366913, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997133016586304, "step": 677 }, { "completion_length": 235.0, "epoch": 2.1664, "grad_norm": 1.9828643798828125, "kl": 0.076171875, "learning_rate": 1.5374999999999998e-07, "loss": 0.0008, "reward": 3.840447187423706, "reward_std": 0.12814121507108212, "rewards/answer_entity_reward": 0.9507211446762085, "rewards/answer_wer_reward": 0.8900850713253021, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996408224105835, "step": 678 }, { "completion_length": 209.0625, "epoch": 2.1696, "grad_norm": 1.306552529335022, "kl": 0.09130859375, "learning_rate": 1.525e-07, "loss": 0.0009, "reward": 3.9448314905166626, "reward_std": 0.016167795285582542, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9459536671638489, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9988778233528137, "step": 679 }, { "completion_length": 206.84375, "epoch": 2.1728, "grad_norm": 1.7964757680892944, "kl": 0.1064453125, "learning_rate": 1.5124999999999998e-07, "loss": 0.0011, "reward": 3.9424251317977905, "reward_std": 0.0136543451808393, "rewards/answer_entity_reward": 0.9895104765892029, "rewards/answer_wer_reward": 0.9541302621364594, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987844526767731, "step": 680 }, { "completion_length": 244.21875, "epoch": 2.176, "grad_norm": 1.3341420888900757, "kl": 0.08642578125, "learning_rate": 1.5e-07, "loss": 0.0009, "reward": 3.9383710622787476, "reward_std": 0.01660554250702262, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9446630477905273, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9981722235679626, "step": 681 }, { "completion_length": 248.84375, "epoch": 2.1792, "grad_norm": 0.9630815386772156, "kl": 0.0888671875, "learning_rate": 1.4874999999999998e-07, "loss": 0.0009, "reward": 3.949557065963745, "reward_std": 0.01444097189232707, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9514444172382355, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9981126189231873, "step": 682 }, { "completion_length": 222.5625, "epoch": 2.1824, "grad_norm": 1.4436620473861694, "kl": 0.091796875, "learning_rate": 1.475e-07, "loss": 0.0009, "reward": 3.9340105056762695, "reward_std": 0.011837240774184465, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9374523460865021, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993990361690521, "step": 683 }, { "completion_length": 220.3125, "epoch": 2.1856, "grad_norm": 1.7951076030731201, "kl": 0.13623046875, "learning_rate": 1.4624999999999998e-07, "loss": 0.0014, "reward": 3.9159233570098877, "reward_std": 0.022063229698687792, "rewards/answer_entity_reward": 0.9825946092605591, "rewards/answer_wer_reward": 0.935338944196701, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9979897737503052, "step": 684 }, { "completion_length": 222.71875, "epoch": 2.1888, "grad_norm": 2.693173885345459, "kl": 0.090576171875, "learning_rate": 1.45e-07, "loss": 0.0009, "reward": 3.8880432844161987, "reward_std": 0.03081614337861538, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9077447652816772, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9827025234699249, "step": 685 }, { "completion_length": 263.5, "epoch": 2.192, "grad_norm": 5.544942855834961, "kl": 0.122802734375, "learning_rate": 1.4374999999999997e-07, "loss": 0.0012, "reward": 3.906672716140747, "reward_std": 0.017923741601407528, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9078539311885834, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9988189041614532, "step": 686 }, { "completion_length": 219.0625, "epoch": 2.1952, "grad_norm": 1.3066198825836182, "kl": 0.13720703125, "learning_rate": 1.4249999999999999e-07, "loss": 0.0014, "reward": 3.8579492568969727, "reward_std": 0.11750033870339394, "rewards/answer_entity_reward": 0.9445319771766663, "rewards/answer_wer_reward": 0.9204041659832001, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9930130541324615, "step": 687 }, { "completion_length": 203.09375, "epoch": 2.1984, "grad_norm": 2.9115042686462402, "kl": 0.130859375, "learning_rate": 1.4124999999999997e-07, "loss": 0.0013, "reward": 3.929637312889099, "reward_std": 0.07596011366695166, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9678620994091034, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9652473330497742, "step": 688 }, { "completion_length": 205.375, "epoch": 2.2016, "grad_norm": 2.322467803955078, "kl": 0.087158203125, "learning_rate": 1.4e-07, "loss": 0.0009, "reward": 3.8996429443359375, "reward_std": 0.06278708390891552, "rewards/answer_entity_reward": 0.9936868846416473, "rewards/answer_wer_reward": 0.9460262954235077, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.959929883480072, "step": 689 }, { "completion_length": 225.0625, "epoch": 2.2048, "grad_norm": 2.730459213256836, "kl": 0.0791015625, "learning_rate": 1.3875e-07, "loss": 0.0008, "reward": 3.916098475456238, "reward_std": 0.02135017653927207, "rewards/answer_entity_reward": 0.9937500059604645, "rewards/answer_wer_reward": 0.9546346664428711, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9677136838436127, "step": 690 }, { "completion_length": 154.09375, "epoch": 2.208, "grad_norm": 2.1384575366973877, "kl": 0.08740234375, "learning_rate": 1.375e-07, "loss": 0.0009, "reward": 3.8095338344573975, "reward_std": 0.02021293295547366, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9482340812683105, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8612997531890869, "step": 691 }, { "completion_length": 160.6875, "epoch": 2.2112, "grad_norm": 1.9878817796707153, "kl": 0.1083984375, "learning_rate": 1.3625e-07, "loss": 0.0011, "reward": 3.8489197492599487, "reward_std": 0.06898931134492159, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.929458349943161, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9194613993167877, "step": 692 }, { "completion_length": 209.0625, "epoch": 2.2144, "grad_norm": 1.895799994468689, "kl": 0.11669921875, "learning_rate": 1.35e-07, "loss": 0.0012, "reward": 3.897484064102173, "reward_std": 0.02146145049482584, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.926066517829895, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9714176058769226, "step": 693 }, { "completion_length": 248.3125, "epoch": 2.2176, "grad_norm": 2.0095603466033936, "kl": 0.1015625, "learning_rate": 1.3375e-07, "loss": 0.001, "reward": 3.9220433235168457, "reward_std": 0.014254164882004261, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9220432937145233, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 694 }, { "completion_length": 223.0, "epoch": 2.2208, "grad_norm": 1.6252143383026123, "kl": 0.12158203125, "learning_rate": 1.325e-07, "loss": 0.0012, "reward": 3.9079580307006836, "reward_std": 0.02597262989729643, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9434219896793365, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9645361006259918, "step": 695 }, { "completion_length": 187.5, "epoch": 2.224, "grad_norm": 1.387984275817871, "kl": 0.0947265625, "learning_rate": 1.3125e-07, "loss": 0.0009, "reward": 3.9440150260925293, "reward_std": 0.015697208931669593, "rewards/answer_entity_reward": 0.9816919267177582, "rewards/answer_wer_reward": 0.9631733596324921, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9991496503353119, "step": 696 }, { "completion_length": 203.34375, "epoch": 2.2272, "grad_norm": 2.2257113456726074, "kl": 0.110595703125, "learning_rate": 1.3e-07, "loss": 0.0011, "reward": 3.885230541229248, "reward_std": 0.023641248233616352, "rewards/answer_entity_reward": 0.9805992841720581, "rewards/answer_wer_reward": 0.9495046138763428, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9551265835762024, "step": 697 }, { "completion_length": 201.8125, "epoch": 2.2304, "grad_norm": 1.595376968383789, "kl": 0.076171875, "learning_rate": 1.2874999999999998e-07, "loss": 0.0008, "reward": 3.9703818559646606, "reward_std": 0.01011386327445507, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9731970131397247, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999588817358017, "step": 698 }, { "completion_length": 230.6875, "epoch": 2.2336, "grad_norm": 1.6279692649841309, "kl": 0.12744140625, "learning_rate": 1.275e-07, "loss": 0.0013, "reward": 3.9279537200927734, "reward_std": 0.017515965271741152, "rewards/answer_entity_reward": 0.9880050718784332, "rewards/answer_wer_reward": 0.9430651664733887, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9968834519386292, "step": 699 }, { "completion_length": 182.125, "epoch": 2.2368, "grad_norm": 2.8828890323638916, "kl": 0.18505859375, "learning_rate": 1.2624999999999998e-07, "loss": 0.0019, "reward": 3.8859431743621826, "reward_std": 0.142324005253613, "rewards/answer_entity_reward": 0.993686854839325, "rewards/answer_wer_reward": 0.9579981565475464, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9655081927776337, "step": 700 }, { "completion_length": 209.03125, "epoch": 2.24, "grad_norm": 2.9951937198638916, "kl": 0.12109375, "learning_rate": 1.25e-07, "loss": 0.0012, "reward": 3.7733466625213623, "reward_std": 0.025467259343713522, "rewards/answer_entity_reward": 0.9886675775051117, "rewards/answer_wer_reward": 0.9475079476833344, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8371710479259491, "step": 701 }, { "completion_length": 211.40625, "epoch": 2.2432, "grad_norm": 2.5615384578704834, "kl": 0.14208984375, "learning_rate": 1.2375e-07, "loss": 0.0014, "reward": 3.9001591205596924, "reward_std": 0.03031878173351288, "rewards/answer_entity_reward": 0.9910256266593933, "rewards/answer_wer_reward": 0.957984060049057, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9511493742465973, "step": 702 }, { "completion_length": 240.6875, "epoch": 2.2464, "grad_norm": 1.6149277687072754, "kl": 0.10888671875, "learning_rate": 1.225e-07, "loss": 0.0011, "reward": 3.917873740196228, "reward_std": 0.01580545213073492, "rewards/answer_entity_reward": 0.9787845611572266, "rewards/answer_wer_reward": 0.9405834674835205, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9985057711601257, "step": 703 }, { "completion_length": 190.0625, "epoch": 2.2496, "grad_norm": 1.620892882347107, "kl": 0.087646484375, "learning_rate": 1.2125e-07, "loss": 0.0009, "reward": 3.954784393310547, "reward_std": 0.03893708251416683, "rewards/answer_entity_reward": 0.9937500059604645, "rewards/answer_wer_reward": 0.962031751871109, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990026652812958, "step": 704 }, { "completion_length": 170.0, "epoch": 2.2528, "grad_norm": 1.5188636779785156, "kl": 0.111572265625, "learning_rate": 1.2e-07, "loss": 0.0011, "reward": 3.9031065702438354, "reward_std": 0.011392949614673853, "rewards/answer_entity_reward": 0.9861111044883728, "rewards/answer_wer_reward": 0.9356338381767273, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9813616275787354, "step": 705 }, { "completion_length": 211.46875, "epoch": 2.2560000000000002, "grad_norm": 2.7718734741210938, "kl": 0.102294921875, "learning_rate": 1.1874999999999999e-07, "loss": 0.001, "reward": 3.936674475669861, "reward_std": 0.021578084211796522, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9491873383522034, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9874871671199799, "step": 706 }, { "completion_length": 255.0, "epoch": 2.2592, "grad_norm": 1.6890819072723389, "kl": 0.099853515625, "learning_rate": 1.1749999999999999e-07, "loss": 0.001, "reward": 3.9247710704803467, "reward_std": 0.013670595828443766, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9279445707798004, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9968266189098358, "step": 707 }, { "completion_length": 189.3125, "epoch": 2.2624, "grad_norm": 2.3591725826263428, "kl": 0.111328125, "learning_rate": 1.1625e-07, "loss": 0.0011, "reward": 3.929440498352051, "reward_std": 0.018895008601248264, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9316463768482208, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.99779412150383, "step": 708 }, { "completion_length": 202.125, "epoch": 2.2656, "grad_norm": 5.1716766357421875, "kl": 0.142333984375, "learning_rate": 1.15e-07, "loss": 0.0014, "reward": 3.9494107961654663, "reward_std": 0.023188273422420025, "rewards/answer_entity_reward": 0.9902146458625793, "rewards/answer_wer_reward": 0.9693313241004944, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.989864856004715, "step": 709 }, { "completion_length": 241.90625, "epoch": 2.2688, "grad_norm": 2.9082345962524414, "kl": 0.15087890625, "learning_rate": 1.1375e-07, "loss": 0.0015, "reward": 3.877661347389221, "reward_std": 0.08314304798841476, "rewards/answer_entity_reward": 0.9895833134651184, "rewards/answer_wer_reward": 0.9211136996746063, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9669643044471741, "step": 710 }, { "completion_length": 222.40625, "epoch": 2.2720000000000002, "grad_norm": 2.9711413383483887, "kl": 0.123779296875, "learning_rate": 1.125e-07, "loss": 0.0012, "reward": 3.9322550296783447, "reward_std": 0.06140775140374899, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9581426084041595, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9775847494602203, "step": 711 }, { "completion_length": 242.6875, "epoch": 2.2752, "grad_norm": 6.453571796417236, "kl": 0.116943359375, "learning_rate": 1.1125e-07, "loss": 0.0012, "reward": 3.874239444732666, "reward_std": 0.017658520489931107, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.8984209299087524, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9782223105430603, "step": 712 }, { "completion_length": 206.0625, "epoch": 2.2784, "grad_norm": 2.0138731002807617, "kl": 0.10205078125, "learning_rate": 1.0999999999999999e-07, "loss": 0.001, "reward": 3.9465200901031494, "reward_std": 0.01707920106127858, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.950833261013031, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9985276162624359, "step": 713 }, { "completion_length": 205.90625, "epoch": 2.2816, "grad_norm": 1.6215705871582031, "kl": 0.22216796875, "learning_rate": 1.0874999999999999e-07, "loss": 0.0022, "reward": 3.921483874320984, "reward_std": 0.017741497606039047, "rewards/answer_entity_reward": 0.9818618893623352, "rewards/answer_wer_reward": 0.9403572380542755, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9992647171020508, "step": 714 }, { "completion_length": 165.53125, "epoch": 2.2848, "grad_norm": 2.939443349838257, "kl": 0.10302734375, "learning_rate": 1.0749999999999999e-07, "loss": 0.001, "reward": 3.8573367595672607, "reward_std": 0.05941922590136528, "rewards/answer_entity_reward": 0.9981617629528046, "rewards/answer_wer_reward": 0.9561320841312408, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9030430614948273, "step": 715 }, { "completion_length": 209.34375, "epoch": 2.288, "grad_norm": 3.167865753173828, "kl": 0.098876953125, "learning_rate": 1.0624999999999999e-07, "loss": 0.001, "reward": 3.9200966358184814, "reward_std": 0.011050965171307325, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9683842360973358, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9517123103141785, "step": 716 }, { "completion_length": 211.78125, "epoch": 2.2912, "grad_norm": 2.83433198928833, "kl": 0.157470703125, "learning_rate": 1.0499999999999999e-07, "loss": 0.0016, "reward": 3.888568639755249, "reward_std": 0.0255763940513134, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9216626286506653, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9669062495231628, "step": 717 }, { "completion_length": 233.21875, "epoch": 2.2944, "grad_norm": 1.1522959470748901, "kl": 0.123046875, "learning_rate": 1.0374999999999999e-07, "loss": 0.0012, "reward": 3.9315165281295776, "reward_std": 0.015323773492127657, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9349887073040009, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 718 }, { "completion_length": 219.6875, "epoch": 2.2976, "grad_norm": 2.8032352924346924, "kl": 0.097900390625, "learning_rate": 1.0249999999999998e-07, "loss": 0.001, "reward": 3.941191077232361, "reward_std": 0.014842316508293152, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9449678063392639, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9962232708930969, "step": 719 }, { "completion_length": 247.75, "epoch": 2.3008, "grad_norm": 2.120060682296753, "kl": 0.10791015625, "learning_rate": 1.0125e-07, "loss": 0.0011, "reward": 3.7576488256454468, "reward_std": 0.034239969216287136, "rewards/answer_entity_reward": 0.9883012771606445, "rewards/answer_wer_reward": 0.9174286723136902, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8519188165664673, "step": 720 }, { "completion_length": 148.46875, "epoch": 2.304, "grad_norm": 3.453160047531128, "kl": 0.1357421875, "learning_rate": 1e-07, "loss": 0.0014, "reward": 3.9483038187026978, "reward_std": 0.010362145490944386, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9624904096126556, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9858134686946869, "step": 721 }, { "completion_length": 242.625, "epoch": 2.3072, "grad_norm": 1.0787523984909058, "kl": 0.08740234375, "learning_rate": 9.875e-08, "loss": 0.0009, "reward": 3.8604942560195923, "reward_std": 0.14663540851324797, "rewards/answer_entity_reward": 0.9937500059604645, "rewards/answer_wer_reward": 0.9304596483707428, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9675346612930298, "step": 722 }, { "completion_length": 184.15625, "epoch": 2.3104, "grad_norm": 2.8213894367218018, "kl": 0.078857421875, "learning_rate": 9.749999999999999e-08, "loss": 0.0008, "reward": 3.9743396043777466, "reward_std": 0.007034428184852004, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.975724995136261, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9986145198345184, "step": 723 }, { "completion_length": 259.78125, "epoch": 2.3136, "grad_norm": 1.6101382970809937, "kl": 0.090087890625, "learning_rate": 9.624999999999999e-08, "loss": 0.0009, "reward": 3.9425781965255737, "reward_std": 0.012072732672095299, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9443398118019104, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9982384443283081, "step": 724 }, { "completion_length": 244.0625, "epoch": 2.3168, "grad_norm": 6.2361578941345215, "kl": 0.1103515625, "learning_rate": 9.499999999999999e-08, "loss": 0.0011, "reward": 3.930173873901367, "reward_std": 0.027357542887330055, "rewards/answer_entity_reward": 0.9856617748737335, "rewards/answer_wer_reward": 0.9461617767810822, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9983505010604858, "step": 725 }, { "completion_length": 201.59375, "epoch": 2.32, "grad_norm": 1.4726715087890625, "kl": 0.09375, "learning_rate": 9.375e-08, "loss": 0.0009, "reward": 3.932676076889038, "reward_std": 0.018328175880014896, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.9490721523761749, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9892857074737549, "step": 726 }, { "completion_length": 224.46875, "epoch": 2.3232, "grad_norm": 1.8913533687591553, "kl": 0.10693359375, "learning_rate": 9.25e-08, "loss": 0.0011, "reward": 3.9074403047561646, "reward_std": 0.03500279039144516, "rewards/answer_entity_reward": 0.9926948249340057, "rewards/answer_wer_reward": 0.9156512916088104, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990941882133484, "step": 727 }, { "completion_length": 193.96875, "epoch": 2.3264, "grad_norm": 3.589576244354248, "kl": 0.095458984375, "learning_rate": 9.125e-08, "loss": 0.001, "reward": 3.9219532012939453, "reward_std": 0.018014353699982166, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9440249502658844, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9807692468166351, "step": 728 }, { "completion_length": 178.4375, "epoch": 2.3296, "grad_norm": 1.4839043617248535, "kl": 0.125244140625, "learning_rate": 9e-08, "loss": 0.0013, "reward": 3.8304929733276367, "reward_std": 0.009818047750741243, "rewards/answer_entity_reward": 0.9844697117805481, "rewards/answer_wer_reward": 0.9768873453140259, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.869135856628418, "step": 729 }, { "completion_length": 199.28125, "epoch": 2.3327999999999998, "grad_norm": 1.497478723526001, "kl": 0.08642578125, "learning_rate": 8.875e-08, "loss": 0.0009, "reward": 3.9690757989883423, "reward_std": 0.009865536354482174, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9690757989883423, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 730 }, { "completion_length": 220.40625, "epoch": 2.336, "grad_norm": 5.609241485595703, "kl": 0.0966796875, "learning_rate": 8.75e-08, "loss": 0.001, "reward": 3.935381293296814, "reward_std": 0.037938917987048626, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9465770721435547, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9888042211532593, "step": 731 }, { "completion_length": 215.75, "epoch": 2.3392, "grad_norm": 3.496508836746216, "kl": 0.13134765625, "learning_rate": 8.625e-08, "loss": 0.0013, "reward": 3.8224092721939087, "reward_std": 0.02808304876089096, "rewards/answer_entity_reward": 0.9923513829708099, "rewards/answer_wer_reward": 0.9497494399547577, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8803083300590515, "step": 732 }, { "completion_length": 230.78125, "epoch": 2.3424, "grad_norm": 27.852195739746094, "kl": 0.087890625, "learning_rate": 8.500000000000001e-08, "loss": 0.0009, "reward": 3.8288865089416504, "reward_std": 0.01470271497964859, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9576314091682434, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8736589848995209, "step": 733 }, { "completion_length": 241.9375, "epoch": 2.3456, "grad_norm": 3.033336639404297, "kl": 0.1015625, "learning_rate": 8.375e-08, "loss": 0.001, "reward": 3.8371338844299316, "reward_std": 0.060717299580574036, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.8998311161994934, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9373026490211487, "step": 734 }, { "completion_length": 235.53125, "epoch": 2.3487999999999998, "grad_norm": 1.6953455209732056, "kl": 0.094970703125, "learning_rate": 8.25e-08, "loss": 0.001, "reward": 3.9114056825637817, "reward_std": 0.03203952219337225, "rewards/answer_entity_reward": 0.9862325191497803, "rewards/answer_wer_reward": 0.9290111660957336, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9961620271205902, "step": 735 }, { "completion_length": 170.59375, "epoch": 2.352, "grad_norm": 3.9929087162017822, "kl": 0.096923828125, "learning_rate": 8.125e-08, "loss": 0.001, "reward": 3.7566089630126953, "reward_std": 0.029996749013662338, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.8621053397655487, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8945035934448242, "step": 736 }, { "completion_length": 233.125, "epoch": 2.3552, "grad_norm": 4.515742301940918, "kl": 0.129638671875, "learning_rate": 8e-08, "loss": 0.0013, "reward": 3.9159114360809326, "reward_std": 0.03965392196550965, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9617535173892975, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9576301276683807, "step": 737 }, { "completion_length": 202.5, "epoch": 2.3584, "grad_norm": 3.593953847885132, "kl": 0.107177734375, "learning_rate": 7.875e-08, "loss": 0.0011, "reward": 3.9383411407470703, "reward_std": 0.030629536136984825, "rewards/answer_entity_reward": 0.9930555820465088, "rewards/answer_wer_reward": 0.9511449038982391, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.994140625, "step": 738 }, { "completion_length": 204.5, "epoch": 2.3616, "grad_norm": 1.8713083267211914, "kl": 0.099365234375, "learning_rate": 7.75e-08, "loss": 0.001, "reward": 3.9490264654159546, "reward_std": 0.017966313287615776, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.95371875166893, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997718930244446, "step": 739 }, { "completion_length": 240.84375, "epoch": 2.3648, "grad_norm": 1.2076594829559326, "kl": 0.087646484375, "learning_rate": 7.625e-08, "loss": 0.0009, "reward": 3.9529651403427124, "reward_std": 0.00970834819599986, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.95296511054039, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 740 }, { "completion_length": 245.0, "epoch": 2.368, "grad_norm": 0.9895936846733093, "kl": 0.10205078125, "learning_rate": 7.5e-08, "loss": 0.001, "reward": 3.9112091064453125, "reward_std": 0.01916833221912384, "rewards/answer_entity_reward": 0.9895833134651184, "rewards/answer_wer_reward": 0.9262779057025909, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9953478574752808, "step": 741 }, { "completion_length": 240.09375, "epoch": 2.3712, "grad_norm": 2.48711895942688, "kl": 0.076171875, "learning_rate": 7.375e-08, "loss": 0.0008, "reward": 3.942033529281616, "reward_std": 0.015272341668605804, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9465188384056091, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9983557760715485, "step": 742 }, { "completion_length": 201.78125, "epoch": 2.3744, "grad_norm": 2.5322351455688477, "kl": 0.103271484375, "learning_rate": 7.25e-08, "loss": 0.001, "reward": 3.903318166732788, "reward_std": 0.024559098295867443, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9573764503002167, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9459417462348938, "step": 743 }, { "completion_length": 176.8125, "epoch": 2.3776, "grad_norm": 10.369518280029297, "kl": 0.10986328125, "learning_rate": 7.124999999999999e-08, "loss": 0.0011, "reward": 3.9451266527175903, "reward_std": 0.008970791008323431, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.953162282705307, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9919642806053162, "step": 744 }, { "completion_length": 230.625, "epoch": 2.3808, "grad_norm": 1.5272488594055176, "kl": 0.09130859375, "learning_rate": 7e-08, "loss": 0.0009, "reward": 3.9410911798477173, "reward_std": 0.017650599591434002, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9447437524795532, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9991883039474487, "step": 745 }, { "completion_length": 193.3125, "epoch": 2.384, "grad_norm": 2.9624199867248535, "kl": 0.165771484375, "learning_rate": 6.875e-08, "loss": 0.0017, "reward": 3.8940484523773193, "reward_std": 0.060107991099357605, "rewards/answer_entity_reward": 0.9955128133296967, "rewards/answer_wer_reward": 0.9106853604316711, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9878502786159515, "step": 746 }, { "completion_length": 216.34375, "epoch": 2.3872, "grad_norm": 1.623085379600525, "kl": 0.08544921875, "learning_rate": 6.75e-08, "loss": 0.0009, "reward": 3.9699491262435913, "reward_std": 0.016816058196127415, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9752996861934662, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9946492910385132, "step": 747 }, { "completion_length": 202.125, "epoch": 2.3904, "grad_norm": 1.5331361293792725, "kl": 0.12841796875, "learning_rate": 6.625e-08, "loss": 0.0013, "reward": 3.922391891479492, "reward_std": 0.008891359670087695, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9298486709594727, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9925432503223419, "step": 748 }, { "completion_length": 242.6875, "epoch": 2.3936, "grad_norm": 1.3326294422149658, "kl": 0.095947265625, "learning_rate": 6.5e-08, "loss": 0.001, "reward": 3.95425808429718, "reward_std": 0.009861439000815153, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9546802639961243, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9995777010917664, "step": 749 }, { "completion_length": 175.90625, "epoch": 2.3968, "grad_norm": 0.9046992063522339, "kl": 0.112060546875, "learning_rate": 6.375e-08, "loss": 0.0011, "reward": 3.977583885192871, "reward_std": 0.006871582940220833, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9775838255882263, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 750 }, { "completion_length": 215.59375, "epoch": 2.4, "grad_norm": 3.0961620807647705, "kl": 0.089111328125, "learning_rate": 6.25e-08, "loss": 0.0009, "reward": 3.9490163326263428, "reward_std": 0.012763194739818573, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9750434756278992, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9768137633800507, "step": 751 }, { "completion_length": 239.84375, "epoch": 2.4032, "grad_norm": 0.9473263621330261, "kl": 0.102783203125, "learning_rate": 6.125e-08, "loss": 0.001, "reward": 3.953715443611145, "reward_std": 0.018719897605478764, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9566626846790314, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999456524848938, "step": 752 }, { "completion_length": 177.09375, "epoch": 2.4064, "grad_norm": 0.7227364182472229, "kl": 0.115234375, "learning_rate": 6e-08, "loss": 0.0012, "reward": 3.9009724855422974, "reward_std": 0.1057232718449086, "rewards/answer_entity_reward": 0.96875, "rewards/answer_wer_reward": 0.9504120945930481, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.981810450553894, "step": 753 }, { "completion_length": 215.96875, "epoch": 2.4096, "grad_norm": 3.616448163986206, "kl": 0.103759765625, "learning_rate": 5.8749999999999993e-08, "loss": 0.001, "reward": 3.936669707298279, "reward_std": 0.01544360350817442, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9422976672649384, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9978442788124084, "step": 754 }, { "completion_length": 222.03125, "epoch": 2.4128, "grad_norm": 5.449378967285156, "kl": 0.08203125, "learning_rate": 5.75e-08, "loss": 0.0008, "reward": 3.9294843673706055, "reward_std": 0.05806633085012436, "rewards/answer_entity_reward": 0.9763257801532745, "rewards/answer_wer_reward": 0.9556067883968353, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9975519478321075, "step": 755 }, { "completion_length": 200.3125, "epoch": 2.416, "grad_norm": 2.46901798248291, "kl": 0.119873046875, "learning_rate": 5.625e-08, "loss": 0.0012, "reward": 3.888831377029419, "reward_std": 0.015442279167473316, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9264732301235199, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9651989638805389, "step": 756 }, { "completion_length": 237.21875, "epoch": 2.4192, "grad_norm": 1.3007749319076538, "kl": 0.095947265625, "learning_rate": 5.4999999999999996e-08, "loss": 0.001, "reward": 3.933607816696167, "reward_std": 0.023877738043665886, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9380720853805542, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 757 }, { "completion_length": 151.78125, "epoch": 2.4224, "grad_norm": 0.7467179894447327, "kl": 0.099853515625, "learning_rate": 5.3749999999999995e-08, "loss": 0.001, "reward": 3.9564812183380127, "reward_std": 0.004365669563412666, "rewards/answer_entity_reward": 0.9916666746139526, "rewards/answer_wer_reward": 0.9648145437240601, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 758 }, { "completion_length": 232.125, "epoch": 2.4256, "grad_norm": 1.5784400701522827, "kl": 0.1041259765625, "learning_rate": 5.2499999999999994e-08, "loss": 0.001, "reward": 3.9412447214126587, "reward_std": 0.02170270448550582, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9489176869392395, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9967913925647736, "step": 759 }, { "completion_length": 215.4375, "epoch": 2.4288, "grad_norm": 3.9008543491363525, "kl": 0.1298828125, "learning_rate": 5.124999999999999e-08, "loss": 0.0013, "reward": 3.8311843872070312, "reward_std": 0.05369440279901028, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9393357634544373, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8918486833572388, "step": 760 }, { "completion_length": 219.65625, "epoch": 2.432, "grad_norm": 4.4970526695251465, "kl": 0.09765625, "learning_rate": 5e-08, "loss": 0.001, "reward": 3.9511146545410156, "reward_std": 0.01855921559035778, "rewards/answer_entity_reward": 0.9981617629528046, "rewards/answer_wer_reward": 0.9531445503234863, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.99980828166008, "step": 761 }, { "completion_length": 210.21875, "epoch": 2.4352, "grad_norm": 0.9267875552177429, "kl": 0.096923828125, "learning_rate": 4.8749999999999996e-08, "loss": 0.001, "reward": 3.930490016937256, "reward_std": 0.013515972066670656, "rewards/answer_entity_reward": 0.9930555820465088, "rewards/answer_wer_reward": 0.9386539459228516, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987804591655731, "step": 762 }, { "completion_length": 196.625, "epoch": 2.4384, "grad_norm": 2.2344725131988525, "kl": 0.1025390625, "learning_rate": 4.7499999999999995e-08, "loss": 0.001, "reward": 3.9080734252929688, "reward_std": 0.035708663053810596, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9325708150863647, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9779064655303955, "step": 763 }, { "completion_length": 225.09375, "epoch": 2.4416, "grad_norm": 1.588053822517395, "kl": 0.095947265625, "learning_rate": 4.625e-08, "loss": 0.001, "reward": 3.9343831539154053, "reward_std": 0.016630763188004494, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9485695362091064, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9892857074737549, "step": 764 }, { "completion_length": 247.09375, "epoch": 2.4448, "grad_norm": 1.1707122325897217, "kl": 0.09228515625, "learning_rate": 4.5e-08, "loss": 0.0009, "reward": 3.8900914192199707, "reward_std": 0.06134997680783272, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.908464640378952, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9837101101875305, "step": 765 }, { "completion_length": 241.65625, "epoch": 2.448, "grad_norm": 2.8273398876190186, "kl": 0.110595703125, "learning_rate": 4.375e-08, "loss": 0.0011, "reward": 3.890642285346985, "reward_std": 0.021557598374783993, "rewards/answer_entity_reward": 0.9983552694320679, "rewards/answer_wer_reward": 0.8973233997821808, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9949637055397034, "step": 766 }, { "completion_length": 216.96875, "epoch": 2.4512, "grad_norm": 1.1206011772155762, "kl": 0.095947265625, "learning_rate": 4.2500000000000003e-08, "loss": 0.001, "reward": 3.9437450170516968, "reward_std": 0.008607666241005063, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9612680077552795, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9824769496917725, "step": 767 }, { "completion_length": 230.3125, "epoch": 2.4544, "grad_norm": 15.688488960266113, "kl": 0.085693359375, "learning_rate": 4.125e-08, "loss": 0.0009, "reward": 3.9394757747650146, "reward_std": 0.030962621793150902, "rewards/answer_entity_reward": 0.9806547462940216, "rewards/answer_wer_reward": 0.9617869853973389, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9970340430736542, "step": 768 }, { "completion_length": 248.46875, "epoch": 2.4576000000000002, "grad_norm": 1.5618577003479004, "kl": 0.16162109375, "learning_rate": 4e-08, "loss": 0.0016, "reward": 3.9416744709014893, "reward_std": 0.016101540066301823, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9428056180477142, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9988687336444855, "step": 769 }, { "completion_length": 234.0625, "epoch": 2.4608, "grad_norm": 3.257962226867676, "kl": 0.16259765625, "learning_rate": 3.875e-08, "loss": 0.0016, "reward": 3.9221439361572266, "reward_std": 0.02909655123949051, "rewards/answer_entity_reward": 0.9871794581413269, "rewards/answer_wer_reward": 0.9352968335151672, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996675550937653, "step": 770 }, { "completion_length": 245.78125, "epoch": 2.464, "grad_norm": 2.2879505157470703, "kl": 0.083251953125, "learning_rate": 3.75e-08, "loss": 0.0008, "reward": 3.9263609647750854, "reward_std": 0.022196561098098755, "rewards/answer_entity_reward": 0.9944852888584137, "rewards/answer_wer_reward": 0.9454439282417297, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9864316880702972, "step": 771 }, { "completion_length": 158.78125, "epoch": 2.4672, "grad_norm": 2.214250087738037, "kl": 0.1328125, "learning_rate": 3.625e-08, "loss": 0.0013, "reward": 3.883350372314453, "reward_std": 0.04219530359841883, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9803332090377808, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9030172228813171, "step": 772 }, { "completion_length": 229.25, "epoch": 2.4704, "grad_norm": 1.8548256158828735, "kl": 0.10205078125, "learning_rate": 3.5e-08, "loss": 0.001, "reward": 3.9539661407470703, "reward_std": 0.011240935884416103, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9542403221130371, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999725878238678, "step": 773 }, { "completion_length": 227.9375, "epoch": 2.4736000000000002, "grad_norm": 2.2110090255737305, "kl": 0.0927734375, "learning_rate": 3.375e-08, "loss": 0.0009, "reward": 3.9344996213912964, "reward_std": 0.011312551097944379, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9557085335254669, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9787910580635071, "step": 774 }, { "completion_length": 250.9375, "epoch": 2.4768, "grad_norm": 25.519304275512695, "kl": 0.1328125, "learning_rate": 3.25e-08, "loss": 0.0013, "reward": 3.915758967399597, "reward_std": 0.015426212921738625, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9174197912216187, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9983391761779785, "step": 775 }, { "completion_length": 223.65625, "epoch": 2.48, "grad_norm": 3.6137807369232178, "kl": 0.115966796875, "learning_rate": 3.125e-08, "loss": 0.0012, "reward": 3.939508318901062, "reward_std": 0.00902418838813901, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9407406747341156, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987677037715912, "step": 776 }, { "completion_length": 248.0, "epoch": 2.4832, "grad_norm": 1.4470294713974, "kl": 0.16015625, "learning_rate": 3e-08, "loss": 0.0016, "reward": 3.8835391998291016, "reward_std": 0.029840022325515747, "rewards/answer_entity_reward": 0.9926948249340057, "rewards/answer_wer_reward": 0.9006942212581635, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9901500642299652, "step": 777 }, { "completion_length": 174.3125, "epoch": 2.4864, "grad_norm": 2.8671512603759766, "kl": 0.12353515625, "learning_rate": 2.875e-08, "loss": 0.0012, "reward": 3.9390430450439453, "reward_std": 0.029761829413473606, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9475694894790649, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9938772618770599, "step": 778 }, { "completion_length": 217.0, "epoch": 2.4896, "grad_norm": 1.7183799743652344, "kl": 0.095458984375, "learning_rate": 2.7499999999999998e-08, "loss": 0.001, "reward": 3.922086715698242, "reward_std": 0.01339792925864458, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9282321929931641, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9938544631004333, "step": 779 }, { "completion_length": 205.09375, "epoch": 2.4928, "grad_norm": 2.424999475479126, "kl": 0.102294921875, "learning_rate": 2.6249999999999997e-08, "loss": 0.001, "reward": 3.944626212120056, "reward_std": 0.038148084189742804, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9640980660915375, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9805281758308411, "step": 780 }, { "completion_length": 220.6875, "epoch": 2.496, "grad_norm": 1.739138126373291, "kl": 0.09765625, "learning_rate": 2.5e-08, "loss": 0.001, "reward": 3.943056583404541, "reward_std": 0.025130684953182936, "rewards/answer_entity_reward": 0.9871794581413269, "rewards/answer_wer_reward": 0.9561585485935211, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997184872627258, "step": 781 }, { "completion_length": 199.375, "epoch": 2.4992, "grad_norm": 1.3409775495529175, "kl": 0.083984375, "learning_rate": 2.3749999999999998e-08, "loss": 0.0008, "reward": 3.9247756004333496, "reward_std": 0.021664155647158623, "rewards/answer_entity_reward": 0.9902146756649017, "rewards/answer_wer_reward": 0.9345609843730927, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 782 }, { "completion_length": 221.28125, "epoch": 2.5023999999999997, "grad_norm": 1.9740352630615234, "kl": 0.099853515625, "learning_rate": 2.25e-08, "loss": 0.001, "reward": 3.955259919166565, "reward_std": 0.010415108175948262, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9565965533256531, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9986633956432343, "step": 783 }, { "completion_length": 235.40625, "epoch": 2.5056000000000003, "grad_norm": 7.616406440734863, "kl": 0.144287109375, "learning_rate": 2.1250000000000002e-08, "loss": 0.0014, "reward": 3.9511306285858154, "reward_std": 0.011523132212460041, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9639480412006378, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9871825873851776, "step": 784 }, { "completion_length": 205.84375, "epoch": 2.5088, "grad_norm": 3.1992883682250977, "kl": 0.107421875, "learning_rate": 2e-08, "loss": 0.0011, "reward": 3.92184841632843, "reward_std": 0.016722742468118668, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9461718797683716, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9756765961647034, "step": 785 }, { "completion_length": 225.6875, "epoch": 2.512, "grad_norm": 1.2884989976882935, "kl": 0.139404296875, "learning_rate": 1.875e-08, "loss": 0.0014, "reward": 3.946265697479248, "reward_std": 0.017564056208357215, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9507300853729248, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 786 }, { "completion_length": 176.96875, "epoch": 2.5152, "grad_norm": 3.3580868244171143, "kl": 0.1982421875, "learning_rate": 1.75e-08, "loss": 0.002, "reward": 3.8963418006896973, "reward_std": 0.04480761382728815, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9633896946907043, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9329521358013153, "step": 787 }, { "completion_length": 260.28125, "epoch": 2.5183999999999997, "grad_norm": 1.0715585947036743, "kl": 0.105712890625, "learning_rate": 1.625e-08, "loss": 0.0011, "reward": 3.904552698135376, "reward_std": 0.034514338709414005, "rewards/answer_entity_reward": 0.9831239283084869, "rewards/answer_wer_reward": 0.9217879772186279, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996408224105835, "step": 788 }, { "completion_length": 251.625, "epoch": 2.5216, "grad_norm": 3.5006961822509766, "kl": 0.080322265625, "learning_rate": 1.5e-08, "loss": 0.0008, "reward": 3.9146039485931396, "reward_std": 0.028964843600988388, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9146038293838501, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 789 }, { "completion_length": 201.0625, "epoch": 2.5248, "grad_norm": 5.0292534828186035, "kl": 0.1396484375, "learning_rate": 1.3749999999999999e-08, "loss": 0.0014, "reward": 3.897321939468384, "reward_std": 0.01521459873765707, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9714652001857758, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9258567690849304, "step": 790 }, { "completion_length": 185.03125, "epoch": 2.528, "grad_norm": 2.234839916229248, "kl": 0.1005859375, "learning_rate": 1.25e-08, "loss": 0.001, "reward": 3.930277109146118, "reward_std": 0.026013732887804508, "rewards/answer_entity_reward": 0.9908459782600403, "rewards/answer_wer_reward": 0.9424907863140106, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9969403147697449, "step": 791 }, { "completion_length": 185.84375, "epoch": 2.5312, "grad_norm": 0.5959092974662781, "kl": 0.0947265625, "learning_rate": 1.125e-08, "loss": 0.0009, "reward": 3.954566478729248, "reward_std": 0.011145764729008079, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9578571021556854, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9995503425598145, "step": 792 }, { "completion_length": 197.84375, "epoch": 2.5343999999999998, "grad_norm": 2.0784664154052734, "kl": 0.114501953125, "learning_rate": 1e-08, "loss": 0.0011, "reward": 3.885765790939331, "reward_std": 0.012588209472596645, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9541250765323639, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.931640625, "step": 793 }, { "completion_length": 190.03125, "epoch": 2.5376, "grad_norm": 1.7104955911636353, "kl": 0.224609375, "learning_rate": 8.75e-09, "loss": 0.0022, "reward": 3.824442148208618, "reward_std": 0.039704530499875546, "rewards/answer_entity_reward": 0.9877451062202454, "rewards/answer_wer_reward": 0.919477641582489, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9172193706035614, "step": 794 }, { "completion_length": 221.4375, "epoch": 2.5408, "grad_norm": 2.524031162261963, "kl": 0.09521484375, "learning_rate": 7.5e-09, "loss": 0.001, "reward": 3.9210238456726074, "reward_std": 0.03186593018472195, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9480306208133698, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9729932844638824, "step": 795 }, { "completion_length": 149.9375, "epoch": 2.544, "grad_norm": 2.592532157897949, "kl": 0.116943359375, "learning_rate": 6.25e-09, "loss": 0.0012, "reward": 3.835923910140991, "reward_std": 0.016047589480876923, "rewards/answer_entity_reward": 0.9942555129528046, "rewards/answer_wer_reward": 0.8419776558876038, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999690592288971, "step": 796 }, { "completion_length": 196.3125, "epoch": 2.5472, "grad_norm": 1.1898647546768188, "kl": 0.0810546875, "learning_rate": 5e-09, "loss": 0.0008, "reward": 3.9655500650405884, "reward_std": 0.012615942629054189, "rewards/answer_entity_reward": 0.9937500059604645, "rewards/answer_wer_reward": 0.9718000292778015, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 797 }, { "completion_length": 220.3125, "epoch": 2.5504, "grad_norm": 1.6702154874801636, "kl": 0.093505859375, "learning_rate": 3.75e-09, "loss": 0.0009, "reward": 3.945963501930237, "reward_std": 0.007131826248951256, "rewards/answer_entity_reward": 0.9926470518112183, "rewards/answer_wer_reward": 0.9533165395259857, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 798 }, { "completion_length": 223.65625, "epoch": 2.5536, "grad_norm": 46.13692855834961, "kl": 0.11083984375, "learning_rate": 2.5e-09, "loss": 0.0011, "reward": 3.8597129583358765, "reward_std": 0.09108205512166023, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9162732660770416, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9770934879779816, "step": 799 }, { "completion_length": 233.84375, "epoch": 2.5568, "grad_norm": 1.1842632293701172, "kl": 0.108642578125, "learning_rate": 1.25e-09, "loss": 0.0011, "reward": 3.9367305040359497, "reward_std": 0.01876719295978546, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9404171705245972, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9983966648578644, "step": 800 } ], "logging_steps": 1, "max_steps": 800, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }