| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.5568, |
| "eval_steps": 500, |
| "global_step": 800, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 220.40625, |
| "epoch": 0.0032, |
| "grad_norm": 11.881386756896973, |
| "kl": 0.0, |
| "learning_rate": 1e-06, |
| "loss": 0.0, |
| "reward": 2.0222461223602295, |
| "reward_std": 1.2291262745857239, |
| "rewards/answer_entity_reward": 0.5891842544078827, |
| "rewards/answer_wer_reward": 0.36776189506053925, |
| "rewards/format_reward": 0.46875, |
| "rewards/think_ocr_reward": 0.596549928188324, |
| "step": 1 |
| }, |
| { |
| "completion_length": 183.75, |
| "epoch": 0.0064, |
| "grad_norm": 14.301155090332031, |
| "kl": 0.000579833984375, |
| "learning_rate": 9.9875e-07, |
| "loss": 0.0, |
| "reward": 2.1407116651535034, |
| "reward_std": 0.9154457449913025, |
| "rewards/answer_entity_reward": 0.7417342960834503, |
| "rewards/answer_wer_reward": 0.4293617159128189, |
| "rewards/format_reward": 0.59375, |
| "rewards/think_ocr_reward": 0.3758656233549118, |
| "step": 2 |
| }, |
| { |
| "completion_length": 185.09375, |
| "epoch": 0.0096, |
| "grad_norm": 7.90402889251709, |
| "kl": 0.0025768280029296875, |
| "learning_rate": 9.975e-07, |
| "loss": 0.0, |
| "reward": 2.4301702976226807, |
| "reward_std": 1.0761558413505554, |
| "rewards/answer_entity_reward": 0.7529265582561493, |
| "rewards/answer_wer_reward": 0.45110173523426056, |
| "rewards/format_reward": 0.6875, |
| "rewards/think_ocr_reward": 0.5386419892311096, |
| "step": 3 |
| }, |
| { |
| "completion_length": 201.46875, |
| "epoch": 0.0128, |
| "grad_norm": 2.4371554851531982, |
| "kl": 0.0039825439453125, |
| "learning_rate": 9.9625e-07, |
| "loss": 0.0, |
| "reward": 2.4960588216781616, |
| "reward_std": 1.0011246800422668, |
| "rewards/answer_entity_reward": 0.6945474743843079, |
| "rewards/answer_wer_reward": 0.626116082072258, |
| "rewards/format_reward": 0.65625, |
| "rewards/think_ocr_reward": 0.519145280122757, |
| "step": 4 |
| }, |
| { |
| "completion_length": 223.1875, |
| "epoch": 0.016, |
| "grad_norm": 3.092437982559204, |
| "kl": 0.001644134521484375, |
| "learning_rate": 9.95e-07, |
| "loss": 0.0, |
| "reward": 2.6151310205459595, |
| "reward_std": 1.0057614743709564, |
| "rewards/answer_entity_reward": 0.6729370057582855, |
| "rewards/answer_wer_reward": 0.43601465225219727, |
| "rewards/format_reward": 0.75, |
| "rewards/think_ocr_reward": 0.7561794817447662, |
| "step": 5 |
| }, |
| { |
| "completion_length": 211.09375, |
| "epoch": 0.0192, |
| "grad_norm": 3.8149898052215576, |
| "kl": 0.00344085693359375, |
| "learning_rate": 9.9375e-07, |
| "loss": 0.0, |
| "reward": 2.601198673248291, |
| "reward_std": 0.8605955541133881, |
| "rewards/answer_entity_reward": 0.6944940388202667, |
| "rewards/answer_wer_reward": 0.5194687843322754, |
| "rewards/format_reward": 0.71875, |
| "rewards/think_ocr_reward": 0.6684857904911041, |
| "step": 6 |
| }, |
| { |
| "completion_length": 210.8125, |
| "epoch": 0.0224, |
| "grad_norm": 2.000467300415039, |
| "kl": 0.0030364990234375, |
| "learning_rate": 9.925e-07, |
| "loss": 0.0, |
| "reward": 3.1113568544387817, |
| "reward_std": 0.928675651550293, |
| "rewards/answer_entity_reward": 0.8195368647575378, |
| "rewards/answer_wer_reward": 0.7422276139259338, |
| "rewards/format_reward": 0.75, |
| "rewards/think_ocr_reward": 0.7995923757553101, |
| "step": 7 |
| }, |
| { |
| "completion_length": 240.375, |
| "epoch": 0.0256, |
| "grad_norm": 2.2319533824920654, |
| "kl": 0.0052947998046875, |
| "learning_rate": 9.912499999999998e-07, |
| "loss": 0.0001, |
| "reward": 3.217132568359375, |
| "reward_std": 0.4984496384859085, |
| "rewards/answer_entity_reward": 0.7789974808692932, |
| "rewards/answer_wer_reward": 0.6678729355335236, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.8015121817588806, |
| "step": 8 |
| }, |
| { |
| "completion_length": 217.1875, |
| "epoch": 0.0288, |
| "grad_norm": 2.6002566814422607, |
| "kl": 0.06464385986328125, |
| "learning_rate": 9.9e-07, |
| "loss": 0.0006, |
| "reward": 3.217494249343872, |
| "reward_std": 0.5446330606937408, |
| "rewards/answer_entity_reward": 0.8213226199150085, |
| "rewards/answer_wer_reward": 0.7331169545650482, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.6943045258522034, |
| "step": 9 |
| }, |
| { |
| "completion_length": 196.40625, |
| "epoch": 0.032, |
| "grad_norm": 2.9925193786621094, |
| "kl": 0.008941650390625, |
| "learning_rate": 9.8875e-07, |
| "loss": 0.0001, |
| "reward": 3.2711292505264282, |
| "reward_std": 0.5466351807117462, |
| "rewards/answer_entity_reward": 0.7905315160751343, |
| "rewards/answer_wer_reward": 0.7206964790821075, |
| "rewards/format_reward": 0.90625, |
| "rewards/think_ocr_reward": 0.853651225566864, |
| "step": 10 |
| }, |
| { |
| "completion_length": 146.53125, |
| "epoch": 0.0352, |
| "grad_norm": 3.6174111366271973, |
| "kl": 0.0103912353515625, |
| "learning_rate": 9.875e-07, |
| "loss": 0.0001, |
| "reward": 3.083841323852539, |
| "reward_std": 0.6508071422576904, |
| "rewards/answer_entity_reward": 0.7979910671710968, |
| "rewards/answer_wer_reward": 0.6100275814533234, |
| "rewards/format_reward": 0.90625, |
| "rewards/think_ocr_reward": 0.7695727646350861, |
| "step": 11 |
| }, |
| { |
| "completion_length": 218.15625, |
| "epoch": 0.0384, |
| "grad_norm": 3.2925424575805664, |
| "kl": 0.00616455078125, |
| "learning_rate": 9.862499999999999e-07, |
| "loss": 0.0001, |
| "reward": 3.2391178607940674, |
| "reward_std": 0.6323770582675934, |
| "rewards/answer_entity_reward": 0.781956285238266, |
| "rewards/answer_wer_reward": 0.6958223879337311, |
| "rewards/format_reward": 0.90625, |
| "rewards/think_ocr_reward": 0.8550890386104584, |
| "step": 12 |
| }, |
| { |
| "completion_length": 250.53125, |
| "epoch": 0.0416, |
| "grad_norm": 2.291048288345337, |
| "kl": 0.0086669921875, |
| "learning_rate": 9.849999999999999e-07, |
| "loss": 0.0001, |
| "reward": 3.238759756088257, |
| "reward_std": 0.4200912415981293, |
| "rewards/answer_entity_reward": 0.8185493648052216, |
| "rewards/answer_wer_reward": 0.699150562286377, |
| "rewards/format_reward": 0.9375, |
| "rewards/think_ocr_reward": 0.7835597395896912, |
| "step": 13 |
| }, |
| { |
| "completion_length": 196.6875, |
| "epoch": 0.0448, |
| "grad_norm": 2.470576524734497, |
| "kl": 0.0181884765625, |
| "learning_rate": 9.8375e-07, |
| "loss": 0.0002, |
| "reward": 3.460441470146179, |
| "reward_std": 0.34273722767829895, |
| "rewards/answer_entity_reward": 0.9129322171211243, |
| "rewards/answer_wer_reward": 0.7192246317863464, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.8595346808433533, |
| "step": 14 |
| }, |
| { |
| "completion_length": 181.78125, |
| "epoch": 0.048, |
| "grad_norm": 13.122944831848145, |
| "kl": 0.0174560546875, |
| "learning_rate": 9.825e-07, |
| "loss": 0.0002, |
| "reward": 3.526148796081543, |
| "reward_std": 0.2207299917936325, |
| "rewards/answer_entity_reward": 0.8908324241638184, |
| "rewards/answer_wer_reward": 0.8109035789966583, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.8556627035140991, |
| "step": 15 |
| }, |
| { |
| "completion_length": 181.9375, |
| "epoch": 0.0512, |
| "grad_norm": 3.1282718181610107, |
| "kl": 0.0081329345703125, |
| "learning_rate": 9.8125e-07, |
| "loss": 0.0001, |
| "reward": 3.4612035751342773, |
| "reward_std": 0.2798766866326332, |
| "rewards/answer_entity_reward": 0.8926167786121368, |
| "rewards/answer_wer_reward": 0.6810254156589508, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9188113510608673, |
| "step": 16 |
| }, |
| { |
| "completion_length": 243.125, |
| "epoch": 0.0544, |
| "grad_norm": 1.907029390335083, |
| "kl": 0.00677490234375, |
| "learning_rate": 9.8e-07, |
| "loss": 0.0001, |
| "reward": 3.375656485557556, |
| "reward_std": 0.37908758223056793, |
| "rewards/answer_entity_reward": 0.8232844769954681, |
| "rewards/answer_wer_reward": 0.6466233134269714, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9369988143444061, |
| "step": 17 |
| }, |
| { |
| "completion_length": 236.34375, |
| "epoch": 0.0576, |
| "grad_norm": 2.551098108291626, |
| "kl": 0.0098876953125, |
| "learning_rate": 9.7875e-07, |
| "loss": 0.0001, |
| "reward": 3.637453317642212, |
| "reward_std": 0.1572738140821457, |
| "rewards/answer_entity_reward": 0.8815866112709045, |
| "rewards/answer_wer_reward": 0.8101728856563568, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9456938207149506, |
| "step": 18 |
| }, |
| { |
| "completion_length": 242.28125, |
| "epoch": 0.0608, |
| "grad_norm": 3.0685667991638184, |
| "kl": 0.010223388671875, |
| "learning_rate": 9.775e-07, |
| "loss": 0.0001, |
| "reward": 3.3409019708633423, |
| "reward_std": 0.3057943657040596, |
| "rewards/answer_entity_reward": 0.7610115706920624, |
| "rewards/answer_wer_reward": 0.6856433153152466, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8942470550537109, |
| "step": 19 |
| }, |
| { |
| "completion_length": 193.46875, |
| "epoch": 0.064, |
| "grad_norm": 2.6569221019744873, |
| "kl": 0.0095977783203125, |
| "learning_rate": 9.7625e-07, |
| "loss": 0.0001, |
| "reward": 3.5098860263824463, |
| "reward_std": 0.27671176940202713, |
| "rewards/answer_entity_reward": 0.8399666249752045, |
| "rewards/answer_wer_reward": 0.7382143139839172, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9317050278186798, |
| "step": 20 |
| }, |
| { |
| "completion_length": 199.28125, |
| "epoch": 0.0672, |
| "grad_norm": 3.02462100982666, |
| "kl": 0.0101318359375, |
| "learning_rate": 9.75e-07, |
| "loss": 0.0001, |
| "reward": 3.552868962287903, |
| "reward_std": 0.24761613458395004, |
| "rewards/answer_entity_reward": 0.9026052951812744, |
| "rewards/answer_wer_reward": 0.7746964991092682, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8755670785903931, |
| "step": 21 |
| }, |
| { |
| "completion_length": 239.75, |
| "epoch": 0.0704, |
| "grad_norm": 5.65736722946167, |
| "kl": 0.010223388671875, |
| "learning_rate": 9.7375e-07, |
| "loss": 0.0001, |
| "reward": 3.3219141960144043, |
| "reward_std": 0.32601839303970337, |
| "rewards/answer_entity_reward": 0.8810833096504211, |
| "rewards/answer_wer_reward": 0.6434947550296783, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.7973361015319824, |
| "step": 22 |
| }, |
| { |
| "completion_length": 216.21875, |
| "epoch": 0.0736, |
| "grad_norm": 6.68402099609375, |
| "kl": 0.009765625, |
| "learning_rate": 9.725e-07, |
| "loss": 0.0001, |
| "reward": 3.67569899559021, |
| "reward_std": 0.19380945712327957, |
| "rewards/answer_entity_reward": 0.9180394113063812, |
| "rewards/answer_wer_reward": 0.8205302953720093, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9371293187141418, |
| "step": 23 |
| }, |
| { |
| "completion_length": 200.65625, |
| "epoch": 0.0768, |
| "grad_norm": 3.398916006088257, |
| "kl": 0.0118408203125, |
| "learning_rate": 9.712499999999998e-07, |
| "loss": 0.0001, |
| "reward": 3.575831174850464, |
| "reward_std": 0.22907962650060654, |
| "rewards/answer_entity_reward": 0.9015873074531555, |
| "rewards/answer_wer_reward": 0.8195928931236267, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8546508848667145, |
| "step": 24 |
| }, |
| { |
| "completion_length": 144.9375, |
| "epoch": 0.08, |
| "grad_norm": 3.852799415588379, |
| "kl": 0.025146484375, |
| "learning_rate": 9.7e-07, |
| "loss": 0.0003, |
| "reward": 3.596950054168701, |
| "reward_std": 0.29281121492385864, |
| "rewards/answer_entity_reward": 0.9606508314609528, |
| "rewards/answer_wer_reward": 0.7530401945114136, |
| "rewards/format_reward": 0.9375, |
| "rewards/think_ocr_reward": 0.9457589387893677, |
| "step": 25 |
| }, |
| { |
| "completion_length": 201.375, |
| "epoch": 0.0832, |
| "grad_norm": 3.684136390686035, |
| "kl": 0.03955078125, |
| "learning_rate": 9.6875e-07, |
| "loss": 0.0004, |
| "reward": 3.6101993322372437, |
| "reward_std": 0.22506854683160782, |
| "rewards/answer_entity_reward": 0.8913510143756866, |
| "rewards/answer_wer_reward": 0.855983167886734, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8628652393817902, |
| "step": 26 |
| }, |
| { |
| "completion_length": 235.25, |
| "epoch": 0.0864, |
| "grad_norm": 2.9537627696990967, |
| "kl": 0.0134124755859375, |
| "learning_rate": 9.675e-07, |
| "loss": 0.0001, |
| "reward": 3.579669713973999, |
| "reward_std": 0.17270359210669994, |
| "rewards/answer_entity_reward": 0.8651459515094757, |
| "rewards/answer_wer_reward": 0.7930598855018616, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9214637279510498, |
| "step": 27 |
| }, |
| { |
| "completion_length": 199.96875, |
| "epoch": 0.0896, |
| "grad_norm": 2.0981569290161133, |
| "kl": 0.02239990234375, |
| "learning_rate": 9.6625e-07, |
| "loss": 0.0002, |
| "reward": 3.589198589324951, |
| "reward_std": 0.2977752536535263, |
| "rewards/answer_entity_reward": 0.8878033757209778, |
| "rewards/answer_wer_reward": 0.8114102184772491, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.889985203742981, |
| "step": 28 |
| }, |
| { |
| "completion_length": 229.6875, |
| "epoch": 0.0928, |
| "grad_norm": 2.406397819519043, |
| "kl": 0.0191650390625, |
| "learning_rate": 9.649999999999999e-07, |
| "loss": 0.0002, |
| "reward": 3.4348872900009155, |
| "reward_std": 0.37296992540359497, |
| "rewards/answer_entity_reward": 0.7681002914905548, |
| "rewards/answer_wer_reward": 0.724025309085846, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9740117192268372, |
| "step": 29 |
| }, |
| { |
| "completion_length": 199.59375, |
| "epoch": 0.096, |
| "grad_norm": 4.711977481842041, |
| "kl": 0.01727294921875, |
| "learning_rate": 9.637499999999999e-07, |
| "loss": 0.0002, |
| "reward": 3.7957680225372314, |
| "reward_std": 0.10022839158773422, |
| "rewards/answer_entity_reward": 0.9259244203567505, |
| "rewards/answer_wer_reward": 0.8810202181339264, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9888232052326202, |
| "step": 30 |
| }, |
| { |
| "completion_length": 227.71875, |
| "epoch": 0.0992, |
| "grad_norm": 8.605613708496094, |
| "kl": 0.016021728515625, |
| "learning_rate": 9.624999999999999e-07, |
| "loss": 0.0002, |
| "reward": 3.6433751583099365, |
| "reward_std": 0.19832589477300644, |
| "rewards/answer_entity_reward": 0.8819950520992279, |
| "rewards/answer_wer_reward": 0.832177460193634, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9292027056217194, |
| "step": 31 |
| }, |
| { |
| "completion_length": 215.65625, |
| "epoch": 0.1024, |
| "grad_norm": 3.5583388805389404, |
| "kl": 0.0224609375, |
| "learning_rate": 9.6125e-07, |
| "loss": 0.0002, |
| "reward": 3.516916036605835, |
| "reward_std": 0.29861560463905334, |
| "rewards/answer_entity_reward": 0.8093456923961639, |
| "rewards/answer_wer_reward": 0.7672389149665833, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9403314590454102, |
| "step": 32 |
| }, |
| { |
| "completion_length": 255.8125, |
| "epoch": 0.1056, |
| "grad_norm": 3.647063970565796, |
| "kl": 0.009185791015625, |
| "learning_rate": 9.6e-07, |
| "loss": 0.0001, |
| "reward": 3.5868738889694214, |
| "reward_std": 0.2677561491727829, |
| "rewards/answer_entity_reward": 0.818858414888382, |
| "rewards/answer_wer_reward": 0.7967112958431244, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9713042676448822, |
| "step": 33 |
| }, |
| { |
| "completion_length": 223.25, |
| "epoch": 0.1088, |
| "grad_norm": 4.442183017730713, |
| "kl": 0.02569580078125, |
| "learning_rate": 9.5875e-07, |
| "loss": 0.0003, |
| "reward": 3.6685177087783813, |
| "reward_std": 0.16033701971173286, |
| "rewards/answer_entity_reward": 0.8931982815265656, |
| "rewards/answer_wer_reward": 0.8027337491512299, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.972585529088974, |
| "step": 34 |
| }, |
| { |
| "completion_length": 225.09375, |
| "epoch": 0.112, |
| "grad_norm": 1.850151538848877, |
| "kl": 0.0135498046875, |
| "learning_rate": 9.575e-07, |
| "loss": 0.0001, |
| "reward": 3.622478485107422, |
| "reward_std": 0.15638228505849838, |
| "rewards/answer_entity_reward": 0.8341188132762909, |
| "rewards/answer_wer_reward": 0.8296426832675934, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9587167799472809, |
| "step": 35 |
| }, |
| { |
| "completion_length": 182.75, |
| "epoch": 0.1152, |
| "grad_norm": 3.844250202178955, |
| "kl": 0.100982666015625, |
| "learning_rate": 9.5625e-07, |
| "loss": 0.001, |
| "reward": 3.575288772583008, |
| "reward_std": 0.3447410613298416, |
| "rewards/answer_entity_reward": 0.8734935224056244, |
| "rewards/answer_wer_reward": 0.8351728320121765, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.8978723287582397, |
| "step": 36 |
| }, |
| { |
| "completion_length": 170.71875, |
| "epoch": 0.1184, |
| "grad_norm": 3.608771800994873, |
| "kl": 0.0318603515625, |
| "learning_rate": 9.55e-07, |
| "loss": 0.0003, |
| "reward": 3.757541060447693, |
| "reward_std": 0.16554252058267593, |
| "rewards/answer_entity_reward": 0.9673819839954376, |
| "rewards/answer_wer_reward": 0.8668203055858612, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9233386218547821, |
| "step": 37 |
| }, |
| { |
| "completion_length": 252.0, |
| "epoch": 0.1216, |
| "grad_norm": 2.063748836517334, |
| "kl": 0.01507568359375, |
| "learning_rate": 9.5375e-07, |
| "loss": 0.0001, |
| "reward": 3.716595768928528, |
| "reward_std": 0.10926416516304016, |
| "rewards/answer_entity_reward": 0.8833416402339935, |
| "rewards/answer_wer_reward": 0.8585604727268219, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9746935665607452, |
| "step": 38 |
| }, |
| { |
| "completion_length": 231.21875, |
| "epoch": 0.1248, |
| "grad_norm": 2.751699447631836, |
| "kl": 0.0213623046875, |
| "learning_rate": 9.525e-07, |
| "loss": 0.0002, |
| "reward": 3.539994239807129, |
| "reward_std": 0.1212783083319664, |
| "rewards/answer_entity_reward": 0.7954491972923279, |
| "rewards/answer_wer_reward": 0.7638055980205536, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.980739563703537, |
| "step": 39 |
| }, |
| { |
| "completion_length": 216.09375, |
| "epoch": 0.128, |
| "grad_norm": 2.074568033218384, |
| "kl": 0.0379638671875, |
| "learning_rate": 9.5125e-07, |
| "loss": 0.0004, |
| "reward": 3.6039533615112305, |
| "reward_std": 0.26473698019981384, |
| "rewards/answer_entity_reward": 0.8746186196804047, |
| "rewards/answer_wer_reward": 0.8307992517948151, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9297854900360107, |
| "step": 40 |
| }, |
| { |
| "completion_length": 203.5, |
| "epoch": 0.1312, |
| "grad_norm": 3.2622625827789307, |
| "kl": 0.05419921875, |
| "learning_rate": 9.499999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.4951852560043335, |
| "reward_std": 0.18541007116436958, |
| "rewards/answer_entity_reward": 0.8705199360847473, |
| "rewards/answer_wer_reward": 0.8321611285209656, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.7925041615962982, |
| "step": 41 |
| }, |
| { |
| "completion_length": 195.84375, |
| "epoch": 0.1344, |
| "grad_norm": 2.3474910259246826, |
| "kl": 0.0272216796875, |
| "learning_rate": 9.487499999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.556153178215027, |
| "reward_std": 0.22145777754485607, |
| "rewards/answer_entity_reward": 0.9313356876373291, |
| "rewards/answer_wer_reward": 0.7051927447319031, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9508746266365051, |
| "step": 42 |
| }, |
| { |
| "completion_length": 213.25, |
| "epoch": 0.1376, |
| "grad_norm": 2.805851697921753, |
| "kl": 0.039794921875, |
| "learning_rate": 9.474999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.4438276290893555, |
| "reward_std": 0.306783527135849, |
| "rewards/answer_entity_reward": 0.9020311534404755, |
| "rewards/answer_wer_reward": 0.7658404111862183, |
| "rewards/format_reward": 0.9375, |
| "rewards/think_ocr_reward": 0.838456004858017, |
| "step": 43 |
| }, |
| { |
| "completion_length": 237.6875, |
| "epoch": 0.1408, |
| "grad_norm": 1.9424443244934082, |
| "kl": 0.04632568359375, |
| "learning_rate": 9.462499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.6309977769851685, |
| "reward_std": 0.2500930577516556, |
| "rewards/answer_entity_reward": 0.8781489729881287, |
| "rewards/answer_wer_reward": 0.8788634538650513, |
| "rewards/format_reward": 0.9375, |
| "rewards/think_ocr_reward": 0.9364852905273438, |
| "step": 44 |
| }, |
| { |
| "completion_length": 239.375, |
| "epoch": 0.144, |
| "grad_norm": 46.16355895996094, |
| "kl": 0.0579833984375, |
| "learning_rate": 9.45e-07, |
| "loss": 0.0006, |
| "reward": 3.5368932485580444, |
| "reward_std": 0.43694401532411575, |
| "rewards/answer_entity_reward": 0.919220894575119, |
| "rewards/answer_wer_reward": 0.8205748200416565, |
| "rewards/format_reward": 0.84375, |
| "rewards/think_ocr_reward": 0.9533475041389465, |
| "step": 45 |
| }, |
| { |
| "completion_length": 173.84375, |
| "epoch": 0.1472, |
| "grad_norm": 3.7639763355255127, |
| "kl": 0.0450439453125, |
| "learning_rate": 9.4375e-07, |
| "loss": 0.0004, |
| "reward": 3.7322875261306763, |
| "reward_std": 0.1945570409297943, |
| "rewards/answer_entity_reward": 0.9228407144546509, |
| "rewards/answer_wer_reward": 0.8905497789382935, |
| "rewards/format_reward": 0.9375, |
| "rewards/think_ocr_reward": 0.9813971817493439, |
| "step": 46 |
| }, |
| { |
| "completion_length": 147.09375, |
| "epoch": 0.1504, |
| "grad_norm": 4.257631301879883, |
| "kl": 0.0538330078125, |
| "learning_rate": 9.425e-07, |
| "loss": 0.0005, |
| "reward": 3.478027820587158, |
| "reward_std": 0.2542489320039749, |
| "rewards/answer_entity_reward": 0.8890827894210815, |
| "rewards/answer_wer_reward": 0.7596322894096375, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.8605626821517944, |
| "step": 47 |
| }, |
| { |
| "completion_length": 223.9375, |
| "epoch": 0.1536, |
| "grad_norm": 1.5165725946426392, |
| "kl": 0.0335693359375, |
| "learning_rate": 9.4125e-07, |
| "loss": 0.0003, |
| "reward": 3.695801019668579, |
| "reward_std": 0.21276018023490906, |
| "rewards/answer_entity_reward": 0.9133437275886536, |
| "rewards/answer_wer_reward": 0.8821894526481628, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9315177500247955, |
| "step": 48 |
| }, |
| { |
| "completion_length": 196.15625, |
| "epoch": 0.1568, |
| "grad_norm": 2.7737293243408203, |
| "kl": 0.04931640625, |
| "learning_rate": 9.399999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.7317415475845337, |
| "reward_std": 0.11913972720503807, |
| "rewards/answer_entity_reward": 0.9534916281700134, |
| "rewards/answer_wer_reward": 0.8561010956764221, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9533988535404205, |
| "step": 49 |
| }, |
| { |
| "completion_length": 192.21875, |
| "epoch": 0.16, |
| "grad_norm": 3.4223740100860596, |
| "kl": 0.04052734375, |
| "learning_rate": 9.387499999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.65939998626709, |
| "reward_std": 0.1464347057044506, |
| "rewards/answer_entity_reward": 0.9478480219841003, |
| "rewards/answer_wer_reward": 0.8836140036582947, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8279379308223724, |
| "step": 50 |
| }, |
| { |
| "completion_length": 170.75, |
| "epoch": 0.1632, |
| "grad_norm": 3.389747381210327, |
| "kl": 0.0406494140625, |
| "learning_rate": 9.374999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.6742804050445557, |
| "reward_std": 0.21486516296863556, |
| "rewards/answer_entity_reward": 0.9492871761322021, |
| "rewards/answer_wer_reward": 0.8503031730651855, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8746900260448456, |
| "step": 51 |
| }, |
| { |
| "completion_length": 249.78125, |
| "epoch": 0.1664, |
| "grad_norm": 1.3398560285568237, |
| "kl": 0.0609130859375, |
| "learning_rate": 9.3625e-07, |
| "loss": 0.0006, |
| "reward": 3.7340474128723145, |
| "reward_std": 0.16536326706409454, |
| "rewards/answer_entity_reward": 0.9178049564361572, |
| "rewards/answer_wer_reward": 0.8599284589290619, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9563138782978058, |
| "step": 52 |
| }, |
| { |
| "completion_length": 243.53125, |
| "epoch": 0.1696, |
| "grad_norm": 2.292407512664795, |
| "kl": 0.035400390625, |
| "learning_rate": 9.35e-07, |
| "loss": 0.0004, |
| "reward": 3.6057465076446533, |
| "reward_std": 0.1650264859199524, |
| "rewards/answer_entity_reward": 0.942800760269165, |
| "rewards/answer_wer_reward": 0.743953675031662, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9189921319484711, |
| "step": 53 |
| }, |
| { |
| "completion_length": 224.4375, |
| "epoch": 0.1728, |
| "grad_norm": 25.665359497070312, |
| "kl": 0.03118896484375, |
| "learning_rate": 9.3375e-07, |
| "loss": 0.0003, |
| "reward": 3.6430656909942627, |
| "reward_std": 0.14360623061656952, |
| "rewards/answer_entity_reward": 0.907882422208786, |
| "rewards/answer_wer_reward": 0.7998041808605194, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9353790581226349, |
| "step": 54 |
| }, |
| { |
| "completion_length": 173.0625, |
| "epoch": 0.176, |
| "grad_norm": 4.687534809112549, |
| "kl": 0.03643798828125, |
| "learning_rate": 9.325e-07, |
| "loss": 0.0004, |
| "reward": 3.776802897453308, |
| "reward_std": 0.10255010426044464, |
| "rewards/answer_entity_reward": 0.9577985405921936, |
| "rewards/answer_wer_reward": 0.8955680429935455, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9234363734722137, |
| "step": 55 |
| }, |
| { |
| "completion_length": 241.84375, |
| "epoch": 0.1792, |
| "grad_norm": 2.1417253017425537, |
| "kl": 0.02978515625, |
| "learning_rate": 9.3125e-07, |
| "loss": 0.0003, |
| "reward": 3.7508766651153564, |
| "reward_std": 0.12244473025202751, |
| "rewards/answer_entity_reward": 0.9196350574493408, |
| "rewards/answer_wer_reward": 0.8353821933269501, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.995859295129776, |
| "step": 56 |
| }, |
| { |
| "completion_length": 214.15625, |
| "epoch": 0.1824, |
| "grad_norm": 2.977281332015991, |
| "kl": 0.03302001953125, |
| "learning_rate": 9.3e-07, |
| "loss": 0.0003, |
| "reward": 3.77036452293396, |
| "reward_std": 0.18844036478549242, |
| "rewards/answer_entity_reward": 0.9284944236278534, |
| "rewards/answer_wer_reward": 0.8541653454303741, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9877048432826996, |
| "step": 57 |
| }, |
| { |
| "completion_length": 245.9375, |
| "epoch": 0.1856, |
| "grad_norm": 1.5624388456344604, |
| "kl": 0.0296630859375, |
| "learning_rate": 9.287499999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.7977479696273804, |
| "reward_std": 0.08727182075381279, |
| "rewards/answer_entity_reward": 0.9509085714817047, |
| "rewards/answer_wer_reward": 0.8592260181903839, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9876134395599365, |
| "step": 58 |
| }, |
| { |
| "completion_length": 232.65625, |
| "epoch": 0.1888, |
| "grad_norm": 55.87119674682617, |
| "kl": 0.047607421875, |
| "learning_rate": 9.274999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.6933377981185913, |
| "reward_std": 0.24168139696121216, |
| "rewards/answer_entity_reward": 0.9402236640453339, |
| "rewards/answer_wer_reward": 0.8164783418178558, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9366357624530792, |
| "step": 59 |
| }, |
| { |
| "completion_length": 221.65625, |
| "epoch": 0.192, |
| "grad_norm": 1.8363709449768066, |
| "kl": 0.04156494140625, |
| "learning_rate": 9.2625e-07, |
| "loss": 0.0004, |
| "reward": 3.8290294408798218, |
| "reward_std": 0.08228548988699913, |
| "rewards/answer_entity_reward": 0.9317659735679626, |
| "rewards/answer_wer_reward": 0.9017607867717743, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.995502769947052, |
| "step": 60 |
| }, |
| { |
| "completion_length": 207.5625, |
| "epoch": 0.1952, |
| "grad_norm": 5.360762119293213, |
| "kl": 0.03662109375, |
| "learning_rate": 9.25e-07, |
| "loss": 0.0004, |
| "reward": 3.4508965015411377, |
| "reward_std": 0.24354729056358337, |
| "rewards/answer_entity_reward": 0.8888726234436035, |
| "rewards/answer_wer_reward": 0.6527576148509979, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9092662334442139, |
| "step": 61 |
| }, |
| { |
| "completion_length": 175.3125, |
| "epoch": 0.1984, |
| "grad_norm": 6.900688171386719, |
| "kl": 0.0562744140625, |
| "learning_rate": 9.237499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.5809485912323, |
| "reward_std": 0.27670779824256897, |
| "rewards/answer_entity_reward": 0.875405490398407, |
| "rewards/answer_wer_reward": 0.846805214881897, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8587377667427063, |
| "step": 62 |
| }, |
| { |
| "completion_length": 167.21875, |
| "epoch": 0.2016, |
| "grad_norm": 3.296032667160034, |
| "kl": 0.03668212890625, |
| "learning_rate": 9.225e-07, |
| "loss": 0.0004, |
| "reward": 3.775553345680237, |
| "reward_std": 0.1621587909758091, |
| "rewards/answer_entity_reward": 0.9595959782600403, |
| "rewards/answer_wer_reward": 0.900894969701767, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9150623679161072, |
| "step": 63 |
| }, |
| { |
| "completion_length": 216.0625, |
| "epoch": 0.2048, |
| "grad_norm": 3.287728786468506, |
| "kl": 0.05419921875, |
| "learning_rate": 9.2125e-07, |
| "loss": 0.0005, |
| "reward": 3.580909013748169, |
| "reward_std": 0.37151331454515457, |
| "rewards/answer_entity_reward": 0.9558238685131073, |
| "rewards/answer_wer_reward": 0.816798210144043, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.8395369946956635, |
| "step": 64 |
| }, |
| { |
| "completion_length": 242.1875, |
| "epoch": 0.208, |
| "grad_norm": 4.7966766357421875, |
| "kl": 0.0389404296875, |
| "learning_rate": 9.2e-07, |
| "loss": 0.0004, |
| "reward": 3.5479079484939575, |
| "reward_std": 0.34015993028879166, |
| "rewards/answer_entity_reward": 0.9070779979228973, |
| "rewards/answer_wer_reward": 0.7606107890605927, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9114691615104675, |
| "step": 65 |
| }, |
| { |
| "completion_length": 182.9375, |
| "epoch": 0.2112, |
| "grad_norm": 4.85190486907959, |
| "kl": 0.0411376953125, |
| "learning_rate": 9.187499999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.759209156036377, |
| "reward_std": 0.030521959997713566, |
| "rewards/answer_entity_reward": 0.9572916924953461, |
| "rewards/answer_wer_reward": 0.9216786324977875, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.880238950252533, |
| "step": 66 |
| }, |
| { |
| "completion_length": 197.46875, |
| "epoch": 0.2144, |
| "grad_norm": 2.888380765914917, |
| "kl": 0.03271484375, |
| "learning_rate": 9.174999999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.86090886592865, |
| "reward_std": 0.08941158838570118, |
| "rewards/answer_entity_reward": 0.974116176366806, |
| "rewards/answer_wer_reward": 0.9031813442707062, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9836114048957825, |
| "step": 67 |
| }, |
| { |
| "completion_length": 212.03125, |
| "epoch": 0.2176, |
| "grad_norm": 0.9500738382339478, |
| "kl": 0.03057861328125, |
| "learning_rate": 9.1625e-07, |
| "loss": 0.0003, |
| "reward": 3.865835189819336, |
| "reward_std": 0.04183580353856087, |
| "rewards/answer_entity_reward": 0.9732177555561066, |
| "rewards/answer_wer_reward": 0.8951224386692047, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9974949955940247, |
| "step": 68 |
| }, |
| { |
| "completion_length": 168.09375, |
| "epoch": 0.2208, |
| "grad_norm": 4.705175876617432, |
| "kl": 0.03704833984375, |
| "learning_rate": 9.15e-07, |
| "loss": 0.0004, |
| "reward": 3.6963913440704346, |
| "reward_std": 0.16030436754226685, |
| "rewards/answer_entity_reward": 0.932018518447876, |
| "rewards/answer_wer_reward": 0.8480667769908905, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9163061678409576, |
| "step": 69 |
| }, |
| { |
| "completion_length": 193.21875, |
| "epoch": 0.224, |
| "grad_norm": 2.125580310821533, |
| "kl": 0.03515625, |
| "learning_rate": 9.137499999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.8550466299057007, |
| "reward_std": 0.06468157470226288, |
| "rewards/answer_entity_reward": 0.9734202921390533, |
| "rewards/answer_wer_reward": 0.8898061215877533, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9918202459812164, |
| "step": 70 |
| }, |
| { |
| "completion_length": 235.78125, |
| "epoch": 0.2272, |
| "grad_norm": 6.89145040512085, |
| "kl": 0.042236328125, |
| "learning_rate": 9.124999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.725824475288391, |
| "reward_std": 0.05315144546329975, |
| "rewards/answer_entity_reward": 0.9593958258628845, |
| "rewards/answer_wer_reward": 0.8827618062496185, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8836667835712433, |
| "step": 71 |
| }, |
| { |
| "completion_length": 210.1875, |
| "epoch": 0.2304, |
| "grad_norm": 3.6971681118011475, |
| "kl": 0.0343017578125, |
| "learning_rate": 9.1125e-07, |
| "loss": 0.0003, |
| "reward": 3.719637870788574, |
| "reward_std": 0.10697400569915771, |
| "rewards/answer_entity_reward": 0.9880050718784332, |
| "rewards/answer_wer_reward": 0.7961998879909515, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9354328513145447, |
| "step": 72 |
| }, |
| { |
| "completion_length": 216.5625, |
| "epoch": 0.2336, |
| "grad_norm": 17.082843780517578, |
| "kl": 0.0537109375, |
| "learning_rate": 9.1e-07, |
| "loss": 0.0005, |
| "reward": 3.6063274145126343, |
| "reward_std": 0.2845265045762062, |
| "rewards/answer_entity_reward": 0.9374077320098877, |
| "rewards/answer_wer_reward": 0.7878484427928925, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.912321150302887, |
| "step": 73 |
| }, |
| { |
| "completion_length": 234.90625, |
| "epoch": 0.2368, |
| "grad_norm": 1.9695632457733154, |
| "kl": 0.031982421875, |
| "learning_rate": 9.087499999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.762009024620056, |
| "reward_std": 0.06560477986931801, |
| "rewards/answer_entity_reward": 0.9398341476917267, |
| "rewards/answer_wer_reward": 0.8473882973194122, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9747865796089172, |
| "step": 74 |
| }, |
| { |
| "completion_length": 233.65625, |
| "epoch": 0.24, |
| "grad_norm": 1.8333961963653564, |
| "kl": 0.0479736328125, |
| "learning_rate": 9.074999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.6872040033340454, |
| "reward_std": 0.12730678915977478, |
| "rewards/answer_entity_reward": 0.9421398341655731, |
| "rewards/answer_wer_reward": 0.8499290347099304, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.895135223865509, |
| "step": 75 |
| }, |
| { |
| "completion_length": 138.8125, |
| "epoch": 0.2432, |
| "grad_norm": 2.518507719039917, |
| "kl": 0.0504150390625, |
| "learning_rate": 9.0625e-07, |
| "loss": 0.0005, |
| "reward": 3.751777410507202, |
| "reward_std": 0.18188580125570297, |
| "rewards/answer_entity_reward": 0.9270697832107544, |
| "rewards/answer_wer_reward": 0.9237564206123352, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9009511768817902, |
| "step": 76 |
| }, |
| { |
| "completion_length": 261.0, |
| "epoch": 0.2464, |
| "grad_norm": 4.395165920257568, |
| "kl": 0.03662109375, |
| "learning_rate": 9.05e-07, |
| "loss": 0.0004, |
| "reward": 3.602410674095154, |
| "reward_std": 0.12657387554645538, |
| "rewards/answer_entity_reward": 0.8546798527240753, |
| "rewards/answer_wer_reward": 0.794090747833252, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9536401331424713, |
| "step": 77 |
| }, |
| { |
| "completion_length": 221.90625, |
| "epoch": 0.2496, |
| "grad_norm": 1.2728471755981445, |
| "kl": 0.0294189453125, |
| "learning_rate": 9.0375e-07, |
| "loss": 0.0003, |
| "reward": 3.788708806037903, |
| "reward_std": 0.09669506549835205, |
| "rewards/answer_entity_reward": 0.9447909295558929, |
| "rewards/answer_wer_reward": 0.8481404483318329, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9957774579524994, |
| "step": 78 |
| }, |
| { |
| "completion_length": 254.5625, |
| "epoch": 0.2528, |
| "grad_norm": 9.725419998168945, |
| "kl": 0.07373046875, |
| "learning_rate": 9.024999999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.668743133544922, |
| "reward_std": 0.1221558079123497, |
| "rewards/answer_entity_reward": 0.9435009658336639, |
| "rewards/answer_wer_reward": 0.8254426419734955, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8997994065284729, |
| "step": 79 |
| }, |
| { |
| "completion_length": 196.375, |
| "epoch": 0.256, |
| "grad_norm": 2.1853079795837402, |
| "kl": 0.0361328125, |
| "learning_rate": 9.0125e-07, |
| "loss": 0.0004, |
| "reward": 3.6546449661254883, |
| "reward_std": 0.1971728727221489, |
| "rewards/answer_entity_reward": 0.9472028017044067, |
| "rewards/answer_wer_reward": 0.8720800876617432, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.8666120767593384, |
| "step": 80 |
| }, |
| { |
| "completion_length": 248.25, |
| "epoch": 0.2592, |
| "grad_norm": 3.1572227478027344, |
| "kl": 0.03375244140625, |
| "learning_rate": 9e-07, |
| "loss": 0.0003, |
| "reward": 3.7423981428146362, |
| "reward_std": 0.10061750188469887, |
| "rewards/answer_entity_reward": 0.9398939311504364, |
| "rewards/answer_wer_reward": 0.8211633265018463, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9813408553600311, |
| "step": 81 |
| }, |
| { |
| "completion_length": 238.40625, |
| "epoch": 0.2624, |
| "grad_norm": 1.5329415798187256, |
| "kl": 0.03125, |
| "learning_rate": 8.9875e-07, |
| "loss": 0.0003, |
| "reward": 3.874926447868347, |
| "reward_std": 0.03685523197054863, |
| "rewards/answer_entity_reward": 0.9718094170093536, |
| "rewards/answer_wer_reward": 0.9062533378601074, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9968636631965637, |
| "step": 82 |
| }, |
| { |
| "completion_length": 222.5, |
| "epoch": 0.2656, |
| "grad_norm": 2.012899875640869, |
| "kl": 0.0419921875, |
| "learning_rate": 8.974999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.8047072887420654, |
| "reward_std": 0.046287354081869125, |
| "rewards/answer_entity_reward": 0.9534181356430054, |
| "rewards/answer_wer_reward": 0.8727244138717651, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9785646796226501, |
| "step": 83 |
| }, |
| { |
| "completion_length": 225.21875, |
| "epoch": 0.2688, |
| "grad_norm": 1.5400514602661133, |
| "kl": 0.0380859375, |
| "learning_rate": 8.9625e-07, |
| "loss": 0.0004, |
| "reward": 3.718083620071411, |
| "reward_std": 0.1703677996993065, |
| "rewards/answer_entity_reward": 0.9013731181621552, |
| "rewards/answer_wer_reward": 0.8260438740253448, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9906666278839111, |
| "step": 84 |
| }, |
| { |
| "completion_length": 236.125, |
| "epoch": 0.272, |
| "grad_norm": 1.6224849224090576, |
| "kl": 0.0550537109375, |
| "learning_rate": 8.95e-07, |
| "loss": 0.0005, |
| "reward": 3.8032166957855225, |
| "reward_std": 0.0796846654266119, |
| "rewards/answer_entity_reward": 0.9553452134132385, |
| "rewards/answer_wer_reward": 0.8544089794158936, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9934625327587128, |
| "step": 85 |
| }, |
| { |
| "completion_length": 214.34375, |
| "epoch": 0.2752, |
| "grad_norm": 3.1244239807128906, |
| "kl": 0.032470703125, |
| "learning_rate": 8.9375e-07, |
| "loss": 0.0003, |
| "reward": 3.803860068321228, |
| "reward_std": 0.06684968620538712, |
| "rewards/answer_entity_reward": 0.9671759307384491, |
| "rewards/answer_wer_reward": 0.9067878127098083, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9298965036869049, |
| "step": 86 |
| }, |
| { |
| "completion_length": 216.9375, |
| "epoch": 0.2784, |
| "grad_norm": 1.8527048826217651, |
| "kl": 0.02996826171875, |
| "learning_rate": 8.924999999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.813448429107666, |
| "reward_std": 0.05041965842247009, |
| "rewards/answer_entity_reward": 0.9224496483802795, |
| "rewards/answer_wer_reward": 0.8932149708271027, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9977837204933167, |
| "step": 87 |
| }, |
| { |
| "completion_length": 211.8125, |
| "epoch": 0.2816, |
| "grad_norm": 2.733228921890259, |
| "kl": 0.05126953125, |
| "learning_rate": 8.912499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.8481240272521973, |
| "reward_std": 0.0621240958571434, |
| "rewards/answer_entity_reward": 0.9627074301242828, |
| "rewards/answer_wer_reward": 0.9041622579097748, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9812542796134949, |
| "step": 88 |
| }, |
| { |
| "completion_length": 202.5, |
| "epoch": 0.2848, |
| "grad_norm": 4.8413496017456055, |
| "kl": 0.0433349609375, |
| "learning_rate": 8.9e-07, |
| "loss": 0.0004, |
| "reward": 3.668493866920471, |
| "reward_std": 0.08999799937009811, |
| "rewards/answer_entity_reward": 0.96169114112854, |
| "rewards/answer_wer_reward": 0.7791127562522888, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.92768993973732, |
| "step": 89 |
| }, |
| { |
| "completion_length": 214.6875, |
| "epoch": 0.288, |
| "grad_norm": 4.111961841583252, |
| "kl": 0.04638671875, |
| "learning_rate": 8.8875e-07, |
| "loss": 0.0005, |
| "reward": 3.7720965147018433, |
| "reward_std": 0.18014637380838394, |
| "rewards/answer_entity_reward": 0.9866696000099182, |
| "rewards/answer_wer_reward": 0.8934727013111115, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9232043027877808, |
| "step": 90 |
| }, |
| { |
| "completion_length": 241.5625, |
| "epoch": 0.2912, |
| "grad_norm": 1.4061272144317627, |
| "kl": 0.0460205078125, |
| "learning_rate": 8.874999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.828965663909912, |
| "reward_std": 0.04340291768312454, |
| "rewards/answer_entity_reward": 0.9683369398117065, |
| "rewards/answer_wer_reward": 0.8706588447093964, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9899699091911316, |
| "step": 91 |
| }, |
| { |
| "completion_length": 247.9375, |
| "epoch": 0.2944, |
| "grad_norm": 1.6669530868530273, |
| "kl": 0.0611572265625, |
| "learning_rate": 8.8625e-07, |
| "loss": 0.0006, |
| "reward": 3.7649370431900024, |
| "reward_std": 0.1087912805378437, |
| "rewards/answer_entity_reward": 0.9332223832607269, |
| "rewards/answer_wer_reward": 0.8357318043708801, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9959828853607178, |
| "step": 92 |
| }, |
| { |
| "completion_length": 162.625, |
| "epoch": 0.2976, |
| "grad_norm": 5.615991115570068, |
| "kl": 0.058837890625, |
| "learning_rate": 8.85e-07, |
| "loss": 0.0006, |
| "reward": 3.8870660066604614, |
| "reward_std": 0.09454158693552017, |
| "rewards/answer_entity_reward": 0.9939196705818176, |
| "rewards/answer_wer_reward": 0.9443124830722809, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9488338530063629, |
| "step": 93 |
| }, |
| { |
| "completion_length": 256.625, |
| "epoch": 0.3008, |
| "grad_norm": 2.879868984222412, |
| "kl": 0.2406005859375, |
| "learning_rate": 8.8375e-07, |
| "loss": 0.0024, |
| "reward": 3.6465322971343994, |
| "reward_std": 0.23435086756944656, |
| "rewards/answer_entity_reward": 0.9395784735679626, |
| "rewards/answer_wer_reward": 0.7715516090393066, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9666522741317749, |
| "step": 94 |
| }, |
| { |
| "completion_length": 254.15625, |
| "epoch": 0.304, |
| "grad_norm": 6.1645121574401855, |
| "kl": 0.262939453125, |
| "learning_rate": 8.824999999999999e-07, |
| "loss": 0.0026, |
| "reward": 3.728961229324341, |
| "reward_std": 0.11308889091014862, |
| "rewards/answer_entity_reward": 0.9466511011123657, |
| "rewards/answer_wer_reward": 0.8248744010925293, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9574357271194458, |
| "step": 95 |
| }, |
| { |
| "completion_length": 202.3125, |
| "epoch": 0.3072, |
| "grad_norm": 2.2792811393737793, |
| "kl": 0.0501708984375, |
| "learning_rate": 8.812499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.856202244758606, |
| "reward_std": 0.05682223103940487, |
| "rewards/answer_entity_reward": 0.9909722208976746, |
| "rewards/answer_wer_reward": 0.9035914540290833, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9616385698318481, |
| "step": 96 |
| }, |
| { |
| "completion_length": 222.53125, |
| "epoch": 0.3104, |
| "grad_norm": 2.4435033798217773, |
| "kl": 0.051513671875, |
| "learning_rate": 8.799999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.8195481300354004, |
| "reward_std": 0.08100517094135284, |
| "rewards/answer_entity_reward": 0.979785680770874, |
| "rewards/answer_wer_reward": 0.8738153576850891, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.965947151184082, |
| "step": 97 |
| }, |
| { |
| "completion_length": 202.375, |
| "epoch": 0.3136, |
| "grad_norm": 1.7632919549942017, |
| "kl": 0.0357666015625, |
| "learning_rate": 8.7875e-07, |
| "loss": 0.0004, |
| "reward": 3.7597837448120117, |
| "reward_std": 0.061054665595293045, |
| "rewards/answer_entity_reward": 0.9468090534210205, |
| "rewards/answer_wer_reward": 0.8723107874393463, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9406639635562897, |
| "step": 98 |
| }, |
| { |
| "completion_length": 207.53125, |
| "epoch": 0.3168, |
| "grad_norm": 7.402034282684326, |
| "kl": 0.04736328125, |
| "learning_rate": 8.774999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.7576065063476562, |
| "reward_std": 0.04146904498338699, |
| "rewards/answer_entity_reward": 0.9389799237251282, |
| "rewards/answer_wer_reward": 0.8201505243778229, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9984759986400604, |
| "step": 99 |
| }, |
| { |
| "completion_length": 210.15625, |
| "epoch": 0.32, |
| "grad_norm": 1.5828880071640015, |
| "kl": 0.0450439453125, |
| "learning_rate": 8.7625e-07, |
| "loss": 0.0004, |
| "reward": 3.835609197616577, |
| "reward_std": 0.12980258837342262, |
| "rewards/answer_entity_reward": 0.9350627064704895, |
| "rewards/answer_wer_reward": 0.9053294062614441, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9952171444892883, |
| "step": 100 |
| }, |
| { |
| "completion_length": 214.34375, |
| "epoch": 0.3232, |
| "grad_norm": 5.768563270568848, |
| "kl": 0.055908203125, |
| "learning_rate": 8.75e-07, |
| "loss": 0.0006, |
| "reward": 3.611391305923462, |
| "reward_std": 0.2522353269159794, |
| "rewards/answer_entity_reward": 0.9709455966949463, |
| "rewards/answer_wer_reward": 0.7870493829250336, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.8846463263034821, |
| "step": 101 |
| }, |
| { |
| "completion_length": 223.0, |
| "epoch": 0.3264, |
| "grad_norm": 7.32905387878418, |
| "kl": 0.0755615234375, |
| "learning_rate": 8.7375e-07, |
| "loss": 0.0008, |
| "reward": 3.7200475931167603, |
| "reward_std": 0.18947013467550278, |
| "rewards/answer_entity_reward": 0.9668727219104767, |
| "rewards/answer_wer_reward": 0.8209056556224823, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9635193049907684, |
| "step": 102 |
| }, |
| { |
| "completion_length": 229.96875, |
| "epoch": 0.3296, |
| "grad_norm": 0.9038276672363281, |
| "kl": 0.0411376953125, |
| "learning_rate": 8.725e-07, |
| "loss": 0.0004, |
| "reward": 3.862263560295105, |
| "reward_std": 0.03179450985044241, |
| "rewards/answer_entity_reward": 0.9754428863525391, |
| "rewards/answer_wer_reward": 0.8931463062763214, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9936743974685669, |
| "step": 103 |
| }, |
| { |
| "completion_length": 265.3125, |
| "epoch": 0.3328, |
| "grad_norm": 1.3424818515777588, |
| "kl": 0.0357666015625, |
| "learning_rate": 8.712499999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.7375279664993286, |
| "reward_std": 0.07805093377828598, |
| "rewards/answer_entity_reward": 0.9291824698448181, |
| "rewards/answer_wer_reward": 0.8219992816448212, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9863462746143341, |
| "step": 104 |
| }, |
| { |
| "completion_length": 252.78125, |
| "epoch": 0.336, |
| "grad_norm": 1.2035622596740723, |
| "kl": 0.033203125, |
| "learning_rate": 8.699999999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.8339978456497192, |
| "reward_std": 0.05473129637539387, |
| "rewards/answer_entity_reward": 0.9795939922332764, |
| "rewards/answer_wer_reward": 0.8663320243358612, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9880718290805817, |
| "step": 105 |
| }, |
| { |
| "completion_length": 162.96875, |
| "epoch": 0.3392, |
| "grad_norm": 7.1932783126831055, |
| "kl": 0.06005859375, |
| "learning_rate": 8.687499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.8799617290496826, |
| "reward_std": 0.0983762014657259, |
| "rewards/answer_entity_reward": 0.9810606241226196, |
| "rewards/answer_wer_reward": 0.9314018487930298, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9674993753433228, |
| "step": 106 |
| }, |
| { |
| "completion_length": 225.625, |
| "epoch": 0.3424, |
| "grad_norm": 5.7709455490112305, |
| "kl": 0.05291748046875, |
| "learning_rate": 8.675000000000001e-07, |
| "loss": 0.0005, |
| "reward": 3.7411450147628784, |
| "reward_std": 0.2322532683610916, |
| "rewards/answer_entity_reward": 0.9225597083568573, |
| "rewards/answer_wer_reward": 0.8556761145591736, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9941591918468475, |
| "step": 107 |
| }, |
| { |
| "completion_length": 194.78125, |
| "epoch": 0.3456, |
| "grad_norm": 5.741571426391602, |
| "kl": 0.0556640625, |
| "learning_rate": 8.6625e-07, |
| "loss": 0.0006, |
| "reward": 3.867335319519043, |
| "reward_std": 0.03972470294684172, |
| "rewards/answer_entity_reward": 0.9573142230510712, |
| "rewards/answer_wer_reward": 0.9156463444232941, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9943746328353882, |
| "step": 108 |
| }, |
| { |
| "completion_length": 228.5, |
| "epoch": 0.3488, |
| "grad_norm": 6.1572089195251465, |
| "kl": 0.05859375, |
| "learning_rate": 8.65e-07, |
| "loss": 0.0006, |
| "reward": 3.673606753349304, |
| "reward_std": 0.08745867013931274, |
| "rewards/answer_entity_reward": 0.9391757845878601, |
| "rewards/answer_wer_reward": 0.8806695938110352, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8537613451480865, |
| "step": 109 |
| }, |
| { |
| "completion_length": 216.5, |
| "epoch": 0.352, |
| "grad_norm": 2.032820701599121, |
| "kl": 0.0518798828125, |
| "learning_rate": 8.6375e-07, |
| "loss": 0.0005, |
| "reward": 3.6381773948669434, |
| "reward_std": 0.11543435975909233, |
| "rewards/answer_entity_reward": 0.9743416607379913, |
| "rewards/answer_wer_reward": 0.7632936537265778, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9005421102046967, |
| "step": 110 |
| }, |
| { |
| "completion_length": 220.96875, |
| "epoch": 0.3552, |
| "grad_norm": 4.737320423126221, |
| "kl": 0.0751953125, |
| "learning_rate": 8.625e-07, |
| "loss": 0.0008, |
| "reward": 3.823172926902771, |
| "reward_std": 0.04683285113424063, |
| "rewards/answer_entity_reward": 0.9827152192592621, |
| "rewards/answer_wer_reward": 0.8539322018623352, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9865254759788513, |
| "step": 111 |
| }, |
| { |
| "completion_length": 248.75, |
| "epoch": 0.3584, |
| "grad_norm": 3.7395012378692627, |
| "kl": 0.0484619140625, |
| "learning_rate": 8.612499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.835617423057556, |
| "reward_std": 0.039440929889678955, |
| "rewards/answer_entity_reward": 0.9718195497989655, |
| "rewards/answer_wer_reward": 0.8654404282569885, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9983574748039246, |
| "step": 112 |
| }, |
| { |
| "completion_length": 218.75, |
| "epoch": 0.3616, |
| "grad_norm": 3.5470447540283203, |
| "kl": 0.10302734375, |
| "learning_rate": 8.599999999999999e-07, |
| "loss": 0.001, |
| "reward": 3.766317844390869, |
| "reward_std": 0.0799998790025711, |
| "rewards/answer_entity_reward": 0.9724812507629395, |
| "rewards/answer_wer_reward": 0.8530462384223938, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9407903254032135, |
| "step": 113 |
| }, |
| { |
| "completion_length": 231.4375, |
| "epoch": 0.3648, |
| "grad_norm": 4.614479064941406, |
| "kl": 0.060546875, |
| "learning_rate": 8.587499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.828564405441284, |
| "reward_std": 0.030111415311694145, |
| "rewards/answer_entity_reward": 0.9710638523101807, |
| "rewards/answer_wer_reward": 0.866461992263794, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.991038590669632, |
| "step": 114 |
| }, |
| { |
| "completion_length": 237.875, |
| "epoch": 0.368, |
| "grad_norm": 1.1590646505355835, |
| "kl": 0.046142578125, |
| "learning_rate": 8.575e-07, |
| "loss": 0.0005, |
| "reward": 3.870112419128418, |
| "reward_std": 0.051995884627103806, |
| "rewards/answer_entity_reward": 0.9711016416549683, |
| "rewards/answer_wer_reward": 0.9016274213790894, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9973834156990051, |
| "step": 115 |
| }, |
| { |
| "completion_length": 234.59375, |
| "epoch": 0.3712, |
| "grad_norm": 1.4525243043899536, |
| "kl": 0.113525390625, |
| "learning_rate": 8.5625e-07, |
| "loss": 0.0011, |
| "reward": 3.755509376525879, |
| "reward_std": 0.10925759375095367, |
| "rewards/answer_entity_reward": 0.9556345045566559, |
| "rewards/answer_wer_reward": 0.8394978046417236, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9603771269321442, |
| "step": 116 |
| }, |
| { |
| "completion_length": 220.4375, |
| "epoch": 0.3744, |
| "grad_norm": 1.6397019624710083, |
| "kl": 0.15234375, |
| "learning_rate": 8.55e-07, |
| "loss": 0.0015, |
| "reward": 3.829906702041626, |
| "reward_std": 0.03734264615923166, |
| "rewards/answer_entity_reward": 0.9839539229869843, |
| "rewards/answer_wer_reward": 0.8527026474475861, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9932501018047333, |
| "step": 117 |
| }, |
| { |
| "completion_length": 209.53125, |
| "epoch": 0.3776, |
| "grad_norm": 3.598604440689087, |
| "kl": 0.07861328125, |
| "learning_rate": 8.5375e-07, |
| "loss": 0.0008, |
| "reward": 3.7239131927490234, |
| "reward_std": 0.07304626516997814, |
| "rewards/answer_entity_reward": 0.9540751278400421, |
| "rewards/answer_wer_reward": 0.8128292262554169, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9570088684558868, |
| "step": 118 |
| }, |
| { |
| "completion_length": 207.71875, |
| "epoch": 0.3808, |
| "grad_norm": 1.9592057466506958, |
| "kl": 0.0435791015625, |
| "learning_rate": 8.525e-07, |
| "loss": 0.0004, |
| "reward": 3.8095905780792236, |
| "reward_std": 0.15753451362252235, |
| "rewards/answer_entity_reward": 0.9857954680919647, |
| "rewards/answer_wer_reward": 0.9040109515190125, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9510340690612793, |
| "step": 119 |
| }, |
| { |
| "completion_length": 245.90625, |
| "epoch": 0.384, |
| "grad_norm": 1.7574220895767212, |
| "kl": 0.0609130859375, |
| "learning_rate": 8.512499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.854837656021118, |
| "reward_std": 0.0384799987077713, |
| "rewards/answer_entity_reward": 0.9729723632335663, |
| "rewards/answer_wer_reward": 0.8832501769065857, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.998615026473999, |
| "step": 120 |
| }, |
| { |
| "completion_length": 187.15625, |
| "epoch": 0.3872, |
| "grad_norm": 8.7343168258667, |
| "kl": 0.0494384765625, |
| "learning_rate": 8.499999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.7950538396835327, |
| "reward_std": 0.09329042956233025, |
| "rewards/answer_entity_reward": 0.9599206745624542, |
| "rewards/answer_wer_reward": 0.9000534117221832, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.93507981300354, |
| "step": 121 |
| }, |
| { |
| "completion_length": 203.84375, |
| "epoch": 0.3904, |
| "grad_norm": 3.7000162601470947, |
| "kl": 0.062744140625, |
| "learning_rate": 8.487499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.8687225580215454, |
| "reward_std": 0.03621992561966181, |
| "rewards/answer_entity_reward": 0.9873106181621552, |
| "rewards/answer_wer_reward": 0.8855177164077759, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9958942830562592, |
| "step": 122 |
| }, |
| { |
| "completion_length": 205.84375, |
| "epoch": 0.3936, |
| "grad_norm": 9.27507209777832, |
| "kl": 0.0570068359375, |
| "learning_rate": 8.475e-07, |
| "loss": 0.0006, |
| "reward": 3.7104525566101074, |
| "reward_std": 0.05549425818026066, |
| "rewards/answer_entity_reward": 0.955735981464386, |
| "rewards/answer_wer_reward": 0.8933148980140686, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.86140176653862, |
| "step": 123 |
| }, |
| { |
| "completion_length": 246.46875, |
| "epoch": 0.3968, |
| "grad_norm": 2.3181021213531494, |
| "kl": 0.0404052734375, |
| "learning_rate": 8.462499999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.821496605873108, |
| "reward_std": 0.09581143222749233, |
| "rewards/answer_entity_reward": 0.9655607342720032, |
| "rewards/answer_wer_reward": 0.8666167855262756, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9893191456794739, |
| "step": 124 |
| }, |
| { |
| "completion_length": 206.3125, |
| "epoch": 0.4, |
| "grad_norm": 1.5352882146835327, |
| "kl": 0.055419921875, |
| "learning_rate": 8.45e-07, |
| "loss": 0.0006, |
| "reward": 3.831603527069092, |
| "reward_std": 0.08168897591531277, |
| "rewards/answer_entity_reward": 0.9702457189559937, |
| "rewards/answer_wer_reward": 0.9070821702480316, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9542755484580994, |
| "step": 125 |
| }, |
| { |
| "completion_length": 240.28125, |
| "epoch": 0.4032, |
| "grad_norm": 1.380315899848938, |
| "kl": 0.05908203125, |
| "learning_rate": 8.4375e-07, |
| "loss": 0.0006, |
| "reward": 3.7971588373184204, |
| "reward_std": 0.10537005960941315, |
| "rewards/answer_entity_reward": 0.9396995604038239, |
| "rewards/answer_wer_reward": 0.8588653802871704, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9985939264297485, |
| "step": 126 |
| }, |
| { |
| "completion_length": 206.84375, |
| "epoch": 0.4064, |
| "grad_norm": 1.5937124490737915, |
| "kl": 0.056884765625, |
| "learning_rate": 8.425e-07, |
| "loss": 0.0006, |
| "reward": 3.8375606536865234, |
| "reward_std": 0.047878991812467575, |
| "rewards/answer_entity_reward": 0.9553684592247009, |
| "rewards/answer_wer_reward": 0.8867217302322388, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9954704642295837, |
| "step": 127 |
| }, |
| { |
| "completion_length": 214.1875, |
| "epoch": 0.4096, |
| "grad_norm": 1.3648440837860107, |
| "kl": 0.0687255859375, |
| "learning_rate": 8.4125e-07, |
| "loss": 0.0007, |
| "reward": 3.8555803298950195, |
| "reward_std": 0.05176056548953056, |
| "rewards/answer_entity_reward": 0.9823863804340363, |
| "rewards/answer_wer_reward": 0.8972643911838531, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9759295582771301, |
| "step": 128 |
| }, |
| { |
| "completion_length": 215.90625, |
| "epoch": 0.4128, |
| "grad_norm": 1.4308183193206787, |
| "kl": 0.0390625, |
| "learning_rate": 8.399999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.8976725339889526, |
| "reward_std": 0.016966319642961025, |
| "rewards/answer_entity_reward": 0.9958333373069763, |
| "rewards/answer_wer_reward": 0.9021182060241699, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997209906578064, |
| "step": 129 |
| }, |
| { |
| "completion_length": 189.1875, |
| "epoch": 0.416, |
| "grad_norm": 7.785026550292969, |
| "kl": 0.0506591796875, |
| "learning_rate": 8.387499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.7563494443893433, |
| "reward_std": 0.12806903570890427, |
| "rewards/answer_entity_reward": 0.9905131459236145, |
| "rewards/answer_wer_reward": 0.8918424248695374, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8739938735961914, |
| "step": 130 |
| }, |
| { |
| "completion_length": 211.21875, |
| "epoch": 0.4192, |
| "grad_norm": 6.029291152954102, |
| "kl": 0.0860595703125, |
| "learning_rate": 8.375e-07, |
| "loss": 0.0009, |
| "reward": 3.7876737117767334, |
| "reward_std": 0.07924951426684856, |
| "rewards/answer_entity_reward": 0.9788058996200562, |
| "rewards/answer_wer_reward": 0.903822124004364, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9050455689430237, |
| "step": 131 |
| }, |
| { |
| "completion_length": 197.9375, |
| "epoch": 0.4224, |
| "grad_norm": 1.5226598978042603, |
| "kl": 0.0865478515625, |
| "learning_rate": 8.3625e-07, |
| "loss": 0.0009, |
| "reward": 3.8618096113204956, |
| "reward_std": 0.024674754589796066, |
| "rewards/answer_entity_reward": 0.9936868846416473, |
| "rewards/answer_wer_reward": 0.9131532609462738, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9549693167209625, |
| "step": 132 |
| }, |
| { |
| "completion_length": 178.28125, |
| "epoch": 0.4256, |
| "grad_norm": 4.81843376159668, |
| "kl": 0.1806640625, |
| "learning_rate": 8.349999999999999e-07, |
| "loss": 0.0018, |
| "reward": 3.8692500591278076, |
| "reward_std": 0.0898860078305006, |
| "rewards/answer_entity_reward": 0.9539262652397156, |
| "rewards/answer_wer_reward": 0.9164533317089081, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9988704919815063, |
| "step": 133 |
| }, |
| { |
| "completion_length": 214.3125, |
| "epoch": 0.4288, |
| "grad_norm": 3.702409267425537, |
| "kl": 0.100341796875, |
| "learning_rate": 8.3375e-07, |
| "loss": 0.001, |
| "reward": 3.7666897773742676, |
| "reward_std": 0.05854834243655205, |
| "rewards/answer_entity_reward": 0.9739753007888794, |
| "rewards/answer_wer_reward": 0.8456098437309265, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9471046626567841, |
| "step": 134 |
| }, |
| { |
| "completion_length": 230.3125, |
| "epoch": 0.432, |
| "grad_norm": 4.869428634643555, |
| "kl": 0.109619140625, |
| "learning_rate": 8.325e-07, |
| "loss": 0.0011, |
| "reward": 3.837371587753296, |
| "reward_std": 0.07383839413523674, |
| "rewards/answer_entity_reward": 0.9623282849788666, |
| "rewards/answer_wer_reward": 0.8914425075054169, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9836008548736572, |
| "step": 135 |
| }, |
| { |
| "completion_length": 222.84375, |
| "epoch": 0.4352, |
| "grad_norm": 1.1195542812347412, |
| "kl": 0.0875244140625, |
| "learning_rate": 8.3125e-07, |
| "loss": 0.0009, |
| "reward": 3.800593137741089, |
| "reward_std": 0.05516563355922699, |
| "rewards/answer_entity_reward": 0.977182537317276, |
| "rewards/answer_wer_reward": 0.8409056067466736, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9825051128864288, |
| "step": 136 |
| }, |
| { |
| "completion_length": 156.28125, |
| "epoch": 0.4384, |
| "grad_norm": 2.307365655899048, |
| "kl": 0.0631103515625, |
| "learning_rate": 8.299999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.803721785545349, |
| "reward_std": 0.1857592761516571, |
| "rewards/answer_entity_reward": 0.9582379460334778, |
| "rewards/answer_wer_reward": 0.9269835352897644, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9185003936290741, |
| "step": 137 |
| }, |
| { |
| "completion_length": 230.3125, |
| "epoch": 0.4416, |
| "grad_norm": 1.0649584531784058, |
| "kl": 0.0577392578125, |
| "learning_rate": 8.287499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.8693535327911377, |
| "reward_std": 0.10830429336056113, |
| "rewards/answer_entity_reward": 0.9776785671710968, |
| "rewards/answer_wer_reward": 0.8930677771568298, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9986072480678558, |
| "step": 138 |
| }, |
| { |
| "completion_length": 220.5, |
| "epoch": 0.4448, |
| "grad_norm": 3.627920150756836, |
| "kl": 0.0648193359375, |
| "learning_rate": 8.275e-07, |
| "loss": 0.0006, |
| "reward": 3.779549479484558, |
| "reward_std": 0.04976406879723072, |
| "rewards/answer_entity_reward": 0.9892225861549377, |
| "rewards/answer_wer_reward": 0.8991544246673584, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.891172468662262, |
| "step": 139 |
| }, |
| { |
| "completion_length": 214.375, |
| "epoch": 0.448, |
| "grad_norm": 1.0832712650299072, |
| "kl": 0.0511474609375, |
| "learning_rate": 8.2625e-07, |
| "loss": 0.0005, |
| "reward": 3.866790771484375, |
| "reward_std": 0.03637353144586086, |
| "rewards/answer_entity_reward": 0.9854166805744171, |
| "rewards/answer_wer_reward": 0.8826328217983246, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987412095069885, |
| "step": 140 |
| }, |
| { |
| "completion_length": 215.0, |
| "epoch": 0.4512, |
| "grad_norm": 4.865916728973389, |
| "kl": 0.080810546875, |
| "learning_rate": 8.249999999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.782729744911194, |
| "reward_std": 0.05014876648783684, |
| "rewards/answer_entity_reward": 0.9947552382946014, |
| "rewards/answer_wer_reward": 0.9396264553070068, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8483479917049408, |
| "step": 141 |
| }, |
| { |
| "completion_length": 235.1875, |
| "epoch": 0.4544, |
| "grad_norm": 3.832350730895996, |
| "kl": 0.0489501953125, |
| "learning_rate": 8.2375e-07, |
| "loss": 0.0005, |
| "reward": 3.8454935550689697, |
| "reward_std": 0.02625620225444436, |
| "rewards/answer_entity_reward": 0.9856643378734589, |
| "rewards/answer_wer_reward": 0.9073578715324402, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9524714052677155, |
| "step": 142 |
| }, |
| { |
| "completion_length": 193.5625, |
| "epoch": 0.4576, |
| "grad_norm": 1.5562162399291992, |
| "kl": 0.0836181640625, |
| "learning_rate": 8.225e-07, |
| "loss": 0.0008, |
| "reward": 3.8764915466308594, |
| "reward_std": 0.02105938969179988, |
| "rewards/answer_entity_reward": 0.9958333373069763, |
| "rewards/answer_wer_reward": 0.9281685054302216, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.952489823102951, |
| "step": 143 |
| }, |
| { |
| "completion_length": 192.5, |
| "epoch": 0.4608, |
| "grad_norm": 4.00892448425293, |
| "kl": 0.065185546875, |
| "learning_rate": 8.2125e-07, |
| "loss": 0.0007, |
| "reward": 3.9131712913513184, |
| "reward_std": 0.025579220615327358, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9231057167053223, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9945298135280609, |
| "step": 144 |
| }, |
| { |
| "completion_length": 222.5625, |
| "epoch": 0.464, |
| "grad_norm": 6.250589370727539, |
| "kl": 0.0546875, |
| "learning_rate": 8.199999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.8917945623397827, |
| "reward_std": 0.04113447107374668, |
| "rewards/answer_entity_reward": 0.9717775583267212, |
| "rewards/answer_wer_reward": 0.926241010427475, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9937759339809418, |
| "step": 145 |
| }, |
| { |
| "completion_length": 183.90625, |
| "epoch": 0.4672, |
| "grad_norm": 2.7752954959869385, |
| "kl": 0.0670166015625, |
| "learning_rate": 8.187499999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.860864043235779, |
| "reward_std": 0.06173134222626686, |
| "rewards/answer_entity_reward": 0.9583333432674408, |
| "rewards/answer_wer_reward": 0.9120890200138092, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9904417395591736, |
| "step": 146 |
| }, |
| { |
| "completion_length": 220.78125, |
| "epoch": 0.4704, |
| "grad_norm": 3.0674679279327393, |
| "kl": 0.09912109375, |
| "learning_rate": 8.175e-07, |
| "loss": 0.001, |
| "reward": 3.84165620803833, |
| "reward_std": 0.03327286522835493, |
| "rewards/answer_entity_reward": 0.9452651739120483, |
| "rewards/answer_wer_reward": 0.8996314704418182, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.996759682893753, |
| "step": 147 |
| }, |
| { |
| "completion_length": 162.8125, |
| "epoch": 0.4736, |
| "grad_norm": 4.559942245483398, |
| "kl": 0.116455078125, |
| "learning_rate": 8.1625e-07, |
| "loss": 0.0012, |
| "reward": 3.833083748817444, |
| "reward_std": 0.06737112812697887, |
| "rewards/answer_entity_reward": 0.9923878014087677, |
| "rewards/answer_wer_reward": 0.902847170829773, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9378487467765808, |
| "step": 148 |
| }, |
| { |
| "completion_length": 221.53125, |
| "epoch": 0.4768, |
| "grad_norm": 1.3157752752304077, |
| "kl": 0.052978515625, |
| "learning_rate": 8.149999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.8545873165130615, |
| "reward_std": 0.019355260767042637, |
| "rewards/answer_entity_reward": 0.9938696324825287, |
| "rewards/answer_wer_reward": 0.8627510368824005, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9979668259620667, |
| "step": 149 |
| }, |
| { |
| "completion_length": 233.1875, |
| "epoch": 0.48, |
| "grad_norm": 4.352514743804932, |
| "kl": 0.053955078125, |
| "learning_rate": 8.137499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.8025535345077515, |
| "reward_std": 0.0806161779910326, |
| "rewards/answer_entity_reward": 0.9906516969203949, |
| "rewards/answer_wer_reward": 0.8636212348937988, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9482806921005249, |
| "step": 150 |
| }, |
| { |
| "completion_length": 210.03125, |
| "epoch": 0.4832, |
| "grad_norm": 1.3691778182983398, |
| "kl": 0.05615234375, |
| "learning_rate": 8.125e-07, |
| "loss": 0.0006, |
| "reward": 3.860105037689209, |
| "reward_std": 0.034908443689346313, |
| "rewards/answer_entity_reward": 0.9873737394809723, |
| "rewards/answer_wer_reward": 0.9285348653793335, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9441964328289032, |
| "step": 151 |
| }, |
| { |
| "completion_length": 244.625, |
| "epoch": 0.4864, |
| "grad_norm": 1.9329304695129395, |
| "kl": 0.058837890625, |
| "learning_rate": 8.1125e-07, |
| "loss": 0.0006, |
| "reward": 3.849783182144165, |
| "reward_std": 0.029241922311484814, |
| "rewards/answer_entity_reward": 0.9856617450714111, |
| "rewards/answer_wer_reward": 0.8688266575336456, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9952948093414307, |
| "step": 152 |
| }, |
| { |
| "completion_length": 174.375, |
| "epoch": 0.4896, |
| "grad_norm": 5.655167579650879, |
| "kl": 0.067138671875, |
| "learning_rate": 8.1e-07, |
| "loss": 0.0007, |
| "reward": 3.85835599899292, |
| "reward_std": 0.1141166789457202, |
| "rewards/answer_entity_reward": 0.9663461446762085, |
| "rewards/answer_wer_reward": 0.9284006357192993, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9636091589927673, |
| "step": 153 |
| }, |
| { |
| "completion_length": 185.875, |
| "epoch": 0.4928, |
| "grad_norm": 4.543191432952881, |
| "kl": 0.084716796875, |
| "learning_rate": 8.087499999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.851526975631714, |
| "reward_std": 0.0990656241774559, |
| "rewards/answer_entity_reward": 0.9646950364112854, |
| "rewards/answer_wer_reward": 0.9213105142116547, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.965521514415741, |
| "step": 154 |
| }, |
| { |
| "completion_length": 209.0625, |
| "epoch": 0.496, |
| "grad_norm": 2.554072380065918, |
| "kl": 0.0572509765625, |
| "learning_rate": 8.075e-07, |
| "loss": 0.0006, |
| "reward": 3.790269613265991, |
| "reward_std": 0.048579949885606766, |
| "rewards/answer_entity_reward": 0.9870130121707916, |
| "rewards/answer_wer_reward": 0.8052773177623749, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9979791641235352, |
| "step": 155 |
| }, |
| { |
| "completion_length": 211.96875, |
| "epoch": 0.4992, |
| "grad_norm": 2.762598991394043, |
| "kl": 0.0498046875, |
| "learning_rate": 8.0625e-07, |
| "loss": 0.0005, |
| "reward": 3.9201120138168335, |
| "reward_std": 0.014579844661056995, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9228614568710327, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9972506165504456, |
| "step": 156 |
| }, |
| { |
| "completion_length": 206.59375, |
| "epoch": 0.5024, |
| "grad_norm": 1.9372365474700928, |
| "kl": 0.0621337890625, |
| "learning_rate": 8.05e-07, |
| "loss": 0.0006, |
| "reward": 3.5673259496688843, |
| "reward_std": 0.028257974423468113, |
| "rewards/answer_entity_reward": 0.9902146458625793, |
| "rewards/answer_wer_reward": 0.758561909198761, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.818549245595932, |
| "step": 157 |
| }, |
| { |
| "completion_length": 213.15625, |
| "epoch": 0.5056, |
| "grad_norm": 2.594701051712036, |
| "kl": 0.08203125, |
| "learning_rate": 8.037499999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.8647842407226562, |
| "reward_std": 0.029484061524271965, |
| "rewards/answer_entity_reward": 0.9847756624221802, |
| "rewards/answer_wer_reward": 0.8839923739433289, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9960162043571472, |
| "step": 158 |
| }, |
| { |
| "completion_length": 196.34375, |
| "epoch": 0.5088, |
| "grad_norm": 3.0164191722869873, |
| "kl": 0.0526123046875, |
| "learning_rate": 8.024999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.8759838342666626, |
| "reward_std": 0.04202751815319061, |
| "rewards/answer_entity_reward": 0.9789772629737854, |
| "rewards/answer_wer_reward": 0.9108568131923676, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9861496686935425, |
| "step": 159 |
| }, |
| { |
| "completion_length": 198.0, |
| "epoch": 0.512, |
| "grad_norm": 5.223659515380859, |
| "kl": 0.07177734375, |
| "learning_rate": 8.0125e-07, |
| "loss": 0.0007, |
| "reward": 3.8265939950942993, |
| "reward_std": 0.04291579592972994, |
| "rewards/answer_entity_reward": 0.9890734255313873, |
| "rewards/answer_wer_reward": 0.8892558217048645, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9482647478580475, |
| "step": 160 |
| }, |
| { |
| "completion_length": 184.0, |
| "epoch": 0.5152, |
| "grad_norm": 2.4279987812042236, |
| "kl": 0.0914306640625, |
| "learning_rate": 8e-07, |
| "loss": 0.0009, |
| "reward": 3.8738738298416138, |
| "reward_std": 0.049739884212613106, |
| "rewards/answer_entity_reward": 0.9671474397182465, |
| "rewards/answer_wer_reward": 0.9240702688694, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9826560020446777, |
| "step": 161 |
| }, |
| { |
| "completion_length": 250.96875, |
| "epoch": 0.5184, |
| "grad_norm": 1.4533754587173462, |
| "kl": 0.047607421875, |
| "learning_rate": 7.9875e-07, |
| "loss": 0.0005, |
| "reward": 3.9009437561035156, |
| "reward_std": 0.03131024446338415, |
| "rewards/answer_entity_reward": 0.9899475276470184, |
| "rewards/answer_wer_reward": 0.9109963178634644, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 162 |
| }, |
| { |
| "completion_length": 206.5, |
| "epoch": 0.5216, |
| "grad_norm": 10.05416202545166, |
| "kl": 0.1258544921875, |
| "learning_rate": 7.975e-07, |
| "loss": 0.0013, |
| "reward": 3.6952139139175415, |
| "reward_std": 0.08068067952990532, |
| "rewards/answer_entity_reward": 0.9906517267227173, |
| "rewards/answer_wer_reward": 0.9191368222236633, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.785425454378128, |
| "step": 163 |
| }, |
| { |
| "completion_length": 207.71875, |
| "epoch": 0.5248, |
| "grad_norm": 5.6498823165893555, |
| "kl": 0.0572509765625, |
| "learning_rate": 7.9625e-07, |
| "loss": 0.0006, |
| "reward": 3.862972855567932, |
| "reward_std": 0.05051150266081095, |
| "rewards/answer_entity_reward": 0.9871794879436493, |
| "rewards/answer_wer_reward": 0.8966725766658783, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9791209697723389, |
| "step": 164 |
| }, |
| { |
| "completion_length": 231.8125, |
| "epoch": 0.528, |
| "grad_norm": 2.2680246829986572, |
| "kl": 0.0731201171875, |
| "learning_rate": 7.95e-07, |
| "loss": 0.0007, |
| "reward": 3.845450758934021, |
| "reward_std": 0.04592973738908768, |
| "rewards/answer_entity_reward": 0.9903846085071564, |
| "rewards/answer_wer_reward": 0.8566094040870667, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9984567761421204, |
| "step": 165 |
| }, |
| { |
| "completion_length": 218.0, |
| "epoch": 0.5312, |
| "grad_norm": 1.194057583808899, |
| "kl": 0.046630859375, |
| "learning_rate": 7.937499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.900430679321289, |
| "reward_std": 0.01787347625941038, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.907353401184082, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9930772483348846, |
| "step": 166 |
| }, |
| { |
| "completion_length": 212.25, |
| "epoch": 0.5344, |
| "grad_norm": 1.999778389930725, |
| "kl": 0.07568359375, |
| "learning_rate": 7.924999999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.885169267654419, |
| "reward_std": 0.01909848116338253, |
| "rewards/answer_entity_reward": 0.9869123697280884, |
| "rewards/answer_wer_reward": 0.8992542028427124, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990026652812958, |
| "step": 167 |
| }, |
| { |
| "completion_length": 222.65625, |
| "epoch": 0.5376, |
| "grad_norm": 1.8001956939697266, |
| "kl": 0.03814697265625, |
| "learning_rate": 7.912499999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.8382192850112915, |
| "reward_std": 0.12780769122764468, |
| "rewards/answer_entity_reward": 0.9684826135635376, |
| "rewards/answer_wer_reward": 0.8702490329742432, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994876980781555, |
| "step": 168 |
| }, |
| { |
| "completion_length": 181.28125, |
| "epoch": 0.5408, |
| "grad_norm": 1.3718982934951782, |
| "kl": 0.072509765625, |
| "learning_rate": 7.9e-07, |
| "loss": 0.0007, |
| "reward": 3.743025064468384, |
| "reward_std": 0.02209018263965845, |
| "rewards/answer_entity_reward": 0.9875437021255493, |
| "rewards/answer_wer_reward": 0.8102038502693176, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9452773928642273, |
| "step": 169 |
| }, |
| { |
| "completion_length": 231.3125, |
| "epoch": 0.544, |
| "grad_norm": 3.8252196311950684, |
| "kl": 0.087890625, |
| "learning_rate": 7.8875e-07, |
| "loss": 0.0009, |
| "reward": 3.855069398880005, |
| "reward_std": 0.12723926454782486, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.8895151615142822, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9655542373657227, |
| "step": 170 |
| }, |
| { |
| "completion_length": 246.6875, |
| "epoch": 0.5472, |
| "grad_norm": 1.4238818883895874, |
| "kl": 0.089599609375, |
| "learning_rate": 7.875e-07, |
| "loss": 0.0009, |
| "reward": 3.8392333984375, |
| "reward_std": 0.055684901773929596, |
| "rewards/answer_entity_reward": 0.9753443002700806, |
| "rewards/answer_wer_reward": 0.866324782371521, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9975642561912537, |
| "step": 171 |
| }, |
| { |
| "completion_length": 239.09375, |
| "epoch": 0.5504, |
| "grad_norm": 2.5418362617492676, |
| "kl": 0.07421875, |
| "learning_rate": 7.8625e-07, |
| "loss": 0.0007, |
| "reward": 3.7556768655776978, |
| "reward_std": 0.026184914633631706, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.8859277367591858, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8721528947353363, |
| "step": 172 |
| }, |
| { |
| "completion_length": 197.53125, |
| "epoch": 0.5536, |
| "grad_norm": 2.2901041507720947, |
| "kl": 0.0523681640625, |
| "learning_rate": 7.85e-07, |
| "loss": 0.0005, |
| "reward": 3.7119585275650024, |
| "reward_std": 0.14428242854773998, |
| "rewards/answer_entity_reward": 0.8789682686328888, |
| "rewards/answer_wer_reward": 0.8524789810180664, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9805113673210144, |
| "step": 173 |
| }, |
| { |
| "completion_length": 271.65625, |
| "epoch": 0.5568, |
| "grad_norm": 1.5335708856582642, |
| "kl": 0.048095703125, |
| "learning_rate": 7.837499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.8789494037628174, |
| "reward_std": 0.03688232973217964, |
| "rewards/answer_entity_reward": 0.9724817276000977, |
| "rewards/answer_wer_reward": 0.9107584953308105, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9957091808319092, |
| "step": 174 |
| }, |
| { |
| "completion_length": 197.40625, |
| "epoch": 0.56, |
| "grad_norm": 2.6528756618499756, |
| "kl": 0.074462890625, |
| "learning_rate": 7.824999999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.857820510864258, |
| "reward_std": 0.03826703131198883, |
| "rewards/answer_entity_reward": 0.993686854839325, |
| "rewards/answer_wer_reward": 0.8975639641284943, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9665697515010834, |
| "step": 175 |
| }, |
| { |
| "completion_length": 200.15625, |
| "epoch": 0.5632, |
| "grad_norm": 5.963916301727295, |
| "kl": 0.054443359375, |
| "learning_rate": 7.812499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.864750027656555, |
| "reward_std": 0.028456556610763073, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9234411716461182, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9447809159755707, |
| "step": 176 |
| }, |
| { |
| "completion_length": 220.25, |
| "epoch": 0.5664, |
| "grad_norm": 1.086248517036438, |
| "kl": 0.07421875, |
| "learning_rate": 7.799999999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.85122811794281, |
| "reward_std": 0.02548269461840391, |
| "rewards/answer_entity_reward": 0.9941239356994629, |
| "rewards/answer_wer_reward": 0.9126598238945007, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9444444477558136, |
| "step": 177 |
| }, |
| { |
| "completion_length": 235.6875, |
| "epoch": 0.5696, |
| "grad_norm": 3.8478362560272217, |
| "kl": 0.080810546875, |
| "learning_rate": 7.787500000000001e-07, |
| "loss": 0.0008, |
| "reward": 3.8555444478988647, |
| "reward_std": 0.03297184593975544, |
| "rewards/answer_entity_reward": 0.991346150636673, |
| "rewards/answer_wer_reward": 0.8777507543563843, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9864475131034851, |
| "step": 178 |
| }, |
| { |
| "completion_length": 210.6875, |
| "epoch": 0.5728, |
| "grad_norm": 2.200871706008911, |
| "kl": 0.096923828125, |
| "learning_rate": 7.775e-07, |
| "loss": 0.001, |
| "reward": 3.8970987796783447, |
| "reward_std": 0.029029657132923603, |
| "rewards/answer_entity_reward": 0.9676088094711304, |
| "rewards/answer_wer_reward": 0.9392231702804565, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9902668297290802, |
| "step": 179 |
| }, |
| { |
| "completion_length": 202.375, |
| "epoch": 0.576, |
| "grad_norm": 3.42965030670166, |
| "kl": 0.080078125, |
| "learning_rate": 7.7625e-07, |
| "loss": 0.0008, |
| "reward": 3.7469061613082886, |
| "reward_std": 0.08900729566812515, |
| "rewards/answer_entity_reward": 0.9832702279090881, |
| "rewards/answer_wer_reward": 0.8798384070396423, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8837975263595581, |
| "step": 180 |
| }, |
| { |
| "completion_length": 215.875, |
| "epoch": 0.5792, |
| "grad_norm": 2.5457639694213867, |
| "kl": 0.0595703125, |
| "learning_rate": 7.75e-07, |
| "loss": 0.0006, |
| "reward": 3.8780597448349, |
| "reward_std": 0.04192608781158924, |
| "rewards/answer_entity_reward": 0.9845328330993652, |
| "rewards/answer_wer_reward": 0.89576256275177, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9977642893791199, |
| "step": 181 |
| }, |
| { |
| "completion_length": 203.875, |
| "epoch": 0.5824, |
| "grad_norm": 1.3624567985534668, |
| "kl": 0.07177734375, |
| "learning_rate": 7.7375e-07, |
| "loss": 0.0007, |
| "reward": 3.8805158138275146, |
| "reward_std": 0.016396815422922373, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9136685729026794, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9692510068416595, |
| "step": 182 |
| }, |
| { |
| "completion_length": 215.90625, |
| "epoch": 0.5856, |
| "grad_norm": 1.270873785018921, |
| "kl": 0.0543212890625, |
| "learning_rate": 7.724999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.8749226331710815, |
| "reward_std": 0.020629468373954296, |
| "rewards/answer_entity_reward": 0.985921710729599, |
| "rewards/answer_wer_reward": 0.8920559883117676, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9969449043273926, |
| "step": 183 |
| }, |
| { |
| "completion_length": 230.34375, |
| "epoch": 0.5888, |
| "grad_norm": 5.295412063598633, |
| "kl": 0.0489501953125, |
| "learning_rate": 7.712499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.8914437294006348, |
| "reward_std": 0.053787765093147755, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9137877225875854, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9811282753944397, |
| "step": 184 |
| }, |
| { |
| "completion_length": 238.03125, |
| "epoch": 0.592, |
| "grad_norm": 3.6382017135620117, |
| "kl": 0.05126953125, |
| "learning_rate": 7.699999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.80574893951416, |
| "reward_std": 0.031003179028630257, |
| "rewards/answer_entity_reward": 0.9958333373069763, |
| "rewards/answer_wer_reward": 0.8504349291324615, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9594806730747223, |
| "step": 185 |
| }, |
| { |
| "completion_length": 133.46875, |
| "epoch": 0.5952, |
| "grad_norm": 5.556273937225342, |
| "kl": 0.06884765625, |
| "learning_rate": 7.6875e-07, |
| "loss": 0.0007, |
| "reward": 3.875786066055298, |
| "reward_std": 0.014059089124202728, |
| "rewards/answer_entity_reward": 0.9772727489471436, |
| "rewards/answer_wer_reward": 0.9379938840866089, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9605194628238678, |
| "step": 186 |
| }, |
| { |
| "completion_length": 233.96875, |
| "epoch": 0.5984, |
| "grad_norm": 1.1566299200057983, |
| "kl": 0.0654296875, |
| "learning_rate": 7.675e-07, |
| "loss": 0.0007, |
| "reward": 3.8272093534469604, |
| "reward_std": 0.056231189519166946, |
| "rewards/answer_entity_reward": 0.9821289777755737, |
| "rewards/answer_wer_reward": 0.87700355052948, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9680766761302948, |
| "step": 187 |
| }, |
| { |
| "completion_length": 223.21875, |
| "epoch": 0.6016, |
| "grad_norm": 1.125300407409668, |
| "kl": 0.0433349609375, |
| "learning_rate": 7.6625e-07, |
| "loss": 0.0004, |
| "reward": 3.9091583490371704, |
| "reward_std": 0.019687645137310028, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.917988508939743, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9946419894695282, |
| "step": 188 |
| }, |
| { |
| "completion_length": 213.03125, |
| "epoch": 0.6048, |
| "grad_norm": 1.806405782699585, |
| "kl": 0.05859375, |
| "learning_rate": 7.65e-07, |
| "loss": 0.0006, |
| "reward": 3.9139894247055054, |
| "reward_std": 0.01741368416696787, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.916355162858963, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9976341724395752, |
| "step": 189 |
| }, |
| { |
| "completion_length": 246.6875, |
| "epoch": 0.608, |
| "grad_norm": 2.158470630645752, |
| "kl": 0.05224609375, |
| "learning_rate": 7.6375e-07, |
| "loss": 0.0005, |
| "reward": 3.9092923402786255, |
| "reward_std": 0.019907254725694656, |
| "rewards/answer_entity_reward": 0.9944444596767426, |
| "rewards/answer_wer_reward": 0.9189584851264954, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9958893954753876, |
| "step": 190 |
| }, |
| { |
| "completion_length": 197.71875, |
| "epoch": 0.6112, |
| "grad_norm": 0.8463873863220215, |
| "kl": 0.0526123046875, |
| "learning_rate": 7.624999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.7934869527816772, |
| "reward_std": 0.010684152133762836, |
| "rewards/answer_entity_reward": 0.9927884340286255, |
| "rewards/answer_wer_reward": 0.8017330169677734, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9989654421806335, |
| "step": 191 |
| }, |
| { |
| "completion_length": 253.03125, |
| "epoch": 0.6144, |
| "grad_norm": 0.95602947473526, |
| "kl": 0.0577392578125, |
| "learning_rate": 7.612499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.8714359998703003, |
| "reward_std": 0.03730391897261143, |
| "rewards/answer_entity_reward": 0.9679293036460876, |
| "rewards/answer_wer_reward": 0.9067506790161133, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9967561364173889, |
| "step": 192 |
| }, |
| { |
| "completion_length": 260.875, |
| "epoch": 0.6176, |
| "grad_norm": 1.752991795539856, |
| "kl": 0.1259765625, |
| "learning_rate": 7.599999999999999e-07, |
| "loss": 0.0013, |
| "reward": 3.847132444381714, |
| "reward_std": 0.03724599629640579, |
| "rewards/answer_entity_reward": 0.9814560413360596, |
| "rewards/answer_wer_reward": 0.877534031867981, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9881424605846405, |
| "step": 193 |
| }, |
| { |
| "completion_length": 224.8125, |
| "epoch": 0.6208, |
| "grad_norm": 5.3836283683776855, |
| "kl": 0.0616455078125, |
| "learning_rate": 7.5875e-07, |
| "loss": 0.0006, |
| "reward": 3.838170886039734, |
| "reward_std": 0.043032409623265266, |
| "rewards/answer_entity_reward": 0.9778589308261871, |
| "rewards/answer_wer_reward": 0.8835411667823792, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9767708480358124, |
| "step": 194 |
| }, |
| { |
| "completion_length": 234.0, |
| "epoch": 0.624, |
| "grad_norm": 1.4531170129776, |
| "kl": 0.082763671875, |
| "learning_rate": 7.575e-07, |
| "loss": 0.0008, |
| "reward": 3.8195607662200928, |
| "reward_std": 0.06634793058037758, |
| "rewards/answer_entity_reward": 0.9759862422943115, |
| "rewards/answer_wer_reward": 0.8854676187038422, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9581069052219391, |
| "step": 195 |
| }, |
| { |
| "completion_length": 228.875, |
| "epoch": 0.6272, |
| "grad_norm": 1.215409278869629, |
| "kl": 0.0653076171875, |
| "learning_rate": 7.5625e-07, |
| "loss": 0.0006, |
| "reward": 3.869178295135498, |
| "reward_std": 0.018243765458464622, |
| "rewards/answer_entity_reward": 0.9914772808551788, |
| "rewards/answer_wer_reward": 0.9173910617828369, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9603100121021271, |
| "step": 196 |
| }, |
| { |
| "completion_length": 233.40625, |
| "epoch": 0.6304, |
| "grad_norm": 1.5224462747573853, |
| "kl": 0.0479736328125, |
| "learning_rate": 7.55e-07, |
| "loss": 0.0005, |
| "reward": 3.880965232849121, |
| "reward_std": 0.030376747716218233, |
| "rewards/answer_entity_reward": 0.9812500178813934, |
| "rewards/answer_wer_reward": 0.903846025466919, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9958691298961639, |
| "step": 197 |
| }, |
| { |
| "completion_length": 159.75, |
| "epoch": 0.6336, |
| "grad_norm": 2.0013957023620605, |
| "kl": 0.072021484375, |
| "learning_rate": 7.5375e-07, |
| "loss": 0.0007, |
| "reward": 3.8514485359191895, |
| "reward_std": 0.021021784283220768, |
| "rewards/answer_entity_reward": 0.9914772808551788, |
| "rewards/answer_wer_reward": 0.9317480027675629, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9282233119010925, |
| "step": 198 |
| }, |
| { |
| "completion_length": 200.125, |
| "epoch": 0.6368, |
| "grad_norm": 7.399294853210449, |
| "kl": 0.0662841796875, |
| "learning_rate": 7.524999999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.9170095920562744, |
| "reward_std": 0.03030287381261587, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.955333948135376, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9616756439208984, |
| "step": 199 |
| }, |
| { |
| "completion_length": 228.75, |
| "epoch": 0.64, |
| "grad_norm": 1.6671867370605469, |
| "kl": 0.13623046875, |
| "learning_rate": 7.512499999999999e-07, |
| "loss": 0.0014, |
| "reward": 3.848036050796509, |
| "reward_std": 0.14389772480353713, |
| "rewards/answer_entity_reward": 0.9886363744735718, |
| "rewards/answer_wer_reward": 0.9240660667419434, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9665836989879608, |
| "step": 200 |
| }, |
| { |
| "completion_length": 209.90625, |
| "epoch": 0.6432, |
| "grad_norm": 1.2796622514724731, |
| "kl": 0.05029296875, |
| "learning_rate": 7.5e-07, |
| "loss": 0.0005, |
| "reward": 3.856316566467285, |
| "reward_std": 0.025415225885808468, |
| "rewards/answer_entity_reward": 0.9718458652496338, |
| "rewards/answer_wer_reward": 0.8857261538505554, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987446367740631, |
| "step": 201 |
| }, |
| { |
| "completion_length": 203.1875, |
| "epoch": 0.6464, |
| "grad_norm": 6.9469380378723145, |
| "kl": 0.05810546875, |
| "learning_rate": 7.4875e-07, |
| "loss": 0.0006, |
| "reward": 3.7580385208129883, |
| "reward_std": 0.0333370678126812, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.8357867002487183, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9279336631298065, |
| "step": 202 |
| }, |
| { |
| "completion_length": 211.5, |
| "epoch": 0.6496, |
| "grad_norm": 2.437093496322632, |
| "kl": 0.0400390625, |
| "learning_rate": 7.475e-07, |
| "loss": 0.0004, |
| "reward": 3.888434052467346, |
| "reward_std": 0.04942548694089055, |
| "rewards/answer_entity_reward": 0.9895833432674408, |
| "rewards/answer_wer_reward": 0.901074230670929, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9977766275405884, |
| "step": 203 |
| }, |
| { |
| "completion_length": 220.3125, |
| "epoch": 0.6528, |
| "grad_norm": 9.914649963378906, |
| "kl": 0.054443359375, |
| "learning_rate": 7.4625e-07, |
| "loss": 0.0005, |
| "reward": 3.9074004888534546, |
| "reward_std": 0.022341615986078978, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.924115002155304, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9861262738704681, |
| "step": 204 |
| }, |
| { |
| "completion_length": 190.28125, |
| "epoch": 0.656, |
| "grad_norm": 10.771315574645996, |
| "kl": 0.0731201171875, |
| "learning_rate": 7.45e-07, |
| "loss": 0.0007, |
| "reward": 3.8562848567962646, |
| "reward_std": 0.05522243678569794, |
| "rewards/answer_entity_reward": 0.9873949587345123, |
| "rewards/answer_wer_reward": 0.9283336997032166, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9405562579631805, |
| "step": 205 |
| }, |
| { |
| "completion_length": 254.375, |
| "epoch": 0.6592, |
| "grad_norm": 1.2101417779922485, |
| "kl": 0.054443359375, |
| "learning_rate": 7.4375e-07, |
| "loss": 0.0005, |
| "reward": 3.9058661460876465, |
| "reward_std": 0.015844878274947405, |
| "rewards/answer_entity_reward": 0.9788995683193207, |
| "rewards/answer_wer_reward": 0.9304846525192261, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9964818954467773, |
| "step": 206 |
| }, |
| { |
| "completion_length": 202.96875, |
| "epoch": 0.6624, |
| "grad_norm": 3.355869770050049, |
| "kl": 0.0572509765625, |
| "learning_rate": 7.425e-07, |
| "loss": 0.0006, |
| "reward": 3.8065719604492188, |
| "reward_std": 0.19051394425332546, |
| "rewards/answer_entity_reward": 0.9650735259056091, |
| "rewards/answer_wer_reward": 0.8801510035991669, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9925974309444427, |
| "step": 207 |
| }, |
| { |
| "completion_length": 226.1875, |
| "epoch": 0.6656, |
| "grad_norm": 1.7292360067367554, |
| "kl": 0.104248046875, |
| "learning_rate": 7.412499999999999e-07, |
| "loss": 0.001, |
| "reward": 3.8113776445388794, |
| "reward_std": 0.02462965715676546, |
| "rewards/answer_entity_reward": 0.9770916700363159, |
| "rewards/answer_wer_reward": 0.864607959985733, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9696778357028961, |
| "step": 208 |
| }, |
| { |
| "completion_length": 198.75, |
| "epoch": 0.6688, |
| "grad_norm": 4.215091705322266, |
| "kl": 0.06640625, |
| "learning_rate": 7.4e-07, |
| "loss": 0.0007, |
| "reward": 3.8144696950912476, |
| "reward_std": 0.025187399238348007, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9298737645149231, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8845959007740021, |
| "step": 209 |
| }, |
| { |
| "completion_length": 200.1875, |
| "epoch": 0.672, |
| "grad_norm": 1.537361979484558, |
| "kl": 0.049560546875, |
| "learning_rate": 7.3875e-07, |
| "loss": 0.0005, |
| "reward": 3.9332594871520996, |
| "reward_std": 0.011271146591752768, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.951434314250946, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9818252325057983, |
| "step": 210 |
| }, |
| { |
| "completion_length": 190.78125, |
| "epoch": 0.6752, |
| "grad_norm": 2.9701907634735107, |
| "kl": 0.0654296875, |
| "learning_rate": 7.375e-07, |
| "loss": 0.0007, |
| "reward": 3.8168801069259644, |
| "reward_std": 0.024646650068461895, |
| "rewards/answer_entity_reward": 0.9914772808551788, |
| "rewards/answer_wer_reward": 0.9553571939468384, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8700454831123352, |
| "step": 211 |
| }, |
| { |
| "completion_length": 157.59375, |
| "epoch": 0.6784, |
| "grad_norm": 3.1656010150909424, |
| "kl": 0.0611572265625, |
| "learning_rate": 7.362499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.8838521242141724, |
| "reward_std": 0.0407260712236166, |
| "rewards/answer_entity_reward": 0.9767543971538544, |
| "rewards/answer_wer_reward": 0.944227010011673, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9628707766532898, |
| "step": 212 |
| }, |
| { |
| "completion_length": 238.1875, |
| "epoch": 0.6816, |
| "grad_norm": 2.614816665649414, |
| "kl": 0.0947265625, |
| "learning_rate": 7.35e-07, |
| "loss": 0.0009, |
| "reward": 3.8542829751968384, |
| "reward_std": 0.03231436479836702, |
| "rewards/answer_entity_reward": 0.974577009677887, |
| "rewards/answer_wer_reward": 0.8831658661365509, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9965401291847229, |
| "step": 213 |
| }, |
| { |
| "completion_length": 255.0, |
| "epoch": 0.6848, |
| "grad_norm": 1.8072490692138672, |
| "kl": 0.048828125, |
| "learning_rate": 7.3375e-07, |
| "loss": 0.0005, |
| "reward": 3.9139556884765625, |
| "reward_std": 0.013969901017844677, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9155895113945007, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9983660876750946, |
| "step": 214 |
| }, |
| { |
| "completion_length": 163.96875, |
| "epoch": 0.688, |
| "grad_norm": 3.6364543437957764, |
| "kl": 0.082763671875, |
| "learning_rate": 7.325e-07, |
| "loss": 0.0008, |
| "reward": 3.8950713872909546, |
| "reward_std": 0.030674483627080917, |
| "rewards/answer_entity_reward": 0.9930555820465088, |
| "rewards/answer_wer_reward": 0.9427915513515472, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.959224134683609, |
| "step": 215 |
| }, |
| { |
| "completion_length": 211.90625, |
| "epoch": 0.6912, |
| "grad_norm": 1.4036628007888794, |
| "kl": 0.0504150390625, |
| "learning_rate": 7.312499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.90190052986145, |
| "reward_std": 0.028614184819161892, |
| "rewards/answer_entity_reward": 0.9636363685131073, |
| "rewards/answer_wer_reward": 0.9445142149925232, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9937500059604645, |
| "step": 216 |
| }, |
| { |
| "completion_length": 226.875, |
| "epoch": 0.6944, |
| "grad_norm": 1.5664644241333008, |
| "kl": 0.051025390625, |
| "learning_rate": 7.3e-07, |
| "loss": 0.0005, |
| "reward": 3.9051342010498047, |
| "reward_std": 0.023595476523041725, |
| "rewards/answer_entity_reward": 0.994463324546814, |
| "rewards/answer_wer_reward": 0.9128024578094482, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9978685975074768, |
| "step": 217 |
| }, |
| { |
| "completion_length": 211.96875, |
| "epoch": 0.6976, |
| "grad_norm": 3.6565327644348145, |
| "kl": 0.0567626953125, |
| "learning_rate": 7.2875e-07, |
| "loss": 0.0006, |
| "reward": 3.920815348625183, |
| "reward_std": 0.026728018186986446, |
| "rewards/answer_entity_reward": 0.9936868846416473, |
| "rewards/answer_wer_reward": 0.9297977983951569, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9973307251930237, |
| "step": 218 |
| }, |
| { |
| "completion_length": 226.90625, |
| "epoch": 0.7008, |
| "grad_norm": 5.147249221801758, |
| "kl": 0.142333984375, |
| "learning_rate": 7.275e-07, |
| "loss": 0.0014, |
| "reward": 3.887997627258301, |
| "reward_std": 0.017563311383128166, |
| "rewards/answer_entity_reward": 0.9923513829708099, |
| "rewards/answer_wer_reward": 0.8966234028339386, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999022901058197, |
| "step": 219 |
| }, |
| { |
| "completion_length": 196.75, |
| "epoch": 0.704, |
| "grad_norm": 4.334951400756836, |
| "kl": 0.07958984375, |
| "learning_rate": 7.262499999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.919954776763916, |
| "reward_std": 0.020561310462653637, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.922648161649704, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997106492519379, |
| "step": 220 |
| }, |
| { |
| "completion_length": 208.15625, |
| "epoch": 0.7072, |
| "grad_norm": 4.896883964538574, |
| "kl": 0.072509765625, |
| "learning_rate": 7.249999999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.8171916007995605, |
| "reward_std": 0.044522007927298546, |
| "rewards/answer_entity_reward": 0.9767857491970062, |
| "rewards/answer_wer_reward": 0.9031675159931183, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9372382760047913, |
| "step": 221 |
| }, |
| { |
| "completion_length": 197.71875, |
| "epoch": 0.7104, |
| "grad_norm": 1.9743766784667969, |
| "kl": 0.041259765625, |
| "learning_rate": 7.2375e-07, |
| "loss": 0.0004, |
| "reward": 3.9599783420562744, |
| "reward_std": 0.008235257118940353, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9602223634719849, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999755859375, |
| "step": 222 |
| }, |
| { |
| "completion_length": 181.84375, |
| "epoch": 0.7136, |
| "grad_norm": 6.57908296585083, |
| "kl": 0.07421875, |
| "learning_rate": 7.225e-07, |
| "loss": 0.0007, |
| "reward": 3.826643943786621, |
| "reward_std": 0.06298277154564857, |
| "rewards/answer_entity_reward": 0.9833333194255829, |
| "rewards/answer_wer_reward": 0.9450017511844635, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8983088135719299, |
| "step": 223 |
| }, |
| { |
| "completion_length": 177.84375, |
| "epoch": 0.7168, |
| "grad_norm": 13.744032859802246, |
| "kl": 0.078369140625, |
| "learning_rate": 7.212499999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.852834939956665, |
| "reward_std": 0.044052885845303535, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9374657571315765, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9177731275558472, |
| "step": 224 |
| }, |
| { |
| "completion_length": 249.3125, |
| "epoch": 0.72, |
| "grad_norm": 1.7395777702331543, |
| "kl": 0.05712890625, |
| "learning_rate": 7.2e-07, |
| "loss": 0.0006, |
| "reward": 3.8659743070602417, |
| "reward_std": 0.03202287387102842, |
| "rewards/answer_entity_reward": 0.9767628312110901, |
| "rewards/answer_wer_reward": 0.8964802920818329, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9927313327789307, |
| "step": 225 |
| }, |
| { |
| "completion_length": 246.75, |
| "epoch": 0.7232, |
| "grad_norm": 1.1522554159164429, |
| "kl": 0.05419921875, |
| "learning_rate": 7.1875e-07, |
| "loss": 0.0005, |
| "reward": 3.868378758430481, |
| "reward_std": 0.02125831786543131, |
| "rewards/answer_entity_reward": 0.9791666567325592, |
| "rewards/answer_wer_reward": 0.8927575647830963, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9964545369148254, |
| "step": 226 |
| }, |
| { |
| "completion_length": 213.6875, |
| "epoch": 0.7264, |
| "grad_norm": 1.6328908205032349, |
| "kl": 0.0452880859375, |
| "learning_rate": 7.175e-07, |
| "loss": 0.0004, |
| "reward": 3.9461253881454468, |
| "reward_std": 0.017373798182234168, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9516011476516724, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9989885687828064, |
| "step": 227 |
| }, |
| { |
| "completion_length": 180.25, |
| "epoch": 0.7296, |
| "grad_norm": 1.6245373487472534, |
| "kl": 0.0810546875, |
| "learning_rate": 7.1625e-07, |
| "loss": 0.0008, |
| "reward": 3.92253839969635, |
| "reward_std": 0.009518959443084896, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9421058893203735, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9832733571529388, |
| "step": 228 |
| }, |
| { |
| "completion_length": 211.46875, |
| "epoch": 0.7328, |
| "grad_norm": 2.3507907390594482, |
| "kl": 0.080078125, |
| "learning_rate": 7.149999999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.9085057973861694, |
| "reward_std": 0.011625304818153381, |
| "rewards/answer_entity_reward": 0.9886363744735718, |
| "rewards/answer_wer_reward": 0.91986945271492, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 229 |
| }, |
| { |
| "completion_length": 189.78125, |
| "epoch": 0.736, |
| "grad_norm": 2.801975965499878, |
| "kl": 0.068603515625, |
| "learning_rate": 7.137499999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.849338173866272, |
| "reward_std": 0.04476720932871103, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9499310851097107, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.90181103348732, |
| "step": 230 |
| }, |
| { |
| "completion_length": 232.46875, |
| "epoch": 0.7392, |
| "grad_norm": 18.121028900146484, |
| "kl": 0.065673828125, |
| "learning_rate": 7.125e-07, |
| "loss": 0.0007, |
| "reward": 3.8422099351882935, |
| "reward_std": 0.05234749615192413, |
| "rewards/answer_entity_reward": 0.9829545617103577, |
| "rewards/answer_wer_reward": 0.8842452466487885, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9750101864337921, |
| "step": 231 |
| }, |
| { |
| "completion_length": 230.90625, |
| "epoch": 0.7424, |
| "grad_norm": 1.374346375465393, |
| "kl": 0.0440673828125, |
| "learning_rate": 7.1125e-07, |
| "loss": 0.0004, |
| "reward": 3.9123170375823975, |
| "reward_std": 0.025476250797510147, |
| "rewards/answer_entity_reward": 0.9930555820465088, |
| "rewards/answer_wer_reward": 0.9220384955406189, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9972230195999146, |
| "step": 232 |
| }, |
| { |
| "completion_length": 197.5625, |
| "epoch": 0.7456, |
| "grad_norm": 3.1081960201263428, |
| "kl": 0.067138671875, |
| "learning_rate": 7.1e-07, |
| "loss": 0.0007, |
| "reward": 3.921274781227112, |
| "reward_std": 0.04291347204707563, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9490483999252319, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9722263813018799, |
| "step": 233 |
| }, |
| { |
| "completion_length": 198.90625, |
| "epoch": 0.7488, |
| "grad_norm": 2.3603627681732178, |
| "kl": 0.0550537109375, |
| "learning_rate": 7.0875e-07, |
| "loss": 0.0005, |
| "reward": 3.9125137329101562, |
| "reward_std": 0.03855661302804947, |
| "rewards/answer_entity_reward": 0.9947552382946014, |
| "rewards/answer_wer_reward": 0.9429784715175629, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9747799634933472, |
| "step": 234 |
| }, |
| { |
| "completion_length": 220.71875, |
| "epoch": 0.752, |
| "grad_norm": 3.3247504234313965, |
| "kl": 0.070068359375, |
| "learning_rate": 7.075e-07, |
| "loss": 0.0007, |
| "reward": 3.877889394760132, |
| "reward_std": 0.03429079055786133, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9119226932525635, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9659668207168579, |
| "step": 235 |
| }, |
| { |
| "completion_length": 194.21875, |
| "epoch": 0.7552, |
| "grad_norm": 5.20084810256958, |
| "kl": 0.067626953125, |
| "learning_rate": 7.0625e-07, |
| "loss": 0.0007, |
| "reward": 3.918747305870056, |
| "reward_std": 0.03475894033908844, |
| "rewards/answer_entity_reward": 0.9929924309253693, |
| "rewards/answer_wer_reward": 0.9448626041412354, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.980892151594162, |
| "step": 236 |
| }, |
| { |
| "completion_length": 222.53125, |
| "epoch": 0.7584, |
| "grad_norm": 3.0105435848236084, |
| "kl": 0.07421875, |
| "learning_rate": 7.049999999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.9244236946105957, |
| "reward_std": 0.010058181826025248, |
| "rewards/answer_entity_reward": 0.9914772808551788, |
| "rewards/answer_wer_reward": 0.9545913934707642, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9783551096916199, |
| "step": 237 |
| }, |
| { |
| "completion_length": 222.5625, |
| "epoch": 0.7616, |
| "grad_norm": 4.065408229827881, |
| "kl": 0.1181640625, |
| "learning_rate": 7.037499999999999e-07, |
| "loss": 0.0012, |
| "reward": 3.873254418373108, |
| "reward_std": 0.0757724829018116, |
| "rewards/answer_entity_reward": 0.9845328330993652, |
| "rewards/answer_wer_reward": 0.936627209186554, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9520943462848663, |
| "step": 238 |
| }, |
| { |
| "completion_length": 184.21875, |
| "epoch": 0.7648, |
| "grad_norm": 1.1628284454345703, |
| "kl": 0.0579833984375, |
| "learning_rate": 7.024999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.9432320594787598, |
| "reward_std": 0.010221295058727264, |
| "rewards/answer_entity_reward": 0.9905790388584137, |
| "rewards/answer_wer_reward": 0.953954666852951, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.998698353767395, |
| "step": 239 |
| }, |
| { |
| "completion_length": 233.90625, |
| "epoch": 0.768, |
| "grad_norm": 1.4767858982086182, |
| "kl": 0.079345703125, |
| "learning_rate": 7.0125e-07, |
| "loss": 0.0008, |
| "reward": 3.8955001831054688, |
| "reward_std": 0.03214742988348007, |
| "rewards/answer_entity_reward": 0.9854603707790375, |
| "rewards/answer_wer_reward": 0.9112924933433533, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987473487854004, |
| "step": 240 |
| }, |
| { |
| "completion_length": 176.53125, |
| "epoch": 0.7712, |
| "grad_norm": 5.655521869659424, |
| "kl": 0.0872802734375, |
| "learning_rate": 7e-07, |
| "loss": 0.0009, |
| "reward": 3.8957866430282593, |
| "reward_std": 0.02847579075023532, |
| "rewards/answer_entity_reward": 0.9903846085071564, |
| "rewards/answer_wer_reward": 0.9654708206653595, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9399311542510986, |
| "step": 241 |
| }, |
| { |
| "completion_length": 252.71875, |
| "epoch": 0.7744, |
| "grad_norm": 3.268174886703491, |
| "kl": 0.073486328125, |
| "learning_rate": 6.9875e-07, |
| "loss": 0.0007, |
| "reward": 3.8414435386657715, |
| "reward_std": 0.08019998762756586, |
| "rewards/answer_entity_reward": 0.9822989404201508, |
| "rewards/answer_wer_reward": 0.8909429609775543, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.968201756477356, |
| "step": 242 |
| }, |
| { |
| "completion_length": 234.875, |
| "epoch": 0.7776, |
| "grad_norm": 3.445681571960449, |
| "kl": 0.15869140625, |
| "learning_rate": 6.975e-07, |
| "loss": 0.0016, |
| "reward": 3.856196165084839, |
| "reward_std": 0.0546736940741539, |
| "rewards/answer_entity_reward": 0.9822468161582947, |
| "rewards/answer_wer_reward": 0.9020899534225464, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9718593955039978, |
| "step": 243 |
| }, |
| { |
| "completion_length": 186.375, |
| "epoch": 0.7808, |
| "grad_norm": 3.4756290912628174, |
| "kl": 0.109130859375, |
| "learning_rate": 6.9625e-07, |
| "loss": 0.0011, |
| "reward": 3.878678798675537, |
| "reward_std": 0.014406855218112469, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9121991693973541, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9693206548690796, |
| "step": 244 |
| }, |
| { |
| "completion_length": 224.46875, |
| "epoch": 0.784, |
| "grad_norm": 2.4778082370758057, |
| "kl": 0.0618896484375, |
| "learning_rate": 6.949999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.890427350997925, |
| "reward_std": 0.013088527135550976, |
| "rewards/answer_entity_reward": 0.9849699139595032, |
| "rewards/answer_wer_reward": 0.9565823972225189, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9488749206066132, |
| "step": 245 |
| }, |
| { |
| "completion_length": 220.0625, |
| "epoch": 0.7872, |
| "grad_norm": 1.7784525156021118, |
| "kl": 0.0592041015625, |
| "learning_rate": 6.937499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.9208799600601196, |
| "reward_std": 0.013537504710257053, |
| "rewards/answer_entity_reward": 0.9903846085071564, |
| "rewards/answer_wer_reward": 0.9380317628383636, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9924636483192444, |
| "step": 246 |
| }, |
| { |
| "completion_length": 215.03125, |
| "epoch": 0.7904, |
| "grad_norm": 1.7845004796981812, |
| "kl": 0.087158203125, |
| "learning_rate": 6.924999999999999e-07, |
| "loss": 0.0009, |
| "reward": 3.874635100364685, |
| "reward_std": 0.047601671889424324, |
| "rewards/answer_entity_reward": 0.9777146875858307, |
| "rewards/answer_wer_reward": 0.9114454984664917, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9854750335216522, |
| "step": 247 |
| }, |
| { |
| "completion_length": 237.0, |
| "epoch": 0.7936, |
| "grad_norm": 1.9031370878219604, |
| "kl": 0.0665283203125, |
| "learning_rate": 6.9125e-07, |
| "loss": 0.0007, |
| "reward": 3.8799991607666016, |
| "reward_std": 0.040791427716612816, |
| "rewards/answer_entity_reward": 0.9725233018398285, |
| "rewards/answer_wer_reward": 0.9113976061344147, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9960784018039703, |
| "step": 248 |
| }, |
| { |
| "completion_length": 247.625, |
| "epoch": 0.7968, |
| "grad_norm": 6.799812316894531, |
| "kl": 0.5244140625, |
| "learning_rate": 6.9e-07, |
| "loss": 0.0052, |
| "reward": 3.9148751497268677, |
| "reward_std": 0.012524784076958895, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9155747294425964, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993003606796265, |
| "step": 249 |
| }, |
| { |
| "completion_length": 202.78125, |
| "epoch": 0.8, |
| "grad_norm": 2.9497642517089844, |
| "kl": 0.108642578125, |
| "learning_rate": 6.8875e-07, |
| "loss": 0.0011, |
| "reward": 3.88541841506958, |
| "reward_std": 0.05846460163593292, |
| "rewards/answer_entity_reward": 0.9898538887500763, |
| "rewards/answer_wer_reward": 0.9265855848789215, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.968979001045227, |
| "step": 250 |
| }, |
| { |
| "completion_length": 230.03125, |
| "epoch": 0.8032, |
| "grad_norm": 3.021209478378296, |
| "kl": 0.064453125, |
| "learning_rate": 6.875e-07, |
| "loss": 0.0006, |
| "reward": 3.9006909132003784, |
| "reward_std": 0.02151984628289938, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9085462689399719, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9956169128417969, |
| "step": 251 |
| }, |
| { |
| "completion_length": 202.0625, |
| "epoch": 0.8064, |
| "grad_norm": 3.288858413696289, |
| "kl": 0.0810546875, |
| "learning_rate": 6.8625e-07, |
| "loss": 0.0008, |
| "reward": 3.9228957891464233, |
| "reward_std": 0.012390648480504751, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9330424964427948, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9898532032966614, |
| "step": 252 |
| }, |
| { |
| "completion_length": 202.84375, |
| "epoch": 0.8096, |
| "grad_norm": 2.384650468826294, |
| "kl": 0.084228515625, |
| "learning_rate": 6.85e-07, |
| "loss": 0.0009, |
| "reward": 3.8598722219467163, |
| "reward_std": 0.03435686323791742, |
| "rewards/answer_entity_reward": 0.9775519669055939, |
| "rewards/answer_wer_reward": 0.9159774780273438, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9663428068161011, |
| "step": 253 |
| }, |
| { |
| "completion_length": 235.15625, |
| "epoch": 0.8128, |
| "grad_norm": 3.9519598484039307, |
| "kl": 0.061767578125, |
| "learning_rate": 6.837499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.8161985874176025, |
| "reward_std": 0.06573762744665146, |
| "rewards/answer_entity_reward": 0.9905131459236145, |
| "rewards/answer_wer_reward": 0.8475149571895599, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9781705141067505, |
| "step": 254 |
| }, |
| { |
| "completion_length": 241.125, |
| "epoch": 0.816, |
| "grad_norm": 3.464174509048462, |
| "kl": 0.077392578125, |
| "learning_rate": 6.824999999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.894362449645996, |
| "reward_std": 0.025215300731360912, |
| "rewards/answer_entity_reward": 0.9895833432674408, |
| "rewards/answer_wer_reward": 0.9064654111862183, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.998313695192337, |
| "step": 255 |
| }, |
| { |
| "completion_length": 177.59375, |
| "epoch": 0.8192, |
| "grad_norm": 1.5625709295272827, |
| "kl": 0.0986328125, |
| "learning_rate": 6.8125e-07, |
| "loss": 0.001, |
| "reward": 3.9517083168029785, |
| "reward_std": 0.01383261731825769, |
| "rewards/answer_entity_reward": 0.995192289352417, |
| "rewards/answer_wer_reward": 0.9637933671474457, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9927226006984711, |
| "step": 256 |
| }, |
| { |
| "completion_length": 191.625, |
| "epoch": 0.8224, |
| "grad_norm": 1.4757704734802246, |
| "kl": 0.0791015625, |
| "learning_rate": 6.800000000000001e-07, |
| "loss": 0.0008, |
| "reward": 3.8987783193588257, |
| "reward_std": 0.016407988965511322, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9312387406826019, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9699434041976929, |
| "step": 257 |
| }, |
| { |
| "completion_length": 191.65625, |
| "epoch": 0.8256, |
| "grad_norm": 3.355372428894043, |
| "kl": 0.09033203125, |
| "learning_rate": 6.7875e-07, |
| "loss": 0.0009, |
| "reward": 3.9129350185394287, |
| "reward_std": 0.015536424703896046, |
| "rewards/answer_entity_reward": 0.9944852888584137, |
| "rewards/answer_wer_reward": 0.9205312728881836, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9979183673858643, |
| "step": 258 |
| }, |
| { |
| "completion_length": 188.65625, |
| "epoch": 0.8288, |
| "grad_norm": 1.917312741279602, |
| "kl": 0.086669921875, |
| "learning_rate": 6.775e-07, |
| "loss": 0.0009, |
| "reward": 3.918121814727783, |
| "reward_std": 0.0268348827958107, |
| "rewards/answer_entity_reward": 0.9890183508396149, |
| "rewards/answer_wer_reward": 0.9294547438621521, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996488690376282, |
| "step": 259 |
| }, |
| { |
| "completion_length": 234.0, |
| "epoch": 0.832, |
| "grad_norm": 1.334208369255066, |
| "kl": 0.0635986328125, |
| "learning_rate": 6.7625e-07, |
| "loss": 0.0006, |
| "reward": 3.924370527267456, |
| "reward_std": 0.02556901052594185, |
| "rewards/answer_entity_reward": 0.980710506439209, |
| "rewards/answer_wer_reward": 0.9436598718166351, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 260 |
| }, |
| { |
| "completion_length": 157.09375, |
| "epoch": 0.8352, |
| "grad_norm": 3.0484063625335693, |
| "kl": 0.093017578125, |
| "learning_rate": 6.75e-07, |
| "loss": 0.0009, |
| "reward": 3.928007483482361, |
| "reward_std": 0.01636551646515727, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9610774517059326, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.969333827495575, |
| "step": 261 |
| }, |
| { |
| "completion_length": 222.6875, |
| "epoch": 0.8384, |
| "grad_norm": 1.5266326665878296, |
| "kl": 0.110595703125, |
| "learning_rate": 6.737499999999999e-07, |
| "loss": 0.0011, |
| "reward": 3.826764225959778, |
| "reward_std": 0.014424358261749148, |
| "rewards/answer_entity_reward": 0.875, |
| "rewards/answer_wer_reward": 0.9528080821037292, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9989560842514038, |
| "step": 262 |
| }, |
| { |
| "completion_length": 245.96875, |
| "epoch": 0.8416, |
| "grad_norm": 2.332728624343872, |
| "kl": 0.0777587890625, |
| "learning_rate": 6.724999999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.84222412109375, |
| "reward_std": 0.018232629168778658, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.8854961693286896, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9591319561004639, |
| "step": 263 |
| }, |
| { |
| "completion_length": 155.6875, |
| "epoch": 0.8448, |
| "grad_norm": 7.505854606628418, |
| "kl": 0.101806640625, |
| "learning_rate": 6.7125e-07, |
| "loss": 0.001, |
| "reward": 3.875036120414734, |
| "reward_std": 0.07785245403647423, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9335145354270935, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9436048865318298, |
| "step": 264 |
| }, |
| { |
| "completion_length": 249.75, |
| "epoch": 0.848, |
| "grad_norm": 2.8738133907318115, |
| "kl": 0.0516357421875, |
| "learning_rate": 6.7e-07, |
| "loss": 0.0005, |
| "reward": 3.903374195098877, |
| "reward_std": 0.014860059600323439, |
| "rewards/answer_entity_reward": 0.995192289352417, |
| "rewards/answer_wer_reward": 0.9094418883323669, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987399280071259, |
| "step": 265 |
| }, |
| { |
| "completion_length": 199.34375, |
| "epoch": 0.8512, |
| "grad_norm": 8.186075210571289, |
| "kl": 0.074462890625, |
| "learning_rate": 6.6875e-07, |
| "loss": 0.0007, |
| "reward": 3.8564417362213135, |
| "reward_std": 0.06331180594861507, |
| "rewards/answer_entity_reward": 0.9917200803756714, |
| "rewards/answer_wer_reward": 0.9368169605731964, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9279046654701233, |
| "step": 266 |
| }, |
| { |
| "completion_length": 207.75, |
| "epoch": 0.8544, |
| "grad_norm": 1.7668160200119019, |
| "kl": 0.191650390625, |
| "learning_rate": 6.675e-07, |
| "loss": 0.0019, |
| "reward": 3.791893243789673, |
| "reward_std": 0.21384014189243317, |
| "rewards/answer_entity_reward": 0.9642857313156128, |
| "rewards/answer_wer_reward": 0.8976732790470123, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9611842036247253, |
| "step": 267 |
| }, |
| { |
| "completion_length": 232.78125, |
| "epoch": 0.8576, |
| "grad_norm": 3.357858180999756, |
| "kl": 0.0655517578125, |
| "learning_rate": 6.6625e-07, |
| "loss": 0.0006, |
| "reward": 3.849023461341858, |
| "reward_std": 0.07564813643693924, |
| "rewards/answer_entity_reward": 0.981249988079071, |
| "rewards/answer_wer_reward": 0.9172319173812866, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9505417346954346, |
| "step": 268 |
| }, |
| { |
| "completion_length": 159.90625, |
| "epoch": 0.8608, |
| "grad_norm": 8.665388107299805, |
| "kl": 0.083740234375, |
| "learning_rate": 6.65e-07, |
| "loss": 0.0008, |
| "reward": 3.8619388341903687, |
| "reward_std": 0.03842100687325001, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9294092357158661, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9369938969612122, |
| "step": 269 |
| }, |
| { |
| "completion_length": 199.09375, |
| "epoch": 0.864, |
| "grad_norm": 2.6412887573242188, |
| "kl": 0.24951171875, |
| "learning_rate": 6.637499999999999e-07, |
| "loss": 0.0025, |
| "reward": 3.92287015914917, |
| "reward_std": 0.04514491464942694, |
| "rewards/answer_entity_reward": 0.9867424070835114, |
| "rewards/answer_wer_reward": 0.948787659406662, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9873401820659637, |
| "step": 270 |
| }, |
| { |
| "completion_length": 138.625, |
| "epoch": 0.8672, |
| "grad_norm": 5.494461536407471, |
| "kl": 0.1064453125, |
| "learning_rate": 6.624999999999999e-07, |
| "loss": 0.0011, |
| "reward": 3.80997896194458, |
| "reward_std": 0.10453111864626408, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9460954964160919, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8638834655284882, |
| "step": 271 |
| }, |
| { |
| "completion_length": 207.0625, |
| "epoch": 0.8704, |
| "grad_norm": 6.705058574676514, |
| "kl": 0.0904541015625, |
| "learning_rate": 6.6125e-07, |
| "loss": 0.0009, |
| "reward": 3.918370246887207, |
| "reward_std": 0.016086122021079063, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.94427290558815, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9765011072158813, |
| "step": 272 |
| }, |
| { |
| "completion_length": 193.46875, |
| "epoch": 0.8736, |
| "grad_norm": 3.6274845600128174, |
| "kl": 0.16259765625, |
| "learning_rate": 6.6e-07, |
| "loss": 0.0016, |
| "reward": 3.8420186042785645, |
| "reward_std": 0.042743777856230736, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.88405841588974, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9579601883888245, |
| "step": 273 |
| }, |
| { |
| "completion_length": 238.03125, |
| "epoch": 0.8768, |
| "grad_norm": 39.40747833251953, |
| "kl": 0.064453125, |
| "learning_rate": 6.587499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.8922038078308105, |
| "reward_std": 0.08438011445105076, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.8997087776660919, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9924951493740082, |
| "step": 274 |
| }, |
| { |
| "completion_length": 215.03125, |
| "epoch": 0.88, |
| "grad_norm": 3.786466360092163, |
| "kl": 0.073974609375, |
| "learning_rate": 6.575e-07, |
| "loss": 0.0007, |
| "reward": 3.936691641807556, |
| "reward_std": 0.013240452855825424, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9393938779830933, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993811845779419, |
| "step": 275 |
| }, |
| { |
| "completion_length": 171.84375, |
| "epoch": 0.8832, |
| "grad_norm": 6.402861595153809, |
| "kl": 0.09619140625, |
| "learning_rate": 6.5625e-07, |
| "loss": 0.001, |
| "reward": 3.8171043395996094, |
| "reward_std": 0.07490862905979156, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9142147600650787, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.902889609336853, |
| "step": 276 |
| }, |
| { |
| "completion_length": 202.90625, |
| "epoch": 0.8864, |
| "grad_norm": 1.9027079343795776, |
| "kl": 0.07958984375, |
| "learning_rate": 6.55e-07, |
| "loss": 0.0008, |
| "reward": 3.910063624382019, |
| "reward_std": 0.014503994956612587, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9100635945796967, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 277 |
| }, |
| { |
| "completion_length": 194.90625, |
| "epoch": 0.8896, |
| "grad_norm": 3.430772304534912, |
| "kl": 0.10107421875, |
| "learning_rate": 6.5375e-07, |
| "loss": 0.001, |
| "reward": 3.9086241722106934, |
| "reward_std": 0.011167994700372219, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9395906329154968, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9690335094928741, |
| "step": 278 |
| }, |
| { |
| "completion_length": 214.21875, |
| "epoch": 0.8928, |
| "grad_norm": 1.209375262260437, |
| "kl": 0.07763671875, |
| "learning_rate": 6.524999999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.934818387031555, |
| "reward_std": 0.013630851171910763, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9348185062408447, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 279 |
| }, |
| { |
| "completion_length": 210.375, |
| "epoch": 0.896, |
| "grad_norm": 3.4542951583862305, |
| "kl": 0.09619140625, |
| "learning_rate": 6.5125e-07, |
| "loss": 0.001, |
| "reward": 3.8483023643493652, |
| "reward_std": 0.022013184614479542, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9127626419067383, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.935539722442627, |
| "step": 280 |
| }, |
| { |
| "completion_length": 171.96875, |
| "epoch": 0.8992, |
| "grad_norm": 5.6723761558532715, |
| "kl": 0.138671875, |
| "learning_rate": 6.5e-07, |
| "loss": 0.0014, |
| "reward": 3.894706964492798, |
| "reward_std": 0.01279338588938117, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9379555583000183, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9567514657974243, |
| "step": 281 |
| }, |
| { |
| "completion_length": 111.78125, |
| "epoch": 0.9024, |
| "grad_norm": 4.6447954177856445, |
| "kl": 0.1376953125, |
| "learning_rate": 6.4875e-07, |
| "loss": 0.0014, |
| "reward": 3.901338577270508, |
| "reward_std": 0.019952512811869383, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.978780597448349, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9253990054130554, |
| "step": 282 |
| }, |
| { |
| "completion_length": 244.96875, |
| "epoch": 0.9056, |
| "grad_norm": 2.825244665145874, |
| "kl": 0.0611572265625, |
| "learning_rate": 6.474999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.9182543754577637, |
| "reward_std": 0.02383749559521675, |
| "rewards/answer_entity_reward": 0.9927884340286255, |
| "rewards/answer_wer_reward": 0.9259287714958191, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9995370507240295, |
| "step": 283 |
| }, |
| { |
| "completion_length": 218.09375, |
| "epoch": 0.9088, |
| "grad_norm": 2.9246108531951904, |
| "kl": 0.0736083984375, |
| "learning_rate": 6.4625e-07, |
| "loss": 0.0007, |
| "reward": 3.9247629642486572, |
| "reward_std": 0.019582282286137342, |
| "rewards/answer_entity_reward": 0.9866071343421936, |
| "rewards/answer_wer_reward": 0.9388971030712128, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9992588460445404, |
| "step": 284 |
| }, |
| { |
| "completion_length": 174.8125, |
| "epoch": 0.912, |
| "grad_norm": 1.4176238775253296, |
| "kl": 0.115478515625, |
| "learning_rate": 6.45e-07, |
| "loss": 0.0012, |
| "reward": 3.9359350204467773, |
| "reward_std": 0.01886278996244073, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9649160206317902, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9731022417545319, |
| "step": 285 |
| }, |
| { |
| "completion_length": 152.40625, |
| "epoch": 0.9152, |
| "grad_norm": 4.273341178894043, |
| "kl": 0.176025390625, |
| "learning_rate": 6.4375e-07, |
| "loss": 0.0018, |
| "reward": 3.850113034248352, |
| "reward_std": 0.07313014380633831, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9544805884361267, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8991046845912933, |
| "step": 286 |
| }, |
| { |
| "completion_length": 221.90625, |
| "epoch": 0.9184, |
| "grad_norm": 3.1975696086883545, |
| "kl": 0.083984375, |
| "learning_rate": 6.424999999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.8276385068893433, |
| "reward_std": 0.019742398988455534, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.8953758776187897, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9379445612430573, |
| "step": 287 |
| }, |
| { |
| "completion_length": 203.1875, |
| "epoch": 0.9216, |
| "grad_norm": 4.396200180053711, |
| "kl": 0.1318359375, |
| "learning_rate": 6.4125e-07, |
| "loss": 0.0013, |
| "reward": 3.9295929670333862, |
| "reward_std": 0.022352089174091816, |
| "rewards/answer_entity_reward": 0.9927884340286255, |
| "rewards/answer_wer_reward": 0.9394927024841309, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9973118305206299, |
| "step": 288 |
| }, |
| { |
| "completion_length": 188.34375, |
| "epoch": 0.9248, |
| "grad_norm": 23.72756004333496, |
| "kl": 0.098388671875, |
| "learning_rate": 6.4e-07, |
| "loss": 0.001, |
| "reward": 3.7452211380004883, |
| "reward_std": 0.12425664439797401, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.8480645418167114, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8971565663814545, |
| "step": 289 |
| }, |
| { |
| "completion_length": 233.5625, |
| "epoch": 0.928, |
| "grad_norm": 1.2391304969787598, |
| "kl": 0.068603515625, |
| "learning_rate": 6.3875e-07, |
| "loss": 0.0007, |
| "reward": 3.8707345724105835, |
| "reward_std": 0.03127638017758727, |
| "rewards/answer_entity_reward": 0.989980161190033, |
| "rewards/answer_wer_reward": 0.8822586238384247, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9984957575798035, |
| "step": 290 |
| }, |
| { |
| "completion_length": 176.8125, |
| "epoch": 0.9312, |
| "grad_norm": 3.8803555965423584, |
| "kl": 0.14697265625, |
| "learning_rate": 6.374999999999999e-07, |
| "loss": 0.0015, |
| "reward": 3.890373468399048, |
| "reward_std": 0.01580220554023981, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9229053854942322, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9674679934978485, |
| "step": 291 |
| }, |
| { |
| "completion_length": 249.9375, |
| "epoch": 0.9344, |
| "grad_norm": 1.001364827156067, |
| "kl": 0.08447265625, |
| "learning_rate": 6.362499999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.8967798948287964, |
| "reward_std": 0.015075822360813618, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.8994384407997131, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994248449802399, |
| "step": 292 |
| }, |
| { |
| "completion_length": 192.125, |
| "epoch": 0.9376, |
| "grad_norm": 7.706722736358643, |
| "kl": 0.12255859375, |
| "learning_rate": 6.35e-07, |
| "loss": 0.0012, |
| "reward": 3.92827308177948, |
| "reward_std": 0.02050976036116481, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9391875863075256, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9890855252742767, |
| "step": 293 |
| }, |
| { |
| "completion_length": 235.125, |
| "epoch": 0.9408, |
| "grad_norm": 1.723900556564331, |
| "kl": 0.0587158203125, |
| "learning_rate": 6.3375e-07, |
| "loss": 0.0006, |
| "reward": 3.9498140811920166, |
| "reward_std": 0.012220169650390744, |
| "rewards/answer_entity_reward": 0.9981617629528046, |
| "rewards/answer_wer_reward": 0.9532225430011749, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.99842968583107, |
| "step": 294 |
| }, |
| { |
| "completion_length": 205.65625, |
| "epoch": 0.944, |
| "grad_norm": 5.019091606140137, |
| "kl": 0.092041015625, |
| "learning_rate": 6.324999999999999e-07, |
| "loss": 0.0009, |
| "reward": 3.72371768951416, |
| "reward_std": 0.03362658293917775, |
| "rewards/answer_entity_reward": 0.988194465637207, |
| "rewards/answer_wer_reward": 0.8201212882995605, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9154019355773926, |
| "step": 295 |
| }, |
| { |
| "completion_length": 229.78125, |
| "epoch": 0.9472, |
| "grad_norm": 2.4262614250183105, |
| "kl": 0.07763671875, |
| "learning_rate": 6.3125e-07, |
| "loss": 0.0008, |
| "reward": 3.9112552404403687, |
| "reward_std": 0.02215595170855522, |
| "rewards/answer_entity_reward": 0.9932383000850677, |
| "rewards/answer_wer_reward": 0.9202675223350525, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9977494478225708, |
| "step": 296 |
| }, |
| { |
| "completion_length": 201.4375, |
| "epoch": 0.9504, |
| "grad_norm": 15.131966590881348, |
| "kl": 1.363037109375, |
| "learning_rate": 6.3e-07, |
| "loss": 0.0136, |
| "reward": 3.8845863342285156, |
| "reward_std": 0.025053692050278187, |
| "rewards/answer_entity_reward": 0.995192289352417, |
| "rewards/answer_wer_reward": 0.9146546125411987, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9747393429279327, |
| "step": 297 |
| }, |
| { |
| "completion_length": 172.9375, |
| "epoch": 0.9536, |
| "grad_norm": 0.7034117579460144, |
| "kl": 0.114501953125, |
| "learning_rate": 6.2875e-07, |
| "loss": 0.0011, |
| "reward": 3.9505850076675415, |
| "reward_std": 0.004406077787280083, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9516552090644836, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9989297986030579, |
| "step": 298 |
| }, |
| { |
| "completion_length": 226.03125, |
| "epoch": 0.9568, |
| "grad_norm": 10.005863189697266, |
| "kl": 0.099853515625, |
| "learning_rate": 6.274999999999999e-07, |
| "loss": 0.001, |
| "reward": 3.78713595867157, |
| "reward_std": 0.118343286216259, |
| "rewards/answer_entity_reward": 0.9955128133296967, |
| "rewards/answer_wer_reward": 0.8108388781547546, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9807842969894409, |
| "step": 299 |
| }, |
| { |
| "completion_length": 183.0, |
| "epoch": 0.96, |
| "grad_norm": 12.267927169799805, |
| "kl": 0.142578125, |
| "learning_rate": 6.262499999999999e-07, |
| "loss": 0.0014, |
| "reward": 3.7959177494049072, |
| "reward_std": 0.09426255617290735, |
| "rewards/answer_entity_reward": 0.9763257503509521, |
| "rewards/answer_wer_reward": 0.963774561882019, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8558174073696136, |
| "step": 300 |
| }, |
| { |
| "completion_length": 255.28125, |
| "epoch": 0.9632, |
| "grad_norm": 1.5198532342910767, |
| "kl": 0.0638427734375, |
| "learning_rate": 6.249999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.8590621948242188, |
| "reward_std": 0.05621089227497578, |
| "rewards/answer_entity_reward": 0.9652777910232544, |
| "rewards/answer_wer_reward": 0.8950084447860718, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987758696079254, |
| "step": 301 |
| }, |
| { |
| "completion_length": 231.09375, |
| "epoch": 0.9664, |
| "grad_norm": 2.063969135284424, |
| "kl": 0.0770263671875, |
| "learning_rate": 6.2375e-07, |
| "loss": 0.0008, |
| "reward": 3.8598477840423584, |
| "reward_std": 0.04335158132016659, |
| "rewards/answer_entity_reward": 0.9843385815620422, |
| "rewards/answer_wer_reward": 0.8991816341876984, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.976327657699585, |
| "step": 302 |
| }, |
| { |
| "completion_length": 214.1875, |
| "epoch": 0.9696, |
| "grad_norm": 4.762388706207275, |
| "kl": 0.09765625, |
| "learning_rate": 6.225000000000001e-07, |
| "loss": 0.001, |
| "reward": 3.86174213886261, |
| "reward_std": 0.03313549840822816, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9338361918926239, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9313782155513763, |
| "step": 303 |
| }, |
| { |
| "completion_length": 232.5625, |
| "epoch": 0.9728, |
| "grad_norm": 2.811995506286621, |
| "kl": 0.10595703125, |
| "learning_rate": 6.2125e-07, |
| "loss": 0.0011, |
| "reward": 3.732570767402649, |
| "reward_std": 0.14181919861584902, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9169972240924835, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.846823513507843, |
| "step": 304 |
| }, |
| { |
| "completion_length": 221.96875, |
| "epoch": 0.976, |
| "grad_norm": 2.424633741378784, |
| "kl": 0.0677490234375, |
| "learning_rate": 6.2e-07, |
| "loss": 0.0007, |
| "reward": 3.9095277786254883, |
| "reward_std": 0.047814636724069715, |
| "rewards/answer_entity_reward": 0.9927884638309479, |
| "rewards/answer_wer_reward": 0.93398517370224, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9827540516853333, |
| "step": 305 |
| }, |
| { |
| "completion_length": 273.8125, |
| "epoch": 0.9792, |
| "grad_norm": 1.3363338708877563, |
| "kl": 0.0654296875, |
| "learning_rate": 6.1875e-07, |
| "loss": 0.0007, |
| "reward": 3.8615630865097046, |
| "reward_std": 0.029406235553324223, |
| "rewards/answer_entity_reward": 0.9869123697280884, |
| "rewards/answer_wer_reward": 0.8774734139442444, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9971773624420166, |
| "step": 306 |
| }, |
| { |
| "completion_length": 243.34375, |
| "epoch": 0.9824, |
| "grad_norm": 3.1950275897979736, |
| "kl": 0.05810546875, |
| "learning_rate": 6.175e-07, |
| "loss": 0.0006, |
| "reward": 3.898465633392334, |
| "reward_std": 0.022021150682121515, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.903068333864212, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.998238205909729, |
| "step": 307 |
| }, |
| { |
| "completion_length": 230.375, |
| "epoch": 0.9856, |
| "grad_norm": 1.1819887161254883, |
| "kl": 0.075927734375, |
| "learning_rate": 6.162499999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.9233819246292114, |
| "reward_std": 0.01652457471936941, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9294087886810303, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9968140125274658, |
| "step": 308 |
| }, |
| { |
| "completion_length": 193.78125, |
| "epoch": 0.9888, |
| "grad_norm": 3.613255739212036, |
| "kl": 0.089111328125, |
| "learning_rate": 6.149999999999999e-07, |
| "loss": 0.0009, |
| "reward": 3.9530293941497803, |
| "reward_std": 0.013143055606633425, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9536189138889313, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994103908538818, |
| "step": 309 |
| }, |
| { |
| "completion_length": 223.1875, |
| "epoch": 0.992, |
| "grad_norm": 2.9832558631896973, |
| "kl": 0.076904296875, |
| "learning_rate": 6.1375e-07, |
| "loss": 0.0008, |
| "reward": 3.9074047803878784, |
| "reward_std": 0.03526896797120571, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9134717583656311, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9939330518245697, |
| "step": 310 |
| }, |
| { |
| "completion_length": 202.78125, |
| "epoch": 0.9952, |
| "grad_norm": 1.6509346961975098, |
| "kl": 0.100830078125, |
| "learning_rate": 6.125000000000001e-07, |
| "loss": 0.001, |
| "reward": 3.897627115249634, |
| "reward_std": 0.025366032496094704, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9004680216312408, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 311 |
| }, |
| { |
| "completion_length": 227.75, |
| "epoch": 0.9984, |
| "grad_norm": 2.9892170429229736, |
| "kl": 0.091064453125, |
| "learning_rate": 6.1125e-07, |
| "loss": 0.0009, |
| "reward": 3.879219174385071, |
| "reward_std": 0.04558245837688446, |
| "rewards/answer_entity_reward": 0.9914772808551788, |
| "rewards/answer_wer_reward": 0.9074902236461639, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9802517294883728, |
| "step": 312 |
| }, |
| { |
| "completion_length": 165.125, |
| "epoch": 1.0, |
| "grad_norm": 1.1831876039505005, |
| "kl": 0.09814453125, |
| "learning_rate": 6.1e-07, |
| "loss": 0.0005, |
| "reward": 3.956197738647461, |
| "reward_std": 0.047231610864400864, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.985044002532959, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9711538553237915, |
| "step": 313 |
| }, |
| { |
| "completion_length": 194.875, |
| "epoch": 1.0032, |
| "grad_norm": 1.1336063146591187, |
| "kl": 0.10302734375, |
| "learning_rate": 6.0875e-07, |
| "loss": 0.001, |
| "reward": 3.955459713935852, |
| "reward_std": 0.010184567421674728, |
| "rewards/answer_entity_reward": 0.995192289352417, |
| "rewards/answer_wer_reward": 0.9638065993785858, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9964607954025269, |
| "step": 314 |
| }, |
| { |
| "completion_length": 172.1875, |
| "epoch": 1.0064, |
| "grad_norm": 7.745497226715088, |
| "kl": 0.099609375, |
| "learning_rate": 6.075e-07, |
| "loss": 0.001, |
| "reward": 3.9203338623046875, |
| "reward_std": 0.005493420176208019, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9503339529037476, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9699999988079071, |
| "step": 315 |
| }, |
| { |
| "completion_length": 216.90625, |
| "epoch": 1.0096, |
| "grad_norm": 5.326587200164795, |
| "kl": 0.076904296875, |
| "learning_rate": 6.062499999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.8242450952529907, |
| "reward_std": 0.04496973566710949, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9261577427387238, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8980873227119446, |
| "step": 316 |
| }, |
| { |
| "completion_length": 179.59375, |
| "epoch": 1.0128, |
| "grad_norm": 1.887527346611023, |
| "kl": 0.0675048828125, |
| "learning_rate": 6.049999999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.9317299127578735, |
| "reward_std": 0.023447751067578793, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9398273527622223, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9963668584823608, |
| "step": 317 |
| }, |
| { |
| "completion_length": 215.1875, |
| "epoch": 1.016, |
| "grad_norm": 2.478510618209839, |
| "kl": 0.060791015625, |
| "learning_rate": 6.037499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.8788411617279053, |
| "reward_std": 0.020661167800426483, |
| "rewards/answer_entity_reward": 0.9930555820465088, |
| "rewards/answer_wer_reward": 0.8995265662670135, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9862589836120605, |
| "step": 318 |
| }, |
| { |
| "completion_length": 205.5, |
| "epoch": 1.0192, |
| "grad_norm": 1.7058178186416626, |
| "kl": 0.0830078125, |
| "learning_rate": 6.025000000000001e-07, |
| "loss": 0.0008, |
| "reward": 3.807918906211853, |
| "reward_std": 0.04822289012372494, |
| "rewards/answer_entity_reward": 0.9788461625576019, |
| "rewards/answer_wer_reward": 0.8715765476226807, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9574962258338928, |
| "step": 319 |
| }, |
| { |
| "completion_length": 242.46875, |
| "epoch": 1.0224, |
| "grad_norm": 1.7695921659469604, |
| "kl": 0.0859375, |
| "learning_rate": 6.0125e-07, |
| "loss": 0.0009, |
| "reward": 3.9255610704421997, |
| "reward_std": 0.019923360086977482, |
| "rewards/answer_entity_reward": 0.9906516969203949, |
| "rewards/answer_wer_reward": 0.9401695132255554, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9947398900985718, |
| "step": 320 |
| }, |
| { |
| "completion_length": 175.40625, |
| "epoch": 1.0256, |
| "grad_norm": 2.60329270362854, |
| "kl": 0.085693359375, |
| "learning_rate": 6e-07, |
| "loss": 0.0009, |
| "reward": 3.9218677282333374, |
| "reward_std": 0.008750536944717169, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9223886132240295, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994791746139526, |
| "step": 321 |
| }, |
| { |
| "completion_length": 157.875, |
| "epoch": 1.0288, |
| "grad_norm": 5.270680904388428, |
| "kl": 0.120361328125, |
| "learning_rate": 5.9875e-07, |
| "loss": 0.0012, |
| "reward": 3.8664562702178955, |
| "reward_std": 0.03370736539363861, |
| "rewards/answer_entity_reward": 0.9868055582046509, |
| "rewards/answer_wer_reward": 0.9486467838287354, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9310039281845093, |
| "step": 322 |
| }, |
| { |
| "completion_length": 202.15625, |
| "epoch": 1.032, |
| "grad_norm": 0.9677954316139221, |
| "kl": 0.072998046875, |
| "learning_rate": 5.975e-07, |
| "loss": 0.0007, |
| "reward": 3.9512887001037598, |
| "reward_std": 0.008498450508341193, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9516439437866211, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996448755264282, |
| "step": 323 |
| }, |
| { |
| "completion_length": 183.21875, |
| "epoch": 1.0352, |
| "grad_norm": 8.04370403289795, |
| "kl": 0.0908203125, |
| "learning_rate": 5.962499999999999e-07, |
| "loss": 0.0009, |
| "reward": 3.810960531234741, |
| "reward_std": 0.017052859999239445, |
| "rewards/answer_entity_reward": 0.9861111044883728, |
| "rewards/answer_wer_reward": 0.9431954920291901, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8816540241241455, |
| "step": 324 |
| }, |
| { |
| "completion_length": 209.125, |
| "epoch": 1.0384, |
| "grad_norm": 1.1835105419158936, |
| "kl": 0.09326171875, |
| "learning_rate": 5.949999999999999e-07, |
| "loss": 0.0009, |
| "reward": 3.9159555435180664, |
| "reward_std": 0.02768123522400856, |
| "rewards/answer_entity_reward": 0.9866695702075958, |
| "rewards/answer_wer_reward": 0.930209755897522, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990763068199158, |
| "step": 325 |
| }, |
| { |
| "completion_length": 202.15625, |
| "epoch": 1.0416, |
| "grad_norm": 1.198609471321106, |
| "kl": 0.0748291015625, |
| "learning_rate": 5.937499999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.85296094417572, |
| "reward_std": 0.19228698359802365, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9154608845710754, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.96875, |
| "step": 326 |
| }, |
| { |
| "completion_length": 187.9375, |
| "epoch": 1.0448, |
| "grad_norm": 3.9246749877929688, |
| "kl": 0.08740234375, |
| "learning_rate": 5.925e-07, |
| "loss": 0.0009, |
| "reward": 3.8706984519958496, |
| "reward_std": 0.046023860573768616, |
| "rewards/answer_entity_reward": 0.9947552382946014, |
| "rewards/answer_wer_reward": 0.9316051602363586, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9443379342556, |
| "step": 327 |
| }, |
| { |
| "completion_length": 206.5625, |
| "epoch": 1.048, |
| "grad_norm": 2.1665873527526855, |
| "kl": 0.111083984375, |
| "learning_rate": 5.912500000000001e-07, |
| "loss": 0.0011, |
| "reward": 3.8563778400421143, |
| "reward_std": 0.02296618465334177, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9032285511493683, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9531491994857788, |
| "step": 328 |
| }, |
| { |
| "completion_length": 202.71875, |
| "epoch": 1.0512, |
| "grad_norm": 2.493177890777588, |
| "kl": 0.087646484375, |
| "learning_rate": 5.9e-07, |
| "loss": 0.0009, |
| "reward": 3.8221092224121094, |
| "reward_std": 0.13764610793441534, |
| "rewards/answer_entity_reward": 0.9418402910232544, |
| "rewards/answer_wer_reward": 0.8825558722019196, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9977130889892578, |
| "step": 329 |
| }, |
| { |
| "completion_length": 200.0, |
| "epoch": 1.0544, |
| "grad_norm": 1.2568529844284058, |
| "kl": 0.114013671875, |
| "learning_rate": 5.8875e-07, |
| "loss": 0.0011, |
| "reward": 3.934491515159607, |
| "reward_std": 0.012761063873767853, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9390542805194855, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9954372346401215, |
| "step": 330 |
| }, |
| { |
| "completion_length": 212.71875, |
| "epoch": 1.0576, |
| "grad_norm": 1.3623089790344238, |
| "kl": 0.086669921875, |
| "learning_rate": 5.875e-07, |
| "loss": 0.0009, |
| "reward": 3.8928335905075073, |
| "reward_std": 0.03161040600389242, |
| "rewards/answer_entity_reward": 0.9936868846416473, |
| "rewards/answer_wer_reward": 0.8996903300285339, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994563460350037, |
| "step": 331 |
| }, |
| { |
| "completion_length": 240.3125, |
| "epoch": 1.0608, |
| "grad_norm": 1.2754676342010498, |
| "kl": 0.0615234375, |
| "learning_rate": 5.8625e-07, |
| "loss": 0.0006, |
| "reward": 3.925002932548523, |
| "reward_std": 0.0067287166602909565, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9267281293869019, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9982748329639435, |
| "step": 332 |
| }, |
| { |
| "completion_length": 218.6875, |
| "epoch": 1.064, |
| "grad_norm": 1.989392638206482, |
| "kl": 0.073486328125, |
| "learning_rate": 5.849999999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.9305100440979004, |
| "reward_std": 0.014313624240458012, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9314764738082886, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999033510684967, |
| "step": 333 |
| }, |
| { |
| "completion_length": 187.125, |
| "epoch": 1.0672, |
| "grad_norm": 4.332698822021484, |
| "kl": 0.11474609375, |
| "learning_rate": 5.837499999999999e-07, |
| "loss": 0.0011, |
| "reward": 3.9111961126327515, |
| "reward_std": 0.017924371175467968, |
| "rewards/answer_entity_reward": 0.9967105388641357, |
| "rewards/answer_wer_reward": 0.9153991043567657, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990865290164948, |
| "step": 334 |
| }, |
| { |
| "completion_length": 240.8125, |
| "epoch": 1.0704, |
| "grad_norm": 0.991020143032074, |
| "kl": 0.0609130859375, |
| "learning_rate": 5.825e-07, |
| "loss": 0.0006, |
| "reward": 3.9502662420272827, |
| "reward_std": 0.006167408544570208, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9529542922973633, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9973118305206299, |
| "step": 335 |
| }, |
| { |
| "completion_length": 250.625, |
| "epoch": 1.0735999999999999, |
| "grad_norm": 2.3996546268463135, |
| "kl": 0.06396484375, |
| "learning_rate": 5.8125e-07, |
| "loss": 0.0006, |
| "reward": 3.899760365486145, |
| "reward_std": 0.02179525839164853, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9130350351333618, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9867254495620728, |
| "step": 336 |
| }, |
| { |
| "completion_length": 193.9375, |
| "epoch": 1.0768, |
| "grad_norm": 3.6998724937438965, |
| "kl": 0.090576171875, |
| "learning_rate": 5.8e-07, |
| "loss": 0.0009, |
| "reward": 3.8309794664382935, |
| "reward_std": 0.01553899934515357, |
| "rewards/answer_entity_reward": 0.9886363744735718, |
| "rewards/answer_wer_reward": 0.9471099972724915, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.895233154296875, |
| "step": 337 |
| }, |
| { |
| "completion_length": 221.90625, |
| "epoch": 1.08, |
| "grad_norm": 1.1334843635559082, |
| "kl": 0.0587158203125, |
| "learning_rate": 5.7875e-07, |
| "loss": 0.0006, |
| "reward": 3.936136484146118, |
| "reward_std": 0.012863298412412405, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9376117587089539, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9985246956348419, |
| "step": 338 |
| }, |
| { |
| "completion_length": 242.125, |
| "epoch": 1.0832, |
| "grad_norm": 1.0358681678771973, |
| "kl": 0.0643310546875, |
| "learning_rate": 5.775e-07, |
| "loss": 0.0007, |
| "reward": 3.887587547302246, |
| "reward_std": 0.0230812830850482, |
| "rewards/answer_entity_reward": 0.9798610806465149, |
| "rewards/answer_wer_reward": 0.9077264070510864, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 339 |
| }, |
| { |
| "completion_length": 213.9375, |
| "epoch": 1.0864, |
| "grad_norm": 24.39422035217285, |
| "kl": 0.080078125, |
| "learning_rate": 5.7625e-07, |
| "loss": 0.0008, |
| "reward": 3.887939691543579, |
| "reward_std": 0.014108296483755112, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.8971990048885345, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9907407462596893, |
| "step": 340 |
| }, |
| { |
| "completion_length": 205.375, |
| "epoch": 1.0896, |
| "grad_norm": 1.204923152923584, |
| "kl": 0.1015625, |
| "learning_rate": 5.749999999999999e-07, |
| "loss": 0.001, |
| "reward": 3.819010019302368, |
| "reward_std": 0.24664557841606438, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9135412275791168, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.936718761920929, |
| "step": 341 |
| }, |
| { |
| "completion_length": 231.09375, |
| "epoch": 1.0928, |
| "grad_norm": 0.831721842288971, |
| "kl": 0.06884765625, |
| "learning_rate": 5.737499999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.9083417654037476, |
| "reward_std": 0.023847888689488173, |
| "rewards/answer_entity_reward": 0.9902146458625793, |
| "rewards/answer_wer_reward": 0.9184364974498749, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999690592288971, |
| "step": 342 |
| }, |
| { |
| "completion_length": 225.28125, |
| "epoch": 1.096, |
| "grad_norm": 1.239318609237671, |
| "kl": 0.070068359375, |
| "learning_rate": 5.725e-07, |
| "loss": 0.0007, |
| "reward": 3.8802337646484375, |
| "reward_std": 0.019388118293136358, |
| "rewards/answer_entity_reward": 0.9895833730697632, |
| "rewards/answer_wer_reward": 0.8906503319740295, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 343 |
| }, |
| { |
| "completion_length": 182.25, |
| "epoch": 1.0992, |
| "grad_norm": 2.810415267944336, |
| "kl": 0.08349609375, |
| "learning_rate": 5.7125e-07, |
| "loss": 0.0008, |
| "reward": 3.8992663621902466, |
| "reward_std": 0.017442656215280294, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9195939302444458, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9825133979320526, |
| "step": 344 |
| }, |
| { |
| "completion_length": 228.9375, |
| "epoch": 1.1024, |
| "grad_norm": 2.4584133625030518, |
| "kl": 0.11376953125, |
| "learning_rate": 5.699999999999999e-07, |
| "loss": 0.0011, |
| "reward": 3.893067240715027, |
| "reward_std": 0.024248626083135605, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9001834988594055, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9963560402393341, |
| "step": 345 |
| }, |
| { |
| "completion_length": 154.46875, |
| "epoch": 1.1056, |
| "grad_norm": 2.5888006687164307, |
| "kl": 0.1025390625, |
| "learning_rate": 5.6875e-07, |
| "loss": 0.001, |
| "reward": 3.8254867792129517, |
| "reward_std": 0.031096864491701126, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9297608137130737, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8957259356975555, |
| "step": 346 |
| }, |
| { |
| "completion_length": 174.0625, |
| "epoch": 1.1088, |
| "grad_norm": 2.087509870529175, |
| "kl": 0.12158203125, |
| "learning_rate": 5.675e-07, |
| "loss": 0.0012, |
| "reward": 3.920476198196411, |
| "reward_std": 0.017223183065652847, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9334003627300262, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9870758056640625, |
| "step": 347 |
| }, |
| { |
| "completion_length": 209.3125, |
| "epoch": 1.112, |
| "grad_norm": 1.5391756296157837, |
| "kl": 0.105712890625, |
| "learning_rate": 5.6625e-07, |
| "loss": 0.0011, |
| "reward": 3.9325058460235596, |
| "reward_std": 0.011998760513961315, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9345271587371826, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9979787170886993, |
| "step": 348 |
| }, |
| { |
| "completion_length": 211.9375, |
| "epoch": 1.1152, |
| "grad_norm": 2.1449012756347656, |
| "kl": 0.072021484375, |
| "learning_rate": 5.649999999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.887805461883545, |
| "reward_std": 0.01465547364205122, |
| "rewards/answer_entity_reward": 0.9981617629528046, |
| "rewards/answer_wer_reward": 0.8914407789707184, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.998202919960022, |
| "step": 349 |
| }, |
| { |
| "completion_length": 219.875, |
| "epoch": 1.1184, |
| "grad_norm": 2.7394628524780273, |
| "kl": 0.065185546875, |
| "learning_rate": 5.637499999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.905122399330139, |
| "reward_std": 0.014080648310482502, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9162788391113281, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9888435006141663, |
| "step": 350 |
| }, |
| { |
| "completion_length": 191.28125, |
| "epoch": 1.1216, |
| "grad_norm": 2.381448745727539, |
| "kl": 0.0721435546875, |
| "learning_rate": 5.625e-07, |
| "loss": 0.0007, |
| "reward": 3.8880510330200195, |
| "reward_std": 0.04133735504001379, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9348196983337402, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9576955437660217, |
| "step": 351 |
| }, |
| { |
| "completion_length": 263.5625, |
| "epoch": 1.1248, |
| "grad_norm": 1.0376274585723877, |
| "kl": 0.0584716796875, |
| "learning_rate": 5.6125e-07, |
| "loss": 0.0006, |
| "reward": 3.8982614278793335, |
| "reward_std": 0.012545288074761629, |
| "rewards/answer_entity_reward": 0.9981617629528046, |
| "rewards/answer_wer_reward": 0.9007040560245514, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999395489692688, |
| "step": 352 |
| }, |
| { |
| "completion_length": 218.59375, |
| "epoch": 1.1280000000000001, |
| "grad_norm": 1.5081944465637207, |
| "kl": 0.10009765625, |
| "learning_rate": 5.6e-07, |
| "loss": 0.001, |
| "reward": 3.9146311283111572, |
| "reward_std": 0.021717723459005356, |
| "rewards/answer_entity_reward": 0.9917200803756714, |
| "rewards/answer_wer_reward": 0.9235903024673462, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993206560611725, |
| "step": 353 |
| }, |
| { |
| "completion_length": 226.84375, |
| "epoch": 1.1312, |
| "grad_norm": 1.0990034341812134, |
| "kl": 0.063720703125, |
| "learning_rate": 5.587499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.9005931615829468, |
| "reward_std": 0.018239760771393776, |
| "rewards/answer_entity_reward": 0.9927884340286255, |
| "rewards/answer_wer_reward": 0.9203313589096069, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9874734580516815, |
| "step": 354 |
| }, |
| { |
| "completion_length": 238.59375, |
| "epoch": 1.1344, |
| "grad_norm": 10.765813827514648, |
| "kl": 0.056884765625, |
| "learning_rate": 5.575e-07, |
| "loss": 0.0006, |
| "reward": 3.9274662733078003, |
| "reward_std": 0.016329116653651, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9380079507827759, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9894582629203796, |
| "step": 355 |
| }, |
| { |
| "completion_length": 216.1875, |
| "epoch": 1.1376, |
| "grad_norm": 6.097777843475342, |
| "kl": 0.43701171875, |
| "learning_rate": 5.5625e-07, |
| "loss": 0.0044, |
| "reward": 3.6753621101379395, |
| "reward_std": 0.09127287194132805, |
| "rewards/answer_entity_reward": 0.9843385517597198, |
| "rewards/answer_wer_reward": 0.9279595017433167, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.7630640268325806, |
| "step": 356 |
| }, |
| { |
| "completion_length": 233.25, |
| "epoch": 1.1408, |
| "grad_norm": 1.9484727382659912, |
| "kl": 0.07470703125, |
| "learning_rate": 5.55e-07, |
| "loss": 0.0007, |
| "reward": 3.8734058141708374, |
| "reward_std": 0.026476514525711536, |
| "rewards/answer_entity_reward": 0.9829545617103577, |
| "rewards/answer_wer_reward": 0.906408816576004, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9840425550937653, |
| "step": 357 |
| }, |
| { |
| "completion_length": 224.125, |
| "epoch": 1.144, |
| "grad_norm": 1.650207757949829, |
| "kl": 0.071533203125, |
| "learning_rate": 5.5375e-07, |
| "loss": 0.0007, |
| "reward": 3.9309768676757812, |
| "reward_std": 0.016152822878211737, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9357885122299194, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9986605942249298, |
| "step": 358 |
| }, |
| { |
| "completion_length": 202.8125, |
| "epoch": 1.1472, |
| "grad_norm": 2.33708119392395, |
| "kl": 0.102294921875, |
| "learning_rate": 5.525e-07, |
| "loss": 0.001, |
| "reward": 3.901100993156433, |
| "reward_std": 0.06198639050126076, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9566735327243805, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9444274306297302, |
| "step": 359 |
| }, |
| { |
| "completion_length": 231.03125, |
| "epoch": 1.1504, |
| "grad_norm": 2.603564977645874, |
| "kl": 0.0662841796875, |
| "learning_rate": 5.5125e-07, |
| "loss": 0.0007, |
| "reward": 3.8539780378341675, |
| "reward_std": 0.04134450480341911, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.8810023069381714, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.975816547870636, |
| "step": 360 |
| }, |
| { |
| "completion_length": 176.8125, |
| "epoch": 1.1536, |
| "grad_norm": 1.9730738401412964, |
| "kl": 0.0673828125, |
| "learning_rate": 5.5e-07, |
| "loss": 0.0007, |
| "reward": 3.946772813796997, |
| "reward_std": 0.007931779837235808, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9499374032020569, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9968354403972626, |
| "step": 361 |
| }, |
| { |
| "completion_length": 205.28125, |
| "epoch": 1.1568, |
| "grad_norm": 2.6627304553985596, |
| "kl": 0.0997314453125, |
| "learning_rate": 5.487499999999999e-07, |
| "loss": 0.001, |
| "reward": 3.914576292037964, |
| "reward_std": 0.015826540999114513, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9475591778755188, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9670170843601227, |
| "step": 362 |
| }, |
| { |
| "completion_length": 199.9375, |
| "epoch": 1.16, |
| "grad_norm": 2.073272466659546, |
| "kl": 0.091064453125, |
| "learning_rate": 5.474999999999999e-07, |
| "loss": 0.0009, |
| "reward": 3.89456570148468, |
| "reward_std": 0.008259527385234833, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9333997070789337, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9611659348011017, |
| "step": 363 |
| }, |
| { |
| "completion_length": 222.0625, |
| "epoch": 1.1632, |
| "grad_norm": 1.7804555892944336, |
| "kl": 0.1220703125, |
| "learning_rate": 5.4625e-07, |
| "loss": 0.0012, |
| "reward": 3.847594380378723, |
| "reward_std": 0.09885499440133572, |
| "rewards/answer_entity_reward": 0.9692708849906921, |
| "rewards/answer_wer_reward": 0.8783235251903534, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 364 |
| }, |
| { |
| "completion_length": 206.71875, |
| "epoch": 1.1663999999999999, |
| "grad_norm": 1.6756658554077148, |
| "kl": 0.097900390625, |
| "learning_rate": 5.45e-07, |
| "loss": 0.001, |
| "reward": 3.866326928138733, |
| "reward_std": 0.027653913479298353, |
| "rewards/answer_entity_reward": 0.990705132484436, |
| "rewards/answer_wer_reward": 0.9324296712875366, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9431920945644379, |
| "step": 365 |
| }, |
| { |
| "completion_length": 187.125, |
| "epoch": 1.1696, |
| "grad_norm": 1.6528626680374146, |
| "kl": 0.075439453125, |
| "learning_rate": 5.4375e-07, |
| "loss": 0.0008, |
| "reward": 3.821729063987732, |
| "reward_std": 0.14681637566536665, |
| "rewards/answer_entity_reward": 0.995192289352417, |
| "rewards/answer_wer_reward": 0.896637350320816, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9611493647098541, |
| "step": 366 |
| }, |
| { |
| "completion_length": 180.5625, |
| "epoch": 1.1728, |
| "grad_norm": 2.211965560913086, |
| "kl": 0.10302734375, |
| "learning_rate": 5.425e-07, |
| "loss": 0.001, |
| "reward": 3.857783317565918, |
| "reward_std": 0.13934296648949385, |
| "rewards/answer_entity_reward": 0.9847756326198578, |
| "rewards/answer_wer_reward": 0.9358752965927124, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9683823585510254, |
| "step": 367 |
| }, |
| { |
| "completion_length": 208.21875, |
| "epoch": 1.176, |
| "grad_norm": 2.522264242172241, |
| "kl": 0.060546875, |
| "learning_rate": 5.4125e-07, |
| "loss": 0.0006, |
| "reward": 3.8018884658813477, |
| "reward_std": 0.07955996971577406, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.8091042637825012, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9962564706802368, |
| "step": 368 |
| }, |
| { |
| "completion_length": 192.84375, |
| "epoch": 1.1792, |
| "grad_norm": 1.4488089084625244, |
| "kl": 0.0791015625, |
| "learning_rate": 5.4e-07, |
| "loss": 0.0008, |
| "reward": 3.940070152282715, |
| "reward_std": 0.008247917518019676, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9410351514816284, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999035120010376, |
| "step": 369 |
| }, |
| { |
| "completion_length": 244.96875, |
| "epoch": 1.1824, |
| "grad_norm": 5.085299968719482, |
| "kl": 0.109130859375, |
| "learning_rate": 5.387499999999999e-07, |
| "loss": 0.0011, |
| "reward": 3.834069848060608, |
| "reward_std": 0.027521015144884586, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.907810240983963, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9319414496421814, |
| "step": 370 |
| }, |
| { |
| "completion_length": 223.25, |
| "epoch": 1.1856, |
| "grad_norm": 2.248169183731079, |
| "kl": 0.1083984375, |
| "learning_rate": 5.374999999999999e-07, |
| "loss": 0.0011, |
| "reward": 3.9311490058898926, |
| "reward_std": 0.011384843150153756, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.931148886680603, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 371 |
| }, |
| { |
| "completion_length": 237.3125, |
| "epoch": 1.1888, |
| "grad_norm": 1.0549304485321045, |
| "kl": 0.05419921875, |
| "learning_rate": 5.3625e-07, |
| "loss": 0.0005, |
| "reward": 3.890028476715088, |
| "reward_std": 0.012344780378043652, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.8915461599826813, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9984822571277618, |
| "step": 372 |
| }, |
| { |
| "completion_length": 216.0625, |
| "epoch": 1.192, |
| "grad_norm": 1.3054077625274658, |
| "kl": 0.0694580078125, |
| "learning_rate": 5.35e-07, |
| "loss": 0.0007, |
| "reward": 3.8679678440093994, |
| "reward_std": 0.016808426938951015, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.8875625133514404, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.980405330657959, |
| "step": 373 |
| }, |
| { |
| "completion_length": 222.34375, |
| "epoch": 1.1952, |
| "grad_norm": 10.381876945495605, |
| "kl": 0.067626953125, |
| "learning_rate": 5.3375e-07, |
| "loss": 0.0007, |
| "reward": 3.946020483970642, |
| "reward_std": 0.016021378338336945, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9507038593292236, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9977203905582428, |
| "step": 374 |
| }, |
| { |
| "completion_length": 208.875, |
| "epoch": 1.1984, |
| "grad_norm": 2.7493553161621094, |
| "kl": 0.13525390625, |
| "learning_rate": 5.325e-07, |
| "loss": 0.0014, |
| "reward": 3.942535161972046, |
| "reward_std": 0.01458098879083991, |
| "rewards/answer_entity_reward": 0.993686854839325, |
| "rewards/answer_wer_reward": 0.9490944147109985, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997539520263672, |
| "step": 375 |
| }, |
| { |
| "completion_length": 252.09375, |
| "epoch": 1.2016, |
| "grad_norm": 1.9127050638198853, |
| "kl": 0.079345703125, |
| "learning_rate": 5.3125e-07, |
| "loss": 0.0008, |
| "reward": 3.8897405862808228, |
| "reward_std": 0.015877339988946915, |
| "rewards/answer_entity_reward": 0.9888257682323456, |
| "rewards/answer_wer_reward": 0.9078421294689178, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9930725991725922, |
| "step": 376 |
| }, |
| { |
| "completion_length": 186.375, |
| "epoch": 1.2048, |
| "grad_norm": 1.832676887512207, |
| "kl": 0.096435546875, |
| "learning_rate": 5.3e-07, |
| "loss": 0.001, |
| "reward": 3.9009323120117188, |
| "reward_std": 0.013205710332840681, |
| "rewards/answer_entity_reward": 0.995192289352417, |
| "rewards/answer_wer_reward": 0.940411388874054, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9653284549713135, |
| "step": 377 |
| }, |
| { |
| "completion_length": 224.46875, |
| "epoch": 1.208, |
| "grad_norm": 1.1020106077194214, |
| "kl": 0.0638427734375, |
| "learning_rate": 5.2875e-07, |
| "loss": 0.0006, |
| "reward": 3.9539231061935425, |
| "reward_std": 0.005315458634868264, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9545543491840363, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993686676025391, |
| "step": 378 |
| }, |
| { |
| "completion_length": 158.25, |
| "epoch": 1.2112, |
| "grad_norm": 2.493016481399536, |
| "kl": 0.123779296875, |
| "learning_rate": 5.274999999999999e-07, |
| "loss": 0.0012, |
| "reward": 3.921034097671509, |
| "reward_std": 0.009559540543705225, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9532065689563751, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9678275287151337, |
| "step": 379 |
| }, |
| { |
| "completion_length": 253.4375, |
| "epoch": 1.2144, |
| "grad_norm": 1.1055541038513184, |
| "kl": 0.067626953125, |
| "learning_rate": 5.262499999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.8998262882232666, |
| "reward_std": 0.021630683913826942, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9026672542095184, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 380 |
| }, |
| { |
| "completion_length": 211.28125, |
| "epoch": 1.2176, |
| "grad_norm": 2.4898200035095215, |
| "kl": 0.072998046875, |
| "learning_rate": 5.25e-07, |
| "loss": 0.0007, |
| "reward": 3.8961129188537598, |
| "reward_std": 0.02530479012057185, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9441809356212616, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9547730088233948, |
| "step": 381 |
| }, |
| { |
| "completion_length": 241.34375, |
| "epoch": 1.2208, |
| "grad_norm": 1.5863702297210693, |
| "kl": 0.09033203125, |
| "learning_rate": 5.237500000000001e-07, |
| "loss": 0.0009, |
| "reward": 3.9048832654953003, |
| "reward_std": 0.02675863727927208, |
| "rewards/answer_entity_reward": 0.9836346209049225, |
| "rewards/answer_wer_reward": 0.9218496978282928, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993990361690521, |
| "step": 382 |
| }, |
| { |
| "completion_length": 244.90625, |
| "epoch": 1.224, |
| "grad_norm": 1.3265018463134766, |
| "kl": 0.08984375, |
| "learning_rate": 5.225e-07, |
| "loss": 0.0009, |
| "reward": 3.9047261476516724, |
| "reward_std": 0.013275579549372196, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9047262072563171, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 383 |
| }, |
| { |
| "completion_length": 192.3125, |
| "epoch": 1.2272, |
| "grad_norm": 2.3593811988830566, |
| "kl": 0.09521484375, |
| "learning_rate": 5.2125e-07, |
| "loss": 0.0009, |
| "reward": 3.893195629119873, |
| "reward_std": 0.03080725111067295, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9534947872161865, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9441652297973633, |
| "step": 384 |
| }, |
| { |
| "completion_length": 218.65625, |
| "epoch": 1.2304, |
| "grad_norm": 2.7099356651306152, |
| "kl": 0.06982421875, |
| "learning_rate": 5.2e-07, |
| "loss": 0.0007, |
| "reward": 3.8559422492980957, |
| "reward_std": 0.0489511676132679, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9215229749679565, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9344193935394287, |
| "step": 385 |
| }, |
| { |
| "completion_length": 168.375, |
| "epoch": 1.2336, |
| "grad_norm": 3.930095672607422, |
| "kl": 0.109130859375, |
| "learning_rate": 5.1875e-07, |
| "loss": 0.0011, |
| "reward": 3.848017930984497, |
| "reward_std": 0.043564099818468094, |
| "rewards/answer_entity_reward": 0.96875, |
| "rewards/answer_wer_reward": 0.9368032217025757, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9424647688865662, |
| "step": 386 |
| }, |
| { |
| "completion_length": 183.1875, |
| "epoch": 1.2368000000000001, |
| "grad_norm": 7.302414894104004, |
| "kl": 0.1279296875, |
| "learning_rate": 5.174999999999999e-07, |
| "loss": 0.0013, |
| "reward": 3.7856842279434204, |
| "reward_std": 0.026621405966579914, |
| "rewards/answer_entity_reward": 0.9930555820465088, |
| "rewards/answer_wer_reward": 0.9465668201446533, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8460619151592255, |
| "step": 387 |
| }, |
| { |
| "completion_length": 246.0, |
| "epoch": 1.24, |
| "grad_norm": 1.0175095796585083, |
| "kl": 0.06591796875, |
| "learning_rate": 5.162499999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.923374652862549, |
| "reward_std": 0.011706824880093336, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9246262907981873, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987484514713287, |
| "step": 388 |
| }, |
| { |
| "completion_length": 214.34375, |
| "epoch": 1.2432, |
| "grad_norm": 0.9391213655471802, |
| "kl": 0.072021484375, |
| "learning_rate": 5.149999999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.9639917612075806, |
| "reward_std": 0.009625846752896905, |
| "rewards/answer_entity_reward": 0.995192289352417, |
| "rewards/answer_wer_reward": 0.9695361256599426, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9992632865905762, |
| "step": 389 |
| }, |
| { |
| "completion_length": 237.8125, |
| "epoch": 1.2464, |
| "grad_norm": 1.1664483547210693, |
| "kl": 0.07568359375, |
| "learning_rate": 5.137500000000001e-07, |
| "loss": 0.0008, |
| "reward": 3.935038685798645, |
| "reward_std": 0.018754366785287857, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9395028948783875, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 390 |
| }, |
| { |
| "completion_length": 221.09375, |
| "epoch": 1.2496, |
| "grad_norm": 1.0274744033813477, |
| "kl": 0.06591796875, |
| "learning_rate": 5.125e-07, |
| "loss": 0.0007, |
| "reward": 3.9391175508499146, |
| "reward_std": 0.008871730417013168, |
| "rewards/answer_entity_reward": 0.9886363744735718, |
| "rewards/answer_wer_reward": 0.9511756002902985, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993055462837219, |
| "step": 391 |
| }, |
| { |
| "completion_length": 216.53125, |
| "epoch": 1.2528000000000001, |
| "grad_norm": 1.4062410593032837, |
| "kl": 0.0712890625, |
| "learning_rate": 5.1125e-07, |
| "loss": 0.0007, |
| "reward": 3.8631064891815186, |
| "reward_std": 0.02681769710034132, |
| "rewards/answer_entity_reward": 0.9895833432674408, |
| "rewards/answer_wer_reward": 0.9219101965427399, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9516128897666931, |
| "step": 392 |
| }, |
| { |
| "completion_length": 141.9375, |
| "epoch": 1.256, |
| "grad_norm": 9.963582038879395, |
| "kl": 0.12841796875, |
| "learning_rate": 5.1e-07, |
| "loss": 0.0013, |
| "reward": 3.886857271194458, |
| "reward_std": 0.011839461978524923, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9376890957355499, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9520089328289032, |
| "step": 393 |
| }, |
| { |
| "completion_length": 223.75, |
| "epoch": 1.2591999999999999, |
| "grad_norm": 3.129469156265259, |
| "kl": 0.06201171875, |
| "learning_rate": 5.0875e-07, |
| "loss": 0.0006, |
| "reward": 3.8934308290481567, |
| "reward_std": 0.04124835692346096, |
| "rewards/answer_entity_reward": 0.9847756326198578, |
| "rewards/answer_wer_reward": 0.9095006585121155, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9991544783115387, |
| "step": 394 |
| }, |
| { |
| "completion_length": 194.21875, |
| "epoch": 1.2624, |
| "grad_norm": 8.187355995178223, |
| "kl": 0.0849609375, |
| "learning_rate": 5.074999999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.8118830919265747, |
| "reward_std": 0.03198861540295184, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.8248356580734253, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9870474934577942, |
| "step": 395 |
| }, |
| { |
| "completion_length": 216.0625, |
| "epoch": 1.2656, |
| "grad_norm": 1.9981720447540283, |
| "kl": 0.08349609375, |
| "learning_rate": 5.062499999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.876628041267395, |
| "reward_std": 0.030061259865760803, |
| "rewards/answer_entity_reward": 0.9899572730064392, |
| "rewards/answer_wer_reward": 0.9325210154056549, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9541498124599457, |
| "step": 396 |
| }, |
| { |
| "completion_length": 244.9375, |
| "epoch": 1.2688, |
| "grad_norm": 1.46060311794281, |
| "kl": 0.08740234375, |
| "learning_rate": 5.049999999999999e-07, |
| "loss": 0.0009, |
| "reward": 3.9221689701080322, |
| "reward_std": 0.016801749356091022, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9224453568458557, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997234344482422, |
| "step": 397 |
| }, |
| { |
| "completion_length": 171.78125, |
| "epoch": 1.272, |
| "grad_norm": 2.054922342300415, |
| "kl": 0.116455078125, |
| "learning_rate": 5.0375e-07, |
| "loss": 0.0012, |
| "reward": 3.922398328781128, |
| "reward_std": 0.015158042311668396, |
| "rewards/answer_entity_reward": 0.9818181991577148, |
| "rewards/answer_wer_reward": 0.9412411153316498, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993391633033752, |
| "step": 398 |
| }, |
| { |
| "completion_length": 223.6875, |
| "epoch": 1.2752, |
| "grad_norm": 4.638472557067871, |
| "kl": 0.0859375, |
| "learning_rate": 5.025e-07, |
| "loss": 0.0009, |
| "reward": 3.928803563117981, |
| "reward_std": 0.015867930836975574, |
| "rewards/answer_entity_reward": 0.9790209829807281, |
| "rewards/answer_wer_reward": 0.9508891105651855, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9988935589790344, |
| "step": 399 |
| }, |
| { |
| "completion_length": 180.40625, |
| "epoch": 1.2784, |
| "grad_norm": 17.943954467773438, |
| "kl": 0.09228515625, |
| "learning_rate": 5.0125e-07, |
| "loss": 0.0009, |
| "reward": 3.918807029724121, |
| "reward_std": 0.010303683578968048, |
| "rewards/answer_entity_reward": 0.9916666746139526, |
| "rewards/answer_wer_reward": 0.9271402955055237, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 400 |
| }, |
| { |
| "completion_length": 208.59375, |
| "epoch": 1.2816, |
| "grad_norm": 2.634068489074707, |
| "kl": 0.10400390625, |
| "learning_rate": 5e-07, |
| "loss": 0.001, |
| "reward": 3.825340986251831, |
| "reward_std": 0.0303196981549263, |
| "rewards/answer_entity_reward": 0.9871430397033691, |
| "rewards/answer_wer_reward": 0.9115504324436188, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9266475439071655, |
| "step": 401 |
| }, |
| { |
| "completion_length": 203.25, |
| "epoch": 1.2848, |
| "grad_norm": 1.149072289466858, |
| "kl": 0.066162109375, |
| "learning_rate": 4.9875e-07, |
| "loss": 0.0007, |
| "reward": 3.9346535205841064, |
| "reward_std": 0.009479325264692307, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9361503720283508, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9985032677650452, |
| "step": 402 |
| }, |
| { |
| "completion_length": 214.875, |
| "epoch": 1.288, |
| "grad_norm": 1.2013689279556274, |
| "kl": 0.08447265625, |
| "learning_rate": 4.975e-07, |
| "loss": 0.0008, |
| "reward": 3.8625338077545166, |
| "reward_std": 0.012592533603310585, |
| "rewards/answer_entity_reward": 0.9916666746139526, |
| "rewards/answer_wer_reward": 0.9030886590480804, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9677785038948059, |
| "step": 403 |
| }, |
| { |
| "completion_length": 252.78125, |
| "epoch": 1.2912, |
| "grad_norm": 1.6769248247146606, |
| "kl": 0.066162109375, |
| "learning_rate": 4.9625e-07, |
| "loss": 0.0007, |
| "reward": 3.8860517740249634, |
| "reward_std": 0.034614769741892815, |
| "rewards/answer_entity_reward": 0.9836356937885284, |
| "rewards/answer_wer_reward": 0.9036450088024139, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987711310386658, |
| "step": 404 |
| }, |
| { |
| "completion_length": 213.96875, |
| "epoch": 1.2944, |
| "grad_norm": 1.5894328355789185, |
| "kl": 0.069580078125, |
| "learning_rate": 4.95e-07, |
| "loss": 0.0007, |
| "reward": 3.921362280845642, |
| "reward_std": 0.014703459106385708, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.92447629570961, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9992897808551788, |
| "step": 405 |
| }, |
| { |
| "completion_length": 242.53125, |
| "epoch": 1.2976, |
| "grad_norm": 3.458373785018921, |
| "kl": 0.1416015625, |
| "learning_rate": 4.9375e-07, |
| "loss": 0.0014, |
| "reward": 3.7037495374679565, |
| "reward_std": 0.1908966824412346, |
| "rewards/answer_entity_reward": 0.9941239356994629, |
| "rewards/answer_wer_reward": 0.8811471164226532, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.85972860455513, |
| "step": 406 |
| }, |
| { |
| "completion_length": 187.53125, |
| "epoch": 1.3008, |
| "grad_norm": 7.737911224365234, |
| "kl": 0.107666015625, |
| "learning_rate": 4.924999999999999e-07, |
| "loss": 0.0011, |
| "reward": 3.9244754314422607, |
| "reward_std": 0.021069620735943317, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9345695376396179, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9933781623840332, |
| "step": 407 |
| }, |
| { |
| "completion_length": 208.4375, |
| "epoch": 1.304, |
| "grad_norm": 2.0846338272094727, |
| "kl": 0.165771484375, |
| "learning_rate": 4.9125e-07, |
| "loss": 0.0017, |
| "reward": 3.9408286809921265, |
| "reward_std": 0.011463565286248922, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9510546028614044, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9897740483283997, |
| "step": 408 |
| }, |
| { |
| "completion_length": 198.28125, |
| "epoch": 1.3072, |
| "grad_norm": 2.9788646697998047, |
| "kl": 0.089111328125, |
| "learning_rate": 4.9e-07, |
| "loss": 0.0009, |
| "reward": 3.900764584541321, |
| "reward_std": 0.03450075723230839, |
| "rewards/answer_entity_reward": 0.9874475002288818, |
| "rewards/answer_wer_reward": 0.9153684377670288, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9979486465454102, |
| "step": 409 |
| }, |
| { |
| "completion_length": 176.28125, |
| "epoch": 1.3104, |
| "grad_norm": 2.856952667236328, |
| "kl": 0.09228515625, |
| "learning_rate": 4.8875e-07, |
| "loss": 0.0009, |
| "reward": 3.9486716985702515, |
| "reward_std": 0.021902556531131268, |
| "rewards/answer_entity_reward": 0.9912830293178558, |
| "rewards/answer_wer_reward": 0.9610228836536407, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9963657557964325, |
| "step": 410 |
| }, |
| { |
| "completion_length": 209.28125, |
| "epoch": 1.3136, |
| "grad_norm": 2.0441436767578125, |
| "kl": 0.08642578125, |
| "learning_rate": 4.875e-07, |
| "loss": 0.0009, |
| "reward": 3.916486144065857, |
| "reward_std": 0.018760663457214832, |
| "rewards/answer_entity_reward": 0.9963235259056091, |
| "rewards/answer_wer_reward": 0.9207533895969391, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994091391563416, |
| "step": 411 |
| }, |
| { |
| "completion_length": 232.21875, |
| "epoch": 1.3168, |
| "grad_norm": 1.071075201034546, |
| "kl": 0.0633544921875, |
| "learning_rate": 4.8625e-07, |
| "loss": 0.0006, |
| "reward": 3.9308619499206543, |
| "reward_std": 0.018531675916165113, |
| "rewards/answer_entity_reward": 0.9893162250518799, |
| "rewards/answer_wer_reward": 0.943993479013443, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9975521564483643, |
| "step": 412 |
| }, |
| { |
| "completion_length": 210.0625, |
| "epoch": 1.32, |
| "grad_norm": 3.82405686378479, |
| "kl": 0.09326171875, |
| "learning_rate": 4.85e-07, |
| "loss": 0.0009, |
| "reward": 3.889458179473877, |
| "reward_std": 0.02208129083737731, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9473488032817841, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9449501633644104, |
| "step": 413 |
| }, |
| { |
| "completion_length": 197.15625, |
| "epoch": 1.3232, |
| "grad_norm": 1.4103983640670776, |
| "kl": 0.0849609375, |
| "learning_rate": 4.8375e-07, |
| "loss": 0.0009, |
| "reward": 3.9459102153778076, |
| "reward_std": 0.014464881271123886, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9481469988822937, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9977630972862244, |
| "step": 414 |
| }, |
| { |
| "completion_length": 239.9375, |
| "epoch": 1.3264, |
| "grad_norm": 1.4598060846328735, |
| "kl": 0.06982421875, |
| "learning_rate": 4.824999999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.862109899520874, |
| "reward_std": 0.07382148411124945, |
| "rewards/answer_entity_reward": 0.9833333194255829, |
| "rewards/answer_wer_reward": 0.9100264310836792, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.96875, |
| "step": 415 |
| }, |
| { |
| "completion_length": 184.3125, |
| "epoch": 1.3296000000000001, |
| "grad_norm": 0.6735196709632874, |
| "kl": 0.063720703125, |
| "learning_rate": 4.812499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.8697965145111084, |
| "reward_std": 0.18503482337109745, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.932296484708786, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.96875, |
| "step": 416 |
| }, |
| { |
| "completion_length": 172.1875, |
| "epoch": 1.3328, |
| "grad_norm": 1.7613649368286133, |
| "kl": 0.11962890625, |
| "learning_rate": 4.8e-07, |
| "loss": 0.0012, |
| "reward": 3.938371181488037, |
| "reward_std": 0.020422414876520634, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9546558260917664, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.988179475069046, |
| "step": 417 |
| }, |
| { |
| "completion_length": 223.09375, |
| "epoch": 1.336, |
| "grad_norm": 3.332552671432495, |
| "kl": 0.12841796875, |
| "learning_rate": 4.7875e-07, |
| "loss": 0.0013, |
| "reward": 3.9398679733276367, |
| "reward_std": 0.018179779406636953, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9447188973426819, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9979900419712067, |
| "step": 418 |
| }, |
| { |
| "completion_length": 226.5, |
| "epoch": 1.3392, |
| "grad_norm": 1.4374769926071167, |
| "kl": 0.083984375, |
| "learning_rate": 4.775e-07, |
| "loss": 0.0008, |
| "reward": 3.891066312789917, |
| "reward_std": 0.02610717061907053, |
| "rewards/answer_entity_reward": 0.9841079115867615, |
| "rewards/answer_wer_reward": 0.9078442752361298, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9991140365600586, |
| "step": 419 |
| }, |
| { |
| "completion_length": 196.84375, |
| "epoch": 1.3424, |
| "grad_norm": 1.7055010795593262, |
| "kl": 0.100830078125, |
| "learning_rate": 4.7625e-07, |
| "loss": 0.001, |
| "reward": 3.899353504180908, |
| "reward_std": 0.022911718115210533, |
| "rewards/answer_entity_reward": 0.9914772808551788, |
| "rewards/answer_wer_reward": 0.9394311308860779, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9684451520442963, |
| "step": 420 |
| }, |
| { |
| "completion_length": 223.34375, |
| "epoch": 1.3456000000000001, |
| "grad_norm": 2.624370574951172, |
| "kl": 0.13427734375, |
| "learning_rate": 4.7499999999999995e-07, |
| "loss": 0.0013, |
| "reward": 3.8897502422332764, |
| "reward_std": 0.06373783992603421, |
| "rewards/answer_entity_reward": 0.9921875, |
| "rewards/answer_wer_reward": 0.9388905465602875, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9586721360683441, |
| "step": 421 |
| }, |
| { |
| "completion_length": 209.0, |
| "epoch": 1.3488, |
| "grad_norm": 2.2683520317077637, |
| "kl": 0.102294921875, |
| "learning_rate": 4.7374999999999996e-07, |
| "loss": 0.001, |
| "reward": 3.960143804550171, |
| "reward_std": 0.006363062420859933, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9613305628299713, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9988133013248444, |
| "step": 422 |
| }, |
| { |
| "completion_length": 187.6875, |
| "epoch": 1.3519999999999999, |
| "grad_norm": 1.426279067993164, |
| "kl": 0.130859375, |
| "learning_rate": 4.725e-07, |
| "loss": 0.0013, |
| "reward": 3.904189109802246, |
| "reward_std": 0.017666546627879143, |
| "rewards/answer_entity_reward": 0.9875437021255493, |
| "rewards/answer_wer_reward": 0.9480733275413513, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9685720801353455, |
| "step": 423 |
| }, |
| { |
| "completion_length": 227.96875, |
| "epoch": 1.3552, |
| "grad_norm": 2.3656458854675293, |
| "kl": 0.202880859375, |
| "learning_rate": 4.7125e-07, |
| "loss": 0.002, |
| "reward": 3.8170067071914673, |
| "reward_std": 0.15287955617532134, |
| "rewards/answer_entity_reward": 0.993697464466095, |
| "rewards/answer_wer_reward": 0.9000802934169769, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9544788599014282, |
| "step": 424 |
| }, |
| { |
| "completion_length": 260.375, |
| "epoch": 1.3584, |
| "grad_norm": 19.11045265197754, |
| "kl": 0.0771484375, |
| "learning_rate": 4.6999999999999995e-07, |
| "loss": 0.0008, |
| "reward": 3.9173004627227783, |
| "reward_std": 0.02492327243089676, |
| "rewards/answer_entity_reward": 0.9955128133296967, |
| "rewards/answer_wer_reward": 0.9262253046035767, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9955623149871826, |
| "step": 425 |
| }, |
| { |
| "completion_length": 244.0625, |
| "epoch": 1.3616, |
| "grad_norm": 3.7110118865966797, |
| "kl": 0.0650634765625, |
| "learning_rate": 4.6874999999999996e-07, |
| "loss": 0.0007, |
| "reward": 3.912764072418213, |
| "reward_std": 0.022814412601292133, |
| "rewards/answer_entity_reward": 0.9910714328289032, |
| "rewards/answer_wer_reward": 0.9220215976238251, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996710419654846, |
| "step": 426 |
| }, |
| { |
| "completion_length": 206.0625, |
| "epoch": 1.3648, |
| "grad_norm": 7.218249797821045, |
| "kl": 0.0869140625, |
| "learning_rate": 4.675e-07, |
| "loss": 0.0009, |
| "reward": 3.8915610313415527, |
| "reward_std": 0.020747858565300703, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.913354367017746, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9816789925098419, |
| "step": 427 |
| }, |
| { |
| "completion_length": 229.21875, |
| "epoch": 1.3679999999999999, |
| "grad_norm": 6.419763088226318, |
| "kl": 0.078857421875, |
| "learning_rate": 4.6625e-07, |
| "loss": 0.0008, |
| "reward": 3.7964917421340942, |
| "reward_std": 0.03975658491253853, |
| "rewards/answer_entity_reward": 0.9886363744735718, |
| "rewards/answer_wer_reward": 0.9177364408969879, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8901189565658569, |
| "step": 428 |
| }, |
| { |
| "completion_length": 252.96875, |
| "epoch": 1.3712, |
| "grad_norm": 6.5345025062561035, |
| "kl": 0.0782470703125, |
| "learning_rate": 4.65e-07, |
| "loss": 0.0008, |
| "reward": 3.903268814086914, |
| "reward_std": 0.016737705329433084, |
| "rewards/answer_entity_reward": 0.9764957129955292, |
| "rewards/answer_wer_reward": 0.92976513504982, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9970079958438873, |
| "step": 429 |
| }, |
| { |
| "completion_length": 240.15625, |
| "epoch": 1.3744, |
| "grad_norm": 2.109302043914795, |
| "kl": 0.078857421875, |
| "learning_rate": 4.6374999999999995e-07, |
| "loss": 0.0008, |
| "reward": 3.935005784034729, |
| "reward_std": 0.035214878618717194, |
| "rewards/answer_entity_reward": 0.9908459782600403, |
| "rewards/answer_wer_reward": 0.9483801424503326, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9957797825336456, |
| "step": 430 |
| }, |
| { |
| "completion_length": 204.21875, |
| "epoch": 1.3776, |
| "grad_norm": 2.1557323932647705, |
| "kl": 0.0986328125, |
| "learning_rate": 4.625e-07, |
| "loss": 0.001, |
| "reward": 3.895322561264038, |
| "reward_std": 0.00989355193451047, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9249120354652405, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9704104661941528, |
| "step": 431 |
| }, |
| { |
| "completion_length": 222.375, |
| "epoch": 1.3808, |
| "grad_norm": 1.1159002780914307, |
| "kl": 0.139892578125, |
| "learning_rate": 4.6125e-07, |
| "loss": 0.0014, |
| "reward": 3.909332513809204, |
| "reward_std": 0.02693999744951725, |
| "rewards/answer_entity_reward": 0.995192289352417, |
| "rewards/answer_wer_reward": 0.9155605435371399, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9985795617103577, |
| "step": 432 |
| }, |
| { |
| "completion_length": 203.8125, |
| "epoch": 1.384, |
| "grad_norm": 1.4166613817214966, |
| "kl": 0.1220703125, |
| "learning_rate": 4.6e-07, |
| "loss": 0.0012, |
| "reward": 3.8192185163497925, |
| "reward_std": 0.20739353261888027, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.8859462738037109, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9673629999160767, |
| "step": 433 |
| }, |
| { |
| "completion_length": 253.25, |
| "epoch": 1.3872, |
| "grad_norm": 2.674269437789917, |
| "kl": 0.0657958984375, |
| "learning_rate": 4.5874999999999995e-07, |
| "loss": 0.0007, |
| "reward": 3.88591992855072, |
| "reward_std": 0.02829979732632637, |
| "rewards/answer_entity_reward": 0.9763771891593933, |
| "rewards/answer_wer_reward": 0.9098401963710785, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997023940086365, |
| "step": 434 |
| }, |
| { |
| "completion_length": 216.15625, |
| "epoch": 1.3904, |
| "grad_norm": 2.3317995071411133, |
| "kl": 0.1495361328125, |
| "learning_rate": 4.575e-07, |
| "loss": 0.0015, |
| "reward": 3.8120020627975464, |
| "reward_std": 0.0887885820120573, |
| "rewards/answer_entity_reward": 0.9914772808551788, |
| "rewards/answer_wer_reward": 0.9127777814865112, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9077470898628235, |
| "step": 435 |
| }, |
| { |
| "completion_length": 210.5625, |
| "epoch": 1.3936, |
| "grad_norm": 8.527549743652344, |
| "kl": 0.13818359375, |
| "learning_rate": 4.5624999999999997e-07, |
| "loss": 0.0014, |
| "reward": 3.802919387817383, |
| "reward_std": 0.015426071360707283, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9406470954418182, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.862272173166275, |
| "step": 436 |
| }, |
| { |
| "completion_length": 194.21875, |
| "epoch": 1.3968, |
| "grad_norm": 3.6950721740722656, |
| "kl": 0.098388671875, |
| "learning_rate": 4.55e-07, |
| "loss": 0.001, |
| "reward": 3.9119696617126465, |
| "reward_std": 0.025569402612745762, |
| "rewards/answer_entity_reward": 0.9852430522441864, |
| "rewards/answer_wer_reward": 0.9274449944496155, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9992816150188446, |
| "step": 437 |
| }, |
| { |
| "completion_length": 218.0, |
| "epoch": 1.4, |
| "grad_norm": 2.0461039543151855, |
| "kl": 0.09130859375, |
| "learning_rate": 4.5374999999999994e-07, |
| "loss": 0.0009, |
| "reward": 3.9378126859664917, |
| "reward_std": 0.023795679211616516, |
| "rewards/answer_entity_reward": 0.9909090995788574, |
| "rewards/answer_wer_reward": 0.9559187889099121, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9909848570823669, |
| "step": 438 |
| }, |
| { |
| "completion_length": 166.21875, |
| "epoch": 1.4032, |
| "grad_norm": 6.606758117675781, |
| "kl": 0.099853515625, |
| "learning_rate": 4.525e-07, |
| "loss": 0.001, |
| "reward": 3.7634676694869995, |
| "reward_std": 0.12013816519174725, |
| "rewards/answer_entity_reward": 0.9916666746139526, |
| "rewards/answer_wer_reward": 0.9638259708881378, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8079750537872314, |
| "step": 439 |
| }, |
| { |
| "completion_length": 199.0625, |
| "epoch": 1.4064, |
| "grad_norm": 2.7103731632232666, |
| "kl": 0.107666015625, |
| "learning_rate": 4.5124999999999997e-07, |
| "loss": 0.0011, |
| "reward": 3.88293993473053, |
| "reward_std": 0.030841628089547157, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9417436718940735, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9411962330341339, |
| "step": 440 |
| }, |
| { |
| "completion_length": 228.5, |
| "epoch": 1.4096, |
| "grad_norm": 16.007980346679688, |
| "kl": 0.090576171875, |
| "learning_rate": 4.5e-07, |
| "loss": 0.0009, |
| "reward": 3.8373541831970215, |
| "reward_std": 0.07324423175305128, |
| "rewards/answer_entity_reward": 0.9903846085071564, |
| "rewards/answer_wer_reward": 0.8474734723567963, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994959831237793, |
| "step": 441 |
| }, |
| { |
| "completion_length": 198.0, |
| "epoch": 1.4128, |
| "grad_norm": 1.5419743061065674, |
| "kl": 0.090576171875, |
| "learning_rate": 4.4874999999999994e-07, |
| "loss": 0.0009, |
| "reward": 3.9394867420196533, |
| "reward_std": 0.020834744907915592, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9435902535915375, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993686676025391, |
| "step": 442 |
| }, |
| { |
| "completion_length": 212.9375, |
| "epoch": 1.416, |
| "grad_norm": 2.2846686840057373, |
| "kl": 0.09375, |
| "learning_rate": 4.475e-07, |
| "loss": 0.0009, |
| "reward": 3.9014971256256104, |
| "reward_std": 0.05675862170755863, |
| "rewards/answer_entity_reward": 0.9917200803756714, |
| "rewards/answer_wer_reward": 0.9370100498199463, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9727669358253479, |
| "step": 443 |
| }, |
| { |
| "completion_length": 228.78125, |
| "epoch": 1.4192, |
| "grad_norm": 1.8499493598937988, |
| "kl": 0.0723876953125, |
| "learning_rate": 4.4624999999999996e-07, |
| "loss": 0.0007, |
| "reward": 3.967541456222534, |
| "reward_std": 0.005963538307696581, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9685240089893341, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990174472332001, |
| "step": 444 |
| }, |
| { |
| "completion_length": 224.09375, |
| "epoch": 1.4224, |
| "grad_norm": 5.091113567352295, |
| "kl": 0.0704345703125, |
| "learning_rate": 4.45e-07, |
| "loss": 0.0007, |
| "reward": 3.886088252067566, |
| "reward_std": 0.04133851733058691, |
| "rewards/answer_entity_reward": 0.9886092245578766, |
| "rewards/answer_wer_reward": 0.9384825825691223, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9589964747428894, |
| "step": 445 |
| }, |
| { |
| "completion_length": 228.34375, |
| "epoch": 1.4256, |
| "grad_norm": 57.5860710144043, |
| "kl": 0.142822265625, |
| "learning_rate": 4.4374999999999993e-07, |
| "loss": 0.0014, |
| "reward": 3.9096713066101074, |
| "reward_std": 0.01603887975215912, |
| "rewards/answer_entity_reward": 0.9981617629528046, |
| "rewards/answer_wer_reward": 0.9126511812210083, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9988585412502289, |
| "step": 446 |
| }, |
| { |
| "completion_length": 202.34375, |
| "epoch": 1.4288, |
| "grad_norm": 2.211174964904785, |
| "kl": 0.056640625, |
| "learning_rate": 4.425e-07, |
| "loss": 0.0006, |
| "reward": 3.893475890159607, |
| "reward_std": 0.046710459515452385, |
| "rewards/answer_entity_reward": 0.9659091234207153, |
| "rewards/answer_wer_reward": 0.9309596717357635, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9966072142124176, |
| "step": 447 |
| }, |
| { |
| "completion_length": 186.375, |
| "epoch": 1.432, |
| "grad_norm": 3.053344249725342, |
| "kl": 0.07666015625, |
| "learning_rate": 4.4124999999999996e-07, |
| "loss": 0.0008, |
| "reward": 3.941947340965271, |
| "reward_std": 0.009567510336637497, |
| "rewards/answer_entity_reward": 0.9883012771606445, |
| "rewards/answer_wer_reward": 0.964864045381546, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9887820482254028, |
| "step": 448 |
| }, |
| { |
| "completion_length": 170.65625, |
| "epoch": 1.4352, |
| "grad_norm": 2.5118942260742188, |
| "kl": 0.086181640625, |
| "learning_rate": 4.3999999999999997e-07, |
| "loss": 0.0009, |
| "reward": 3.8258646726608276, |
| "reward_std": 0.011685115285217762, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.8330351114273071, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9928297400474548, |
| "step": 449 |
| }, |
| { |
| "completion_length": 199.8125, |
| "epoch": 1.4384000000000001, |
| "grad_norm": 3.3471686840057373, |
| "kl": 0.12109375, |
| "learning_rate": 4.3874999999999993e-07, |
| "loss": 0.0012, |
| "reward": 3.768259644508362, |
| "reward_std": 0.061878617852926254, |
| "rewards/answer_entity_reward": 0.9866071343421936, |
| "rewards/answer_wer_reward": 0.8001611828804016, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9814914762973785, |
| "step": 450 |
| }, |
| { |
| "completion_length": 179.625, |
| "epoch": 1.4416, |
| "grad_norm": 6.58098840713501, |
| "kl": 0.13671875, |
| "learning_rate": 4.375e-07, |
| "loss": 0.0014, |
| "reward": 3.9213969707489014, |
| "reward_std": 0.007897446397691965, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.954677164554596, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.966719776391983, |
| "step": 451 |
| }, |
| { |
| "completion_length": 201.53125, |
| "epoch": 1.4447999999999999, |
| "grad_norm": 2.6606149673461914, |
| "kl": 0.07275390625, |
| "learning_rate": 4.3625e-07, |
| "loss": 0.0007, |
| "reward": 3.930065631866455, |
| "reward_std": 0.016306706704199314, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9411978721618652, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9888678193092346, |
| "step": 452 |
| }, |
| { |
| "completion_length": 218.71875, |
| "epoch": 1.448, |
| "grad_norm": 2.720804452896118, |
| "kl": 0.068115234375, |
| "learning_rate": 4.3499999999999996e-07, |
| "loss": 0.0007, |
| "reward": 3.9184677600860596, |
| "reward_std": 0.018319842871278524, |
| "rewards/answer_entity_reward": 0.9955128133296967, |
| "rewards/answer_wer_reward": 0.9235129654407501, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994419813156128, |
| "step": 453 |
| }, |
| { |
| "completion_length": 207.65625, |
| "epoch": 1.4512, |
| "grad_norm": 3.4664785861968994, |
| "kl": 0.153564453125, |
| "learning_rate": 4.3375000000000003e-07, |
| "loss": 0.0015, |
| "reward": 3.9119069576263428, |
| "reward_std": 0.017484096810221672, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9505945444107056, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9613124430179596, |
| "step": 454 |
| }, |
| { |
| "completion_length": 212.125, |
| "epoch": 1.4544000000000001, |
| "grad_norm": 1.4592719078063965, |
| "kl": 0.081298828125, |
| "learning_rate": 4.325e-07, |
| "loss": 0.0008, |
| "reward": 3.9381325244903564, |
| "reward_std": 0.015622157603502274, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.94671231508255, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9938240349292755, |
| "step": 455 |
| }, |
| { |
| "completion_length": 237.8125, |
| "epoch": 1.4576, |
| "grad_norm": 1.2292534112930298, |
| "kl": 0.089111328125, |
| "learning_rate": 4.3125e-07, |
| "loss": 0.0009, |
| "reward": 3.940074920654297, |
| "reward_std": 0.013516389299184084, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.943013072013855, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9970619082450867, |
| "step": 456 |
| }, |
| { |
| "completion_length": 206.90625, |
| "epoch": 1.4607999999999999, |
| "grad_norm": 2.4139420986175537, |
| "kl": 0.08837890625, |
| "learning_rate": 4.2999999999999996e-07, |
| "loss": 0.0009, |
| "reward": 3.9423701763153076, |
| "reward_std": 0.017034863587468863, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.942692369222641, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999677836894989, |
| "step": 457 |
| }, |
| { |
| "completion_length": 200.21875, |
| "epoch": 1.464, |
| "grad_norm": 1.0297181606292725, |
| "kl": 0.11083984375, |
| "learning_rate": 4.2875e-07, |
| "loss": 0.0011, |
| "reward": 3.9459547996520996, |
| "reward_std": 0.014651869423687458, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9467397332191467, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999215304851532, |
| "step": 458 |
| }, |
| { |
| "completion_length": 219.75, |
| "epoch": 1.4672, |
| "grad_norm": 1.3148033618927002, |
| "kl": 0.10546875, |
| "learning_rate": 4.275e-07, |
| "loss": 0.0011, |
| "reward": 3.9567900896072388, |
| "reward_std": 0.011340227210894227, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9586590230464935, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.99813112616539, |
| "step": 459 |
| }, |
| { |
| "completion_length": 181.0625, |
| "epoch": 1.4704, |
| "grad_norm": 2.4274115562438965, |
| "kl": 0.09814453125, |
| "learning_rate": 4.2625e-07, |
| "loss": 0.001, |
| "reward": 3.9310171604156494, |
| "reward_std": 0.017811311408877373, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9543968439102173, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9766204059123993, |
| "step": 460 |
| }, |
| { |
| "completion_length": 205.625, |
| "epoch": 1.4736, |
| "grad_norm": 2.885746717453003, |
| "kl": 0.12548828125, |
| "learning_rate": 4.2499999999999995e-07, |
| "loss": 0.0013, |
| "reward": 3.878596305847168, |
| "reward_std": 0.02481621317565441, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.9378580451011658, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9464203119277954, |
| "step": 461 |
| }, |
| { |
| "completion_length": 205.4375, |
| "epoch": 1.4768, |
| "grad_norm": 2.366044521331787, |
| "kl": 0.10595703125, |
| "learning_rate": 4.2375e-07, |
| "loss": 0.0011, |
| "reward": 3.940233826637268, |
| "reward_std": 0.013500516302883625, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.9461761116981506, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997395873069763, |
| "step": 462 |
| }, |
| { |
| "completion_length": 229.46875, |
| "epoch": 1.48, |
| "grad_norm": 2.4469070434570312, |
| "kl": 0.078369140625, |
| "learning_rate": 4.225e-07, |
| "loss": 0.0008, |
| "reward": 3.92271089553833, |
| "reward_std": 0.022854273673146963, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9247944056987762, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9979166686534882, |
| "step": 463 |
| }, |
| { |
| "completion_length": 225.96875, |
| "epoch": 1.4832, |
| "grad_norm": 11.768393516540527, |
| "kl": 0.1123046875, |
| "learning_rate": 4.2125e-07, |
| "loss": 0.0011, |
| "reward": 3.9518144130706787, |
| "reward_std": 0.010446197353303432, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9525662660598755, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9992480874061584, |
| "step": 464 |
| }, |
| { |
| "completion_length": 149.0, |
| "epoch": 1.4864, |
| "grad_norm": 6.672958850860596, |
| "kl": 0.185791015625, |
| "learning_rate": 4.1999999999999995e-07, |
| "loss": 0.0019, |
| "reward": 3.944068431854248, |
| "reward_std": 0.02685389667749405, |
| "rewards/answer_entity_reward": 0.9774305522441864, |
| "rewards/answer_wer_reward": 0.9714455008506775, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.995192289352417, |
| "step": 465 |
| }, |
| { |
| "completion_length": 249.625, |
| "epoch": 1.4896, |
| "grad_norm": 1.7048887014389038, |
| "kl": 0.10986328125, |
| "learning_rate": 4.1875e-07, |
| "loss": 0.0011, |
| "reward": 3.902083158493042, |
| "reward_std": 0.011234605684876442, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9024596214294434, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996234774589539, |
| "step": 466 |
| }, |
| { |
| "completion_length": 181.8125, |
| "epoch": 1.4928, |
| "grad_norm": 2.429704189300537, |
| "kl": 0.112548828125, |
| "learning_rate": 4.1749999999999997e-07, |
| "loss": 0.0011, |
| "reward": 3.9163752794265747, |
| "reward_std": 0.014369658660143614, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9364789724349976, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9798963963985443, |
| "step": 467 |
| }, |
| { |
| "completion_length": 201.5625, |
| "epoch": 1.496, |
| "grad_norm": 3.5214920043945312, |
| "kl": 0.09912109375, |
| "learning_rate": 4.1625e-07, |
| "loss": 0.001, |
| "reward": 3.9279046058654785, |
| "reward_std": 0.016232089139521122, |
| "rewards/answer_entity_reward": 0.9983552694320679, |
| "rewards/answer_wer_reward": 0.9468095898628235, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9827397167682648, |
| "step": 468 |
| }, |
| { |
| "completion_length": 180.09375, |
| "epoch": 1.4992, |
| "grad_norm": 2.471404790878296, |
| "kl": 0.10693359375, |
| "learning_rate": 4.1499999999999994e-07, |
| "loss": 0.0011, |
| "reward": 3.860212564468384, |
| "reward_std": 0.02879812940955162, |
| "rewards/answer_entity_reward": 0.9914772808551788, |
| "rewards/answer_wer_reward": 0.9390608966350555, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9296742975711823, |
| "step": 469 |
| }, |
| { |
| "completion_length": 208.1875, |
| "epoch": 1.5024, |
| "grad_norm": 0.9673317074775696, |
| "kl": 0.104248046875, |
| "learning_rate": 4.1375e-07, |
| "loss": 0.001, |
| "reward": 3.944485664367676, |
| "reward_std": 0.01182422018609941, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9444854557514191, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 470 |
| }, |
| { |
| "completion_length": 194.0625, |
| "epoch": 1.5056, |
| "grad_norm": 1.0823942422866821, |
| "kl": 0.096435546875, |
| "learning_rate": 4.1249999999999997e-07, |
| "loss": 0.001, |
| "reward": 3.9105581045150757, |
| "reward_std": 0.015555873978883028, |
| "rewards/answer_entity_reward": 0.9869916439056396, |
| "rewards/answer_wer_reward": 0.9253619015216827, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9982046484947205, |
| "step": 471 |
| }, |
| { |
| "completion_length": 221.5625, |
| "epoch": 1.5088, |
| "grad_norm": 4.074758052825928, |
| "kl": 0.077880859375, |
| "learning_rate": 4.1125e-07, |
| "loss": 0.0008, |
| "reward": 3.933529496192932, |
| "reward_std": 0.019466498168185353, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9344994425773621, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990300834178925, |
| "step": 472 |
| }, |
| { |
| "completion_length": 167.3125, |
| "epoch": 1.512, |
| "grad_norm": 2.003244400024414, |
| "kl": 0.10888671875, |
| "learning_rate": 4.0999999999999994e-07, |
| "loss": 0.0011, |
| "reward": 3.927189588546753, |
| "reward_std": 0.00937123317271471, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9625618755817413, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9646276533603668, |
| "step": 473 |
| }, |
| { |
| "completion_length": 253.59375, |
| "epoch": 1.5152, |
| "grad_norm": 1.7125921249389648, |
| "kl": 0.1005859375, |
| "learning_rate": 4.0875e-07, |
| "loss": 0.001, |
| "reward": 3.9120967388153076, |
| "reward_std": 0.020000137854367495, |
| "rewards/answer_entity_reward": 0.9910256266593933, |
| "rewards/answer_wer_reward": 0.9225968718528748, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9984740912914276, |
| "step": 474 |
| }, |
| { |
| "completion_length": 168.5625, |
| "epoch": 1.5184, |
| "grad_norm": 2.8377087116241455, |
| "kl": 0.176513671875, |
| "learning_rate": 4.0749999999999996e-07, |
| "loss": 0.0018, |
| "reward": 3.8111839294433594, |
| "reward_std": 0.015397761948406696, |
| "rewards/answer_entity_reward": 0.9882352948188782, |
| "rewards/answer_wer_reward": 0.9508877992630005, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8720609545707703, |
| "step": 475 |
| }, |
| { |
| "completion_length": 180.0625, |
| "epoch": 1.5215999999999998, |
| "grad_norm": 1.8417869806289673, |
| "kl": 0.09814453125, |
| "learning_rate": 4.0625e-07, |
| "loss": 0.001, |
| "reward": 3.9401214122772217, |
| "reward_std": 0.017564786598086357, |
| "rewards/answer_entity_reward": 0.9958333373069763, |
| "rewards/answer_wer_reward": 0.95186448097229, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9924235343933105, |
| "step": 476 |
| }, |
| { |
| "completion_length": 259.5625, |
| "epoch": 1.5248, |
| "grad_norm": 3.1482410430908203, |
| "kl": 0.065673828125, |
| "learning_rate": 4.05e-07, |
| "loss": 0.0007, |
| "reward": 3.8720295429229736, |
| "reward_std": 0.05017535015940666, |
| "rewards/answer_entity_reward": 0.9819904267787933, |
| "rewards/answer_wer_reward": 0.9132304787635803, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9768086373806, |
| "step": 477 |
| }, |
| { |
| "completion_length": 224.25, |
| "epoch": 1.528, |
| "grad_norm": 1.309258222579956, |
| "kl": 0.151123046875, |
| "learning_rate": 4.0375e-07, |
| "loss": 0.0015, |
| "reward": 3.9491400718688965, |
| "reward_std": 0.015128562692552805, |
| "rewards/answer_entity_reward": 0.9905637204647064, |
| "rewards/answer_wer_reward": 0.9590685665607452, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999507874250412, |
| "step": 478 |
| }, |
| { |
| "completion_length": 195.03125, |
| "epoch": 1.5312000000000001, |
| "grad_norm": 2.627673864364624, |
| "kl": 0.10205078125, |
| "learning_rate": 4.025e-07, |
| "loss": 0.001, |
| "reward": 3.8748838901519775, |
| "reward_std": 0.03435908444225788, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.931220144033432, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9471359848976135, |
| "step": 479 |
| }, |
| { |
| "completion_length": 214.09375, |
| "epoch": 1.5344, |
| "grad_norm": 1.328961730003357, |
| "kl": 0.09375, |
| "learning_rate": 4.0124999999999997e-07, |
| "loss": 0.0009, |
| "reward": 3.895302414894104, |
| "reward_std": 0.05907848384231329, |
| "rewards/answer_entity_reward": 0.9810912609100342, |
| "rewards/answer_wer_reward": 0.9161643385887146, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.998046875, |
| "step": 480 |
| }, |
| { |
| "completion_length": 238.34375, |
| "epoch": 1.5375999999999999, |
| "grad_norm": 1.2219247817993164, |
| "kl": 0.073486328125, |
| "learning_rate": 4e-07, |
| "loss": 0.0007, |
| "reward": 3.9036080837249756, |
| "reward_std": 0.039926802739501, |
| "rewards/answer_entity_reward": 0.9823717474937439, |
| "rewards/answer_wer_reward": 0.9258527159690857, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9953835308551788, |
| "step": 481 |
| }, |
| { |
| "completion_length": 197.90625, |
| "epoch": 1.5408, |
| "grad_norm": 1.6363537311553955, |
| "kl": 0.14306640625, |
| "learning_rate": 3.9875e-07, |
| "loss": 0.0014, |
| "reward": 3.948188543319702, |
| "reward_std": 0.010867676697671413, |
| "rewards/answer_entity_reward": 0.9958333373069763, |
| "rewards/answer_wer_reward": 0.9528435170650482, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.99951171875, |
| "step": 482 |
| }, |
| { |
| "completion_length": 248.84375, |
| "epoch": 1.544, |
| "grad_norm": 1.3250434398651123, |
| "kl": 0.07421875, |
| "learning_rate": 3.975e-07, |
| "loss": 0.0007, |
| "reward": 3.9251530170440674, |
| "reward_std": 0.011389322113245726, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9254424273967743, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997106492519379, |
| "step": 483 |
| }, |
| { |
| "completion_length": 183.03125, |
| "epoch": 1.5472000000000001, |
| "grad_norm": 1.3042057752609253, |
| "kl": 0.10107421875, |
| "learning_rate": 3.9624999999999996e-07, |
| "loss": 0.001, |
| "reward": 3.8853487968444824, |
| "reward_std": 0.06827400880865753, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9366248250007629, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9487238228321075, |
| "step": 484 |
| }, |
| { |
| "completion_length": 192.0625, |
| "epoch": 1.5504, |
| "grad_norm": 2.232529640197754, |
| "kl": 0.11279296875, |
| "learning_rate": 3.95e-07, |
| "loss": 0.0011, |
| "reward": 3.9339704513549805, |
| "reward_std": 0.011435477063059807, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.935157060623169, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9988133013248444, |
| "step": 485 |
| }, |
| { |
| "completion_length": 237.28125, |
| "epoch": 1.5535999999999999, |
| "grad_norm": 1.1462312936782837, |
| "kl": 0.098876953125, |
| "learning_rate": 3.9375e-07, |
| "loss": 0.001, |
| "reward": 3.953871250152588, |
| "reward_std": 0.007947361096739769, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9546429216861725, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9992283880710602, |
| "step": 486 |
| }, |
| { |
| "completion_length": 167.8125, |
| "epoch": 1.5568, |
| "grad_norm": 2.324936628341675, |
| "kl": 0.1357421875, |
| "learning_rate": 3.925e-07, |
| "loss": 0.0014, |
| "reward": 3.858751654624939, |
| "reward_std": 0.14167471043765545, |
| "rewards/answer_entity_reward": 0.9930555820465088, |
| "rewards/answer_wer_reward": 0.9401543736457825, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9567916989326477, |
| "step": 487 |
| }, |
| { |
| "completion_length": 245.09375, |
| "epoch": 1.56, |
| "grad_norm": 4.512195587158203, |
| "kl": 0.076171875, |
| "learning_rate": 3.9124999999999996e-07, |
| "loss": 0.0008, |
| "reward": 3.920499563217163, |
| "reward_std": 0.03615456819534302, |
| "rewards/answer_entity_reward": 0.9908459782600403, |
| "rewards/answer_wer_reward": 0.9309035241603851, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987500607967377, |
| "step": 488 |
| }, |
| { |
| "completion_length": 227.84375, |
| "epoch": 1.5632000000000001, |
| "grad_norm": 10.537569046020508, |
| "kl": 0.0859375, |
| "learning_rate": 3.8999999999999997e-07, |
| "loss": 0.0009, |
| "reward": 3.9345154762268066, |
| "reward_std": 0.0299052600748837, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9527814090251923, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9817341864109039, |
| "step": 489 |
| }, |
| { |
| "completion_length": 228.78125, |
| "epoch": 1.5664, |
| "grad_norm": 1.635452151298523, |
| "kl": 0.125, |
| "learning_rate": 3.8875e-07, |
| "loss": 0.0013, |
| "reward": 3.944974184036255, |
| "reward_std": 0.019456470385193825, |
| "rewards/answer_entity_reward": 0.9919143319129944, |
| "rewards/answer_wer_reward": 0.9538533091545105, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.99920654296875, |
| "step": 490 |
| }, |
| { |
| "completion_length": 146.78125, |
| "epoch": 1.5695999999999999, |
| "grad_norm": 3.557502031326294, |
| "kl": 0.256103515625, |
| "learning_rate": 3.875e-07, |
| "loss": 0.0026, |
| "reward": 3.865471601486206, |
| "reward_std": 0.05972531996667385, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.972651481628418, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8928201496601105, |
| "step": 491 |
| }, |
| { |
| "completion_length": 207.28125, |
| "epoch": 1.5728, |
| "grad_norm": 1.0813632011413574, |
| "kl": 0.11767578125, |
| "learning_rate": 3.8624999999999995e-07, |
| "loss": 0.0012, |
| "reward": 3.91045343875885, |
| "reward_std": 0.01970634702593088, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9153560400009155, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9979382753372192, |
| "step": 492 |
| }, |
| { |
| "completion_length": 219.4375, |
| "epoch": 1.576, |
| "grad_norm": 9.319220542907715, |
| "kl": 0.11181640625, |
| "learning_rate": 3.8499999999999997e-07, |
| "loss": 0.0011, |
| "reward": 3.876826047897339, |
| "reward_std": 0.02829575538635254, |
| "rewards/answer_entity_reward": 0.9930555820465088, |
| "rewards/answer_wer_reward": 0.9400706589221954, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9436996281147003, |
| "step": 493 |
| }, |
| { |
| "completion_length": 205.34375, |
| "epoch": 1.5792000000000002, |
| "grad_norm": 1.0891739130020142, |
| "kl": 0.091796875, |
| "learning_rate": 3.8375e-07, |
| "loss": 0.0009, |
| "reward": 3.9375252723693848, |
| "reward_std": 0.01566324196755886, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9394660592079163, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9980591833591461, |
| "step": 494 |
| }, |
| { |
| "completion_length": 247.4375, |
| "epoch": 1.5824, |
| "grad_norm": 1.313225507736206, |
| "kl": 0.120361328125, |
| "learning_rate": 3.825e-07, |
| "loss": 0.0012, |
| "reward": 3.9278939962387085, |
| "reward_std": 0.01758108288049698, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9288604855537415, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999033510684967, |
| "step": 495 |
| }, |
| { |
| "completion_length": 190.28125, |
| "epoch": 1.5856, |
| "grad_norm": 9.440873146057129, |
| "kl": 0.19677734375, |
| "learning_rate": 3.8124999999999995e-07, |
| "loss": 0.002, |
| "reward": 3.7597657442092896, |
| "reward_std": 0.05097449291497469, |
| "rewards/answer_entity_reward": 0.9879385828971863, |
| "rewards/answer_wer_reward": 0.9360098242759705, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8358173370361328, |
| "step": 496 |
| }, |
| { |
| "completion_length": 165.09375, |
| "epoch": 1.5888, |
| "grad_norm": 3.4180030822753906, |
| "kl": 0.111572265625, |
| "learning_rate": 3.7999999999999996e-07, |
| "loss": 0.0011, |
| "reward": 3.87969434261322, |
| "reward_std": 0.058905988931655884, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9649176299571991, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9147767424583435, |
| "step": 497 |
| }, |
| { |
| "completion_length": 188.96875, |
| "epoch": 1.592, |
| "grad_norm": 5.278741359710693, |
| "kl": 0.072998046875, |
| "learning_rate": 3.7875e-07, |
| "loss": 0.0007, |
| "reward": 3.9229685068130493, |
| "reward_std": 0.03557159844785929, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9312180578708649, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9938337206840515, |
| "step": 498 |
| }, |
| { |
| "completion_length": 223.125, |
| "epoch": 1.5952, |
| "grad_norm": 1.8821698427200317, |
| "kl": 0.100341796875, |
| "learning_rate": 3.775e-07, |
| "loss": 0.001, |
| "reward": 3.85340416431427, |
| "reward_std": 0.1366682257503271, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9250198900699615, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9596343636512756, |
| "step": 499 |
| }, |
| { |
| "completion_length": 206.125, |
| "epoch": 1.5984, |
| "grad_norm": 2.528049945831299, |
| "kl": 0.089111328125, |
| "learning_rate": 3.7624999999999994e-07, |
| "loss": 0.0009, |
| "reward": 3.9384653568267822, |
| "reward_std": 0.011245439760386944, |
| "rewards/answer_entity_reward": 0.9983552694320679, |
| "rewards/answer_wer_reward": 0.9415569603443146, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9985532462596893, |
| "step": 500 |
| }, |
| { |
| "completion_length": 229.46875, |
| "epoch": 1.6016, |
| "grad_norm": 1.4024198055267334, |
| "kl": 0.082763671875, |
| "learning_rate": 3.75e-07, |
| "loss": 0.0008, |
| "reward": 3.9224425554275513, |
| "reward_std": 0.012164951767772436, |
| "rewards/answer_entity_reward": 0.9886363744735718, |
| "rewards/answer_wer_reward": 0.9341387450695038, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996675550937653, |
| "step": 501 |
| }, |
| { |
| "completion_length": 183.09375, |
| "epoch": 1.6048, |
| "grad_norm": 2.642270088195801, |
| "kl": 0.0888671875, |
| "learning_rate": 3.7375e-07, |
| "loss": 0.0009, |
| "reward": 3.872815251350403, |
| "reward_std": 0.04407367669045925, |
| "rewards/answer_entity_reward": 0.995192289352417, |
| "rewards/answer_wer_reward": 0.953230619430542, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9243922829627991, |
| "step": 502 |
| }, |
| { |
| "completion_length": 242.8125, |
| "epoch": 1.608, |
| "grad_norm": 3.0733675956726074, |
| "kl": 0.1044921875, |
| "learning_rate": 3.725e-07, |
| "loss": 0.001, |
| "reward": 3.9133812189102173, |
| "reward_std": 0.017343452665954828, |
| "rewards/answer_entity_reward": 0.9914772808551788, |
| "rewards/answer_wer_reward": 0.9236075580120087, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9982964098453522, |
| "step": 503 |
| }, |
| { |
| "completion_length": 234.34375, |
| "epoch": 1.6112, |
| "grad_norm": 1.4146682024002075, |
| "kl": 0.1064453125, |
| "learning_rate": 3.7125e-07, |
| "loss": 0.0011, |
| "reward": 3.941379427909851, |
| "reward_std": 0.011062228586524725, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9433701932430267, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9980092346668243, |
| "step": 504 |
| }, |
| { |
| "completion_length": 252.84375, |
| "epoch": 1.6143999999999998, |
| "grad_norm": 1.9019030332565308, |
| "kl": 0.101318359375, |
| "learning_rate": 3.7e-07, |
| "loss": 0.001, |
| "reward": 3.87961208820343, |
| "reward_std": 0.02180068287998438, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.8812887370586395, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9983234107494354, |
| "step": 505 |
| }, |
| { |
| "completion_length": 184.34375, |
| "epoch": 1.6176, |
| "grad_norm": 3.1965742111206055, |
| "kl": 0.114501953125, |
| "learning_rate": 3.6875e-07, |
| "loss": 0.0011, |
| "reward": 3.650223731994629, |
| "reward_std": 0.16780234314501286, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9241631031036377, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.7573105096817017, |
| "step": 506 |
| }, |
| { |
| "completion_length": 224.1875, |
| "epoch": 1.6208, |
| "grad_norm": 1.8885560035705566, |
| "kl": 0.115966796875, |
| "learning_rate": 3.675e-07, |
| "loss": 0.0012, |
| "reward": 3.9122270345687866, |
| "reward_std": 0.04261860717087984, |
| "rewards/answer_entity_reward": 0.9947552382946014, |
| "rewards/answer_wer_reward": 0.9476769864559174, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9697948098182678, |
| "step": 507 |
| }, |
| { |
| "completion_length": 220.25, |
| "epoch": 1.624, |
| "grad_norm": 3.2499520778656006, |
| "kl": 0.1044921875, |
| "learning_rate": 3.6625e-07, |
| "loss": 0.0011, |
| "reward": 3.927606225013733, |
| "reward_std": 0.023842450696974993, |
| "rewards/answer_entity_reward": 0.9879376590251923, |
| "rewards/answer_wer_reward": 0.9439153373241425, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9957531988620758, |
| "step": 508 |
| }, |
| { |
| "completion_length": 227.96875, |
| "epoch": 1.6272, |
| "grad_norm": 2.6528868675231934, |
| "kl": 0.087890625, |
| "learning_rate": 3.65e-07, |
| "loss": 0.0009, |
| "reward": 3.928350806236267, |
| "reward_std": 0.01731124660000205, |
| "rewards/answer_entity_reward": 0.9886363744735718, |
| "rewards/answer_wer_reward": 0.9420913755893707, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9976231157779694, |
| "step": 509 |
| }, |
| { |
| "completion_length": 243.5, |
| "epoch": 1.6303999999999998, |
| "grad_norm": 1.618895411491394, |
| "kl": 0.09814453125, |
| "learning_rate": 3.6375e-07, |
| "loss": 0.001, |
| "reward": 3.9476526975631714, |
| "reward_std": 0.011007866356521845, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9514043629169464, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9962483048439026, |
| "step": 510 |
| }, |
| { |
| "completion_length": 263.4375, |
| "epoch": 1.6336, |
| "grad_norm": 2.8576741218566895, |
| "kl": 0.104736328125, |
| "learning_rate": 3.6249999999999997e-07, |
| "loss": 0.001, |
| "reward": 3.9101955890655518, |
| "reward_std": 0.01921992190182209, |
| "rewards/answer_entity_reward": 0.9935776889324188, |
| "rewards/answer_wer_reward": 0.9185610413551331, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9980569183826447, |
| "step": 511 |
| }, |
| { |
| "completion_length": 184.5625, |
| "epoch": 1.6368, |
| "grad_norm": 6.8555908203125, |
| "kl": 0.113525390625, |
| "learning_rate": 3.6125e-07, |
| "loss": 0.0011, |
| "reward": 3.8663313388824463, |
| "reward_std": 0.10157291498035192, |
| "rewards/answer_entity_reward": 0.9823717772960663, |
| "rewards/answer_wer_reward": 0.9564132988452911, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9275462925434113, |
| "step": 512 |
| }, |
| { |
| "completion_length": 218.25, |
| "epoch": 1.6400000000000001, |
| "grad_norm": 1.9482468366622925, |
| "kl": 0.091064453125, |
| "learning_rate": 3.6e-07, |
| "loss": 0.0009, |
| "reward": 3.8723970651626587, |
| "reward_std": 0.07238492835313082, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.9418750703334808, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9362037181854248, |
| "step": 513 |
| }, |
| { |
| "completion_length": 240.90625, |
| "epoch": 1.6432, |
| "grad_norm": 1.2296831607818604, |
| "kl": 0.079345703125, |
| "learning_rate": 3.5875e-07, |
| "loss": 0.0008, |
| "reward": 3.9039018154144287, |
| "reward_std": 0.09914317354559898, |
| "rewards/answer_entity_reward": 0.984375, |
| "rewards/answer_wer_reward": 0.9198593199253082, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996675550937653, |
| "step": 514 |
| }, |
| { |
| "completion_length": 234.4375, |
| "epoch": 1.6463999999999999, |
| "grad_norm": 1.4495328664779663, |
| "kl": 0.1455078125, |
| "learning_rate": 3.5749999999999997e-07, |
| "loss": 0.0015, |
| "reward": 3.9163358211517334, |
| "reward_std": 0.013342800550162792, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9182494282722473, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9980863332748413, |
| "step": 515 |
| }, |
| { |
| "completion_length": 251.5, |
| "epoch": 1.6496, |
| "grad_norm": 1.5324357748031616, |
| "kl": 0.085693359375, |
| "learning_rate": 3.5625e-07, |
| "loss": 0.0009, |
| "reward": 3.8444111347198486, |
| "reward_std": 0.19724943954497576, |
| "rewards/answer_entity_reward": 0.9548611044883728, |
| "rewards/answer_wer_reward": 0.9235136210918427, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9972864091396332, |
| "step": 516 |
| }, |
| { |
| "completion_length": 242.40625, |
| "epoch": 1.6528, |
| "grad_norm": 33.44215774536133, |
| "kl": 0.107177734375, |
| "learning_rate": 3.55e-07, |
| "loss": 0.0011, |
| "reward": 3.8674838542938232, |
| "reward_std": 0.024256199598312378, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.931198239326477, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9362856149673462, |
| "step": 517 |
| }, |
| { |
| "completion_length": 222.375, |
| "epoch": 1.6560000000000001, |
| "grad_norm": 2.1098077297210693, |
| "kl": 0.119140625, |
| "learning_rate": 3.5375e-07, |
| "loss": 0.0012, |
| "reward": 3.916640877723694, |
| "reward_std": 0.012934736907482147, |
| "rewards/answer_entity_reward": 0.9841346144676208, |
| "rewards/answer_wer_reward": 0.9368312060832977, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.99567511677742, |
| "step": 518 |
| }, |
| { |
| "completion_length": 187.09375, |
| "epoch": 1.6592, |
| "grad_norm": 5.296720504760742, |
| "kl": 0.1220703125, |
| "learning_rate": 3.5249999999999996e-07, |
| "loss": 0.0012, |
| "reward": 3.9440935850143433, |
| "reward_std": 0.02182569820433855, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9536486864089966, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9939171075820923, |
| "step": 519 |
| }, |
| { |
| "completion_length": 199.1875, |
| "epoch": 1.6623999999999999, |
| "grad_norm": 2.8992345333099365, |
| "kl": 0.1083984375, |
| "learning_rate": 3.5124999999999997e-07, |
| "loss": 0.0011, |
| "reward": 3.868250846862793, |
| "reward_std": 0.01035462855361402, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.952102780342102, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9196203351020813, |
| "step": 520 |
| }, |
| { |
| "completion_length": 201.03125, |
| "epoch": 1.6656, |
| "grad_norm": 2.3841094970703125, |
| "kl": 0.176025390625, |
| "learning_rate": 3.5e-07, |
| "loss": 0.0018, |
| "reward": 3.8405520915985107, |
| "reward_std": 0.020799917168915272, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9267003536224365, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.913851797580719, |
| "step": 521 |
| }, |
| { |
| "completion_length": 212.84375, |
| "epoch": 1.6688, |
| "grad_norm": 2.3912744522094727, |
| "kl": 0.126953125, |
| "learning_rate": 3.4875e-07, |
| "loss": 0.0013, |
| "reward": 3.894093632698059, |
| "reward_std": 0.027726877480745316, |
| "rewards/answer_entity_reward": 0.9886363744735718, |
| "rewards/answer_wer_reward": 0.935745120048523, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9697121679782867, |
| "step": 522 |
| }, |
| { |
| "completion_length": 235.875, |
| "epoch": 1.6720000000000002, |
| "grad_norm": 3.050795078277588, |
| "kl": 0.109130859375, |
| "learning_rate": 3.4749999999999996e-07, |
| "loss": 0.0011, |
| "reward": 3.8923540115356445, |
| "reward_std": 0.01905027125030756, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9099950790405273, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9851997494697571, |
| "step": 523 |
| }, |
| { |
| "completion_length": 228.25, |
| "epoch": 1.6752, |
| "grad_norm": 1.20732843875885, |
| "kl": 0.09375, |
| "learning_rate": 3.4624999999999997e-07, |
| "loss": 0.0009, |
| "reward": 3.936145067214966, |
| "reward_std": 0.009886496467515826, |
| "rewards/answer_entity_reward": 0.9944852888584137, |
| "rewards/answer_wer_reward": 0.9416597485542297, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 524 |
| }, |
| { |
| "completion_length": 225.78125, |
| "epoch": 1.6784, |
| "grad_norm": 8.249052047729492, |
| "kl": 0.09326171875, |
| "learning_rate": 3.45e-07, |
| "loss": 0.0009, |
| "reward": 3.922656536102295, |
| "reward_std": 0.030036092270165682, |
| "rewards/answer_entity_reward": 0.9934523701667786, |
| "rewards/answer_wer_reward": 0.9322790205478668, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9969250857830048, |
| "step": 525 |
| }, |
| { |
| "completion_length": 181.78125, |
| "epoch": 1.6816, |
| "grad_norm": 3.0338377952575684, |
| "kl": 0.42138671875, |
| "learning_rate": 3.4375e-07, |
| "loss": 0.0042, |
| "reward": 3.9170188903808594, |
| "reward_std": 0.02494343649595976, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9231892824172974, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9962334036827087, |
| "step": 526 |
| }, |
| { |
| "completion_length": 197.0625, |
| "epoch": 1.6848, |
| "grad_norm": 1.7836970090866089, |
| "kl": 0.2099609375, |
| "learning_rate": 3.425e-07, |
| "loss": 0.0021, |
| "reward": 3.9194570779800415, |
| "reward_std": 0.03800513781607151, |
| "rewards/answer_entity_reward": 0.9902680516242981, |
| "rewards/answer_wer_reward": 0.9317581951618195, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9974307715892792, |
| "step": 527 |
| }, |
| { |
| "completion_length": 210.59375, |
| "epoch": 1.688, |
| "grad_norm": 2.595771074295044, |
| "kl": 0.1103515625, |
| "learning_rate": 3.4124999999999996e-07, |
| "loss": 0.0011, |
| "reward": 3.8902900218963623, |
| "reward_std": 0.03382246592082083, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.959803968667984, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9328898191452026, |
| "step": 528 |
| }, |
| { |
| "completion_length": 220.0, |
| "epoch": 1.6912, |
| "grad_norm": 1.7138639688491821, |
| "kl": 0.1044921875, |
| "learning_rate": 3.4000000000000003e-07, |
| "loss": 0.001, |
| "reward": 3.9327096939086914, |
| "reward_std": 0.02261860202997923, |
| "rewards/answer_entity_reward": 0.9938696324825287, |
| "rewards/answer_wer_reward": 0.9420961737632751, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9967438876628876, |
| "step": 529 |
| }, |
| { |
| "completion_length": 196.75, |
| "epoch": 1.6944, |
| "grad_norm": 11.008087158203125, |
| "kl": 0.25732421875, |
| "learning_rate": 3.3875e-07, |
| "loss": 0.0026, |
| "reward": 3.9551256895065308, |
| "reward_std": 0.013849829090759158, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9695225656032562, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9884441494941711, |
| "step": 530 |
| }, |
| { |
| "completion_length": 206.25, |
| "epoch": 1.6976, |
| "grad_norm": 3.295365810394287, |
| "kl": 0.17822265625, |
| "learning_rate": 3.375e-07, |
| "loss": 0.0018, |
| "reward": 3.8593257665634155, |
| "reward_std": 0.03199449460953474, |
| "rewards/answer_entity_reward": 0.9895833134651184, |
| "rewards/answer_wer_reward": 0.9447747468948364, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9249675273895264, |
| "step": 531 |
| }, |
| { |
| "completion_length": 202.9375, |
| "epoch": 1.7008, |
| "grad_norm": 1.3525906801223755, |
| "kl": 0.1484375, |
| "learning_rate": 3.3624999999999996e-07, |
| "loss": 0.0015, |
| "reward": 3.9375537633895874, |
| "reward_std": 0.017243665643036366, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9407197833061218, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9992378056049347, |
| "step": 532 |
| }, |
| { |
| "completion_length": 243.09375, |
| "epoch": 1.704, |
| "grad_norm": 3.5387661457061768, |
| "kl": 0.074951171875, |
| "learning_rate": 3.35e-07, |
| "loss": 0.0007, |
| "reward": 3.907800793647766, |
| "reward_std": 0.019072275608778, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9087632894515991, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999037504196167, |
| "step": 533 |
| }, |
| { |
| "completion_length": 235.15625, |
| "epoch": 1.7072, |
| "grad_norm": 2.016521453857422, |
| "kl": 0.09326171875, |
| "learning_rate": 3.3375e-07, |
| "loss": 0.0009, |
| "reward": 3.8281819820404053, |
| "reward_std": 0.021804995834827423, |
| "rewards/answer_entity_reward": 0.9937500059604645, |
| "rewards/answer_wer_reward": 0.9325411021709442, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9018907845020294, |
| "step": 534 |
| }, |
| { |
| "completion_length": 228.25, |
| "epoch": 1.7104, |
| "grad_norm": 2.274576187133789, |
| "kl": 0.090087890625, |
| "learning_rate": 3.325e-07, |
| "loss": 0.0009, |
| "reward": 3.9243232011795044, |
| "reward_std": 0.02412506751716137, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9507229626178741, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9764412045478821, |
| "step": 535 |
| }, |
| { |
| "completion_length": 224.53125, |
| "epoch": 1.7136, |
| "grad_norm": 2.6043360233306885, |
| "kl": 0.10400390625, |
| "learning_rate": 3.3124999999999995e-07, |
| "loss": 0.001, |
| "reward": 3.876230835914612, |
| "reward_std": 0.07055234862491488, |
| "rewards/answer_entity_reward": 0.9927884340286255, |
| "rewards/answer_wer_reward": 0.9505043029785156, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9329380691051483, |
| "step": 536 |
| }, |
| { |
| "completion_length": 225.75, |
| "epoch": 1.7168, |
| "grad_norm": 2.8599207401275635, |
| "kl": 0.09814453125, |
| "learning_rate": 3.3e-07, |
| "loss": 0.001, |
| "reward": 3.8414641618728638, |
| "reward_std": 0.05350587982684374, |
| "rewards/answer_entity_reward": 0.9983552694320679, |
| "rewards/answer_wer_reward": 0.9369199872016907, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9061886966228485, |
| "step": 537 |
| }, |
| { |
| "completion_length": 148.6875, |
| "epoch": 1.72, |
| "grad_norm": 1.6326717138290405, |
| "kl": 0.10009765625, |
| "learning_rate": 3.2875e-07, |
| "loss": 0.001, |
| "reward": 3.9361575841903687, |
| "reward_std": 0.004058501799590886, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9676616787910461, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9684959352016449, |
| "step": 538 |
| }, |
| { |
| "completion_length": 195.0625, |
| "epoch": 1.7231999999999998, |
| "grad_norm": 1.9592961072921753, |
| "kl": 0.12841796875, |
| "learning_rate": 3.275e-07, |
| "loss": 0.0013, |
| "reward": 3.772740364074707, |
| "reward_std": 0.1297362227924168, |
| "rewards/answer_entity_reward": 0.8774839639663696, |
| "rewards/answer_wer_reward": 0.953325480222702, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9419309496879578, |
| "step": 539 |
| }, |
| { |
| "completion_length": 234.75, |
| "epoch": 1.7264, |
| "grad_norm": 2.8339364528656006, |
| "kl": 0.09130859375, |
| "learning_rate": 3.2624999999999995e-07, |
| "loss": 0.0009, |
| "reward": 3.9273258447647095, |
| "reward_std": 0.019230290316045284, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.929772675037384, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996366202831268, |
| "step": 540 |
| }, |
| { |
| "completion_length": 202.21875, |
| "epoch": 1.7296, |
| "grad_norm": 1.428126335144043, |
| "kl": 0.11083984375, |
| "learning_rate": 3.25e-07, |
| "loss": 0.0011, |
| "reward": 3.8019243478775024, |
| "reward_std": 0.012322985101491213, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9284610748291016, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8734632432460785, |
| "step": 541 |
| }, |
| { |
| "completion_length": 204.875, |
| "epoch": 1.7328000000000001, |
| "grad_norm": 2.1539251804351807, |
| "kl": 0.11376953125, |
| "learning_rate": 3.2374999999999997e-07, |
| "loss": 0.0011, |
| "reward": 3.9308128356933594, |
| "reward_std": 0.03895580768585205, |
| "rewards/answer_entity_reward": 0.9930555820465088, |
| "rewards/answer_wer_reward": 0.9500284790992737, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9877286553382874, |
| "step": 542 |
| }, |
| { |
| "completion_length": 237.40625, |
| "epoch": 1.736, |
| "grad_norm": 2.9644949436187744, |
| "kl": 0.091796875, |
| "learning_rate": 3.225e-07, |
| "loss": 0.0009, |
| "reward": 3.8919214010238647, |
| "reward_std": 0.025371606461703777, |
| "rewards/answer_entity_reward": 0.9927884340286255, |
| "rewards/answer_wer_reward": 0.9108140766620636, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9883188605308533, |
| "step": 543 |
| }, |
| { |
| "completion_length": 173.5, |
| "epoch": 1.7391999999999999, |
| "grad_norm": 1.8892700672149658, |
| "kl": 0.11376953125, |
| "learning_rate": 3.2124999999999994e-07, |
| "loss": 0.0011, |
| "reward": 3.816041350364685, |
| "reward_std": 0.021231804974377155, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.8221178352832794, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9973958432674408, |
| "step": 544 |
| }, |
| { |
| "completion_length": 196.90625, |
| "epoch": 1.7424, |
| "grad_norm": 1.6765927076339722, |
| "kl": 0.103271484375, |
| "learning_rate": 3.2e-07, |
| "loss": 0.001, |
| "reward": 3.825459599494934, |
| "reward_std": 0.1512175016105175, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9306570887565613, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.928135871887207, |
| "step": 545 |
| }, |
| { |
| "completion_length": 245.15625, |
| "epoch": 1.7456, |
| "grad_norm": 2.408535957336426, |
| "kl": 0.100830078125, |
| "learning_rate": 3.1874999999999997e-07, |
| "loss": 0.001, |
| "reward": 3.904157519340515, |
| "reward_std": 0.024684349074959755, |
| "rewards/answer_entity_reward": 0.9914772808551788, |
| "rewards/answer_wer_reward": 0.9129303097724915, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999750018119812, |
| "step": 546 |
| }, |
| { |
| "completion_length": 192.15625, |
| "epoch": 1.7488000000000001, |
| "grad_norm": 1.3466379642486572, |
| "kl": 0.1162109375, |
| "learning_rate": 3.175e-07, |
| "loss": 0.0012, |
| "reward": 3.8801496028900146, |
| "reward_std": 0.028854741947725415, |
| "rewards/answer_entity_reward": 0.987500011920929, |
| "rewards/answer_wer_reward": 0.9211397469043732, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9715098142623901, |
| "step": 547 |
| }, |
| { |
| "completion_length": 199.5, |
| "epoch": 1.752, |
| "grad_norm": 1.6798815727233887, |
| "kl": 0.14404296875, |
| "learning_rate": 3.1624999999999994e-07, |
| "loss": 0.0014, |
| "reward": 3.9099488258361816, |
| "reward_std": 0.01651060301810503, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9137388169765472, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9986139535903931, |
| "step": 548 |
| }, |
| { |
| "completion_length": 244.9375, |
| "epoch": 1.7551999999999999, |
| "grad_norm": 1.4050216674804688, |
| "kl": 0.12451171875, |
| "learning_rate": 3.15e-07, |
| "loss": 0.0012, |
| "reward": 3.9373788833618164, |
| "reward_std": 0.015202231705188751, |
| "rewards/answer_entity_reward": 0.9914772808551788, |
| "rewards/answer_wer_reward": 0.9471401572227478, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987614154815674, |
| "step": 549 |
| }, |
| { |
| "completion_length": 248.25, |
| "epoch": 1.7584, |
| "grad_norm": 1.4261935949325562, |
| "kl": 0.06884765625, |
| "learning_rate": 3.1374999999999996e-07, |
| "loss": 0.0007, |
| "reward": 3.8940224647521973, |
| "reward_std": 0.02191777713596821, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.8968429565429688, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9995833337306976, |
| "step": 550 |
| }, |
| { |
| "completion_length": 201.96875, |
| "epoch": 1.7616, |
| "grad_norm": 3.3936564922332764, |
| "kl": 0.10986328125, |
| "learning_rate": 3.1249999999999997e-07, |
| "loss": 0.0011, |
| "reward": 3.8392233848571777, |
| "reward_std": 0.054989127907902, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9420890212059021, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8999753296375275, |
| "step": 551 |
| }, |
| { |
| "completion_length": 224.28125, |
| "epoch": 1.7648000000000001, |
| "grad_norm": 3.447808027267456, |
| "kl": 0.1162109375, |
| "learning_rate": 3.1125000000000004e-07, |
| "loss": 0.0012, |
| "reward": 3.928855776786804, |
| "reward_std": 0.03860421013087034, |
| "rewards/answer_entity_reward": 0.995192289352417, |
| "rewards/answer_wer_reward": 0.9465460479259491, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9871174991130829, |
| "step": 552 |
| }, |
| { |
| "completion_length": 239.0625, |
| "epoch": 1.768, |
| "grad_norm": 0.9099166989326477, |
| "kl": 0.09228515625, |
| "learning_rate": 3.1e-07, |
| "loss": 0.0008, |
| "reward": 3.946284055709839, |
| "reward_std": 0.0096789482049644, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9500063955783844, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9962775409221649, |
| "step": 553 |
| }, |
| { |
| "completion_length": 225.0625, |
| "epoch": 1.7711999999999999, |
| "grad_norm": 5.470230579376221, |
| "kl": 0.0791015625, |
| "learning_rate": 3.0875e-07, |
| "loss": 0.0008, |
| "reward": 3.919348955154419, |
| "reward_std": 0.03945630043745041, |
| "rewards/answer_entity_reward": 0.9685782790184021, |
| "rewards/answer_wer_reward": 0.9539141952991486, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9968563914299011, |
| "step": 554 |
| }, |
| { |
| "completion_length": 221.46875, |
| "epoch": 1.7744, |
| "grad_norm": 2.4623939990997314, |
| "kl": 0.091064453125, |
| "learning_rate": 3.0749999999999997e-07, |
| "loss": 0.0009, |
| "reward": 3.9319478273391724, |
| "reward_std": 0.020772571209818125, |
| "rewards/answer_entity_reward": 0.9983552694320679, |
| "rewards/answer_wer_reward": 0.9341712892055511, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994212985038757, |
| "step": 555 |
| }, |
| { |
| "completion_length": 195.28125, |
| "epoch": 1.7776, |
| "grad_norm": 3.2428677082061768, |
| "kl": 0.1201171875, |
| "learning_rate": 3.0625000000000003e-07, |
| "loss": 0.0012, |
| "reward": 3.8943945169448853, |
| "reward_std": 0.03664180589839816, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.9439602494239807, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9561159908771515, |
| "step": 556 |
| }, |
| { |
| "completion_length": 181.4375, |
| "epoch": 1.7808000000000002, |
| "grad_norm": 3.0905327796936035, |
| "kl": 0.1103515625, |
| "learning_rate": 3.05e-07, |
| "loss": 0.0011, |
| "reward": 3.761397957801819, |
| "reward_std": 0.21460139192640781, |
| "rewards/answer_entity_reward": 0.9930555522441864, |
| "rewards/answer_wer_reward": 0.928047776222229, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.8715444803237915, |
| "step": 557 |
| }, |
| { |
| "completion_length": 221.625, |
| "epoch": 1.784, |
| "grad_norm": 1.951019525527954, |
| "kl": 0.075927734375, |
| "learning_rate": 3.0375e-07, |
| "loss": 0.0008, |
| "reward": 3.77008855342865, |
| "reward_std": 0.32161275763064623, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.8999943733215332, |
| "rewards/format_reward": 0.9375, |
| "rewards/think_ocr_reward": 0.9325942993164062, |
| "step": 558 |
| }, |
| { |
| "completion_length": 205.28125, |
| "epoch": 1.7872, |
| "grad_norm": 3.277336359024048, |
| "kl": 0.219482421875, |
| "learning_rate": 3.0249999999999996e-07, |
| "loss": 0.0022, |
| "reward": 3.934972047805786, |
| "reward_std": 0.0279585188254714, |
| "rewards/answer_entity_reward": 0.9919143319129944, |
| "rewards/answer_wer_reward": 0.944387674331665, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.998670220375061, |
| "step": 559 |
| }, |
| { |
| "completion_length": 228.5625, |
| "epoch": 1.7904, |
| "grad_norm": 1.3801170587539673, |
| "kl": 0.090576171875, |
| "learning_rate": 3.0125000000000003e-07, |
| "loss": 0.0009, |
| "reward": 3.93076229095459, |
| "reward_std": 0.018667958676815033, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9448211789131165, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9887820482254028, |
| "step": 560 |
| }, |
| { |
| "completion_length": 210.71875, |
| "epoch": 1.7936, |
| "grad_norm": 1.791351556777954, |
| "kl": 0.109130859375, |
| "learning_rate": 3e-07, |
| "loss": 0.0011, |
| "reward": 3.883724331855774, |
| "reward_std": 0.061979083344340324, |
| "rewards/answer_entity_reward": 0.9836647808551788, |
| "rewards/answer_wer_reward": 0.9013588726520538, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987007081508636, |
| "step": 561 |
| }, |
| { |
| "completion_length": 205.125, |
| "epoch": 1.7968, |
| "grad_norm": 2.168004274368286, |
| "kl": 0.103271484375, |
| "learning_rate": 2.9875e-07, |
| "loss": 0.001, |
| "reward": 3.8556606769561768, |
| "reward_std": 0.08509537391364574, |
| "rewards/answer_entity_reward": 0.995192289352417, |
| "rewards/answer_wer_reward": 0.9325578808784485, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9279104173183441, |
| "step": 562 |
| }, |
| { |
| "completion_length": 206.78125, |
| "epoch": 1.8, |
| "grad_norm": 1.8020058870315552, |
| "kl": 0.112548828125, |
| "learning_rate": 2.9749999999999996e-07, |
| "loss": 0.0011, |
| "reward": 3.9098552465438843, |
| "reward_std": 0.027897534891963005, |
| "rewards/answer_entity_reward": 0.995192289352417, |
| "rewards/answer_wer_reward": 0.9327702820301056, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9818926453590393, |
| "step": 563 |
| }, |
| { |
| "completion_length": 199.84375, |
| "epoch": 1.8032, |
| "grad_norm": 2.1101276874542236, |
| "kl": 0.08056640625, |
| "learning_rate": 2.9625e-07, |
| "loss": 0.0008, |
| "reward": 3.928394079208374, |
| "reward_std": 0.013759741093963385, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9306167364120483, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9977773427963257, |
| "step": 564 |
| }, |
| { |
| "completion_length": 198.34375, |
| "epoch": 1.8064, |
| "grad_norm": 1.7468022108078003, |
| "kl": 0.110595703125, |
| "learning_rate": 2.95e-07, |
| "loss": 0.0011, |
| "reward": 3.8688454627990723, |
| "reward_std": 0.01723374053835869, |
| "rewards/answer_entity_reward": 0.9888257682323456, |
| "rewards/answer_wer_reward": 0.9232835471630096, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9567362368106842, |
| "step": 565 |
| }, |
| { |
| "completion_length": 166.21875, |
| "epoch": 1.8096, |
| "grad_norm": 3.4565577507019043, |
| "kl": 0.12451171875, |
| "learning_rate": 2.9375e-07, |
| "loss": 0.0012, |
| "reward": 3.8460001945495605, |
| "reward_std": 0.12010016990825534, |
| "rewards/answer_entity_reward": 0.9685245454311371, |
| "rewards/answer_wer_reward": 0.905397355556488, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9720781445503235, |
| "step": 566 |
| }, |
| { |
| "completion_length": 204.375, |
| "epoch": 1.8128, |
| "grad_norm": 2.109642267227173, |
| "kl": 0.1474609375, |
| "learning_rate": 2.9249999999999995e-07, |
| "loss": 0.0015, |
| "reward": 3.9155898094177246, |
| "reward_std": 0.021943609230220318, |
| "rewards/answer_entity_reward": 0.995192289352417, |
| "rewards/answer_wer_reward": 0.9553306102752686, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9650669693946838, |
| "step": 567 |
| }, |
| { |
| "completion_length": 231.5625, |
| "epoch": 1.8159999999999998, |
| "grad_norm": 1.4336498975753784, |
| "kl": 0.101318359375, |
| "learning_rate": 2.9125e-07, |
| "loss": 0.001, |
| "reward": 3.9311413764953613, |
| "reward_std": 0.012714509852230549, |
| "rewards/answer_entity_reward": 0.9944852888584137, |
| "rewards/answer_wer_reward": 0.9397754371166229, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9968807101249695, |
| "step": 568 |
| }, |
| { |
| "completion_length": 204.8125, |
| "epoch": 1.8192, |
| "grad_norm": 2.3991148471832275, |
| "kl": 0.0830078125, |
| "learning_rate": 2.9e-07, |
| "loss": 0.0008, |
| "reward": 3.843847155570984, |
| "reward_std": 0.1981589295901358, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9109402298927307, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9641570150852203, |
| "step": 569 |
| }, |
| { |
| "completion_length": 232.46875, |
| "epoch": 1.8224, |
| "grad_norm": 1.6885050535202026, |
| "kl": 0.085693359375, |
| "learning_rate": 2.8875e-07, |
| "loss": 0.0009, |
| "reward": 3.8632709980010986, |
| "reward_std": 0.08977647870779037, |
| "rewards/answer_entity_reward": 0.9838541746139526, |
| "rewards/answer_wer_reward": 0.9250127673149109, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9544041156768799, |
| "step": 570 |
| }, |
| { |
| "completion_length": 277.78125, |
| "epoch": 1.8256000000000001, |
| "grad_norm": 1.6569448709487915, |
| "kl": 0.09619140625, |
| "learning_rate": 2.8749999999999995e-07, |
| "loss": 0.001, |
| "reward": 3.70079243183136, |
| "reward_std": 0.15860513970255852, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.901878148317337, |
| "rewards/format_reward": 0.9375, |
| "rewards/think_ocr_reward": 0.8614143133163452, |
| "step": 571 |
| }, |
| { |
| "completion_length": 248.0625, |
| "epoch": 1.8288, |
| "grad_norm": 1.72274649143219, |
| "kl": 0.08251953125, |
| "learning_rate": 2.8625e-07, |
| "loss": 0.0008, |
| "reward": 3.8939234018325806, |
| "reward_std": 0.016017161309719086, |
| "rewards/answer_entity_reward": 0.9803321659564972, |
| "rewards/answer_wer_reward": 0.9138159155845642, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997751712799072, |
| "step": 572 |
| }, |
| { |
| "completion_length": 222.1875, |
| "epoch": 1.8319999999999999, |
| "grad_norm": 5.086897373199463, |
| "kl": 0.3359375, |
| "learning_rate": 2.8499999999999997e-07, |
| "loss": 0.0034, |
| "reward": 3.931598663330078, |
| "reward_std": 0.02384120598435402, |
| "rewards/answer_entity_reward": 0.9947552382946014, |
| "rewards/answer_wer_reward": 0.9370801150798798, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997632503509521, |
| "step": 573 |
| }, |
| { |
| "completion_length": 216.3125, |
| "epoch": 1.8352, |
| "grad_norm": 1.5536502599716187, |
| "kl": 0.089599609375, |
| "learning_rate": 2.8375e-07, |
| "loss": 0.0009, |
| "reward": 3.941322922706604, |
| "reward_std": 0.018783860839903355, |
| "rewards/answer_entity_reward": 0.9833333194255829, |
| "rewards/answer_wer_reward": 0.9589866697788239, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990028738975525, |
| "step": 574 |
| }, |
| { |
| "completion_length": 232.9375, |
| "epoch": 1.8384, |
| "grad_norm": 1.4012224674224854, |
| "kl": 0.09033203125, |
| "learning_rate": 2.8249999999999994e-07, |
| "loss": 0.0009, |
| "reward": 3.9058892726898193, |
| "reward_std": 0.05868656514212489, |
| "rewards/answer_entity_reward": 0.9895833432674408, |
| "rewards/answer_wer_reward": 0.9166894555091858, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996165633201599, |
| "step": 575 |
| }, |
| { |
| "completion_length": 202.125, |
| "epoch": 1.8416000000000001, |
| "grad_norm": 3.967221260070801, |
| "kl": 0.098388671875, |
| "learning_rate": 2.8125e-07, |
| "loss": 0.001, |
| "reward": 3.936561346054077, |
| "reward_std": 0.028190571581944823, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9553852677345276, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9811758995056152, |
| "step": 576 |
| }, |
| { |
| "completion_length": 248.25, |
| "epoch": 1.8448, |
| "grad_norm": 3.581430673599243, |
| "kl": 0.19970703125, |
| "learning_rate": 2.8e-07, |
| "loss": 0.002, |
| "reward": 3.854965329170227, |
| "reward_std": 0.08418525848537683, |
| "rewards/answer_entity_reward": 0.9715560376644135, |
| "rewards/answer_wer_reward": 0.9045931100845337, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9788161218166351, |
| "step": 577 |
| }, |
| { |
| "completion_length": 199.25, |
| "epoch": 1.8479999999999999, |
| "grad_norm": 3.7948851585388184, |
| "kl": 0.147705078125, |
| "learning_rate": 2.7875e-07, |
| "loss": 0.0015, |
| "reward": 3.923743486404419, |
| "reward_std": 0.042667020577937365, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9548681676387787, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9688753485679626, |
| "step": 578 |
| }, |
| { |
| "completion_length": 193.34375, |
| "epoch": 1.8512, |
| "grad_norm": 2.4876842498779297, |
| "kl": 0.102294921875, |
| "learning_rate": 2.775e-07, |
| "loss": 0.001, |
| "reward": 3.97040331363678, |
| "reward_std": 0.01486315974034369, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.973244309425354, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 579 |
| }, |
| { |
| "completion_length": 223.3125, |
| "epoch": 1.8544, |
| "grad_norm": 4.710970878601074, |
| "kl": 0.14501953125, |
| "learning_rate": 2.7625e-07, |
| "loss": 0.0015, |
| "reward": 3.9073562622070312, |
| "reward_std": 0.040397388860583305, |
| "rewards/answer_entity_reward": 0.9927884638309479, |
| "rewards/answer_wer_reward": 0.9182255864143372, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9963421821594238, |
| "step": 580 |
| }, |
| { |
| "completion_length": 240.75, |
| "epoch": 1.8576000000000001, |
| "grad_norm": 1.0144391059875488, |
| "kl": 0.089599609375, |
| "learning_rate": 2.75e-07, |
| "loss": 0.0009, |
| "reward": 3.8770586252212524, |
| "reward_std": 0.030949956737458706, |
| "rewards/answer_entity_reward": 0.9654052257537842, |
| "rewards/answer_wer_reward": 0.9127066433429718, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9989466071128845, |
| "step": 581 |
| }, |
| { |
| "completion_length": 240.46875, |
| "epoch": 1.8608, |
| "grad_norm": 33.290077209472656, |
| "kl": 0.14111328125, |
| "learning_rate": 2.7374999999999997e-07, |
| "loss": 0.0014, |
| "reward": 3.931227445602417, |
| "reward_std": 0.017369844019412994, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9346133172512054, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990180134773254, |
| "step": 582 |
| }, |
| { |
| "completion_length": 247.1875, |
| "epoch": 1.8639999999999999, |
| "grad_norm": 1.7812319993972778, |
| "kl": 0.083984375, |
| "learning_rate": 2.725e-07, |
| "loss": 0.0008, |
| "reward": 3.885915160179138, |
| "reward_std": 0.0845849048346281, |
| "rewards/answer_entity_reward": 0.9893162548542023, |
| "rewards/answer_wer_reward": 0.9280518591403961, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9685470759868622, |
| "step": 583 |
| }, |
| { |
| "completion_length": 208.0625, |
| "epoch": 1.8672, |
| "grad_norm": 3.882129192352295, |
| "kl": 0.133544921875, |
| "learning_rate": 2.7125e-07, |
| "loss": 0.0013, |
| "reward": 3.8892873525619507, |
| "reward_std": 0.04396933689713478, |
| "rewards/answer_entity_reward": 0.9919143319129944, |
| "rewards/answer_wer_reward": 0.9468680024147034, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9505050778388977, |
| "step": 584 |
| }, |
| { |
| "completion_length": 254.6875, |
| "epoch": 1.8704, |
| "grad_norm": 1.1797689199447632, |
| "kl": 0.09619140625, |
| "learning_rate": 2.7e-07, |
| "loss": 0.001, |
| "reward": 3.9246060848236084, |
| "reward_std": 0.021186589263379574, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9278402030467987, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9991695880889893, |
| "step": 585 |
| }, |
| { |
| "completion_length": 225.9375, |
| "epoch": 1.8736000000000002, |
| "grad_norm": 1.4846960306167603, |
| "kl": 0.1142578125, |
| "learning_rate": 2.6874999999999997e-07, |
| "loss": 0.0011, |
| "reward": 3.9609856605529785, |
| "reward_std": 0.024455342907458544, |
| "rewards/answer_entity_reward": 0.9909090995788574, |
| "rewards/answer_wer_reward": 0.9731672704219818, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9969093799591064, |
| "step": 586 |
| }, |
| { |
| "completion_length": 214.03125, |
| "epoch": 1.8768, |
| "grad_norm": 1.4108463525772095, |
| "kl": 0.2236328125, |
| "learning_rate": 2.675e-07, |
| "loss": 0.0022, |
| "reward": 3.9008651971817017, |
| "reward_std": 0.023720702156424522, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9427990317344666, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9615384638309479, |
| "step": 587 |
| }, |
| { |
| "completion_length": 257.34375, |
| "epoch": 1.88, |
| "grad_norm": 2.2120485305786133, |
| "kl": 0.085693359375, |
| "learning_rate": 2.6625e-07, |
| "loss": 0.0009, |
| "reward": 3.905014157295227, |
| "reward_std": 0.02011673618108034, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9055063128471375, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999507874250412, |
| "step": 588 |
| }, |
| { |
| "completion_length": 217.21875, |
| "epoch": 1.8832, |
| "grad_norm": 2.6982715129852295, |
| "kl": 0.08935546875, |
| "learning_rate": 2.65e-07, |
| "loss": 0.0009, |
| "reward": 3.9260659217834473, |
| "reward_std": 0.02971976064145565, |
| "rewards/answer_entity_reward": 0.9871068000793457, |
| "rewards/answer_wer_reward": 0.9389589130878448, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 589 |
| }, |
| { |
| "completion_length": 253.65625, |
| "epoch": 1.8864, |
| "grad_norm": 1.0963667631149292, |
| "kl": 0.081298828125, |
| "learning_rate": 2.6374999999999996e-07, |
| "loss": 0.0008, |
| "reward": 3.9269603490829468, |
| "reward_std": 0.02615117933601141, |
| "rewards/answer_entity_reward": 0.9908459782600403, |
| "rewards/answer_wer_reward": 0.9372480809688568, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9988662898540497, |
| "step": 590 |
| }, |
| { |
| "completion_length": 204.09375, |
| "epoch": 1.8896, |
| "grad_norm": 2.6849443912506104, |
| "kl": 0.128173828125, |
| "learning_rate": 2.625e-07, |
| "loss": 0.0013, |
| "reward": 3.912359118461609, |
| "reward_std": 0.025913351215422153, |
| "rewards/answer_entity_reward": 0.9798610806465149, |
| "rewards/answer_wer_reward": 0.9567141532897949, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9757837951183319, |
| "step": 591 |
| }, |
| { |
| "completion_length": 229.75, |
| "epoch": 1.8928, |
| "grad_norm": 12.276920318603516, |
| "kl": 0.524169921875, |
| "learning_rate": 2.6125e-07, |
| "loss": 0.0052, |
| "reward": 3.8932021856307983, |
| "reward_std": 0.014225118793547153, |
| "rewards/answer_entity_reward": 0.9981617629528046, |
| "rewards/answer_wer_reward": 0.9339624643325806, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9610779881477356, |
| "step": 592 |
| }, |
| { |
| "completion_length": 172.0, |
| "epoch": 1.896, |
| "grad_norm": 3.136312961578369, |
| "kl": 0.197509765625, |
| "learning_rate": 2.6e-07, |
| "loss": 0.002, |
| "reward": 3.927412748336792, |
| "reward_std": 0.02914919052273035, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9748775362968445, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9525350630283356, |
| "step": 593 |
| }, |
| { |
| "completion_length": 231.625, |
| "epoch": 1.8992, |
| "grad_norm": 1.4952311515808105, |
| "kl": 0.0966796875, |
| "learning_rate": 2.5874999999999996e-07, |
| "loss": 0.001, |
| "reward": 3.920572519302368, |
| "reward_std": 0.023940533865243196, |
| "rewards/answer_entity_reward": 0.9852676391601562, |
| "rewards/answer_wer_reward": 0.9361679553985596, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9991368651390076, |
| "step": 594 |
| }, |
| { |
| "completion_length": 222.71875, |
| "epoch": 1.9024, |
| "grad_norm": 1.1621572971343994, |
| "kl": 0.10302734375, |
| "learning_rate": 2.5749999999999997e-07, |
| "loss": 0.001, |
| "reward": 3.8978298902511597, |
| "reward_std": 0.07608090154826641, |
| "rewards/answer_entity_reward": 0.9649057686328888, |
| "rewards/answer_wer_reward": 0.9345213770866394, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9984027147293091, |
| "step": 595 |
| }, |
| { |
| "completion_length": 254.21875, |
| "epoch": 1.9056, |
| "grad_norm": 0.9472298622131348, |
| "kl": 0.092041015625, |
| "learning_rate": 2.5625e-07, |
| "loss": 0.0009, |
| "reward": 3.916618824005127, |
| "reward_std": 0.015597880817949772, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9208222925662994, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9978796541690826, |
| "step": 596 |
| }, |
| { |
| "completion_length": 247.84375, |
| "epoch": 1.9088, |
| "grad_norm": 1.7148473262786865, |
| "kl": 0.106201171875, |
| "learning_rate": 2.55e-07, |
| "loss": 0.0011, |
| "reward": 3.9292455911636353, |
| "reward_std": 0.007026449544355273, |
| "rewards/answer_entity_reward": 0.9861111044883728, |
| "rewards/answer_wer_reward": 0.9444275796413422, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987069070339203, |
| "step": 597 |
| }, |
| { |
| "completion_length": 183.21875, |
| "epoch": 1.912, |
| "grad_norm": 1.6502317190170288, |
| "kl": 0.119140625, |
| "learning_rate": 2.5374999999999995e-07, |
| "loss": 0.0012, |
| "reward": 3.9383983612060547, |
| "reward_std": 0.03170687519013882, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9614830911159515, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.981379508972168, |
| "step": 598 |
| }, |
| { |
| "completion_length": 167.625, |
| "epoch": 1.9152, |
| "grad_norm": 1.1803314685821533, |
| "kl": 0.137939453125, |
| "learning_rate": 2.5249999999999996e-07, |
| "loss": 0.0014, |
| "reward": 3.9067423343658447, |
| "reward_std": 0.013731301296502352, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.949960470199585, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9567819237709045, |
| "step": 599 |
| }, |
| { |
| "completion_length": 199.75, |
| "epoch": 1.9184, |
| "grad_norm": 1.3902597427368164, |
| "kl": 0.080322265625, |
| "learning_rate": 2.5125e-07, |
| "loss": 0.0008, |
| "reward": 3.927718758583069, |
| "reward_std": 0.02047483716160059, |
| "rewards/answer_entity_reward": 0.9840544760227203, |
| "rewards/answer_wer_reward": 0.9543785154819489, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9892857074737549, |
| "step": 600 |
| }, |
| { |
| "completion_length": 210.375, |
| "epoch": 1.9216, |
| "grad_norm": 1.122063159942627, |
| "kl": 0.120361328125, |
| "learning_rate": 2.5e-07, |
| "loss": 0.0012, |
| "reward": 3.942714214324951, |
| "reward_std": 0.027352871373295784, |
| "rewards/answer_entity_reward": 0.9909090995788574, |
| "rewards/answer_wer_reward": 0.9525187313556671, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9992863833904266, |
| "step": 601 |
| }, |
| { |
| "completion_length": 191.28125, |
| "epoch": 1.9247999999999998, |
| "grad_norm": 1.9480561017990112, |
| "kl": 0.092529296875, |
| "learning_rate": 2.4875e-07, |
| "loss": 0.0009, |
| "reward": 3.8946096897125244, |
| "reward_std": 0.07258242554962635, |
| "rewards/answer_entity_reward": 0.9886363744735718, |
| "rewards/answer_wer_reward": 0.9430054724216461, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9629679620265961, |
| "step": 602 |
| }, |
| { |
| "completion_length": 210.0, |
| "epoch": 1.928, |
| "grad_norm": 1.522335171699524, |
| "kl": 0.08203125, |
| "learning_rate": 2.475e-07, |
| "loss": 0.0008, |
| "reward": 3.9419585466384888, |
| "reward_std": 0.02094284538179636, |
| "rewards/answer_entity_reward": 0.9960784316062927, |
| "rewards/answer_wer_reward": 0.9462659358978271, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996141791343689, |
| "step": 603 |
| }, |
| { |
| "completion_length": 194.375, |
| "epoch": 1.9312, |
| "grad_norm": 1.9648785591125488, |
| "kl": 0.2880859375, |
| "learning_rate": 2.4624999999999997e-07, |
| "loss": 0.0029, |
| "reward": 3.9480878114700317, |
| "reward_std": 0.015357580035924911, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9483262896537781, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997614622116089, |
| "step": 604 |
| }, |
| { |
| "completion_length": 220.75, |
| "epoch": 1.9344000000000001, |
| "grad_norm": 2.2600207328796387, |
| "kl": 0.09814453125, |
| "learning_rate": 2.45e-07, |
| "loss": 0.001, |
| "reward": 3.92673122882843, |
| "reward_std": 0.025613561272621155, |
| "rewards/answer_entity_reward": 0.9899475276470184, |
| "rewards/answer_wer_reward": 0.9408612251281738, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9959224164485931, |
| "step": 605 |
| }, |
| { |
| "completion_length": 159.125, |
| "epoch": 1.9376, |
| "grad_norm": 3.3259623050689697, |
| "kl": 0.15869140625, |
| "learning_rate": 2.4375e-07, |
| "loss": 0.0016, |
| "reward": 3.9267284870147705, |
| "reward_std": 0.024998134351335466, |
| "rewards/answer_entity_reward": 0.987500011920929, |
| "rewards/answer_wer_reward": 0.9395100474357605, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997184872627258, |
| "step": 606 |
| }, |
| { |
| "completion_length": 250.0625, |
| "epoch": 1.9407999999999999, |
| "grad_norm": 1.4518193006515503, |
| "kl": 0.1357421875, |
| "learning_rate": 2.425e-07, |
| "loss": 0.0014, |
| "reward": 3.825323224067688, |
| "reward_std": 0.11214365810155869, |
| "rewards/answer_entity_reward": 0.9936868846416473, |
| "rewards/answer_wer_reward": 0.9456245005130768, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9172618985176086, |
| "step": 607 |
| }, |
| { |
| "completion_length": 219.53125, |
| "epoch": 1.944, |
| "grad_norm": 1.2852040529251099, |
| "kl": 0.081298828125, |
| "learning_rate": 2.4124999999999997e-07, |
| "loss": 0.0008, |
| "reward": 3.9620739221572876, |
| "reward_std": 0.00826547248288989, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9646617472171783, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9974121451377869, |
| "step": 608 |
| }, |
| { |
| "completion_length": 232.875, |
| "epoch": 1.9472, |
| "grad_norm": 4.870666027069092, |
| "kl": 0.111328125, |
| "learning_rate": 2.4e-07, |
| "loss": 0.0011, |
| "reward": 3.9366871118545532, |
| "reward_std": 0.022448008647188544, |
| "rewards/answer_entity_reward": 0.9916141629219055, |
| "rewards/answer_wer_reward": 0.9457239210605621, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993489384651184, |
| "step": 609 |
| }, |
| { |
| "completion_length": 241.625, |
| "epoch": 1.9504000000000001, |
| "grad_norm": 1.8101410865783691, |
| "kl": 0.096435546875, |
| "learning_rate": 2.3875e-07, |
| "loss": 0.001, |
| "reward": 3.9368724822998047, |
| "reward_std": 0.022243991494178772, |
| "rewards/answer_entity_reward": 0.9929924309253693, |
| "rewards/answer_wer_reward": 0.943880021572113, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 610 |
| }, |
| { |
| "completion_length": 223.0, |
| "epoch": 1.9536, |
| "grad_norm": 0.8068660497665405, |
| "kl": 0.099609375, |
| "learning_rate": 2.3749999999999998e-07, |
| "loss": 0.001, |
| "reward": 3.9270153045654297, |
| "reward_std": 0.01834964146837592, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.931628555059433, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9988590180873871, |
| "step": 611 |
| }, |
| { |
| "completion_length": 259.59375, |
| "epoch": 1.9567999999999999, |
| "grad_norm": 1.522141695022583, |
| "kl": 0.08203125, |
| "learning_rate": 2.3625e-07, |
| "loss": 0.0008, |
| "reward": 3.9466216564178467, |
| "reward_std": 0.015272928401827812, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9516074061393738, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9984864592552185, |
| "step": 612 |
| }, |
| { |
| "completion_length": 211.6875, |
| "epoch": 1.96, |
| "grad_norm": 5.929853916168213, |
| "kl": 0.10205078125, |
| "learning_rate": 2.3499999999999997e-07, |
| "loss": 0.001, |
| "reward": 3.912535309791565, |
| "reward_std": 0.026867160573601723, |
| "rewards/answer_entity_reward": 0.9908565580844879, |
| "rewards/answer_wer_reward": 0.9229700565338135, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987086653709412, |
| "step": 613 |
| }, |
| { |
| "completion_length": 222.40625, |
| "epoch": 1.9632, |
| "grad_norm": 2.7727534770965576, |
| "kl": 0.106689453125, |
| "learning_rate": 2.3375e-07, |
| "loss": 0.0011, |
| "reward": 3.9305132627487183, |
| "reward_std": 0.07138971472159028, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9646386206150055, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9658747315406799, |
| "step": 614 |
| }, |
| { |
| "completion_length": 222.34375, |
| "epoch": 1.9664000000000001, |
| "grad_norm": 1.5660823583602905, |
| "kl": 0.09912109375, |
| "learning_rate": 2.325e-07, |
| "loss": 0.001, |
| "reward": 3.935341477394104, |
| "reward_std": 0.01785436598584056, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9401703774929047, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.997254341840744, |
| "step": 615 |
| }, |
| { |
| "completion_length": 236.34375, |
| "epoch": 1.9696, |
| "grad_norm": 1.94826340675354, |
| "kl": 0.078857421875, |
| "learning_rate": 2.3125e-07, |
| "loss": 0.0008, |
| "reward": 3.958570718765259, |
| "reward_std": 0.008256069151684642, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9587988257408142, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997718930244446, |
| "step": 616 |
| }, |
| { |
| "completion_length": 143.5, |
| "epoch": 1.9727999999999999, |
| "grad_norm": 2.0813863277435303, |
| "kl": 0.121337890625, |
| "learning_rate": 2.3e-07, |
| "loss": 0.0012, |
| "reward": 3.886753797531128, |
| "reward_std": 0.027786132879555225, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9573519229888916, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9294019639492035, |
| "step": 617 |
| }, |
| { |
| "completion_length": 227.75, |
| "epoch": 1.976, |
| "grad_norm": 8.525589942932129, |
| "kl": 0.097900390625, |
| "learning_rate": 2.2875e-07, |
| "loss": 0.001, |
| "reward": 3.857698917388916, |
| "reward_std": 0.07474052533507347, |
| "rewards/answer_entity_reward": 0.9985119104385376, |
| "rewards/answer_wer_reward": 0.9551934599876404, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9039934873580933, |
| "step": 618 |
| }, |
| { |
| "completion_length": 233.46875, |
| "epoch": 1.9792, |
| "grad_norm": 1.966539978981018, |
| "kl": 0.082763671875, |
| "learning_rate": 2.275e-07, |
| "loss": 0.0008, |
| "reward": 3.9442771673202515, |
| "reward_std": 0.018204713938757777, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.9551240801811218, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9948348999023438, |
| "step": 619 |
| }, |
| { |
| "completion_length": 254.59375, |
| "epoch": 1.9824000000000002, |
| "grad_norm": 2.300699234008789, |
| "kl": 0.338134765625, |
| "learning_rate": 2.2625e-07, |
| "loss": 0.0034, |
| "reward": 3.9195804595947266, |
| "reward_std": 0.014696986880153418, |
| "rewards/answer_entity_reward": 0.974116176366806, |
| "rewards/answer_wer_reward": 0.9479033648967743, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9975608885288239, |
| "step": 620 |
| }, |
| { |
| "completion_length": 239.71875, |
| "epoch": 1.9856, |
| "grad_norm": 20.98819923400879, |
| "kl": 0.16015625, |
| "learning_rate": 2.25e-07, |
| "loss": 0.0016, |
| "reward": 3.858734607696533, |
| "reward_std": 0.14412511140108109, |
| "rewards/answer_entity_reward": 0.9937500059604645, |
| "rewards/answer_wer_reward": 0.9278987050056458, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9683361053466797, |
| "step": 621 |
| }, |
| { |
| "completion_length": 225.1875, |
| "epoch": 1.9888, |
| "grad_norm": 2.4856221675872803, |
| "kl": 0.1181640625, |
| "learning_rate": 2.2375e-07, |
| "loss": 0.0012, |
| "reward": 3.92032527923584, |
| "reward_std": 0.030348293483257294, |
| "rewards/answer_entity_reward": 0.9947916567325592, |
| "rewards/answer_wer_reward": 0.9266910254955292, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9988425970077515, |
| "step": 622 |
| }, |
| { |
| "completion_length": 209.4375, |
| "epoch": 1.992, |
| "grad_norm": 2.2857937812805176, |
| "kl": 0.109619140625, |
| "learning_rate": 2.225e-07, |
| "loss": 0.0011, |
| "reward": 3.790956974029541, |
| "reward_std": 0.07298576645553112, |
| "rewards/answer_entity_reward": 0.993697464466095, |
| "rewards/answer_wer_reward": 0.93813356757164, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8591260015964508, |
| "step": 623 |
| }, |
| { |
| "completion_length": 182.90625, |
| "epoch": 1.9952, |
| "grad_norm": 4.463064670562744, |
| "kl": 0.126953125, |
| "learning_rate": 2.2125e-07, |
| "loss": 0.0013, |
| "reward": 3.906226873397827, |
| "reward_std": 0.0698380870744586, |
| "rewards/answer_entity_reward": 0.992799699306488, |
| "rewards/answer_wer_reward": 0.9589782953262329, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9544488191604614, |
| "step": 624 |
| }, |
| { |
| "completion_length": 232.875, |
| "epoch": 1.9984, |
| "grad_norm": 1.20980966091156, |
| "kl": 0.1044921875, |
| "learning_rate": 2.1999999999999998e-07, |
| "loss": 0.001, |
| "reward": 3.9167827367782593, |
| "reward_std": 0.01762760616838932, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9183346629142761, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9984481334686279, |
| "step": 625 |
| }, |
| { |
| "completion_length": 177.75, |
| "epoch": 2.0, |
| "grad_norm": 2.3776934146881104, |
| "kl": 0.11865234375, |
| "learning_rate": 2.1875e-07, |
| "loss": 0.0006, |
| "reward": 3.8532142639160156, |
| "reward_std": 0.018374208360910416, |
| "rewards/answer_entity_reward": 0.9963235259056091, |
| "rewards/answer_wer_reward": 0.9639798402786255, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8929109573364258, |
| "step": 626 |
| }, |
| { |
| "completion_length": 234.6875, |
| "epoch": 2.0032, |
| "grad_norm": 1.611534833908081, |
| "kl": 0.13134765625, |
| "learning_rate": 2.1749999999999998e-07, |
| "loss": 0.0013, |
| "reward": 3.9422988891601562, |
| "reward_std": 0.020460932981222868, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9433672726154327, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9989316165447235, |
| "step": 627 |
| }, |
| { |
| "completion_length": 174.4375, |
| "epoch": 2.0064, |
| "grad_norm": 3.054837942123413, |
| "kl": 0.1240234375, |
| "learning_rate": 2.1625e-07, |
| "loss": 0.0012, |
| "reward": 3.958639144897461, |
| "reward_std": 0.009486648719757795, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9633738994598389, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9952651560306549, |
| "step": 628 |
| }, |
| { |
| "completion_length": 222.15625, |
| "epoch": 2.0096, |
| "grad_norm": 4.340118885040283, |
| "kl": 0.093017578125, |
| "learning_rate": 2.1499999999999998e-07, |
| "loss": 0.0009, |
| "reward": 3.8726435899734497, |
| "reward_std": 0.033597009256482124, |
| "rewards/answer_entity_reward": 0.9919143319129944, |
| "rewards/answer_wer_reward": 0.9492302238941193, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9314990937709808, |
| "step": 629 |
| }, |
| { |
| "completion_length": 207.8125, |
| "epoch": 2.0128, |
| "grad_norm": 2.162853717803955, |
| "kl": 0.109130859375, |
| "learning_rate": 2.1375e-07, |
| "loss": 0.0011, |
| "reward": 3.9318206310272217, |
| "reward_std": 0.01979170460253954, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9489758312702179, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9863169491291046, |
| "step": 630 |
| }, |
| { |
| "completion_length": 239.90625, |
| "epoch": 2.016, |
| "grad_norm": 1.676960825920105, |
| "kl": 0.182861328125, |
| "learning_rate": 2.1249999999999998e-07, |
| "loss": 0.0018, |
| "reward": 3.937206506729126, |
| "reward_std": 0.014235546346753836, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9372064471244812, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 631 |
| }, |
| { |
| "completion_length": 194.28125, |
| "epoch": 2.0192, |
| "grad_norm": 5.123164176940918, |
| "kl": 0.110107421875, |
| "learning_rate": 2.1125e-07, |
| "loss": 0.0011, |
| "reward": 3.7548152208328247, |
| "reward_std": 0.10348369181156158, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.8692809343338013, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8912160098552704, |
| "step": 632 |
| }, |
| { |
| "completion_length": 214.5625, |
| "epoch": 2.0224, |
| "grad_norm": 1.3529505729675293, |
| "kl": 0.106689453125, |
| "learning_rate": 2.0999999999999997e-07, |
| "loss": 0.0011, |
| "reward": 3.8920629024505615, |
| "reward_std": 0.01197694381698966, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9270462095737457, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9650167524814606, |
| "step": 633 |
| }, |
| { |
| "completion_length": 194.25, |
| "epoch": 2.0256, |
| "grad_norm": 1.6181930303573608, |
| "kl": 0.109130859375, |
| "learning_rate": 2.0874999999999999e-07, |
| "loss": 0.0011, |
| "reward": 3.9565629959106445, |
| "reward_std": 0.021904858760535717, |
| "rewards/answer_entity_reward": 0.995192289352417, |
| "rewards/answer_wer_reward": 0.9613706469535828, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 634 |
| }, |
| { |
| "completion_length": 206.59375, |
| "epoch": 2.0288, |
| "grad_norm": 2.353773832321167, |
| "kl": 0.0986328125, |
| "learning_rate": 2.0749999999999997e-07, |
| "loss": 0.001, |
| "reward": 3.919626474380493, |
| "reward_std": 0.02727056946605444, |
| "rewards/answer_entity_reward": 0.987500011920929, |
| "rewards/answer_wer_reward": 0.9333742260932922, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987522065639496, |
| "step": 635 |
| }, |
| { |
| "completion_length": 189.71875, |
| "epoch": 2.032, |
| "grad_norm": 1.6075130701065063, |
| "kl": 0.13720703125, |
| "learning_rate": 2.0624999999999998e-07, |
| "loss": 0.0014, |
| "reward": 3.9046106338500977, |
| "reward_std": 0.025621079374104738, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.9407951831817627, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9694972634315491, |
| "step": 636 |
| }, |
| { |
| "completion_length": 230.46875, |
| "epoch": 2.0352, |
| "grad_norm": 5.240235805511475, |
| "kl": 0.10302734375, |
| "learning_rate": 2.0499999999999997e-07, |
| "loss": 0.001, |
| "reward": 3.9211699962615967, |
| "reward_std": 0.017814213410019875, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9250318109989166, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9982215464115143, |
| "step": 637 |
| }, |
| { |
| "completion_length": 209.59375, |
| "epoch": 2.0384, |
| "grad_norm": 2.4782729148864746, |
| "kl": 0.083984375, |
| "learning_rate": 2.0374999999999998e-07, |
| "loss": 0.0008, |
| "reward": 3.894644021987915, |
| "reward_std": 0.020965205505490303, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9609961807727814, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9336478412151337, |
| "step": 638 |
| }, |
| { |
| "completion_length": 233.5, |
| "epoch": 2.0416, |
| "grad_norm": 1.102921485900879, |
| "kl": 0.089599609375, |
| "learning_rate": 2.025e-07, |
| "loss": 0.0009, |
| "reward": 3.9374464750289917, |
| "reward_std": 0.015141086652874947, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9409857094287872, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993016719818115, |
| "step": 639 |
| }, |
| { |
| "completion_length": 227.84375, |
| "epoch": 2.0448, |
| "grad_norm": 1.3384666442871094, |
| "kl": 0.0908203125, |
| "learning_rate": 2.0125e-07, |
| "loss": 0.0009, |
| "reward": 3.9045239686965942, |
| "reward_std": 0.12723269453272223, |
| "rewards/answer_entity_reward": 0.96875, |
| "rewards/answer_wer_reward": 0.9367768168449402, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.998997151851654, |
| "step": 640 |
| }, |
| { |
| "completion_length": 175.625, |
| "epoch": 2.048, |
| "grad_norm": 0.6850874423980713, |
| "kl": 0.124267578125, |
| "learning_rate": 2e-07, |
| "loss": 0.0012, |
| "reward": 3.929681897163391, |
| "reward_std": 0.008345533395186067, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9302853643894196, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993966221809387, |
| "step": 641 |
| }, |
| { |
| "completion_length": 213.59375, |
| "epoch": 2.0512, |
| "grad_norm": 2.9222097396850586, |
| "kl": 0.101318359375, |
| "learning_rate": 1.9875e-07, |
| "loss": 0.001, |
| "reward": 3.8092339038848877, |
| "reward_std": 0.11687304638326168, |
| "rewards/answer_entity_reward": 0.9529532790184021, |
| "rewards/answer_wer_reward": 0.8997257351875305, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9565548896789551, |
| "step": 642 |
| }, |
| { |
| "completion_length": 205.5, |
| "epoch": 2.0544, |
| "grad_norm": 1.1586568355560303, |
| "kl": 0.092041015625, |
| "learning_rate": 1.975e-07, |
| "loss": 0.0009, |
| "reward": 3.9247117042541504, |
| "reward_std": 0.011728376615792513, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.935352236032486, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9893594086170197, |
| "step": 643 |
| }, |
| { |
| "completion_length": 203.09375, |
| "epoch": 2.0576, |
| "grad_norm": 1.5699268579483032, |
| "kl": 0.09326171875, |
| "learning_rate": 1.9625e-07, |
| "loss": 0.0009, |
| "reward": 3.9444518089294434, |
| "reward_std": 0.020181890577077866, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9571816027164459, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9907424449920654, |
| "step": 644 |
| }, |
| { |
| "completion_length": 203.4375, |
| "epoch": 2.0608, |
| "grad_norm": 1.7927268743515015, |
| "kl": 0.15478515625, |
| "learning_rate": 1.9499999999999999e-07, |
| "loss": 0.0015, |
| "reward": 3.9478741884231567, |
| "reward_std": 0.01690173940733075, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9509572982788086, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997577667236328, |
| "step": 645 |
| }, |
| { |
| "completion_length": 249.40625, |
| "epoch": 2.064, |
| "grad_norm": 1.3610011339187622, |
| "kl": 0.09033203125, |
| "learning_rate": 1.9375e-07, |
| "loss": 0.0009, |
| "reward": 3.817861795425415, |
| "reward_std": 0.1958598094061017, |
| "rewards/answer_entity_reward": 0.990950733423233, |
| "rewards/answer_wer_reward": 0.8910082578659058, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9671527743339539, |
| "step": 646 |
| }, |
| { |
| "completion_length": 211.84375, |
| "epoch": 2.0672, |
| "grad_norm": 1.7078856229782104, |
| "kl": 0.104248046875, |
| "learning_rate": 1.9249999999999998e-07, |
| "loss": 0.001, |
| "reward": 3.9115630388259888, |
| "reward_std": 0.024524363689124584, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9344038963317871, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9800000190734863, |
| "step": 647 |
| }, |
| { |
| "completion_length": 250.3125, |
| "epoch": 2.0704, |
| "grad_norm": 1.6208539009094238, |
| "kl": 0.10205078125, |
| "learning_rate": 1.9125e-07, |
| "loss": 0.001, |
| "reward": 3.8414340019226074, |
| "reward_std": 0.15159638598561287, |
| "rewards/answer_entity_reward": 0.9867424070835114, |
| "rewards/answer_wer_reward": 0.9219352900981903, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.964006245136261, |
| "step": 648 |
| }, |
| { |
| "completion_length": 191.21875, |
| "epoch": 2.0736, |
| "grad_norm": 2.747303009033203, |
| "kl": 0.123046875, |
| "learning_rate": 1.8999999999999998e-07, |
| "loss": 0.0012, |
| "reward": 3.929761052131653, |
| "reward_std": 0.029091503005474806, |
| "rewards/answer_entity_reward": 0.9930555522441864, |
| "rewards/answer_wer_reward": 0.9552291929721832, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9814762473106384, |
| "step": 649 |
| }, |
| { |
| "completion_length": 242.21875, |
| "epoch": 2.0768, |
| "grad_norm": 1.213749647140503, |
| "kl": 0.08203125, |
| "learning_rate": 1.8875e-07, |
| "loss": 0.0008, |
| "reward": 3.9264228343963623, |
| "reward_std": 0.022060640156269073, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9312105178833008, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9972956776618958, |
| "step": 650 |
| }, |
| { |
| "completion_length": 220.65625, |
| "epoch": 2.08, |
| "grad_norm": 6.092029571533203, |
| "kl": 0.100830078125, |
| "learning_rate": 1.875e-07, |
| "loss": 0.001, |
| "reward": 3.9253735542297363, |
| "reward_std": 0.07221902348101139, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9613818228244781, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9668327569961548, |
| "step": 651 |
| }, |
| { |
| "completion_length": 182.5625, |
| "epoch": 2.0832, |
| "grad_norm": 1.7553961277008057, |
| "kl": 0.11279296875, |
| "learning_rate": 1.8625e-07, |
| "loss": 0.0011, |
| "reward": 3.8646005392074585, |
| "reward_std": 0.14707163721323013, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.931645005941391, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9676778018474579, |
| "step": 652 |
| }, |
| { |
| "completion_length": 232.3125, |
| "epoch": 2.0864, |
| "grad_norm": 1.1559618711471558, |
| "kl": 0.091796875, |
| "learning_rate": 1.85e-07, |
| "loss": 0.0009, |
| "reward": 3.957954168319702, |
| "reward_std": 0.013995198532938957, |
| "rewards/answer_entity_reward": 0.9958333373069763, |
| "rewards/answer_wer_reward": 0.963512659072876, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9986082315444946, |
| "step": 653 |
| }, |
| { |
| "completion_length": 223.15625, |
| "epoch": 2.0896, |
| "grad_norm": 2.3205788135528564, |
| "kl": 0.10302734375, |
| "learning_rate": 1.8375e-07, |
| "loss": 0.001, |
| "reward": 3.9284400939941406, |
| "reward_std": 0.02101885131560266, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9305233955383301, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 654 |
| }, |
| { |
| "completion_length": 174.03125, |
| "epoch": 2.0928, |
| "grad_norm": 3.2282309532165527, |
| "kl": 0.089599609375, |
| "learning_rate": 1.825e-07, |
| "loss": 0.0009, |
| "reward": 3.913803219795227, |
| "reward_std": 0.06426881160587072, |
| "rewards/answer_entity_reward": 0.9895833134651184, |
| "rewards/answer_wer_reward": 0.9792838096618652, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.944936066865921, |
| "step": 655 |
| }, |
| { |
| "completion_length": 209.40625, |
| "epoch": 2.096, |
| "grad_norm": 3.0301027297973633, |
| "kl": 0.15478515625, |
| "learning_rate": 1.8124999999999999e-07, |
| "loss": 0.0015, |
| "reward": 3.7523492574691772, |
| "reward_std": 0.15331693179905415, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9194472134113312, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.8641521334648132, |
| "step": 656 |
| }, |
| { |
| "completion_length": 183.46875, |
| "epoch": 2.0992, |
| "grad_norm": 3.859424591064453, |
| "kl": 0.10498046875, |
| "learning_rate": 1.8e-07, |
| "loss": 0.001, |
| "reward": 3.9188934564590454, |
| "reward_std": 0.016068585216999054, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9425098896026611, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9784668385982513, |
| "step": 657 |
| }, |
| { |
| "completion_length": 249.28125, |
| "epoch": 2.1024, |
| "grad_norm": 1.1957335472106934, |
| "kl": 0.0732421875, |
| "learning_rate": 1.7874999999999998e-07, |
| "loss": 0.0007, |
| "reward": 3.920902967453003, |
| "reward_std": 0.009091381449252367, |
| "rewards/answer_entity_reward": 0.982051283121109, |
| "rewards/answer_wer_reward": 0.9388516247272491, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 658 |
| }, |
| { |
| "completion_length": 215.1875, |
| "epoch": 2.1056, |
| "grad_norm": 0.9195266962051392, |
| "kl": 0.08349609375, |
| "learning_rate": 1.775e-07, |
| "loss": 0.0008, |
| "reward": 3.9518920183181763, |
| "reward_std": 0.00956010865047574, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9591186344623566, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9927734434604645, |
| "step": 659 |
| }, |
| { |
| "completion_length": 247.21875, |
| "epoch": 2.1088, |
| "grad_norm": 1.4949768781661987, |
| "kl": 0.109130859375, |
| "learning_rate": 1.7624999999999998e-07, |
| "loss": 0.0011, |
| "reward": 3.9060639142990112, |
| "reward_std": 0.029787511564791203, |
| "rewards/answer_entity_reward": 0.9838598966598511, |
| "rewards/answer_wer_reward": 0.9244924187660217, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9977116584777832, |
| "step": 660 |
| }, |
| { |
| "completion_length": 213.15625, |
| "epoch": 2.112, |
| "grad_norm": 2.8325705528259277, |
| "kl": 0.08740234375, |
| "learning_rate": 1.75e-07, |
| "loss": 0.0009, |
| "reward": 3.951107144355774, |
| "reward_std": 0.019360109698027372, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9607862234115601, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9931618571281433, |
| "step": 661 |
| }, |
| { |
| "completion_length": 210.1875, |
| "epoch": 2.1152, |
| "grad_norm": 4.155531883239746, |
| "kl": 0.12890625, |
| "learning_rate": 1.7374999999999998e-07, |
| "loss": 0.0013, |
| "reward": 3.8224732875823975, |
| "reward_std": 0.200032701715827, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.909185916185379, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9480096995830536, |
| "step": 662 |
| }, |
| { |
| "completion_length": 220.59375, |
| "epoch": 2.1184, |
| "grad_norm": 1.299959421157837, |
| "kl": 0.091796875, |
| "learning_rate": 1.725e-07, |
| "loss": 0.0009, |
| "reward": 3.959660768508911, |
| "reward_std": 0.009787917137145996, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9610175788402557, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9986431002616882, |
| "step": 663 |
| }, |
| { |
| "completion_length": 203.875, |
| "epoch": 2.1216, |
| "grad_norm": 1.2713404893875122, |
| "kl": 0.0849609375, |
| "learning_rate": 1.7125e-07, |
| "loss": 0.0008, |
| "reward": 3.913174271583557, |
| "reward_std": 0.03005001787096262, |
| "rewards/answer_entity_reward": 0.9858973920345306, |
| "rewards/answer_wer_reward": 0.9283359348773956, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9989408254623413, |
| "step": 664 |
| }, |
| { |
| "completion_length": 235.6875, |
| "epoch": 2.1248, |
| "grad_norm": 13.369653701782227, |
| "kl": 0.167236328125, |
| "learning_rate": 1.7000000000000001e-07, |
| "loss": 0.0017, |
| "reward": 3.862402558326721, |
| "reward_std": 0.1546822851523757, |
| "rewards/answer_entity_reward": 0.9921875, |
| "rewards/answer_wer_reward": 0.9333168268203735, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9681483209133148, |
| "step": 665 |
| }, |
| { |
| "completion_length": 154.8125, |
| "epoch": 2.128, |
| "grad_norm": 35.12384033203125, |
| "kl": 0.115966796875, |
| "learning_rate": 1.6875e-07, |
| "loss": 0.0012, |
| "reward": 3.9367175102233887, |
| "reward_std": 0.02194784674793482, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9534772336483002, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9832403361797333, |
| "step": 666 |
| }, |
| { |
| "completion_length": 195.28125, |
| "epoch": 2.1312, |
| "grad_norm": 1.2815937995910645, |
| "kl": 0.107666015625, |
| "learning_rate": 1.675e-07, |
| "loss": 0.0011, |
| "reward": 3.937751293182373, |
| "reward_std": 0.014415924437344074, |
| "rewards/answer_entity_reward": 0.9930555522441864, |
| "rewards/answer_wer_reward": 0.9462690353393555, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9984267055988312, |
| "step": 667 |
| }, |
| { |
| "completion_length": 225.90625, |
| "epoch": 2.1344, |
| "grad_norm": 0.840438723564148, |
| "kl": 0.12841796875, |
| "learning_rate": 1.6625e-07, |
| "loss": 0.0013, |
| "reward": 3.9389572143554688, |
| "reward_std": 0.01061929203569889, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.939858615398407, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990985691547394, |
| "step": 668 |
| }, |
| { |
| "completion_length": 186.6875, |
| "epoch": 2.1376, |
| "grad_norm": 1.6506493091583252, |
| "kl": 0.081787109375, |
| "learning_rate": 1.65e-07, |
| "loss": 0.0008, |
| "reward": 3.956157922744751, |
| "reward_std": 0.008805734105408192, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9561578929424286, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 669 |
| }, |
| { |
| "completion_length": 256.84375, |
| "epoch": 2.1408, |
| "grad_norm": 1.2955864667892456, |
| "kl": 0.134765625, |
| "learning_rate": 1.6375e-07, |
| "loss": 0.0013, |
| "reward": 3.924846053123474, |
| "reward_std": 0.016075235791504383, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9267153441905975, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.998130738735199, |
| "step": 670 |
| }, |
| { |
| "completion_length": 205.8125, |
| "epoch": 2.144, |
| "grad_norm": 2.9848484992980957, |
| "kl": 0.09375, |
| "learning_rate": 1.625e-07, |
| "loss": 0.0009, |
| "reward": 3.920342206954956, |
| "reward_std": 0.017433147877454758, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9359186589717865, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9844235181808472, |
| "step": 671 |
| }, |
| { |
| "completion_length": 205.25, |
| "epoch": 2.1471999999999998, |
| "grad_norm": 3.0758063793182373, |
| "kl": 0.0888671875, |
| "learning_rate": 1.6125e-07, |
| "loss": 0.0009, |
| "reward": 3.94283390045166, |
| "reward_std": 0.03408639598637819, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9549511075019836, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9878826439380646, |
| "step": 672 |
| }, |
| { |
| "completion_length": 185.3125, |
| "epoch": 2.1504, |
| "grad_norm": 4.493408203125, |
| "kl": 0.1298828125, |
| "learning_rate": 1.6e-07, |
| "loss": 0.0013, |
| "reward": 3.7623226642608643, |
| "reward_std": 0.053579739294946194, |
| "rewards/answer_entity_reward": 0.9799679517745972, |
| "rewards/answer_wer_reward": 0.930513322353363, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8518414497375488, |
| "step": 673 |
| }, |
| { |
| "completion_length": 238.5, |
| "epoch": 2.1536, |
| "grad_norm": 1.0133973360061646, |
| "kl": 0.074462890625, |
| "learning_rate": 1.5875e-07, |
| "loss": 0.0008, |
| "reward": 3.949939250946045, |
| "reward_std": 0.01046135206706822, |
| "rewards/answer_entity_reward": 0.9926470518112183, |
| "rewards/answer_wer_reward": 0.9593237638473511, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.997968465089798, |
| "step": 674 |
| }, |
| { |
| "completion_length": 204.75, |
| "epoch": 2.1568, |
| "grad_norm": 2.416959762573242, |
| "kl": 0.24658203125, |
| "learning_rate": 1.575e-07, |
| "loss": 0.0025, |
| "reward": 3.8200684785842896, |
| "reward_std": 0.014629668090492487, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9454044103622437, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8746639788150787, |
| "step": 675 |
| }, |
| { |
| "completion_length": 201.0625, |
| "epoch": 2.16, |
| "grad_norm": 1.1082431077957153, |
| "kl": 0.103515625, |
| "learning_rate": 1.5624999999999999e-07, |
| "loss": 0.001, |
| "reward": 3.9603604078292847, |
| "reward_std": 0.013338471297174692, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9646645486354828, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9985367655754089, |
| "step": 676 |
| }, |
| { |
| "completion_length": 200.5, |
| "epoch": 2.1632, |
| "grad_norm": 1.243102788925171, |
| "kl": 0.088623046875, |
| "learning_rate": 1.55e-07, |
| "loss": 0.0009, |
| "reward": 3.9476585388183594, |
| "reward_std": 0.014779110439121723, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9507860839366913, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997133016586304, |
| "step": 677 |
| }, |
| { |
| "completion_length": 235.0, |
| "epoch": 2.1664, |
| "grad_norm": 1.9828643798828125, |
| "kl": 0.076171875, |
| "learning_rate": 1.5374999999999998e-07, |
| "loss": 0.0008, |
| "reward": 3.840447187423706, |
| "reward_std": 0.12814121507108212, |
| "rewards/answer_entity_reward": 0.9507211446762085, |
| "rewards/answer_wer_reward": 0.8900850713253021, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996408224105835, |
| "step": 678 |
| }, |
| { |
| "completion_length": 209.0625, |
| "epoch": 2.1696, |
| "grad_norm": 1.306552529335022, |
| "kl": 0.09130859375, |
| "learning_rate": 1.525e-07, |
| "loss": 0.0009, |
| "reward": 3.9448314905166626, |
| "reward_std": 0.016167795285582542, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9459536671638489, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9988778233528137, |
| "step": 679 |
| }, |
| { |
| "completion_length": 206.84375, |
| "epoch": 2.1728, |
| "grad_norm": 1.7964757680892944, |
| "kl": 0.1064453125, |
| "learning_rate": 1.5124999999999998e-07, |
| "loss": 0.0011, |
| "reward": 3.9424251317977905, |
| "reward_std": 0.0136543451808393, |
| "rewards/answer_entity_reward": 0.9895104765892029, |
| "rewards/answer_wer_reward": 0.9541302621364594, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987844526767731, |
| "step": 680 |
| }, |
| { |
| "completion_length": 244.21875, |
| "epoch": 2.176, |
| "grad_norm": 1.3341420888900757, |
| "kl": 0.08642578125, |
| "learning_rate": 1.5e-07, |
| "loss": 0.0009, |
| "reward": 3.9383710622787476, |
| "reward_std": 0.01660554250702262, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9446630477905273, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9981722235679626, |
| "step": 681 |
| }, |
| { |
| "completion_length": 248.84375, |
| "epoch": 2.1792, |
| "grad_norm": 0.9630815386772156, |
| "kl": 0.0888671875, |
| "learning_rate": 1.4874999999999998e-07, |
| "loss": 0.0009, |
| "reward": 3.949557065963745, |
| "reward_std": 0.01444097189232707, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9514444172382355, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9981126189231873, |
| "step": 682 |
| }, |
| { |
| "completion_length": 222.5625, |
| "epoch": 2.1824, |
| "grad_norm": 1.4436620473861694, |
| "kl": 0.091796875, |
| "learning_rate": 1.475e-07, |
| "loss": 0.0009, |
| "reward": 3.9340105056762695, |
| "reward_std": 0.011837240774184465, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9374523460865021, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993990361690521, |
| "step": 683 |
| }, |
| { |
| "completion_length": 220.3125, |
| "epoch": 2.1856, |
| "grad_norm": 1.7951076030731201, |
| "kl": 0.13623046875, |
| "learning_rate": 1.4624999999999998e-07, |
| "loss": 0.0014, |
| "reward": 3.9159233570098877, |
| "reward_std": 0.022063229698687792, |
| "rewards/answer_entity_reward": 0.9825946092605591, |
| "rewards/answer_wer_reward": 0.935338944196701, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9979897737503052, |
| "step": 684 |
| }, |
| { |
| "completion_length": 222.71875, |
| "epoch": 2.1888, |
| "grad_norm": 2.693173885345459, |
| "kl": 0.090576171875, |
| "learning_rate": 1.45e-07, |
| "loss": 0.0009, |
| "reward": 3.8880432844161987, |
| "reward_std": 0.03081614337861538, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9077447652816772, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9827025234699249, |
| "step": 685 |
| }, |
| { |
| "completion_length": 263.5, |
| "epoch": 2.192, |
| "grad_norm": 5.544942855834961, |
| "kl": 0.122802734375, |
| "learning_rate": 1.4374999999999997e-07, |
| "loss": 0.0012, |
| "reward": 3.906672716140747, |
| "reward_std": 0.017923741601407528, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9078539311885834, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9988189041614532, |
| "step": 686 |
| }, |
| { |
| "completion_length": 219.0625, |
| "epoch": 2.1952, |
| "grad_norm": 1.3066198825836182, |
| "kl": 0.13720703125, |
| "learning_rate": 1.4249999999999999e-07, |
| "loss": 0.0014, |
| "reward": 3.8579492568969727, |
| "reward_std": 0.11750033870339394, |
| "rewards/answer_entity_reward": 0.9445319771766663, |
| "rewards/answer_wer_reward": 0.9204041659832001, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9930130541324615, |
| "step": 687 |
| }, |
| { |
| "completion_length": 203.09375, |
| "epoch": 2.1984, |
| "grad_norm": 2.9115042686462402, |
| "kl": 0.130859375, |
| "learning_rate": 1.4124999999999997e-07, |
| "loss": 0.0013, |
| "reward": 3.929637312889099, |
| "reward_std": 0.07596011366695166, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9678620994091034, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9652473330497742, |
| "step": 688 |
| }, |
| { |
| "completion_length": 205.375, |
| "epoch": 2.2016, |
| "grad_norm": 2.322467803955078, |
| "kl": 0.087158203125, |
| "learning_rate": 1.4e-07, |
| "loss": 0.0009, |
| "reward": 3.8996429443359375, |
| "reward_std": 0.06278708390891552, |
| "rewards/answer_entity_reward": 0.9936868846416473, |
| "rewards/answer_wer_reward": 0.9460262954235077, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.959929883480072, |
| "step": 689 |
| }, |
| { |
| "completion_length": 225.0625, |
| "epoch": 2.2048, |
| "grad_norm": 2.730459213256836, |
| "kl": 0.0791015625, |
| "learning_rate": 1.3875e-07, |
| "loss": 0.0008, |
| "reward": 3.916098475456238, |
| "reward_std": 0.02135017653927207, |
| "rewards/answer_entity_reward": 0.9937500059604645, |
| "rewards/answer_wer_reward": 0.9546346664428711, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9677136838436127, |
| "step": 690 |
| }, |
| { |
| "completion_length": 154.09375, |
| "epoch": 2.208, |
| "grad_norm": 2.1384575366973877, |
| "kl": 0.08740234375, |
| "learning_rate": 1.375e-07, |
| "loss": 0.0009, |
| "reward": 3.8095338344573975, |
| "reward_std": 0.02021293295547366, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9482340812683105, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8612997531890869, |
| "step": 691 |
| }, |
| { |
| "completion_length": 160.6875, |
| "epoch": 2.2112, |
| "grad_norm": 1.9878817796707153, |
| "kl": 0.1083984375, |
| "learning_rate": 1.3625e-07, |
| "loss": 0.0011, |
| "reward": 3.8489197492599487, |
| "reward_std": 0.06898931134492159, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.929458349943161, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9194613993167877, |
| "step": 692 |
| }, |
| { |
| "completion_length": 209.0625, |
| "epoch": 2.2144, |
| "grad_norm": 1.895799994468689, |
| "kl": 0.11669921875, |
| "learning_rate": 1.35e-07, |
| "loss": 0.0012, |
| "reward": 3.897484064102173, |
| "reward_std": 0.02146145049482584, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.926066517829895, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9714176058769226, |
| "step": 693 |
| }, |
| { |
| "completion_length": 248.3125, |
| "epoch": 2.2176, |
| "grad_norm": 2.0095603466033936, |
| "kl": 0.1015625, |
| "learning_rate": 1.3375e-07, |
| "loss": 0.001, |
| "reward": 3.9220433235168457, |
| "reward_std": 0.014254164882004261, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9220432937145233, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 694 |
| }, |
| { |
| "completion_length": 223.0, |
| "epoch": 2.2208, |
| "grad_norm": 1.6252143383026123, |
| "kl": 0.12158203125, |
| "learning_rate": 1.325e-07, |
| "loss": 0.0012, |
| "reward": 3.9079580307006836, |
| "reward_std": 0.02597262989729643, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9434219896793365, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9645361006259918, |
| "step": 695 |
| }, |
| { |
| "completion_length": 187.5, |
| "epoch": 2.224, |
| "grad_norm": 1.387984275817871, |
| "kl": 0.0947265625, |
| "learning_rate": 1.3125e-07, |
| "loss": 0.0009, |
| "reward": 3.9440150260925293, |
| "reward_std": 0.015697208931669593, |
| "rewards/answer_entity_reward": 0.9816919267177582, |
| "rewards/answer_wer_reward": 0.9631733596324921, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9991496503353119, |
| "step": 696 |
| }, |
| { |
| "completion_length": 203.34375, |
| "epoch": 2.2272, |
| "grad_norm": 2.2257113456726074, |
| "kl": 0.110595703125, |
| "learning_rate": 1.3e-07, |
| "loss": 0.0011, |
| "reward": 3.885230541229248, |
| "reward_std": 0.023641248233616352, |
| "rewards/answer_entity_reward": 0.9805992841720581, |
| "rewards/answer_wer_reward": 0.9495046138763428, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9551265835762024, |
| "step": 697 |
| }, |
| { |
| "completion_length": 201.8125, |
| "epoch": 2.2304, |
| "grad_norm": 1.595376968383789, |
| "kl": 0.076171875, |
| "learning_rate": 1.2874999999999998e-07, |
| "loss": 0.0008, |
| "reward": 3.9703818559646606, |
| "reward_std": 0.01011386327445507, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9731970131397247, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999588817358017, |
| "step": 698 |
| }, |
| { |
| "completion_length": 230.6875, |
| "epoch": 2.2336, |
| "grad_norm": 1.6279692649841309, |
| "kl": 0.12744140625, |
| "learning_rate": 1.275e-07, |
| "loss": 0.0013, |
| "reward": 3.9279537200927734, |
| "reward_std": 0.017515965271741152, |
| "rewards/answer_entity_reward": 0.9880050718784332, |
| "rewards/answer_wer_reward": 0.9430651664733887, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9968834519386292, |
| "step": 699 |
| }, |
| { |
| "completion_length": 182.125, |
| "epoch": 2.2368, |
| "grad_norm": 2.8828890323638916, |
| "kl": 0.18505859375, |
| "learning_rate": 1.2624999999999998e-07, |
| "loss": 0.0019, |
| "reward": 3.8859431743621826, |
| "reward_std": 0.142324005253613, |
| "rewards/answer_entity_reward": 0.993686854839325, |
| "rewards/answer_wer_reward": 0.9579981565475464, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9655081927776337, |
| "step": 700 |
| }, |
| { |
| "completion_length": 209.03125, |
| "epoch": 2.24, |
| "grad_norm": 2.9951937198638916, |
| "kl": 0.12109375, |
| "learning_rate": 1.25e-07, |
| "loss": 0.0012, |
| "reward": 3.7733466625213623, |
| "reward_std": 0.025467259343713522, |
| "rewards/answer_entity_reward": 0.9886675775051117, |
| "rewards/answer_wer_reward": 0.9475079476833344, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8371710479259491, |
| "step": 701 |
| }, |
| { |
| "completion_length": 211.40625, |
| "epoch": 2.2432, |
| "grad_norm": 2.5615384578704834, |
| "kl": 0.14208984375, |
| "learning_rate": 1.2375e-07, |
| "loss": 0.0014, |
| "reward": 3.9001591205596924, |
| "reward_std": 0.03031878173351288, |
| "rewards/answer_entity_reward": 0.9910256266593933, |
| "rewards/answer_wer_reward": 0.957984060049057, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9511493742465973, |
| "step": 702 |
| }, |
| { |
| "completion_length": 240.6875, |
| "epoch": 2.2464, |
| "grad_norm": 1.6149277687072754, |
| "kl": 0.10888671875, |
| "learning_rate": 1.225e-07, |
| "loss": 0.0011, |
| "reward": 3.917873740196228, |
| "reward_std": 0.01580545213073492, |
| "rewards/answer_entity_reward": 0.9787845611572266, |
| "rewards/answer_wer_reward": 0.9405834674835205, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9985057711601257, |
| "step": 703 |
| }, |
| { |
| "completion_length": 190.0625, |
| "epoch": 2.2496, |
| "grad_norm": 1.620892882347107, |
| "kl": 0.087646484375, |
| "learning_rate": 1.2125e-07, |
| "loss": 0.0009, |
| "reward": 3.954784393310547, |
| "reward_std": 0.03893708251416683, |
| "rewards/answer_entity_reward": 0.9937500059604645, |
| "rewards/answer_wer_reward": 0.962031751871109, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990026652812958, |
| "step": 704 |
| }, |
| { |
| "completion_length": 170.0, |
| "epoch": 2.2528, |
| "grad_norm": 1.5188636779785156, |
| "kl": 0.111572265625, |
| "learning_rate": 1.2e-07, |
| "loss": 0.0011, |
| "reward": 3.9031065702438354, |
| "reward_std": 0.011392949614673853, |
| "rewards/answer_entity_reward": 0.9861111044883728, |
| "rewards/answer_wer_reward": 0.9356338381767273, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9813616275787354, |
| "step": 705 |
| }, |
| { |
| "completion_length": 211.46875, |
| "epoch": 2.2560000000000002, |
| "grad_norm": 2.7718734741210938, |
| "kl": 0.102294921875, |
| "learning_rate": 1.1874999999999999e-07, |
| "loss": 0.001, |
| "reward": 3.936674475669861, |
| "reward_std": 0.021578084211796522, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9491873383522034, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9874871671199799, |
| "step": 706 |
| }, |
| { |
| "completion_length": 255.0, |
| "epoch": 2.2592, |
| "grad_norm": 1.6890819072723389, |
| "kl": 0.099853515625, |
| "learning_rate": 1.1749999999999999e-07, |
| "loss": 0.001, |
| "reward": 3.9247710704803467, |
| "reward_std": 0.013670595828443766, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9279445707798004, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9968266189098358, |
| "step": 707 |
| }, |
| { |
| "completion_length": 189.3125, |
| "epoch": 2.2624, |
| "grad_norm": 2.3591725826263428, |
| "kl": 0.111328125, |
| "learning_rate": 1.1625e-07, |
| "loss": 0.0011, |
| "reward": 3.929440498352051, |
| "reward_std": 0.018895008601248264, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9316463768482208, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.99779412150383, |
| "step": 708 |
| }, |
| { |
| "completion_length": 202.125, |
| "epoch": 2.2656, |
| "grad_norm": 5.1716766357421875, |
| "kl": 0.142333984375, |
| "learning_rate": 1.15e-07, |
| "loss": 0.0014, |
| "reward": 3.9494107961654663, |
| "reward_std": 0.023188273422420025, |
| "rewards/answer_entity_reward": 0.9902146458625793, |
| "rewards/answer_wer_reward": 0.9693313241004944, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.989864856004715, |
| "step": 709 |
| }, |
| { |
| "completion_length": 241.90625, |
| "epoch": 2.2688, |
| "grad_norm": 2.9082345962524414, |
| "kl": 0.15087890625, |
| "learning_rate": 1.1375e-07, |
| "loss": 0.0015, |
| "reward": 3.877661347389221, |
| "reward_std": 0.08314304798841476, |
| "rewards/answer_entity_reward": 0.9895833134651184, |
| "rewards/answer_wer_reward": 0.9211136996746063, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9669643044471741, |
| "step": 710 |
| }, |
| { |
| "completion_length": 222.40625, |
| "epoch": 2.2720000000000002, |
| "grad_norm": 2.9711413383483887, |
| "kl": 0.123779296875, |
| "learning_rate": 1.125e-07, |
| "loss": 0.0012, |
| "reward": 3.9322550296783447, |
| "reward_std": 0.06140775140374899, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9581426084041595, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9775847494602203, |
| "step": 711 |
| }, |
| { |
| "completion_length": 242.6875, |
| "epoch": 2.2752, |
| "grad_norm": 6.453571796417236, |
| "kl": 0.116943359375, |
| "learning_rate": 1.1125e-07, |
| "loss": 0.0012, |
| "reward": 3.874239444732666, |
| "reward_std": 0.017658520489931107, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.8984209299087524, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9782223105430603, |
| "step": 712 |
| }, |
| { |
| "completion_length": 206.0625, |
| "epoch": 2.2784, |
| "grad_norm": 2.0138731002807617, |
| "kl": 0.10205078125, |
| "learning_rate": 1.0999999999999999e-07, |
| "loss": 0.001, |
| "reward": 3.9465200901031494, |
| "reward_std": 0.01707920106127858, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.950833261013031, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9985276162624359, |
| "step": 713 |
| }, |
| { |
| "completion_length": 205.90625, |
| "epoch": 2.2816, |
| "grad_norm": 1.6215705871582031, |
| "kl": 0.22216796875, |
| "learning_rate": 1.0874999999999999e-07, |
| "loss": 0.0022, |
| "reward": 3.921483874320984, |
| "reward_std": 0.017741497606039047, |
| "rewards/answer_entity_reward": 0.9818618893623352, |
| "rewards/answer_wer_reward": 0.9403572380542755, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9992647171020508, |
| "step": 714 |
| }, |
| { |
| "completion_length": 165.53125, |
| "epoch": 2.2848, |
| "grad_norm": 2.939443349838257, |
| "kl": 0.10302734375, |
| "learning_rate": 1.0749999999999999e-07, |
| "loss": 0.001, |
| "reward": 3.8573367595672607, |
| "reward_std": 0.05941922590136528, |
| "rewards/answer_entity_reward": 0.9981617629528046, |
| "rewards/answer_wer_reward": 0.9561320841312408, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9030430614948273, |
| "step": 715 |
| }, |
| { |
| "completion_length": 209.34375, |
| "epoch": 2.288, |
| "grad_norm": 3.167865753173828, |
| "kl": 0.098876953125, |
| "learning_rate": 1.0624999999999999e-07, |
| "loss": 0.001, |
| "reward": 3.9200966358184814, |
| "reward_std": 0.011050965171307325, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9683842360973358, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9517123103141785, |
| "step": 716 |
| }, |
| { |
| "completion_length": 211.78125, |
| "epoch": 2.2912, |
| "grad_norm": 2.83433198928833, |
| "kl": 0.157470703125, |
| "learning_rate": 1.0499999999999999e-07, |
| "loss": 0.0016, |
| "reward": 3.888568639755249, |
| "reward_std": 0.0255763940513134, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9216626286506653, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9669062495231628, |
| "step": 717 |
| }, |
| { |
| "completion_length": 233.21875, |
| "epoch": 2.2944, |
| "grad_norm": 1.1522959470748901, |
| "kl": 0.123046875, |
| "learning_rate": 1.0374999999999999e-07, |
| "loss": 0.0012, |
| "reward": 3.9315165281295776, |
| "reward_std": 0.015323773492127657, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9349887073040009, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 718 |
| }, |
| { |
| "completion_length": 219.6875, |
| "epoch": 2.2976, |
| "grad_norm": 2.8032352924346924, |
| "kl": 0.097900390625, |
| "learning_rate": 1.0249999999999998e-07, |
| "loss": 0.001, |
| "reward": 3.941191077232361, |
| "reward_std": 0.014842316508293152, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9449678063392639, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9962232708930969, |
| "step": 719 |
| }, |
| { |
| "completion_length": 247.75, |
| "epoch": 2.3008, |
| "grad_norm": 2.120060682296753, |
| "kl": 0.10791015625, |
| "learning_rate": 1.0125e-07, |
| "loss": 0.0011, |
| "reward": 3.7576488256454468, |
| "reward_std": 0.034239969216287136, |
| "rewards/answer_entity_reward": 0.9883012771606445, |
| "rewards/answer_wer_reward": 0.9174286723136902, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8519188165664673, |
| "step": 720 |
| }, |
| { |
| "completion_length": 148.46875, |
| "epoch": 2.304, |
| "grad_norm": 3.453160047531128, |
| "kl": 0.1357421875, |
| "learning_rate": 1e-07, |
| "loss": 0.0014, |
| "reward": 3.9483038187026978, |
| "reward_std": 0.010362145490944386, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9624904096126556, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9858134686946869, |
| "step": 721 |
| }, |
| { |
| "completion_length": 242.625, |
| "epoch": 2.3072, |
| "grad_norm": 1.0787523984909058, |
| "kl": 0.08740234375, |
| "learning_rate": 9.875e-08, |
| "loss": 0.0009, |
| "reward": 3.8604942560195923, |
| "reward_std": 0.14663540851324797, |
| "rewards/answer_entity_reward": 0.9937500059604645, |
| "rewards/answer_wer_reward": 0.9304596483707428, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9675346612930298, |
| "step": 722 |
| }, |
| { |
| "completion_length": 184.15625, |
| "epoch": 2.3104, |
| "grad_norm": 2.8213894367218018, |
| "kl": 0.078857421875, |
| "learning_rate": 9.749999999999999e-08, |
| "loss": 0.0008, |
| "reward": 3.9743396043777466, |
| "reward_std": 0.007034428184852004, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.975724995136261, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9986145198345184, |
| "step": 723 |
| }, |
| { |
| "completion_length": 259.78125, |
| "epoch": 2.3136, |
| "grad_norm": 1.6101382970809937, |
| "kl": 0.090087890625, |
| "learning_rate": 9.624999999999999e-08, |
| "loss": 0.0009, |
| "reward": 3.9425781965255737, |
| "reward_std": 0.012072732672095299, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9443398118019104, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9982384443283081, |
| "step": 724 |
| }, |
| { |
| "completion_length": 244.0625, |
| "epoch": 2.3168, |
| "grad_norm": 6.2361578941345215, |
| "kl": 0.1103515625, |
| "learning_rate": 9.499999999999999e-08, |
| "loss": 0.0011, |
| "reward": 3.930173873901367, |
| "reward_std": 0.027357542887330055, |
| "rewards/answer_entity_reward": 0.9856617748737335, |
| "rewards/answer_wer_reward": 0.9461617767810822, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9983505010604858, |
| "step": 725 |
| }, |
| { |
| "completion_length": 201.59375, |
| "epoch": 2.32, |
| "grad_norm": 1.4726715087890625, |
| "kl": 0.09375, |
| "learning_rate": 9.375e-08, |
| "loss": 0.0009, |
| "reward": 3.932676076889038, |
| "reward_std": 0.018328175880014896, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.9490721523761749, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9892857074737549, |
| "step": 726 |
| }, |
| { |
| "completion_length": 224.46875, |
| "epoch": 2.3232, |
| "grad_norm": 1.8913533687591553, |
| "kl": 0.10693359375, |
| "learning_rate": 9.25e-08, |
| "loss": 0.0011, |
| "reward": 3.9074403047561646, |
| "reward_std": 0.03500279039144516, |
| "rewards/answer_entity_reward": 0.9926948249340057, |
| "rewards/answer_wer_reward": 0.9156512916088104, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990941882133484, |
| "step": 727 |
| }, |
| { |
| "completion_length": 193.96875, |
| "epoch": 2.3264, |
| "grad_norm": 3.589576244354248, |
| "kl": 0.095458984375, |
| "learning_rate": 9.125e-08, |
| "loss": 0.001, |
| "reward": 3.9219532012939453, |
| "reward_std": 0.018014353699982166, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9440249502658844, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9807692468166351, |
| "step": 728 |
| }, |
| { |
| "completion_length": 178.4375, |
| "epoch": 2.3296, |
| "grad_norm": 1.4839043617248535, |
| "kl": 0.125244140625, |
| "learning_rate": 9e-08, |
| "loss": 0.0013, |
| "reward": 3.8304929733276367, |
| "reward_std": 0.009818047750741243, |
| "rewards/answer_entity_reward": 0.9844697117805481, |
| "rewards/answer_wer_reward": 0.9768873453140259, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.869135856628418, |
| "step": 729 |
| }, |
| { |
| "completion_length": 199.28125, |
| "epoch": 2.3327999999999998, |
| "grad_norm": 1.497478723526001, |
| "kl": 0.08642578125, |
| "learning_rate": 8.875e-08, |
| "loss": 0.0009, |
| "reward": 3.9690757989883423, |
| "reward_std": 0.009865536354482174, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9690757989883423, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 730 |
| }, |
| { |
| "completion_length": 220.40625, |
| "epoch": 2.336, |
| "grad_norm": 5.609241485595703, |
| "kl": 0.0966796875, |
| "learning_rate": 8.75e-08, |
| "loss": 0.001, |
| "reward": 3.935381293296814, |
| "reward_std": 0.037938917987048626, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9465770721435547, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9888042211532593, |
| "step": 731 |
| }, |
| { |
| "completion_length": 215.75, |
| "epoch": 2.3392, |
| "grad_norm": 3.496508836746216, |
| "kl": 0.13134765625, |
| "learning_rate": 8.625e-08, |
| "loss": 0.0013, |
| "reward": 3.8224092721939087, |
| "reward_std": 0.02808304876089096, |
| "rewards/answer_entity_reward": 0.9923513829708099, |
| "rewards/answer_wer_reward": 0.9497494399547577, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8803083300590515, |
| "step": 732 |
| }, |
| { |
| "completion_length": 230.78125, |
| "epoch": 2.3424, |
| "grad_norm": 27.852195739746094, |
| "kl": 0.087890625, |
| "learning_rate": 8.500000000000001e-08, |
| "loss": 0.0009, |
| "reward": 3.8288865089416504, |
| "reward_std": 0.01470271497964859, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9576314091682434, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8736589848995209, |
| "step": 733 |
| }, |
| { |
| "completion_length": 241.9375, |
| "epoch": 2.3456, |
| "grad_norm": 3.033336639404297, |
| "kl": 0.1015625, |
| "learning_rate": 8.375e-08, |
| "loss": 0.001, |
| "reward": 3.8371338844299316, |
| "reward_std": 0.060717299580574036, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.8998311161994934, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9373026490211487, |
| "step": 734 |
| }, |
| { |
| "completion_length": 235.53125, |
| "epoch": 2.3487999999999998, |
| "grad_norm": 1.6953455209732056, |
| "kl": 0.094970703125, |
| "learning_rate": 8.25e-08, |
| "loss": 0.001, |
| "reward": 3.9114056825637817, |
| "reward_std": 0.03203952219337225, |
| "rewards/answer_entity_reward": 0.9862325191497803, |
| "rewards/answer_wer_reward": 0.9290111660957336, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9961620271205902, |
| "step": 735 |
| }, |
| { |
| "completion_length": 170.59375, |
| "epoch": 2.352, |
| "grad_norm": 3.9929087162017822, |
| "kl": 0.096923828125, |
| "learning_rate": 8.125e-08, |
| "loss": 0.001, |
| "reward": 3.7566089630126953, |
| "reward_std": 0.029996749013662338, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.8621053397655487, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8945035934448242, |
| "step": 736 |
| }, |
| { |
| "completion_length": 233.125, |
| "epoch": 2.3552, |
| "grad_norm": 4.515742301940918, |
| "kl": 0.129638671875, |
| "learning_rate": 8e-08, |
| "loss": 0.0013, |
| "reward": 3.9159114360809326, |
| "reward_std": 0.03965392196550965, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9617535173892975, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9576301276683807, |
| "step": 737 |
| }, |
| { |
| "completion_length": 202.5, |
| "epoch": 2.3584, |
| "grad_norm": 3.593953847885132, |
| "kl": 0.107177734375, |
| "learning_rate": 7.875e-08, |
| "loss": 0.0011, |
| "reward": 3.9383411407470703, |
| "reward_std": 0.030629536136984825, |
| "rewards/answer_entity_reward": 0.9930555820465088, |
| "rewards/answer_wer_reward": 0.9511449038982391, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.994140625, |
| "step": 738 |
| }, |
| { |
| "completion_length": 204.5, |
| "epoch": 2.3616, |
| "grad_norm": 1.8713083267211914, |
| "kl": 0.099365234375, |
| "learning_rate": 7.75e-08, |
| "loss": 0.001, |
| "reward": 3.9490264654159546, |
| "reward_std": 0.017966313287615776, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.95371875166893, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997718930244446, |
| "step": 739 |
| }, |
| { |
| "completion_length": 240.84375, |
| "epoch": 2.3648, |
| "grad_norm": 1.2076594829559326, |
| "kl": 0.087646484375, |
| "learning_rate": 7.625e-08, |
| "loss": 0.0009, |
| "reward": 3.9529651403427124, |
| "reward_std": 0.00970834819599986, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.95296511054039, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 740 |
| }, |
| { |
| "completion_length": 245.0, |
| "epoch": 2.368, |
| "grad_norm": 0.9895936846733093, |
| "kl": 0.10205078125, |
| "learning_rate": 7.5e-08, |
| "loss": 0.001, |
| "reward": 3.9112091064453125, |
| "reward_std": 0.01916833221912384, |
| "rewards/answer_entity_reward": 0.9895833134651184, |
| "rewards/answer_wer_reward": 0.9262779057025909, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9953478574752808, |
| "step": 741 |
| }, |
| { |
| "completion_length": 240.09375, |
| "epoch": 2.3712, |
| "grad_norm": 2.48711895942688, |
| "kl": 0.076171875, |
| "learning_rate": 7.375e-08, |
| "loss": 0.0008, |
| "reward": 3.942033529281616, |
| "reward_std": 0.015272341668605804, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9465188384056091, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9983557760715485, |
| "step": 742 |
| }, |
| { |
| "completion_length": 201.78125, |
| "epoch": 2.3744, |
| "grad_norm": 2.5322351455688477, |
| "kl": 0.103271484375, |
| "learning_rate": 7.25e-08, |
| "loss": 0.001, |
| "reward": 3.903318166732788, |
| "reward_std": 0.024559098295867443, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9573764503002167, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9459417462348938, |
| "step": 743 |
| }, |
| { |
| "completion_length": 176.8125, |
| "epoch": 2.3776, |
| "grad_norm": 10.369518280029297, |
| "kl": 0.10986328125, |
| "learning_rate": 7.124999999999999e-08, |
| "loss": 0.0011, |
| "reward": 3.9451266527175903, |
| "reward_std": 0.008970791008323431, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.953162282705307, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9919642806053162, |
| "step": 744 |
| }, |
| { |
| "completion_length": 230.625, |
| "epoch": 2.3808, |
| "grad_norm": 1.5272488594055176, |
| "kl": 0.09130859375, |
| "learning_rate": 7e-08, |
| "loss": 0.0009, |
| "reward": 3.9410911798477173, |
| "reward_std": 0.017650599591434002, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9447437524795532, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9991883039474487, |
| "step": 745 |
| }, |
| { |
| "completion_length": 193.3125, |
| "epoch": 2.384, |
| "grad_norm": 2.9624199867248535, |
| "kl": 0.165771484375, |
| "learning_rate": 6.875e-08, |
| "loss": 0.0017, |
| "reward": 3.8940484523773193, |
| "reward_std": 0.060107991099357605, |
| "rewards/answer_entity_reward": 0.9955128133296967, |
| "rewards/answer_wer_reward": 0.9106853604316711, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9878502786159515, |
| "step": 746 |
| }, |
| { |
| "completion_length": 216.34375, |
| "epoch": 2.3872, |
| "grad_norm": 1.623085379600525, |
| "kl": 0.08544921875, |
| "learning_rate": 6.75e-08, |
| "loss": 0.0009, |
| "reward": 3.9699491262435913, |
| "reward_std": 0.016816058196127415, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9752996861934662, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9946492910385132, |
| "step": 747 |
| }, |
| { |
| "completion_length": 202.125, |
| "epoch": 2.3904, |
| "grad_norm": 1.5331361293792725, |
| "kl": 0.12841796875, |
| "learning_rate": 6.625e-08, |
| "loss": 0.0013, |
| "reward": 3.922391891479492, |
| "reward_std": 0.008891359670087695, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9298486709594727, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9925432503223419, |
| "step": 748 |
| }, |
| { |
| "completion_length": 242.6875, |
| "epoch": 2.3936, |
| "grad_norm": 1.3326294422149658, |
| "kl": 0.095947265625, |
| "learning_rate": 6.5e-08, |
| "loss": 0.001, |
| "reward": 3.95425808429718, |
| "reward_std": 0.009861439000815153, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9546802639961243, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9995777010917664, |
| "step": 749 |
| }, |
| { |
| "completion_length": 175.90625, |
| "epoch": 2.3968, |
| "grad_norm": 0.9046992063522339, |
| "kl": 0.112060546875, |
| "learning_rate": 6.375e-08, |
| "loss": 0.0011, |
| "reward": 3.977583885192871, |
| "reward_std": 0.006871582940220833, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9775838255882263, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 750 |
| }, |
| { |
| "completion_length": 215.59375, |
| "epoch": 2.4, |
| "grad_norm": 3.0961620807647705, |
| "kl": 0.089111328125, |
| "learning_rate": 6.25e-08, |
| "loss": 0.0009, |
| "reward": 3.9490163326263428, |
| "reward_std": 0.012763194739818573, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9750434756278992, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9768137633800507, |
| "step": 751 |
| }, |
| { |
| "completion_length": 239.84375, |
| "epoch": 2.4032, |
| "grad_norm": 0.9473263621330261, |
| "kl": 0.102783203125, |
| "learning_rate": 6.125e-08, |
| "loss": 0.001, |
| "reward": 3.953715443611145, |
| "reward_std": 0.018719897605478764, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9566626846790314, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999456524848938, |
| "step": 752 |
| }, |
| { |
| "completion_length": 177.09375, |
| "epoch": 2.4064, |
| "grad_norm": 0.7227364182472229, |
| "kl": 0.115234375, |
| "learning_rate": 6e-08, |
| "loss": 0.0012, |
| "reward": 3.9009724855422974, |
| "reward_std": 0.1057232718449086, |
| "rewards/answer_entity_reward": 0.96875, |
| "rewards/answer_wer_reward": 0.9504120945930481, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.981810450553894, |
| "step": 753 |
| }, |
| { |
| "completion_length": 215.96875, |
| "epoch": 2.4096, |
| "grad_norm": 3.616448163986206, |
| "kl": 0.103759765625, |
| "learning_rate": 5.8749999999999993e-08, |
| "loss": 0.001, |
| "reward": 3.936669707298279, |
| "reward_std": 0.01544360350817442, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9422976672649384, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9978442788124084, |
| "step": 754 |
| }, |
| { |
| "completion_length": 222.03125, |
| "epoch": 2.4128, |
| "grad_norm": 5.449378967285156, |
| "kl": 0.08203125, |
| "learning_rate": 5.75e-08, |
| "loss": 0.0008, |
| "reward": 3.9294843673706055, |
| "reward_std": 0.05806633085012436, |
| "rewards/answer_entity_reward": 0.9763257801532745, |
| "rewards/answer_wer_reward": 0.9556067883968353, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9975519478321075, |
| "step": 755 |
| }, |
| { |
| "completion_length": 200.3125, |
| "epoch": 2.416, |
| "grad_norm": 2.46901798248291, |
| "kl": 0.119873046875, |
| "learning_rate": 5.625e-08, |
| "loss": 0.0012, |
| "reward": 3.888831377029419, |
| "reward_std": 0.015442279167473316, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9264732301235199, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9651989638805389, |
| "step": 756 |
| }, |
| { |
| "completion_length": 237.21875, |
| "epoch": 2.4192, |
| "grad_norm": 1.3007749319076538, |
| "kl": 0.095947265625, |
| "learning_rate": 5.4999999999999996e-08, |
| "loss": 0.001, |
| "reward": 3.933607816696167, |
| "reward_std": 0.023877738043665886, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9380720853805542, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 757 |
| }, |
| { |
| "completion_length": 151.78125, |
| "epoch": 2.4224, |
| "grad_norm": 0.7467179894447327, |
| "kl": 0.099853515625, |
| "learning_rate": 5.3749999999999995e-08, |
| "loss": 0.001, |
| "reward": 3.9564812183380127, |
| "reward_std": 0.004365669563412666, |
| "rewards/answer_entity_reward": 0.9916666746139526, |
| "rewards/answer_wer_reward": 0.9648145437240601, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 758 |
| }, |
| { |
| "completion_length": 232.125, |
| "epoch": 2.4256, |
| "grad_norm": 1.5784400701522827, |
| "kl": 0.1041259765625, |
| "learning_rate": 5.2499999999999994e-08, |
| "loss": 0.001, |
| "reward": 3.9412447214126587, |
| "reward_std": 0.02170270448550582, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9489176869392395, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9967913925647736, |
| "step": 759 |
| }, |
| { |
| "completion_length": 215.4375, |
| "epoch": 2.4288, |
| "grad_norm": 3.9008543491363525, |
| "kl": 0.1298828125, |
| "learning_rate": 5.124999999999999e-08, |
| "loss": 0.0013, |
| "reward": 3.8311843872070312, |
| "reward_std": 0.05369440279901028, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9393357634544373, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8918486833572388, |
| "step": 760 |
| }, |
| { |
| "completion_length": 219.65625, |
| "epoch": 2.432, |
| "grad_norm": 4.4970526695251465, |
| "kl": 0.09765625, |
| "learning_rate": 5e-08, |
| "loss": 0.001, |
| "reward": 3.9511146545410156, |
| "reward_std": 0.01855921559035778, |
| "rewards/answer_entity_reward": 0.9981617629528046, |
| "rewards/answer_wer_reward": 0.9531445503234863, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.99980828166008, |
| "step": 761 |
| }, |
| { |
| "completion_length": 210.21875, |
| "epoch": 2.4352, |
| "grad_norm": 0.9267875552177429, |
| "kl": 0.096923828125, |
| "learning_rate": 4.8749999999999996e-08, |
| "loss": 0.001, |
| "reward": 3.930490016937256, |
| "reward_std": 0.013515972066670656, |
| "rewards/answer_entity_reward": 0.9930555820465088, |
| "rewards/answer_wer_reward": 0.9386539459228516, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987804591655731, |
| "step": 762 |
| }, |
| { |
| "completion_length": 196.625, |
| "epoch": 2.4384, |
| "grad_norm": 2.2344725131988525, |
| "kl": 0.1025390625, |
| "learning_rate": 4.7499999999999995e-08, |
| "loss": 0.001, |
| "reward": 3.9080734252929688, |
| "reward_std": 0.035708663053810596, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9325708150863647, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9779064655303955, |
| "step": 763 |
| }, |
| { |
| "completion_length": 225.09375, |
| "epoch": 2.4416, |
| "grad_norm": 1.588053822517395, |
| "kl": 0.095947265625, |
| "learning_rate": 4.625e-08, |
| "loss": 0.001, |
| "reward": 3.9343831539154053, |
| "reward_std": 0.016630763188004494, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9485695362091064, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9892857074737549, |
| "step": 764 |
| }, |
| { |
| "completion_length": 247.09375, |
| "epoch": 2.4448, |
| "grad_norm": 1.1707122325897217, |
| "kl": 0.09228515625, |
| "learning_rate": 4.5e-08, |
| "loss": 0.0009, |
| "reward": 3.8900914192199707, |
| "reward_std": 0.06134997680783272, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.908464640378952, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9837101101875305, |
| "step": 765 |
| }, |
| { |
| "completion_length": 241.65625, |
| "epoch": 2.448, |
| "grad_norm": 2.8273398876190186, |
| "kl": 0.110595703125, |
| "learning_rate": 4.375e-08, |
| "loss": 0.0011, |
| "reward": 3.890642285346985, |
| "reward_std": 0.021557598374783993, |
| "rewards/answer_entity_reward": 0.9983552694320679, |
| "rewards/answer_wer_reward": 0.8973233997821808, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9949637055397034, |
| "step": 766 |
| }, |
| { |
| "completion_length": 216.96875, |
| "epoch": 2.4512, |
| "grad_norm": 1.1206011772155762, |
| "kl": 0.095947265625, |
| "learning_rate": 4.2500000000000003e-08, |
| "loss": 0.001, |
| "reward": 3.9437450170516968, |
| "reward_std": 0.008607666241005063, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9612680077552795, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9824769496917725, |
| "step": 767 |
| }, |
| { |
| "completion_length": 230.3125, |
| "epoch": 2.4544, |
| "grad_norm": 15.688488960266113, |
| "kl": 0.085693359375, |
| "learning_rate": 4.125e-08, |
| "loss": 0.0009, |
| "reward": 3.9394757747650146, |
| "reward_std": 0.030962621793150902, |
| "rewards/answer_entity_reward": 0.9806547462940216, |
| "rewards/answer_wer_reward": 0.9617869853973389, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9970340430736542, |
| "step": 768 |
| }, |
| { |
| "completion_length": 248.46875, |
| "epoch": 2.4576000000000002, |
| "grad_norm": 1.5618577003479004, |
| "kl": 0.16162109375, |
| "learning_rate": 4e-08, |
| "loss": 0.0016, |
| "reward": 3.9416744709014893, |
| "reward_std": 0.016101540066301823, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9428056180477142, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9988687336444855, |
| "step": 769 |
| }, |
| { |
| "completion_length": 234.0625, |
| "epoch": 2.4608, |
| "grad_norm": 3.257962226867676, |
| "kl": 0.16259765625, |
| "learning_rate": 3.875e-08, |
| "loss": 0.0016, |
| "reward": 3.9221439361572266, |
| "reward_std": 0.02909655123949051, |
| "rewards/answer_entity_reward": 0.9871794581413269, |
| "rewards/answer_wer_reward": 0.9352968335151672, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996675550937653, |
| "step": 770 |
| }, |
| { |
| "completion_length": 245.78125, |
| "epoch": 2.464, |
| "grad_norm": 2.2879505157470703, |
| "kl": 0.083251953125, |
| "learning_rate": 3.75e-08, |
| "loss": 0.0008, |
| "reward": 3.9263609647750854, |
| "reward_std": 0.022196561098098755, |
| "rewards/answer_entity_reward": 0.9944852888584137, |
| "rewards/answer_wer_reward": 0.9454439282417297, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9864316880702972, |
| "step": 771 |
| }, |
| { |
| "completion_length": 158.78125, |
| "epoch": 2.4672, |
| "grad_norm": 2.214250087738037, |
| "kl": 0.1328125, |
| "learning_rate": 3.625e-08, |
| "loss": 0.0013, |
| "reward": 3.883350372314453, |
| "reward_std": 0.04219530359841883, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9803332090377808, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9030172228813171, |
| "step": 772 |
| }, |
| { |
| "completion_length": 229.25, |
| "epoch": 2.4704, |
| "grad_norm": 1.8548256158828735, |
| "kl": 0.10205078125, |
| "learning_rate": 3.5e-08, |
| "loss": 0.001, |
| "reward": 3.9539661407470703, |
| "reward_std": 0.011240935884416103, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9542403221130371, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999725878238678, |
| "step": 773 |
| }, |
| { |
| "completion_length": 227.9375, |
| "epoch": 2.4736000000000002, |
| "grad_norm": 2.2110090255737305, |
| "kl": 0.0927734375, |
| "learning_rate": 3.375e-08, |
| "loss": 0.0009, |
| "reward": 3.9344996213912964, |
| "reward_std": 0.011312551097944379, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9557085335254669, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9787910580635071, |
| "step": 774 |
| }, |
| { |
| "completion_length": 250.9375, |
| "epoch": 2.4768, |
| "grad_norm": 25.519304275512695, |
| "kl": 0.1328125, |
| "learning_rate": 3.25e-08, |
| "loss": 0.0013, |
| "reward": 3.915758967399597, |
| "reward_std": 0.015426212921738625, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9174197912216187, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9983391761779785, |
| "step": 775 |
| }, |
| { |
| "completion_length": 223.65625, |
| "epoch": 2.48, |
| "grad_norm": 3.6137807369232178, |
| "kl": 0.115966796875, |
| "learning_rate": 3.125e-08, |
| "loss": 0.0012, |
| "reward": 3.939508318901062, |
| "reward_std": 0.00902418838813901, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9407406747341156, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987677037715912, |
| "step": 776 |
| }, |
| { |
| "completion_length": 248.0, |
| "epoch": 2.4832, |
| "grad_norm": 1.4470294713974, |
| "kl": 0.16015625, |
| "learning_rate": 3e-08, |
| "loss": 0.0016, |
| "reward": 3.8835391998291016, |
| "reward_std": 0.029840022325515747, |
| "rewards/answer_entity_reward": 0.9926948249340057, |
| "rewards/answer_wer_reward": 0.9006942212581635, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9901500642299652, |
| "step": 777 |
| }, |
| { |
| "completion_length": 174.3125, |
| "epoch": 2.4864, |
| "grad_norm": 2.8671512603759766, |
| "kl": 0.12353515625, |
| "learning_rate": 2.875e-08, |
| "loss": 0.0012, |
| "reward": 3.9390430450439453, |
| "reward_std": 0.029761829413473606, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9475694894790649, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9938772618770599, |
| "step": 778 |
| }, |
| { |
| "completion_length": 217.0, |
| "epoch": 2.4896, |
| "grad_norm": 1.7183799743652344, |
| "kl": 0.095458984375, |
| "learning_rate": 2.7499999999999998e-08, |
| "loss": 0.001, |
| "reward": 3.922086715698242, |
| "reward_std": 0.01339792925864458, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9282321929931641, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9938544631004333, |
| "step": 779 |
| }, |
| { |
| "completion_length": 205.09375, |
| "epoch": 2.4928, |
| "grad_norm": 2.424999475479126, |
| "kl": 0.102294921875, |
| "learning_rate": 2.6249999999999997e-08, |
| "loss": 0.001, |
| "reward": 3.944626212120056, |
| "reward_std": 0.038148084189742804, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9640980660915375, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9805281758308411, |
| "step": 780 |
| }, |
| { |
| "completion_length": 220.6875, |
| "epoch": 2.496, |
| "grad_norm": 1.739138126373291, |
| "kl": 0.09765625, |
| "learning_rate": 2.5e-08, |
| "loss": 0.001, |
| "reward": 3.943056583404541, |
| "reward_std": 0.025130684953182936, |
| "rewards/answer_entity_reward": 0.9871794581413269, |
| "rewards/answer_wer_reward": 0.9561585485935211, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997184872627258, |
| "step": 781 |
| }, |
| { |
| "completion_length": 199.375, |
| "epoch": 2.4992, |
| "grad_norm": 1.3409775495529175, |
| "kl": 0.083984375, |
| "learning_rate": 2.3749999999999998e-08, |
| "loss": 0.0008, |
| "reward": 3.9247756004333496, |
| "reward_std": 0.021664155647158623, |
| "rewards/answer_entity_reward": 0.9902146756649017, |
| "rewards/answer_wer_reward": 0.9345609843730927, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 782 |
| }, |
| { |
| "completion_length": 221.28125, |
| "epoch": 2.5023999999999997, |
| "grad_norm": 1.9740352630615234, |
| "kl": 0.099853515625, |
| "learning_rate": 2.25e-08, |
| "loss": 0.001, |
| "reward": 3.955259919166565, |
| "reward_std": 0.010415108175948262, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9565965533256531, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9986633956432343, |
| "step": 783 |
| }, |
| { |
| "completion_length": 235.40625, |
| "epoch": 2.5056000000000003, |
| "grad_norm": 7.616406440734863, |
| "kl": 0.144287109375, |
| "learning_rate": 2.1250000000000002e-08, |
| "loss": 0.0014, |
| "reward": 3.9511306285858154, |
| "reward_std": 0.011523132212460041, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9639480412006378, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9871825873851776, |
| "step": 784 |
| }, |
| { |
| "completion_length": 205.84375, |
| "epoch": 2.5088, |
| "grad_norm": 3.1992883682250977, |
| "kl": 0.107421875, |
| "learning_rate": 2e-08, |
| "loss": 0.0011, |
| "reward": 3.92184841632843, |
| "reward_std": 0.016722742468118668, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9461718797683716, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9756765961647034, |
| "step": 785 |
| }, |
| { |
| "completion_length": 225.6875, |
| "epoch": 2.512, |
| "grad_norm": 1.2884989976882935, |
| "kl": 0.139404296875, |
| "learning_rate": 1.875e-08, |
| "loss": 0.0014, |
| "reward": 3.946265697479248, |
| "reward_std": 0.017564056208357215, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9507300853729248, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 786 |
| }, |
| { |
| "completion_length": 176.96875, |
| "epoch": 2.5152, |
| "grad_norm": 3.3580868244171143, |
| "kl": 0.1982421875, |
| "learning_rate": 1.75e-08, |
| "loss": 0.002, |
| "reward": 3.8963418006896973, |
| "reward_std": 0.04480761382728815, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9633896946907043, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9329521358013153, |
| "step": 787 |
| }, |
| { |
| "completion_length": 260.28125, |
| "epoch": 2.5183999999999997, |
| "grad_norm": 1.0715585947036743, |
| "kl": 0.105712890625, |
| "learning_rate": 1.625e-08, |
| "loss": 0.0011, |
| "reward": 3.904552698135376, |
| "reward_std": 0.034514338709414005, |
| "rewards/answer_entity_reward": 0.9831239283084869, |
| "rewards/answer_wer_reward": 0.9217879772186279, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996408224105835, |
| "step": 788 |
| }, |
| { |
| "completion_length": 251.625, |
| "epoch": 2.5216, |
| "grad_norm": 3.5006961822509766, |
| "kl": 0.080322265625, |
| "learning_rate": 1.5e-08, |
| "loss": 0.0008, |
| "reward": 3.9146039485931396, |
| "reward_std": 0.028964843600988388, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9146038293838501, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 789 |
| }, |
| { |
| "completion_length": 201.0625, |
| "epoch": 2.5248, |
| "grad_norm": 5.0292534828186035, |
| "kl": 0.1396484375, |
| "learning_rate": 1.3749999999999999e-08, |
| "loss": 0.0014, |
| "reward": 3.897321939468384, |
| "reward_std": 0.01521459873765707, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9714652001857758, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9258567690849304, |
| "step": 790 |
| }, |
| { |
| "completion_length": 185.03125, |
| "epoch": 2.528, |
| "grad_norm": 2.234839916229248, |
| "kl": 0.1005859375, |
| "learning_rate": 1.25e-08, |
| "loss": 0.001, |
| "reward": 3.930277109146118, |
| "reward_std": 0.026013732887804508, |
| "rewards/answer_entity_reward": 0.9908459782600403, |
| "rewards/answer_wer_reward": 0.9424907863140106, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9969403147697449, |
| "step": 791 |
| }, |
| { |
| "completion_length": 185.84375, |
| "epoch": 2.5312, |
| "grad_norm": 0.5959092974662781, |
| "kl": 0.0947265625, |
| "learning_rate": 1.125e-08, |
| "loss": 0.0009, |
| "reward": 3.954566478729248, |
| "reward_std": 0.011145764729008079, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9578571021556854, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9995503425598145, |
| "step": 792 |
| }, |
| { |
| "completion_length": 197.84375, |
| "epoch": 2.5343999999999998, |
| "grad_norm": 2.0784664154052734, |
| "kl": 0.114501953125, |
| "learning_rate": 1e-08, |
| "loss": 0.0011, |
| "reward": 3.885765790939331, |
| "reward_std": 0.012588209472596645, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9541250765323639, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.931640625, |
| "step": 793 |
| }, |
| { |
| "completion_length": 190.03125, |
| "epoch": 2.5376, |
| "grad_norm": 1.7104955911636353, |
| "kl": 0.224609375, |
| "learning_rate": 8.75e-09, |
| "loss": 0.0022, |
| "reward": 3.824442148208618, |
| "reward_std": 0.039704530499875546, |
| "rewards/answer_entity_reward": 0.9877451062202454, |
| "rewards/answer_wer_reward": 0.919477641582489, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9172193706035614, |
| "step": 794 |
| }, |
| { |
| "completion_length": 221.4375, |
| "epoch": 2.5408, |
| "grad_norm": 2.524031162261963, |
| "kl": 0.09521484375, |
| "learning_rate": 7.5e-09, |
| "loss": 0.001, |
| "reward": 3.9210238456726074, |
| "reward_std": 0.03186593018472195, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9480306208133698, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9729932844638824, |
| "step": 795 |
| }, |
| { |
| "completion_length": 149.9375, |
| "epoch": 2.544, |
| "grad_norm": 2.592532157897949, |
| "kl": 0.116943359375, |
| "learning_rate": 6.25e-09, |
| "loss": 0.0012, |
| "reward": 3.835923910140991, |
| "reward_std": 0.016047589480876923, |
| "rewards/answer_entity_reward": 0.9942555129528046, |
| "rewards/answer_wer_reward": 0.8419776558876038, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999690592288971, |
| "step": 796 |
| }, |
| { |
| "completion_length": 196.3125, |
| "epoch": 2.5472, |
| "grad_norm": 1.1898647546768188, |
| "kl": 0.0810546875, |
| "learning_rate": 5e-09, |
| "loss": 0.0008, |
| "reward": 3.9655500650405884, |
| "reward_std": 0.012615942629054189, |
| "rewards/answer_entity_reward": 0.9937500059604645, |
| "rewards/answer_wer_reward": 0.9718000292778015, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 797 |
| }, |
| { |
| "completion_length": 220.3125, |
| "epoch": 2.5504, |
| "grad_norm": 1.6702154874801636, |
| "kl": 0.093505859375, |
| "learning_rate": 3.75e-09, |
| "loss": 0.0009, |
| "reward": 3.945963501930237, |
| "reward_std": 0.007131826248951256, |
| "rewards/answer_entity_reward": 0.9926470518112183, |
| "rewards/answer_wer_reward": 0.9533165395259857, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 798 |
| }, |
| { |
| "completion_length": 223.65625, |
| "epoch": 2.5536, |
| "grad_norm": 46.13692855834961, |
| "kl": 0.11083984375, |
| "learning_rate": 2.5e-09, |
| "loss": 0.0011, |
| "reward": 3.8597129583358765, |
| "reward_std": 0.09108205512166023, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9162732660770416, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9770934879779816, |
| "step": 799 |
| }, |
| { |
| "completion_length": 233.84375, |
| "epoch": 2.5568, |
| "grad_norm": 1.1842632293701172, |
| "kl": 0.108642578125, |
| "learning_rate": 1.25e-09, |
| "loss": 0.0011, |
| "reward": 3.9367305040359497, |
| "reward_std": 0.01876719295978546, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9404171705245972, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9983966648578644, |
| "step": 800 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 800, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|