{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.5568, "eval_steps": 500, "global_step": 800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 175.78125, "epoch": 0.0032, "grad_norm": 5.3713698387146, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 2.691648483276367, "reward_std": 0.9842272102832794, "rewards/answer_entity_reward": 0.8998827934265137, "rewards/answer_wer_reward": 0.6144023239612579, "rewards/format_reward": 0.65625, "rewards/think_ocr_reward": 0.5211134254932404, "step": 1 }, { "completion_length": 205.1875, "epoch": 0.0064, "grad_norm": 12.984394073486328, "kl": 0.000339508056640625, "learning_rate": 9.9875e-07, "loss": 0.0, "reward": 2.8287014961242676, "reward_std": 1.0050830841064453, "rewards/answer_entity_reward": 0.7303222715854645, "rewards/answer_wer_reward": 0.47497838735580444, "rewards/format_reward": 0.875, "rewards/think_ocr_reward": 0.7484009563922882, "step": 2 }, { "completion_length": 203.09375, "epoch": 0.0096, "grad_norm": 5.166553497314453, "kl": 0.00044536590576171875, "learning_rate": 9.975e-07, "loss": 0.0, "reward": 3.498788595199585, "reward_std": 0.2545953020453453, "rewards/answer_entity_reward": 0.9527146220207214, "rewards/answer_wer_reward": 0.7393675744533539, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8067062795162201, "step": 3 }, { "completion_length": 206.1875, "epoch": 0.0128, "grad_norm": 2.356685161590576, "kl": 0.0009002685546875, "learning_rate": 9.9625e-07, "loss": 0.0, "reward": 3.299022078514099, "reward_std": 0.5456227362155914, "rewards/answer_entity_reward": 0.8519714176654816, "rewards/answer_wer_reward": 0.6592651903629303, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.8190353512763977, "step": 4 }, { "completion_length": 223.28125, "epoch": 0.016, "grad_norm": 3.5642409324645996, "kl": 0.001827239990234375, "learning_rate": 9.95e-07, "loss": 0.0, "reward": 2.8498330116271973, "reward_std": 0.6001743674278259, "rewards/answer_entity_reward": 0.8803278803825378, "rewards/answer_wer_reward": 0.45287495851516724, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.5478802025318146, "step": 5 }, { "completion_length": 210.28125, "epoch": 0.0192, "grad_norm": 2.062991142272949, "kl": 0.004608154296875, "learning_rate": 9.9375e-07, "loss": 0.0, "reward": 3.345002055168152, "reward_std": 0.5891430526971817, "rewards/answer_entity_reward": 0.8334160447120667, "rewards/answer_wer_reward": 0.7313504219055176, "rewards/format_reward": 0.875, "rewards/think_ocr_reward": 0.9052354693412781, "step": 6 }, { "completion_length": 204.9375, "epoch": 0.0224, "grad_norm": 2.77138090133667, "kl": 0.01922607421875, "learning_rate": 9.925e-07, "loss": 0.0002, "reward": 3.3531779050827026, "reward_std": 0.7286678552627563, "rewards/answer_entity_reward": 0.8474657833576202, "rewards/answer_wer_reward": 0.7306987345218658, "rewards/format_reward": 0.90625, "rewards/think_ocr_reward": 0.8687634468078613, "step": 7 }, { "completion_length": 242.0, "epoch": 0.0256, "grad_norm": 1.9377678632736206, "kl": 0.00897216796875, "learning_rate": 9.912499999999998e-07, "loss": 0.0001, "reward": 3.538244366645813, "reward_std": 0.26357416808605194, "rewards/answer_entity_reward": 0.8956374526023865, "rewards/answer_wer_reward": 0.795194149017334, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.8786628246307373, "step": 8 }, { "completion_length": 181.28125, "epoch": 0.0288, "grad_norm": 2.9018149375915527, "kl": 0.0250244140625, "learning_rate": 9.9e-07, "loss": 0.0002, "reward": 3.6827263832092285, "reward_std": 0.21120695769786835, "rewards/answer_entity_reward": 0.9178647994995117, "rewards/answer_wer_reward": 0.8329994082450867, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9318622648715973, "step": 9 }, { "completion_length": 211.1875, "epoch": 0.032, "grad_norm": 3.4354376792907715, "kl": 0.02166748046875, "learning_rate": 9.8875e-07, "loss": 0.0002, "reward": 3.6928374767303467, "reward_std": 0.21010804921388626, "rewards/answer_entity_reward": 0.8995116055011749, "rewards/answer_wer_reward": 0.8549435138702393, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9383824467658997, "step": 10 }, { "completion_length": 165.40625, "epoch": 0.0352, "grad_norm": 5.1537652015686035, "kl": 0.0521240234375, "learning_rate": 9.875e-07, "loss": 0.0005, "reward": 3.500484824180603, "reward_std": 0.5196337550878525, "rewards/answer_entity_reward": 0.9380581974983215, "rewards/answer_wer_reward": 0.7917109727859497, "rewards/format_reward": 0.9375, "rewards/think_ocr_reward": 0.833215594291687, "step": 11 }, { "completion_length": 223.8125, "epoch": 0.0384, "grad_norm": 3.7026002407073975, "kl": 0.02813720703125, "learning_rate": 9.862499999999999e-07, "loss": 0.0003, "reward": 3.7366983890533447, "reward_std": 0.19402557611465454, "rewards/answer_entity_reward": 0.9315968751907349, "rewards/answer_wer_reward": 0.836162269115448, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9689393639564514, "step": 12 }, { "completion_length": 201.34375, "epoch": 0.0416, "grad_norm": 4.624758243560791, "kl": 0.0487060546875, "learning_rate": 9.849999999999999e-07, "loss": 0.0005, "reward": 3.6485583782196045, "reward_std": 0.19490989297628403, "rewards/answer_entity_reward": 0.9538419842720032, "rewards/answer_wer_reward": 0.8439803719520569, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.8819859325885773, "step": 13 }, { "completion_length": 197.53125, "epoch": 0.0448, "grad_norm": 5.349609375, "kl": 0.03363037109375, "learning_rate": 9.8375e-07, "loss": 0.0003, "reward": 3.579698920249939, "reward_std": 0.12941206991672516, "rewards/answer_entity_reward": 0.9086007177829742, "rewards/answer_wer_reward": 0.8474478721618652, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8236501812934875, "step": 14 }, { "completion_length": 180.5625, "epoch": 0.048, "grad_norm": 5.51423454284668, "kl": 0.0633544921875, "learning_rate": 9.825e-07, "loss": 0.0006, "reward": 3.6973917484283447, "reward_std": 0.15208109095692635, "rewards/answer_entity_reward": 0.9153402149677277, "rewards/answer_wer_reward": 0.8323444426059723, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9497069418430328, "step": 15 }, { "completion_length": 205.03125, "epoch": 0.0512, "grad_norm": 3.2830357551574707, "kl": 0.059326171875, "learning_rate": 9.8125e-07, "loss": 0.0006, "reward": 3.477460026741028, "reward_std": 0.43340209126472473, "rewards/answer_entity_reward": 0.8780590891838074, "rewards/answer_wer_reward": 0.7556597292423248, "rewards/format_reward": 0.9375, "rewards/think_ocr_reward": 0.9062411189079285, "step": 16 }, { "completion_length": 243.84375, "epoch": 0.0544, "grad_norm": 2.257538080215454, "kl": 0.03240966796875, "learning_rate": 9.8e-07, "loss": 0.0003, "reward": 3.6340386867523193, "reward_std": 0.15337160229682922, "rewards/answer_entity_reward": 0.8995862305164337, "rewards/answer_wer_reward": 0.7731227576732635, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9613295793533325, "step": 17 }, { "completion_length": 236.125, "epoch": 0.0576, "grad_norm": 2.133462429046631, "kl": 0.0579833984375, "learning_rate": 9.7875e-07, "loss": 0.0006, "reward": 3.730382204055786, "reward_std": 0.1639438048005104, "rewards/answer_entity_reward": 0.9158936738967896, "rewards/answer_wer_reward": 0.8535431623458862, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9609452486038208, "step": 18 }, { "completion_length": 253.84375, "epoch": 0.0608, "grad_norm": 2.6911232471466064, "kl": 0.042236328125, "learning_rate": 9.775e-07, "loss": 0.0004, "reward": 3.6918214559555054, "reward_std": 0.24240515753626823, "rewards/answer_entity_reward": 0.908495306968689, "rewards/answer_wer_reward": 0.8162411749362946, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9983349442481995, "step": 19 }, { "completion_length": 195.3125, "epoch": 0.064, "grad_norm": 2.856860876083374, "kl": 0.0548095703125, "learning_rate": 9.7625e-07, "loss": 0.0005, "reward": 3.570927858352661, "reward_std": 0.38515634275972843, "rewards/answer_entity_reward": 0.885971337556839, "rewards/answer_wer_reward": 0.7937527894973755, "rewards/format_reward": 0.9375, "rewards/think_ocr_reward": 0.9537037014961243, "step": 20 }, { "completion_length": 200.21875, "epoch": 0.0672, "grad_norm": 2.869398355484009, "kl": 0.059814453125, "learning_rate": 9.75e-07, "loss": 0.0006, "reward": 3.7599644660949707, "reward_std": 0.13445724919438362, "rewards/answer_entity_reward": 0.9744762480258942, "rewards/answer_wer_reward": 0.8406906425952911, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9447975754737854, "step": 21 }, { "completion_length": 228.9375, "epoch": 0.0704, "grad_norm": 2.2584221363067627, "kl": 0.03387451171875, "learning_rate": 9.7375e-07, "loss": 0.0003, "reward": 3.5859320163726807, "reward_std": 0.14986564964056015, "rewards/answer_entity_reward": 0.9357894659042358, "rewards/answer_wer_reward": 0.8099571466445923, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8401854038238525, "step": 22 }, { "completion_length": 219.78125, "epoch": 0.0736, "grad_norm": 2.140197277069092, "kl": 0.0499267578125, "learning_rate": 9.725e-07, "loss": 0.0005, "reward": 3.755205750465393, "reward_std": 0.09474575892090797, "rewards/answer_entity_reward": 0.9487689137458801, "rewards/answer_wer_reward": 0.871625155210495, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9348115921020508, "step": 23 }, { "completion_length": 206.28125, "epoch": 0.0768, "grad_norm": 3.823035478591919, "kl": 0.13916015625, "learning_rate": 9.712499999999998e-07, "loss": 0.0014, "reward": 3.7580984830856323, "reward_std": 0.07033384963870049, "rewards/answer_entity_reward": 0.9635280966758728, "rewards/answer_wer_reward": 0.8670244812965393, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9275458753108978, "step": 24 }, { "completion_length": 141.875, "epoch": 0.08, "grad_norm": 3.9088714122772217, "kl": 0.10791015625, "learning_rate": 9.7e-07, "loss": 0.0011, "reward": 3.7762891054153442, "reward_std": 0.04259665124118328, "rewards/answer_entity_reward": 0.9848519563674927, "rewards/answer_wer_reward": 0.8006402850151062, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9907970130443573, "step": 25 }, { "completion_length": 205.21875, "epoch": 0.0832, "grad_norm": 2.103792905807495, "kl": 0.065185546875, "learning_rate": 9.6875e-07, "loss": 0.0007, "reward": 3.811550498008728, "reward_std": 0.11633584462106228, "rewards/answer_entity_reward": 0.9553370177745819, "rewards/answer_wer_reward": 0.9040265679359436, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9834368824958801, "step": 26 }, { "completion_length": 228.78125, "epoch": 0.0864, "grad_norm": 2.7897403240203857, "kl": 0.0435791015625, "learning_rate": 9.675e-07, "loss": 0.0004, "reward": 3.788088798522949, "reward_std": 0.10910476744174957, "rewards/answer_entity_reward": 0.9546680450439453, "rewards/answer_wer_reward": 0.872740238904953, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9606806039810181, "step": 27 }, { "completion_length": 210.5, "epoch": 0.0896, "grad_norm": 1.2101320028305054, "kl": 0.0552978515625, "learning_rate": 9.6625e-07, "loss": 0.0006, "reward": 3.8938169479370117, "reward_std": 0.04485907219350338, "rewards/answer_entity_reward": 0.974581778049469, "rewards/answer_wer_reward": 0.9207929372787476, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9984423518180847, "step": 28 }, { "completion_length": 233.78125, "epoch": 0.0928, "grad_norm": 2.7460684776306152, "kl": 0.035400390625, "learning_rate": 9.649999999999999e-07, "loss": 0.0004, "reward": 3.662728428840637, "reward_std": 0.20339616388082504, "rewards/answer_entity_reward": 0.8774791359901428, "rewards/answer_wer_reward": 0.8000176846981049, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9852316677570343, "step": 29 }, { "completion_length": 199.59375, "epoch": 0.096, "grad_norm": 1.8316643238067627, "kl": 0.0596923828125, "learning_rate": 9.637499999999999e-07, "loss": 0.0006, "reward": 3.890167713165283, "reward_std": 0.037449197843670845, "rewards/answer_entity_reward": 0.96912881731987, "rewards/answer_wer_reward": 0.9220606982707977, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9989781975746155, "step": 30 }, { "completion_length": 226.125, "epoch": 0.0992, "grad_norm": 2.0417702198028564, "kl": 0.0440673828125, "learning_rate": 9.624999999999999e-07, "loss": 0.0004, "reward": 3.8260613679885864, "reward_std": 0.07994803786277771, "rewards/answer_entity_reward": 0.9577426314353943, "rewards/answer_wer_reward": 0.902205765247345, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9661130309104919, "step": 31 }, { "completion_length": 214.5, "epoch": 0.1024, "grad_norm": 4.027645111083984, "kl": 0.1015625, "learning_rate": 9.6125e-07, "loss": 0.001, "reward": 3.7394936084747314, "reward_std": 0.10389792174100876, "rewards/answer_entity_reward": 0.9218434691429138, "rewards/answer_wer_reward": 0.8621510863304138, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9554989635944366, "step": 32 }, { "completion_length": 255.28125, "epoch": 0.1056, "grad_norm": 1.527213454246521, "kl": 0.046875, "learning_rate": 9.6e-07, "loss": 0.0005, "reward": 3.8307132720947266, "reward_std": 0.0552691500633955, "rewards/answer_entity_reward": 0.9554121494293213, "rewards/answer_wer_reward": 0.8765550553798676, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987460970878601, "step": 33 }, { "completion_length": 226.0, "epoch": 0.1088, "grad_norm": 1.822529673576355, "kl": 0.0372314453125, "learning_rate": 9.5875e-07, "loss": 0.0004, "reward": 3.8188695907592773, "reward_std": 0.07392234448343515, "rewards/answer_entity_reward": 0.9491736888885498, "rewards/answer_wer_reward": 0.8781739175319672, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.991521954536438, "step": 34 }, { "completion_length": 230.71875, "epoch": 0.112, "grad_norm": 1.96689772605896, "kl": 0.05322265625, "learning_rate": 9.575e-07, "loss": 0.0005, "reward": 3.839812397956848, "reward_std": 0.04108080454170704, "rewards/answer_entity_reward": 0.9491481184959412, "rewards/answer_wer_reward": 0.8918017745018005, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9988624751567841, "step": 35 }, { "completion_length": 181.75, "epoch": 0.1152, "grad_norm": 25.535808563232422, "kl": 0.100830078125, "learning_rate": 9.5625e-07, "loss": 0.001, "reward": 3.8188287019729614, "reward_std": 0.1601814702153206, "rewards/answer_entity_reward": 0.9457894563674927, "rewards/answer_wer_reward": 0.9093815982341766, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9636577069759369, "step": 36 }, { "completion_length": 165.375, "epoch": 0.1184, "grad_norm": 2.886183738708496, "kl": 0.0692138671875, "learning_rate": 9.55e-07, "loss": 0.0007, "reward": 3.8752315044403076, "reward_std": 0.04815678671002388, "rewards/answer_entity_reward": 0.994689553976059, "rewards/answer_wer_reward": 0.9401271045207977, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9404149055480957, "step": 37 }, { "completion_length": 250.40625, "epoch": 0.1216, "grad_norm": 2.9052975177764893, "kl": 0.0467529296875, "learning_rate": 9.5375e-07, "loss": 0.0005, "reward": 3.8545405864715576, "reward_std": 0.04892056295648217, "rewards/answer_entity_reward": 0.9534467458724976, "rewards/answer_wer_reward": 0.9035276472568512, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9975661933422089, "step": 38 }, { "completion_length": 234.125, "epoch": 0.1248, "grad_norm": 1.5214505195617676, "kl": 0.04010009765625, "learning_rate": 9.525e-07, "loss": 0.0004, "reward": 3.7642624378204346, "reward_std": 0.06860890984535217, "rewards/answer_entity_reward": 0.9330369234085083, "rewards/answer_wer_reward": 0.8348780572414398, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9963473677635193, "step": 39 }, { "completion_length": 222.5, "epoch": 0.128, "grad_norm": 1.4751359224319458, "kl": 0.0521240234375, "learning_rate": 9.5125e-07, "loss": 0.0005, "reward": 3.8170441389083862, "reward_std": 0.06563596054911613, "rewards/answer_entity_reward": 0.9340721964836121, "rewards/answer_wer_reward": 0.8999682068824768, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9830037355422974, "step": 40 }, { "completion_length": 201.84375, "epoch": 0.1312, "grad_norm": 20.2832088470459, "kl": 0.038818359375, "learning_rate": 9.499999999999999e-07, "loss": 0.0004, "reward": 3.7008172273635864, "reward_std": 0.039744822308421135, "rewards/answer_entity_reward": 0.9294143319129944, "rewards/answer_wer_reward": 0.890234112739563, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8811687231063843, "step": 41 }, { "completion_length": 192.09375, "epoch": 0.1344, "grad_norm": 3.430189609527588, "kl": 0.0523681640625, "learning_rate": 9.487499999999999e-07, "loss": 0.0005, "reward": 3.8015908002853394, "reward_std": 0.057819752022624016, "rewards/answer_entity_reward": 0.9672390222549438, "rewards/answer_wer_reward": 0.8474858105182648, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9868659377098083, "step": 42 }, { "completion_length": 215.53125, "epoch": 0.1376, "grad_norm": 16.041494369506836, "kl": 0.0418701171875, "learning_rate": 9.474999999999999e-07, "loss": 0.0004, "reward": 3.730579137802124, "reward_std": 0.11731705069541931, "rewards/answer_entity_reward": 0.9560448527336121, "rewards/answer_wer_reward": 0.8699329495429993, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9046012759208679, "step": 43 }, { "completion_length": 236.78125, "epoch": 0.1408, "grad_norm": 1.6949574947357178, "kl": 0.0352783203125, "learning_rate": 9.462499999999999e-07, "loss": 0.0004, "reward": 3.899806261062622, "reward_std": 0.018219145480543375, "rewards/answer_entity_reward": 0.9738267660140991, "rewards/answer_wer_reward": 0.9316939115524292, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9942856431007385, "step": 44 }, { "completion_length": 246.4375, "epoch": 0.144, "grad_norm": 1.3507007360458374, "kl": 0.0330810546875, "learning_rate": 9.45e-07, "loss": 0.0003, "reward": 3.8328453302383423, "reward_std": 0.06314087565988302, "rewards/answer_entity_reward": 0.9711392819881439, "rewards/answer_wer_reward": 0.8670938909053802, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.994612067937851, "step": 45 }, { "completion_length": 170.28125, "epoch": 0.1472, "grad_norm": 2.2585864067077637, "kl": 0.077392578125, "learning_rate": 9.4375e-07, "loss": 0.0008, "reward": 3.902386784553528, "reward_std": 0.035709235817193985, "rewards/answer_entity_reward": 0.9873873591423035, "rewards/answer_wer_reward": 0.9353838264942169, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9796155691146851, "step": 46 }, { "completion_length": 149.8125, "epoch": 0.1504, "grad_norm": 4.581851005554199, "kl": 0.0452880859375, "learning_rate": 9.425e-07, "loss": 0.0005, "reward": 3.6548960208892822, "reward_std": 0.06261088512837887, "rewards/answer_entity_reward": 0.9477430880069733, "rewards/answer_wer_reward": 0.8129006922245026, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8942522406578064, "step": 47 }, { "completion_length": 216.75, "epoch": 0.1536, "grad_norm": 47.897464752197266, "kl": 0.3621826171875, "learning_rate": 9.4125e-07, "loss": 0.0036, "reward": 3.906231164932251, "reward_std": 0.034966002218425274, "rewards/answer_entity_reward": 0.9823353588581085, "rewards/answer_wer_reward": 0.9293725490570068, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9945231974124908, "step": 48 }, { "completion_length": 196.9375, "epoch": 0.1568, "grad_norm": 2.257028579711914, "kl": 0.0465087890625, "learning_rate": 9.399999999999999e-07, "loss": 0.0005, "reward": 3.8652477264404297, "reward_std": 0.03087481390684843, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9092975854873657, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.958791047334671, "step": 49 }, { "completion_length": 196.9375, "epoch": 0.16, "grad_norm": 4.950622081756592, "kl": 0.0345458984375, "learning_rate": 9.387499999999999e-07, "loss": 0.0003, "reward": 3.824746251106262, "reward_std": 0.058218397200107574, "rewards/answer_entity_reward": 0.9825757443904877, "rewards/answer_wer_reward": 0.9601459503173828, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8820245563983917, "step": 50 }, { "completion_length": 174.8125, "epoch": 0.1632, "grad_norm": 7.211401462554932, "kl": 0.0582275390625, "learning_rate": 9.374999999999999e-07, "loss": 0.0006, "reward": 3.8160147666931152, "reward_std": 0.04299969598650932, "rewards/answer_entity_reward": 0.9790209829807281, "rewards/answer_wer_reward": 0.9350173771381378, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.901976466178894, "step": 51 }, { "completion_length": 248.125, "epoch": 0.1664, "grad_norm": 0.9922041893005371, "kl": 0.0201416015625, "learning_rate": 9.3625e-07, "loss": 0.0002, "reward": 3.8918874263763428, "reward_std": 0.029974642675369978, "rewards/answer_entity_reward": 0.9869123697280884, "rewards/answer_wer_reward": 0.9067046940326691, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9982704818248749, "step": 52 }, { "completion_length": 251.59375, "epoch": 0.1696, "grad_norm": 0.9144994020462036, "kl": 0.02001953125, "learning_rate": 9.35e-07, "loss": 0.0002, "reward": 3.782878875732422, "reward_std": 0.04338405467569828, "rewards/answer_entity_reward": 0.9685876965522766, "rewards/answer_wer_reward": 0.8232664167881012, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9910247623920441, "step": 53 }, { "completion_length": 224.5625, "epoch": 0.1728, "grad_norm": 0.8014624118804932, "kl": 0.01904296875, "learning_rate": 9.3375e-07, "loss": 0.0002, "reward": 3.804163098335266, "reward_std": 0.02029208466410637, "rewards/answer_entity_reward": 0.9539299309253693, "rewards/answer_wer_reward": 0.8539278209209442, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9963052868843079, "step": 54 }, { "completion_length": 174.59375, "epoch": 0.176, "grad_norm": 2.5315935611724854, "kl": 0.02862548828125, "learning_rate": 9.325e-07, "loss": 0.0003, "reward": 3.8737215995788574, "reward_std": 0.06625958904623985, "rewards/answer_entity_reward": 0.9887503385543823, "rewards/answer_wer_reward": 0.9215180277824402, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9634531438350677, "step": 55 }, { "completion_length": 239.4375, "epoch": 0.1792, "grad_norm": 1.3654975891113281, "kl": 0.0283203125, "learning_rate": 9.3125e-07, "loss": 0.0003, "reward": 3.8753963708877563, "reward_std": 0.04764867387712002, "rewards/answer_entity_reward": 0.9810132682323456, "rewards/answer_wer_reward": 0.8943831324577332, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 56 }, { "completion_length": 214.75, "epoch": 0.1824, "grad_norm": 1.4159584045410156, "kl": 0.02081298828125, "learning_rate": 9.3e-07, "loss": 0.0002, "reward": 3.8986427783966064, "reward_std": 0.031265249475836754, "rewards/answer_entity_reward": 0.9880681931972504, "rewards/answer_wer_reward": 0.9130412340164185, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.99753338098526, "step": 57 }, { "completion_length": 240.46875, "epoch": 0.1856, "grad_norm": 1.1824144124984741, "kl": 0.015960693359375, "learning_rate": 9.287499999999999e-07, "loss": 0.0002, "reward": 3.90795361995697, "reward_std": 0.02096135076135397, "rewards/answer_entity_reward": 0.9983552694320679, "rewards/answer_wer_reward": 0.9095984101295471, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 58 }, { "completion_length": 238.09375, "epoch": 0.1888, "grad_norm": 1.165099024772644, "kl": 0.026123046875, "learning_rate": 9.274999999999999e-07, "loss": 0.0003, "reward": 3.9033310413360596, "reward_std": 0.03423699922859669, "rewards/answer_entity_reward": 0.9810605943202972, "rewards/answer_wer_reward": 0.9234386384487152, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9988317787647247, "step": 59 }, { "completion_length": 221.84375, "epoch": 0.192, "grad_norm": 2.964642286300659, "kl": 0.02587890625, "learning_rate": 9.2625e-07, "loss": 0.0003, "reward": 3.9065024852752686, "reward_std": 0.022342820651829243, "rewards/answer_entity_reward": 0.978426069021225, "rewards/answer_wer_reward": 0.9289742708206177, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9991020858287811, "step": 60 }, { "completion_length": 211.1875, "epoch": 0.1952, "grad_norm": 2.225137233734131, "kl": 0.0374755859375, "learning_rate": 9.25e-07, "loss": 0.0004, "reward": 3.6701877117156982, "reward_std": 0.03641202859580517, "rewards/answer_entity_reward": 0.9796620309352875, "rewards/answer_wer_reward": 0.7723922729492188, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9181334376335144, "step": 61 }, { "completion_length": 150.0, "epoch": 0.1984, "grad_norm": 4.289616584777832, "kl": 0.062744140625, "learning_rate": 9.237499999999999e-07, "loss": 0.0006, "reward": 3.769058585166931, "reward_std": 0.060237159952521324, "rewards/answer_entity_reward": 0.842234879732132, "rewards/answer_wer_reward": 0.9324747323989868, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9943490326404572, "step": 62 }, { "completion_length": 172.59375, "epoch": 0.2016, "grad_norm": 0.9226670861244202, "kl": 0.04541015625, "learning_rate": 9.225e-07, "loss": 0.0005, "reward": 3.9475854635238647, "reward_std": 0.009972278494387865, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9488748908042908, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987106323242188, "step": 63 }, { "completion_length": 186.28125, "epoch": 0.2048, "grad_norm": 2.8787524700164795, "kl": 0.02923583984375, "learning_rate": 9.2125e-07, "loss": 0.0003, "reward": 3.8407578468322754, "reward_std": 0.04633911233395338, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9414158165454865, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8993421196937561, "step": 64 }, { "completion_length": 235.21875, "epoch": 0.208, "grad_norm": 3.289802074432373, "kl": 0.02203369140625, "learning_rate": 9.2e-07, "loss": 0.0002, "reward": 3.8516111373901367, "reward_std": 0.05013709142804146, "rewards/answer_entity_reward": 0.9782106876373291, "rewards/answer_wer_reward": 0.8967941999435425, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9766062498092651, "step": 65 }, { "completion_length": 182.9375, "epoch": 0.2112, "grad_norm": 15.17410659790039, "kl": 0.079833984375, "learning_rate": 9.187499999999999e-07, "loss": 0.0008, "reward": 3.7952799797058105, "reward_std": 0.08191402442753315, "rewards/answer_entity_reward": 0.9947552382946014, "rewards/answer_wer_reward": 0.9461319446563721, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8543927371501923, "step": 66 }, { "completion_length": 195.3125, "epoch": 0.2144, "grad_norm": 1.6663379669189453, "kl": 0.0638427734375, "learning_rate": 9.174999999999999e-07, "loss": 0.0006, "reward": 3.916337490081787, "reward_std": 0.018936872947961092, "rewards/answer_entity_reward": 0.9955128133296967, "rewards/answer_wer_reward": 0.9398471117019653, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.980977475643158, "step": 67 }, { "completion_length": 211.84375, "epoch": 0.2176, "grad_norm": 2.6255111694335938, "kl": 0.05126953125, "learning_rate": 9.1625e-07, "loss": 0.0005, "reward": 3.9224915504455566, "reward_std": 0.01644316827878356, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9280897378921509, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.997242659330368, "step": 68 }, { "completion_length": 170.65625, "epoch": 0.2208, "grad_norm": 3.3114447593688965, "kl": 0.0849609375, "learning_rate": 9.15e-07, "loss": 0.0009, "reward": 3.801788806915283, "reward_std": 0.07587217539548874, "rewards/answer_entity_reward": 0.9663097262382507, "rewards/answer_wer_reward": 0.9007239937782288, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9347550868988037, "step": 69 }, { "completion_length": 194.0, "epoch": 0.224, "grad_norm": 0.908227264881134, "kl": 0.0428466796875, "learning_rate": 9.137499999999999e-07, "loss": 0.0004, "reward": 3.908014178276062, "reward_std": 0.015611772891134024, "rewards/answer_entity_reward": 0.9866071343421936, "rewards/answer_wer_reward": 0.9214071035385132, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 70 }, { "completion_length": 235.15625, "epoch": 0.2272, "grad_norm": 6.288023471832275, "kl": 0.0377197265625, "learning_rate": 9.124999999999999e-07, "loss": 0.0004, "reward": 3.8232322931289673, "reward_std": 0.019494441337883472, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9413564205169678, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8853481709957123, "step": 71 }, { "completion_length": 202.84375, "epoch": 0.2304, "grad_norm": 3.666252374649048, "kl": 0.02703857421875, "learning_rate": 9.1125e-07, "loss": 0.0003, "reward": 3.8724911212921143, "reward_std": 0.036418632604181767, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9379763305187225, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.934514731168747, "step": 72 }, { "completion_length": 192.59375, "epoch": 0.2336, "grad_norm": 2.5703845024108887, "kl": 0.04815673828125, "learning_rate": 9.1e-07, "loss": 0.0005, "reward": 3.819400668144226, "reward_std": 0.09702013805508614, "rewards/answer_entity_reward": 0.9749708473682404, "rewards/answer_wer_reward": 0.8958881497383118, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9485417604446411, "step": 73 }, { "completion_length": 233.96875, "epoch": 0.2368, "grad_norm": 5.079833030700684, "kl": 0.03594970703125, "learning_rate": 9.087499999999999e-07, "loss": 0.0004, "reward": 3.87298047542572, "reward_std": 0.04117333237081766, "rewards/answer_entity_reward": 0.979208379983902, "rewards/answer_wer_reward": 0.8985798060894012, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.995192289352417, "step": 74 }, { "completion_length": 232.09375, "epoch": 0.24, "grad_norm": 1.3709529638290405, "kl": 0.0469970703125, "learning_rate": 9.074999999999999e-07, "loss": 0.0005, "reward": 3.8842471837997437, "reward_std": 0.02406489010900259, "rewards/answer_entity_reward": 0.976262629032135, "rewards/answer_wer_reward": 0.9083134233951569, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996710419654846, "step": 75 }, { "completion_length": 134.46875, "epoch": 0.2432, "grad_norm": 1.7917073965072632, "kl": 0.04345703125, "learning_rate": 9.0625e-07, "loss": 0.0004, "reward": 3.9434739351272583, "reward_std": 0.03165043890476227, "rewards/answer_entity_reward": 0.9853896200656891, "rewards/answer_wer_reward": 0.960752934217453, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9973313212394714, "step": 76 }, { "completion_length": 260.75, "epoch": 0.2464, "grad_norm": 2.487206220626831, "kl": 0.02789306640625, "learning_rate": 9.05e-07, "loss": 0.0003, "reward": 3.8149930238723755, "reward_std": 0.04638839513063431, "rewards/answer_entity_reward": 0.9494674503803253, "rewards/answer_wer_reward": 0.8663396835327148, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9991858303546906, "step": 77 }, { "completion_length": 221.3125, "epoch": 0.2496, "grad_norm": 1.8767852783203125, "kl": 0.017425537109375, "learning_rate": 9.0375e-07, "loss": 0.0002, "reward": 3.8600170612335205, "reward_std": 0.04895954905077815, "rewards/answer_entity_reward": 0.9861111044883728, "rewards/answer_wer_reward": 0.8933806419372559, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9805253744125366, "step": 78 }, { "completion_length": 230.71875, "epoch": 0.2528, "grad_norm": 3.712688684463501, "kl": 0.054931640625, "learning_rate": 9.024999999999999e-07, "loss": 0.0005, "reward": 3.8847248554229736, "reward_std": 0.012873267754912376, "rewards/answer_entity_reward": 0.9855768978595734, "rewards/answer_wer_reward": 0.9019420742988586, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9972057938575745, "step": 79 }, { "completion_length": 199.3125, "epoch": 0.256, "grad_norm": 1.9246958494186401, "kl": 0.054931640625, "learning_rate": 9.0125e-07, "loss": 0.0005, "reward": 3.8006842136383057, "reward_std": 0.052133604884147644, "rewards/answer_entity_reward": 0.9955128133296967, "rewards/answer_wer_reward": 0.9017785787582397, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9033928513526917, "step": 80 }, { "completion_length": 250.21875, "epoch": 0.2592, "grad_norm": 1.160876989364624, "kl": 0.0220947265625, "learning_rate": 9e-07, "loss": 0.0002, "reward": 3.8708144426345825, "reward_std": 0.030466954689472914, "rewards/answer_entity_reward": 0.9937500059604645, "rewards/answer_wer_reward": 0.8790038824081421, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9980604648590088, "step": 81 }, { "completion_length": 237.125, "epoch": 0.2624, "grad_norm": 5.024093151092529, "kl": 0.0382080078125, "learning_rate": 8.9875e-07, "loss": 0.0004, "reward": 3.9048351049423218, "reward_std": 0.03107828088104725, "rewards/answer_entity_reward": 0.9851398468017578, "rewards/answer_wer_reward": 0.9344828426837921, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9852123558521271, "step": 82 }, { "completion_length": 222.5, "epoch": 0.2656, "grad_norm": 1.6519030332565308, "kl": 0.0380859375, "learning_rate": 8.974999999999999e-07, "loss": 0.0004, "reward": 3.863801956176758, "reward_std": 0.030243747401982546, "rewards/answer_entity_reward": 0.9727078676223755, "rewards/answer_wer_reward": 0.9002127051353455, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9908813536167145, "step": 83 }, { "completion_length": 225.53125, "epoch": 0.2688, "grad_norm": 1.4793689250946045, "kl": 0.0517578125, "learning_rate": 8.9625e-07, "loss": 0.0005, "reward": 3.8814769983291626, "reward_std": 0.029270809143781662, "rewards/answer_entity_reward": 0.9880681931972504, "rewards/answer_wer_reward": 0.8934087753295898, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 84 }, { "completion_length": 235.9375, "epoch": 0.272, "grad_norm": 1.597517728805542, "kl": 0.1016845703125, "learning_rate": 8.95e-07, "loss": 0.001, "reward": 3.8768863677978516, "reward_std": 0.03502520266920328, "rewards/answer_entity_reward": 0.9798878133296967, "rewards/answer_wer_reward": 0.8985857367515564, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9984128177165985, "step": 85 }, { "completion_length": 214.4375, "epoch": 0.2752, "grad_norm": 4.483051300048828, "kl": 0.04150390625, "learning_rate": 8.9375e-07, "loss": 0.0004, "reward": 3.903320074081421, "reward_std": 0.019831405603326857, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9384645223617554, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9648554623126984, "step": 86 }, { "completion_length": 217.3125, "epoch": 0.2784, "grad_norm": 2.5979843139648438, "kl": 0.0279541015625, "learning_rate": 8.924999999999999e-07, "loss": 0.0003, "reward": 3.8643628358840942, "reward_std": 0.07706086616963148, "rewards/answer_entity_reward": 0.9751845002174377, "rewards/answer_wer_reward": 0.9189748764038086, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9702034890651703, "step": 87 }, { "completion_length": 209.0625, "epoch": 0.2816, "grad_norm": 2.134483575820923, "kl": 0.0654296875, "learning_rate": 8.912499999999999e-07, "loss": 0.0007, "reward": 3.829586148262024, "reward_std": 0.11678730137646198, "rewards/answer_entity_reward": 0.9327990114688873, "rewards/answer_wer_reward": 0.9185277223587036, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9782594740390778, "step": 88 }, { "completion_length": 202.5625, "epoch": 0.2848, "grad_norm": 2.750098943710327, "kl": 0.0386962890625, "learning_rate": 8.9e-07, "loss": 0.0004, "reward": 3.813106060028076, "reward_std": 0.013170521473512053, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.8155100047588348, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9975961446762085, "step": 89 }, { "completion_length": 208.21875, "epoch": 0.288, "grad_norm": 1.0419001579284668, "kl": 0.02874755859375, "learning_rate": 8.8875e-07, "loss": 0.0003, "reward": 3.7984471321105957, "reward_std": 0.046625567600131035, "rewards/answer_entity_reward": 0.9813492298126221, "rewards/answer_wer_reward": 0.908283531665802, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9088144898414612, "step": 90 }, { "completion_length": 240.875, "epoch": 0.2912, "grad_norm": 1.406315565109253, "kl": 0.0322265625, "learning_rate": 8.874999999999999e-07, "loss": 0.0003, "reward": 3.917527914047241, "reward_std": 0.018682857509702444, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.919611245393753, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 91 }, { "completion_length": 248.28125, "epoch": 0.2944, "grad_norm": 0.9986963868141174, "kl": 0.034912109375, "learning_rate": 8.8625e-07, "loss": 0.0003, "reward": 3.8824074268341064, "reward_std": 0.027639332227408886, "rewards/answer_entity_reward": 0.9829497039318085, "rewards/answer_wer_reward": 0.8998689651489258, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999588817358017, "step": 92 }, { "completion_length": 166.84375, "epoch": 0.2976, "grad_norm": 1.9086061716079712, "kl": 0.03448486328125, "learning_rate": 8.85e-07, "loss": 0.0003, "reward": 3.9501060247421265, "reward_std": 0.012802016455680132, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9628694355487823, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9872365295886993, "step": 93 }, { "completion_length": 256.1875, "epoch": 0.3008, "grad_norm": 3.4043421745300293, "kl": 0.049072265625, "learning_rate": 8.8375e-07, "loss": 0.0005, "reward": 3.814915657043457, "reward_std": 0.03222915716469288, "rewards/answer_entity_reward": 0.9890734255313873, "rewards/answer_wer_reward": 0.8261894881725311, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999652773141861, "step": 94 }, { "completion_length": 253.4375, "epoch": 0.304, "grad_norm": 0.9184324741363525, "kl": 0.03564453125, "learning_rate": 8.824999999999999e-07, "loss": 0.0004, "reward": 3.8896020650863647, "reward_std": 0.02269437536597252, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.8971993029117584, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9952436983585358, "step": 95 }, { "completion_length": 202.15625, "epoch": 0.3072, "grad_norm": 12.922323226928711, "kl": 0.05908203125, "learning_rate": 8.812499999999999e-07, "loss": 0.0006, "reward": 3.9009629487991333, "reward_std": 0.0202713580802083, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9189554452896118, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9820075929164886, "step": 96 }, { "completion_length": 224.53125, "epoch": 0.3104, "grad_norm": 4.217601299285889, "kl": 0.0465087890625, "learning_rate": 8.799999999999999e-07, "loss": 0.0005, "reward": 3.8913207054138184, "reward_std": 0.014381649438291788, "rewards/answer_entity_reward": 0.9821428656578064, "rewards/answer_wer_reward": 0.9095685184001923, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996093809604645, "step": 97 }, { "completion_length": 206.40625, "epoch": 0.3136, "grad_norm": 2.168041706085205, "kl": 0.0323486328125, "learning_rate": 8.7875e-07, "loss": 0.0003, "reward": 3.8137295246124268, "reward_std": 0.06389336660504341, "rewards/answer_entity_reward": 0.9776169061660767, "rewards/answer_wer_reward": 0.8989610075950623, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9371516108512878, "step": 98 }, { "completion_length": 209.0625, "epoch": 0.3168, "grad_norm": 1.6052436828613281, "kl": 0.0345458984375, "learning_rate": 8.774999999999999e-07, "loss": 0.0003, "reward": 3.828700304031372, "reward_std": 0.019330056384205818, "rewards/answer_entity_reward": 0.9850388169288635, "rewards/answer_wer_reward": 0.846589595079422, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9970719814300537, "step": 99 }, { "completion_length": 210.6875, "epoch": 0.32, "grad_norm": 0.9548845887184143, "kl": 0.0341796875, "learning_rate": 8.7625e-07, "loss": 0.0003, "reward": 3.9469358921051025, "reward_std": 0.021241382230073214, "rewards/answer_entity_reward": 0.9851641654968262, "rewards/answer_wer_reward": 0.961771547794342, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 100 }, { "completion_length": 214.28125, "epoch": 0.3232, "grad_norm": 2.8610620498657227, "kl": 0.052734375, "learning_rate": 8.75e-07, "loss": 0.0005, "reward": 3.806527853012085, "reward_std": 0.04471902176737785, "rewards/answer_entity_reward": 0.9853896200656891, "rewards/answer_wer_reward": 0.8547504544258118, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.966387927532196, "step": 101 }, { "completion_length": 223.4375, "epoch": 0.3264, "grad_norm": 0.7780336141586304, "kl": 0.034912109375, "learning_rate": 8.7375e-07, "loss": 0.0003, "reward": 3.880792260169983, "reward_std": 0.022754055447876453, "rewards/answer_entity_reward": 0.989393949508667, "rewards/answer_wer_reward": 0.8913983702659607, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 102 }, { "completion_length": 231.0625, "epoch": 0.3296, "grad_norm": 1.3763070106506348, "kl": 0.024444580078125, "learning_rate": 8.725e-07, "loss": 0.0003, "reward": 3.929618239402771, "reward_std": 0.012849014718085527, "rewards/answer_entity_reward": 0.9983552694320679, "rewards/answer_wer_reward": 0.9325020015239716, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987609684467316, "step": 103 }, { "completion_length": 268.96875, "epoch": 0.3328, "grad_norm": 1.7985624074935913, "kl": 0.0289306640625, "learning_rate": 8.712499999999999e-07, "loss": 0.0003, "reward": 3.888875961303711, "reward_std": 0.027541114017367363, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.8925231993198395, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999193549156189, "step": 104 }, { "completion_length": 254.1875, "epoch": 0.336, "grad_norm": 18.920978546142578, "kl": 0.027099609375, "learning_rate": 8.699999999999999e-07, "loss": 0.0003, "reward": 3.860435366630554, "reward_std": 0.030950906220823526, "rewards/answer_entity_reward": 0.9734883308410645, "rewards/answer_wer_reward": 0.8872724771499634, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996744692325592, "step": 105 }, { "completion_length": 163.53125, "epoch": 0.3392, "grad_norm": 2.867141008377075, "kl": 0.03399658203125, "learning_rate": 8.687499999999999e-07, "loss": 0.0003, "reward": 3.9226391315460205, "reward_std": 0.023416020907461643, "rewards/answer_entity_reward": 0.9886363744735718, "rewards/answer_wer_reward": 0.9473121762275696, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9866906106472015, "step": 106 }, { "completion_length": 230.5625, "epoch": 0.3424, "grad_norm": 1.7444649934768677, "kl": 0.03302001953125, "learning_rate": 8.675000000000001e-07, "loss": 0.0003, "reward": 3.9037901163101196, "reward_std": 0.013123108074069023, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9062368869781494, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996366202831268, "step": 107 }, { "completion_length": 196.96875, "epoch": 0.3456, "grad_norm": 1.4596710205078125, "kl": 0.0565185546875, "learning_rate": 8.6625e-07, "loss": 0.0006, "reward": 3.927306890487671, "reward_std": 0.017726238816976547, "rewards/answer_entity_reward": 0.9847221970558167, "rewards/answer_wer_reward": 0.9435714483261108, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990131855010986, "step": 108 }, { "completion_length": 204.03125, "epoch": 0.3488, "grad_norm": 21.111600875854492, "kl": 0.259765625, "learning_rate": 8.65e-07, "loss": 0.0026, "reward": 3.878751039505005, "reward_std": 0.09589649271219969, "rewards/answer_entity_reward": 0.9957579076290131, "rewards/answer_wer_reward": 0.9333003461360931, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9496928453445435, "step": 109 }, { "completion_length": 213.25, "epoch": 0.352, "grad_norm": 5.349282264709473, "kl": 0.0455322265625, "learning_rate": 8.6375e-07, "loss": 0.0005, "reward": 3.862163782119751, "reward_std": 0.031207362189888954, "rewards/answer_entity_reward": 0.9892857372760773, "rewards/answer_wer_reward": 0.9074709117412567, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9654072225093842, "step": 110 }, { "completion_length": 220.3125, "epoch": 0.3552, "grad_norm": 3.316596746444702, "kl": 0.03369140625, "learning_rate": 8.625e-07, "loss": 0.0003, "reward": 3.8875255584716797, "reward_std": 0.03998068626970053, "rewards/answer_entity_reward": 0.9902909696102142, "rewards/answer_wer_reward": 0.9038136303424835, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9934210479259491, "step": 111 }, { "completion_length": 250.78125, "epoch": 0.3584, "grad_norm": 2.525360107421875, "kl": 0.03515625, "learning_rate": 8.612499999999999e-07, "loss": 0.0003, "reward": 3.8880720138549805, "reward_std": 0.025330569595098495, "rewards/answer_entity_reward": 0.9918486475944519, "rewards/answer_wer_reward": 0.8981437385082245, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9980796277523041, "step": 112 }, { "completion_length": 220.09375, "epoch": 0.3616, "grad_norm": 5.7261433601379395, "kl": 0.038330078125, "learning_rate": 8.599999999999999e-07, "loss": 0.0004, "reward": 3.873054027557373, "reward_std": 0.018459735438227654, "rewards/answer_entity_reward": 0.9903846085071564, "rewards/answer_wer_reward": 0.8848404586315155, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9978289306163788, "step": 113 }, { "completion_length": 233.625, "epoch": 0.3648, "grad_norm": 2.1468665599823, "kl": 0.0286865234375, "learning_rate": 8.587499999999999e-07, "loss": 0.0003, "reward": 3.9267923831939697, "reward_std": 0.026638164184987545, "rewards/answer_entity_reward": 0.993686854839325, "rewards/answer_wer_reward": 0.9341540634632111, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9989514350891113, "step": 114 }, { "completion_length": 237.375, "epoch": 0.368, "grad_norm": 14.322599411010742, "kl": 0.04052734375, "learning_rate": 8.575e-07, "loss": 0.0004, "reward": 3.9121710062026978, "reward_std": 0.02902364358305931, "rewards/answer_entity_reward": 0.9908459782600403, "rewards/answer_wer_reward": 0.922933429479599, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9983916878700256, "step": 115 }, { "completion_length": 240.15625, "epoch": 0.3712, "grad_norm": 2.0209200382232666, "kl": 0.06103515625, "learning_rate": 8.5625e-07, "loss": 0.0006, "reward": 3.888006567955017, "reward_std": 0.023146681487560272, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.895849883556366, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.994560569524765, "step": 116 }, { "completion_length": 222.4375, "epoch": 0.3744, "grad_norm": 2.933910608291626, "kl": 0.0419921875, "learning_rate": 8.55e-07, "loss": 0.0004, "reward": 3.8359127044677734, "reward_std": 0.058022117242217064, "rewards/answer_entity_reward": 0.9440500438213348, "rewards/answer_wer_reward": 0.894202709197998, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9976600110530853, "step": 117 }, { "completion_length": 214.5625, "epoch": 0.3776, "grad_norm": 7.493628025054932, "kl": 0.064453125, "learning_rate": 8.5375e-07, "loss": 0.0006, "reward": 3.799570918083191, "reward_std": 0.06657508388161659, "rewards/answer_entity_reward": 0.9727430641651154, "rewards/answer_wer_reward": 0.871229887008667, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9555979073047638, "step": 118 }, { "completion_length": 212.625, "epoch": 0.3808, "grad_norm": 2.1899421215057373, "kl": 0.0570068359375, "learning_rate": 8.525e-07, "loss": 0.0006, "reward": 3.9054840803146362, "reward_std": 0.027329989708960056, "rewards/answer_entity_reward": 0.9914772808551788, "rewards/answer_wer_reward": 0.9344967901706696, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9795099198818207, "step": 119 }, { "completion_length": 249.8125, "epoch": 0.384, "grad_norm": 2.4804491996765137, "kl": 0.035888671875, "learning_rate": 8.512499999999999e-07, "loss": 0.0004, "reward": 3.8948739767074585, "reward_std": 0.028746116440743208, "rewards/answer_entity_reward": 0.9953208565711975, "rewards/answer_wer_reward": 0.9002179205417633, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993351101875305, "step": 120 }, { "completion_length": 185.34375, "epoch": 0.3872, "grad_norm": 2.305140256881714, "kl": 0.102783203125, "learning_rate": 8.499999999999999e-07, "loss": 0.001, "reward": 3.9010980129241943, "reward_std": 0.021339962724596262, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9222235083580017, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9788744449615479, "step": 121 }, { "completion_length": 204.65625, "epoch": 0.3904, "grad_norm": 1.5420470237731934, "kl": 0.0313720703125, "learning_rate": 8.487499999999999e-07, "loss": 0.0003, "reward": 3.927214741706848, "reward_std": 0.019817203283309937, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.92842698097229, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987878203392029, "step": 122 }, { "completion_length": 216.90625, "epoch": 0.3936, "grad_norm": 8.852909088134766, "kl": 0.0716552734375, "learning_rate": 8.475e-07, "loss": 0.0007, "reward": 3.811018466949463, "reward_std": 0.010543343145400286, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.938366711139679, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8726518452167511, "step": 123 }, { "completion_length": 257.75, "epoch": 0.3968, "grad_norm": 1.4971685409545898, "kl": 0.0330810546875, "learning_rate": 8.462499999999999e-07, "loss": 0.0003, "reward": 3.9272462129592896, "reward_std": 0.01983210165053606, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9303403496742249, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9989891648292542, "step": 124 }, { "completion_length": 207.4375, "epoch": 0.4, "grad_norm": 1.9963277578353882, "kl": 0.056396484375, "learning_rate": 8.45e-07, "loss": 0.0006, "reward": 3.9006247520446777, "reward_std": 0.030232679098844528, "rewards/answer_entity_reward": 0.9941239356994629, "rewards/answer_wer_reward": 0.9261119067668915, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9803889393806458, "step": 125 }, { "completion_length": 246.875, "epoch": 0.4032, "grad_norm": 1.1950430870056152, "kl": 0.03369140625, "learning_rate": 8.4375e-07, "loss": 0.0003, "reward": 3.881152391433716, "reward_std": 0.03120280895382166, "rewards/answer_entity_reward": 0.9683753550052643, "rewards/answer_wer_reward": 0.9131445586681366, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996323585510254, "step": 126 }, { "completion_length": 212.15625, "epoch": 0.4064, "grad_norm": 4.167364120483398, "kl": 0.257568359375, "learning_rate": 8.425e-07, "loss": 0.0026, "reward": 3.891525626182556, "reward_std": 0.03758985735476017, "rewards/answer_entity_reward": 0.9853896200656891, "rewards/answer_wer_reward": 0.9100889563560486, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9960470795631409, "step": 127 }, { "completion_length": 215.5625, "epoch": 0.4096, "grad_norm": 1.2758169174194336, "kl": 0.059326171875, "learning_rate": 8.4125e-07, "loss": 0.0006, "reward": 3.8984569311141968, "reward_std": 0.02103353524580598, "rewards/answer_entity_reward": 0.987500011920929, "rewards/answer_wer_reward": 0.9310561716556549, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9799006283283234, "step": 128 }, { "completion_length": 221.0, "epoch": 0.4128, "grad_norm": 1.6011369228363037, "kl": 0.02734375, "learning_rate": 8.399999999999999e-07, "loss": 0.0003, "reward": 3.907585859298706, "reward_std": 0.024174046237021685, "rewards/answer_entity_reward": 0.9887152910232544, "rewards/answer_wer_reward": 0.9191110134124756, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997596144676208, "step": 129 }, { "completion_length": 189.375, "epoch": 0.416, "grad_norm": 2.7846839427948, "kl": 0.0413818359375, "learning_rate": 8.387499999999999e-07, "loss": 0.0004, "reward": 3.8641178607940674, "reward_std": 0.03212345764040947, "rewards/answer_entity_reward": 0.9947552382946014, "rewards/answer_wer_reward": 0.9255104064941406, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9438523054122925, "step": 130 }, { "completion_length": 209.5, "epoch": 0.4192, "grad_norm": 4.144553184509277, "kl": 0.0548095703125, "learning_rate": 8.375e-07, "loss": 0.0006, "reward": 3.8618308305740356, "reward_std": 0.07612445950508118, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9306082725524902, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9312225580215454, "step": 131 }, { "completion_length": 198.5, "epoch": 0.4224, "grad_norm": 2.663985013961792, "kl": 0.04052734375, "learning_rate": 8.3625e-07, "loss": 0.0004, "reward": 3.897012948989868, "reward_std": 0.030758653301745653, "rewards/answer_entity_reward": 0.9861111044883728, "rewards/answer_wer_reward": 0.9326047897338867, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9782971143722534, "step": 132 }, { "completion_length": 180.78125, "epoch": 0.4256, "grad_norm": 2.2100954055786133, "kl": 0.0439453125, "learning_rate": 8.349999999999999e-07, "loss": 0.0004, "reward": 3.923304557800293, "reward_std": 0.025213422253727913, "rewards/answer_entity_reward": 0.9882478713989258, "rewards/answer_wer_reward": 0.9360361397266388, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999020516872406, "step": 133 }, { "completion_length": 219.59375, "epoch": 0.4288, "grad_norm": 15.98015022277832, "kl": 0.0645751953125, "learning_rate": 8.3375e-07, "loss": 0.0006, "reward": 3.8721258640289307, "reward_std": 0.02985560242086649, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9070867002010345, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9650392830371857, "step": 134 }, { "completion_length": 239.90625, "epoch": 0.432, "grad_norm": 3.754002332687378, "kl": 0.0419921875, "learning_rate": 8.325e-07, "loss": 0.0004, "reward": 3.8614091873168945, "reward_std": 0.0724228248000145, "rewards/answer_entity_reward": 0.9794008135795593, "rewards/answer_wer_reward": 0.9043296277523041, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9776785373687744, "step": 135 }, { "completion_length": 228.09375, "epoch": 0.4352, "grad_norm": 2.609844207763672, "kl": 0.037841796875, "learning_rate": 8.3125e-07, "loss": 0.0004, "reward": 3.8617947101593018, "reward_std": 0.021692313253879547, "rewards/answer_entity_reward": 0.9861111044883728, "rewards/answer_wer_reward": 0.8795575797557831, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.996126115322113, "step": 136 }, { "completion_length": 158.8125, "epoch": 0.4384, "grad_norm": 1.6180543899536133, "kl": 0.055419921875, "learning_rate": 8.299999999999999e-07, "loss": 0.0005, "reward": 3.9137951135635376, "reward_std": 0.020158007740974426, "rewards/answer_entity_reward": 0.970695972442627, "rewards/answer_wer_reward": 0.9480262100696564, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9950730204582214, "step": 137 }, { "completion_length": 231.25, "epoch": 0.4416, "grad_norm": 0.9336134195327759, "kl": 0.03399658203125, "learning_rate": 8.287499999999999e-07, "loss": 0.0003, "reward": 3.9351539611816406, "reward_std": 0.014509289292618632, "rewards/answer_entity_reward": 0.9934294819831848, "rewards/answer_wer_reward": 0.9442258775234222, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9974986016750336, "step": 138 }, { "completion_length": 220.34375, "epoch": 0.4448, "grad_norm": 21.355905532836914, "kl": 0.059814453125, "learning_rate": 8.275e-07, "loss": 0.0006, "reward": 3.863122820854187, "reward_std": 0.060401469469070435, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9233364760875702, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9397862255573273, "step": 139 }, { "completion_length": 214.90625, "epoch": 0.448, "grad_norm": 1.280321478843689, "kl": 0.052490234375, "learning_rate": 8.2625e-07, "loss": 0.0005, "reward": 3.9231661558151245, "reward_std": 0.009715312160551548, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9245247840881348, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.998641312122345, "step": 140 }, { "completion_length": 211.375, "epoch": 0.4512, "grad_norm": 1.7492412328720093, "kl": 0.062744140625, "learning_rate": 8.249999999999999e-07, "loss": 0.0006, "reward": 3.88791024684906, "reward_std": 0.011862037936225533, "rewards/answer_entity_reward": 0.9832702279090881, "rewards/answer_wer_reward": 0.957579493522644, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9470604658126831, "step": 141 }, { "completion_length": 246.3125, "epoch": 0.4544, "grad_norm": 2.37640118598938, "kl": 0.0369873046875, "learning_rate": 8.2375e-07, "loss": 0.0004, "reward": 3.944279909133911, "reward_std": 0.011443465016782284, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9472803771495819, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9969995319843292, "step": 142 }, { "completion_length": 199.90625, "epoch": 0.4576, "grad_norm": 2.8359158039093018, "kl": 0.0540771484375, "learning_rate": 8.225e-07, "loss": 0.0005, "reward": 3.93644380569458, "reward_std": 0.023367811925709248, "rewards/answer_entity_reward": 0.9916666746139526, "rewards/answer_wer_reward": 0.9554752707481384, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9893018305301666, "step": 143 }, { "completion_length": 195.28125, "epoch": 0.4608, "grad_norm": 1.723976731300354, "kl": 0.031982421875, "learning_rate": 8.2125e-07, "loss": 0.0003, "reward": 3.9411680698394775, "reward_std": 0.007689078338444233, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.941936582326889, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9992315769195557, "step": 144 }, { "completion_length": 223.375, "epoch": 0.464, "grad_norm": 1.08156418800354, "kl": 0.02874755859375, "learning_rate": 8.199999999999999e-07, "loss": 0.0003, "reward": 3.9059054851531982, "reward_std": 0.007867377484217286, "rewards/answer_entity_reward": 0.9903846085071564, "rewards/answer_wer_reward": 0.9531411230564117, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9623797535896301, "step": 145 }, { "completion_length": 184.75, "epoch": 0.4672, "grad_norm": 1.7059741020202637, "kl": 0.0400390625, "learning_rate": 8.187499999999999e-07, "loss": 0.0004, "reward": 3.939697027206421, "reward_std": 0.0070332614704966545, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9535529613494873, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9861441254615784, "step": 146 }, { "completion_length": 222.84375, "epoch": 0.4704, "grad_norm": 1.5283204317092896, "kl": 0.072998046875, "learning_rate": 8.175e-07, "loss": 0.0007, "reward": 3.843386173248291, "reward_std": 0.02895416272804141, "rewards/answer_entity_reward": 0.9304008483886719, "rewards/answer_wer_reward": 0.9129853844642639, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 147 }, { "completion_length": 165.25, "epoch": 0.4736, "grad_norm": 2.885890245437622, "kl": 0.04193115234375, "learning_rate": 8.1625e-07, "loss": 0.0004, "reward": 3.8639066219329834, "reward_std": 0.01842296402901411, "rewards/answer_entity_reward": 0.9947552382946014, "rewards/answer_wer_reward": 0.9352113604545593, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9339398741722107, "step": 148 }, { "completion_length": 225.8125, "epoch": 0.4768, "grad_norm": 1.5893429517745972, "kl": 0.0615234375, "learning_rate": 8.149999999999999e-07, "loss": 0.0006, "reward": 3.9009220600128174, "reward_std": 0.022383708506822586, "rewards/answer_entity_reward": 0.9967105388641357, "rewards/answer_wer_reward": 0.9052460193634033, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9989655911922455, "step": 149 }, { "completion_length": 236.21875, "epoch": 0.48, "grad_norm": 2.1324307918548584, "kl": 0.0377197265625, "learning_rate": 8.137499999999999e-07, "loss": 0.0004, "reward": 3.8904128074645996, "reward_std": 0.02841739635914564, "rewards/answer_entity_reward": 0.9930555820465088, "rewards/answer_wer_reward": 0.8976494371891022, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997079372406006, "step": 150 }, { "completion_length": 213.15625, "epoch": 0.4832, "grad_norm": 0.9698525667190552, "kl": 0.034423828125, "learning_rate": 8.125e-07, "loss": 0.0003, "reward": 3.890373468399048, "reward_std": 0.009532647207379341, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9459290206432343, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9444444477558136, "step": 151 }, { "completion_length": 250.21875, "epoch": 0.4864, "grad_norm": 4.16625452041626, "kl": 0.198486328125, "learning_rate": 8.1125e-07, "loss": 0.002, "reward": 3.8978230953216553, "reward_std": 0.024048997554928064, "rewards/answer_entity_reward": 0.987500011920929, "rewards/answer_wer_reward": 0.9117782711982727, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9985446929931641, "step": 152 }, { "completion_length": 174.15625, "epoch": 0.4896, "grad_norm": 2.9183833599090576, "kl": 0.0716552734375, "learning_rate": 8.1e-07, "loss": 0.0007, "reward": 3.908216118812561, "reward_std": 0.032137976959347725, "rewards/answer_entity_reward": 0.9895833432674408, "rewards/answer_wer_reward": 0.9441157281398773, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9745170772075653, "step": 153 }, { "completion_length": 187.03125, "epoch": 0.4928, "grad_norm": 1.039563536643982, "kl": 0.0535888671875, "learning_rate": 8.087499999999999e-07, "loss": 0.0005, "reward": 3.940076231956482, "reward_std": 0.014994107652455568, "rewards/answer_entity_reward": 0.9910714626312256, "rewards/answer_wer_reward": 0.9499542117118835, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990506172180176, "step": 154 }, { "completion_length": 214.125, "epoch": 0.496, "grad_norm": 2.49003267288208, "kl": 0.0635986328125, "learning_rate": 8.075e-07, "loss": 0.0006, "reward": 3.850375175476074, "reward_std": 0.026249381713569164, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.8511867821216583, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9991883039474487, "step": 155 }, { "completion_length": 214.8125, "epoch": 0.4992, "grad_norm": 2.7330820560455322, "kl": 0.03717041015625, "learning_rate": 8.0625e-07, "loss": 0.0004, "reward": 3.9070980548858643, "reward_std": 0.04327901639044285, "rewards/answer_entity_reward": 0.9886363744735718, "rewards/answer_wer_reward": 0.9249836802482605, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9934781193733215, "step": 156 }, { "completion_length": 195.0625, "epoch": 0.5024, "grad_norm": 2.878744602203369, "kl": 0.0828857421875, "learning_rate": 8.05e-07, "loss": 0.0008, "reward": 3.9139277935028076, "reward_std": 0.022999857552349567, "rewards/answer_entity_reward": 0.9947916567325592, "rewards/answer_wer_reward": 0.9313595592975616, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9877763986587524, "step": 157 }, { "completion_length": 216.625, "epoch": 0.5056, "grad_norm": 1.1287983655929565, "kl": 0.049072265625, "learning_rate": 8.037499999999999e-07, "loss": 0.0005, "reward": 3.9037948846817017, "reward_std": 0.011531218886375427, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9081907570362091, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9956042170524597, "step": 158 }, { "completion_length": 200.21875, "epoch": 0.5088, "grad_norm": 1.5555959939956665, "kl": 0.0369873046875, "learning_rate": 8.024999999999999e-07, "loss": 0.0004, "reward": 3.9110556840896606, "reward_std": 0.019422957440838218, "rewards/answer_entity_reward": 0.9941239356994629, "rewards/answer_wer_reward": 0.9354503750801086, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9814814925193787, "step": 159 }, { "completion_length": 202.875, "epoch": 0.512, "grad_norm": 13.22675895690918, "kl": 0.084228515625, "learning_rate": 8.0125e-07, "loss": 0.0008, "reward": 3.8508609533309937, "reward_std": 0.037849435582756996, "rewards/answer_entity_reward": 0.9867424070835114, "rewards/answer_wer_reward": 0.9194300472736359, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9446885287761688, "step": 160 }, { "completion_length": 187.5625, "epoch": 0.5152, "grad_norm": 1.9724727869033813, "kl": 0.05126953125, "learning_rate": 8e-07, "loss": 0.0005, "reward": 3.9261248111724854, "reward_std": 0.02531399577856064, "rewards/answer_entity_reward": 0.9882478713989258, "rewards/answer_wer_reward": 0.9410728812217712, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9968039691448212, "step": 161 }, { "completion_length": 254.84375, "epoch": 0.5184, "grad_norm": 2.3500356674194336, "kl": 0.05340576171875, "learning_rate": 7.9875e-07, "loss": 0.0005, "reward": 3.910772919654846, "reward_std": 0.04009111411869526, "rewards/answer_entity_reward": 0.9747862815856934, "rewards/answer_wer_reward": 0.9362366199493408, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999750018119812, "step": 162 }, { "completion_length": 206.625, "epoch": 0.5216, "grad_norm": 6.3654890060424805, "kl": 0.069580078125, "learning_rate": 7.975e-07, "loss": 0.0007, "reward": 3.805917978286743, "reward_std": 0.052407728508114815, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9451808631420135, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8642092943191528, "step": 163 }, { "completion_length": 212.71875, "epoch": 0.5248, "grad_norm": 1.921622633934021, "kl": 0.09283447265625, "learning_rate": 7.9625e-07, "loss": 0.0009, "reward": 3.9235308170318604, "reward_std": 0.022881922777742147, "rewards/answer_entity_reward": 0.993686854839325, "rewards/answer_wer_reward": 0.9401760995388031, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9896678924560547, "step": 164 }, { "completion_length": 234.5625, "epoch": 0.528, "grad_norm": 1.4160696268081665, "kl": 0.061767578125, "learning_rate": 7.95e-07, "loss": 0.0006, "reward": 3.890324354171753, "reward_std": 0.014382836874574423, "rewards/answer_entity_reward": 0.9653846025466919, "rewards/answer_wer_reward": 0.9249398708343506, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 165 }, { "completion_length": 223.0, "epoch": 0.5312, "grad_norm": 1.2775448560714722, "kl": 0.0582275390625, "learning_rate": 7.937499999999999e-07, "loss": 0.0006, "reward": 3.9478421211242676, "reward_std": 0.011931413784623146, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9481260776519775, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997159242630005, "step": 166 }, { "completion_length": 214.65625, "epoch": 0.5344, "grad_norm": 1.287255883216858, "kl": 0.052734375, "learning_rate": 7.924999999999999e-07, "loss": 0.0005, "reward": 3.9042768478393555, "reward_std": 0.02827941346913576, "rewards/answer_entity_reward": 0.9787962138652802, "rewards/answer_wer_reward": 0.925747811794281, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997329115867615, "step": 167 }, { "completion_length": 224.59375, "epoch": 0.5376, "grad_norm": 1.7952959537506104, "kl": 0.0364990234375, "learning_rate": 7.912499999999999e-07, "loss": 0.0004, "reward": 3.935611605644226, "reward_std": 0.027386673726141453, "rewards/answer_entity_reward": 0.9919143319129944, "rewards/answer_wer_reward": 0.9439473152160645, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999750018119812, "step": 168 }, { "completion_length": 183.28125, "epoch": 0.5408, "grad_norm": 8.36503791809082, "kl": 0.0848388671875, "learning_rate": 7.9e-07, "loss": 0.0008, "reward": 3.8025405406951904, "reward_std": 0.04630524106323719, "rewards/answer_entity_reward": 0.9862637221813202, "rewards/answer_wer_reward": 0.8270655274391174, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9892113208770752, "step": 169 }, { "completion_length": 235.15625, "epoch": 0.544, "grad_norm": 2.2816457748413086, "kl": 0.0296630859375, "learning_rate": 7.8875e-07, "loss": 0.0003, "reward": 3.934034824371338, "reward_std": 0.009957955218851566, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9344717264175415, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9995629191398621, "step": 170 }, { "completion_length": 247.53125, "epoch": 0.5472, "grad_norm": 1.6856052875518799, "kl": 0.13134765625, "learning_rate": 7.875e-07, "loss": 0.0013, "reward": 3.896223545074463, "reward_std": 0.015339810401201248, "rewards/answer_entity_reward": 0.9861111044883728, "rewards/answer_wer_reward": 0.9109295010566711, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9991828203201294, "step": 171 }, { "completion_length": 245.03125, "epoch": 0.5504, "grad_norm": 4.956347465515137, "kl": 0.044921875, "learning_rate": 7.8625e-07, "loss": 0.0005, "reward": 3.7271645069122314, "reward_std": 0.21888091787695885, "rewards/answer_entity_reward": 0.9630681872367859, "rewards/answer_wer_reward": 0.8937070369720459, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9016393423080444, "step": 172 }, { "completion_length": 211.3125, "epoch": 0.5536, "grad_norm": 1.1714370250701904, "kl": 0.0323486328125, "learning_rate": 7.85e-07, "loss": 0.0003, "reward": 3.913045883178711, "reward_std": 0.04143238253891468, "rewards/answer_entity_reward": 0.9870130121707916, "rewards/answer_wer_reward": 0.9331351518630981, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9928977191448212, "step": 173 }, { "completion_length": 272.1875, "epoch": 0.5568, "grad_norm": 1.2012341022491455, "kl": 0.0413818359375, "learning_rate": 7.837499999999999e-07, "loss": 0.0004, "reward": 3.876948356628418, "reward_std": 0.03149130195379257, "rewards/answer_entity_reward": 0.9889954328536987, "rewards/answer_wer_reward": 0.9271560311317444, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9607969224452972, "step": 174 }, { "completion_length": 200.3125, "epoch": 0.56, "grad_norm": 2.998842477798462, "kl": 0.067138671875, "learning_rate": 7.824999999999999e-07, "loss": 0.0007, "reward": 3.8472641706466675, "reward_std": 0.04471721313893795, "rewards/answer_entity_reward": 0.9902146458625793, "rewards/answer_wer_reward": 0.9358225166797638, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.921226978302002, "step": 175 }, { "completion_length": 207.03125, "epoch": 0.5632, "grad_norm": 10.961363792419434, "kl": 0.0789794921875, "learning_rate": 7.812499999999999e-07, "loss": 0.0008, "reward": 3.9478721618652344, "reward_std": 0.027662259992212057, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9600406885147095, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9902353584766388, "step": 176 }, { "completion_length": 221.59375, "epoch": 0.5664, "grad_norm": 1.341109275817871, "kl": 0.065185546875, "learning_rate": 7.799999999999999e-07, "loss": 0.0006, "reward": 3.8582847118377686, "reward_std": 0.041704089380800724, "rewards/answer_entity_reward": 0.9775640964508057, "rewards/answer_wer_reward": 0.9368657767772675, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9438548386096954, "step": 177 }, { "completion_length": 239.90625, "epoch": 0.5696, "grad_norm": 1.4057974815368652, "kl": 0.045166015625, "learning_rate": 7.787500000000001e-07, "loss": 0.0005, "reward": 3.9274110794067383, "reward_std": 0.02352920500561595, "rewards/answer_entity_reward": 0.9946895241737366, "rewards/answer_wer_reward": 0.9349404275417328, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9977810680866241, "step": 178 }, { "completion_length": 211.78125, "epoch": 0.5728, "grad_norm": 2.9184887409210205, "kl": 0.031982421875, "learning_rate": 7.775e-07, "loss": 0.0003, "reward": 3.945718765258789, "reward_std": 0.01779081765562296, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9512039721012115, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9973557591438293, "step": 179 }, { "completion_length": 204.375, "epoch": 0.576, "grad_norm": 113.12403869628906, "kl": 0.05322265625, "learning_rate": 7.7625e-07, "loss": 0.0005, "reward": 3.8825124502182007, "reward_std": 0.07031127344816923, "rewards/answer_entity_reward": 0.9926734566688538, "rewards/answer_wer_reward": 0.9367940425872803, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9530448913574219, "step": 180 }, { "completion_length": 214.75, "epoch": 0.5792, "grad_norm": 1.3515021800994873, "kl": 0.0609130859375, "learning_rate": 7.75e-07, "loss": 0.0006, "reward": 3.920071840286255, "reward_std": 0.011316743912175298, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9225669503211975, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9975048303604126, "step": 181 }, { "completion_length": 205.0625, "epoch": 0.5824, "grad_norm": 1.5749711990356445, "kl": 0.054443359375, "learning_rate": 7.7375e-07, "loss": 0.0005, "reward": 3.921678900718689, "reward_std": 0.013327162247151136, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9460242688655853, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9780585169792175, "step": 182 }, { "completion_length": 217.75, "epoch": 0.5856, "grad_norm": 0.7737219929695129, "kl": 0.0469970703125, "learning_rate": 7.724999999999999e-07, "loss": 0.0005, "reward": 3.9334832429885864, "reward_std": 0.020406807772815228, "rewards/answer_entity_reward": 0.9947552382946014, "rewards/answer_wer_reward": 0.938728004693985, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 183 }, { "completion_length": 231.59375, "epoch": 0.5888, "grad_norm": 1.6825175285339355, "kl": 0.0543212890625, "learning_rate": 7.712499999999999e-07, "loss": 0.0005, "reward": 3.938681125640869, "reward_std": 0.017365658190101385, "rewards/answer_entity_reward": 0.9981617629528046, "rewards/answer_wer_reward": 0.9413779377937317, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9991414844989777, "step": 184 }, { "completion_length": 239.71875, "epoch": 0.592, "grad_norm": 1.3427449464797974, "kl": 0.058837890625, "learning_rate": 7.699999999999999e-07, "loss": 0.0006, "reward": 3.9066988229751587, "reward_std": 0.020341036841273308, "rewards/answer_entity_reward": 0.9776557087898254, "rewards/answer_wer_reward": 0.929761528968811, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9992816150188446, "step": 185 }, { "completion_length": 133.90625, "epoch": 0.5952, "grad_norm": 4.991705417633057, "kl": 0.0623779296875, "learning_rate": 7.6875e-07, "loss": 0.0006, "reward": 3.926753878593445, "reward_std": 0.023914007004350424, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9629489779472351, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9638049006462097, "step": 186 }, { "completion_length": 234.625, "epoch": 0.5984, "grad_norm": 2.8712401390075684, "kl": 0.096435546875, "learning_rate": 7.675e-07, "loss": 0.001, "reward": 3.872377395629883, "reward_std": 0.06525835767388344, "rewards/answer_entity_reward": 0.9841803908348083, "rewards/answer_wer_reward": 0.9093597233295441, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9788371920585632, "step": 187 }, { "completion_length": 225.4375, "epoch": 0.6016, "grad_norm": 2.3115170001983643, "kl": 0.055419921875, "learning_rate": 7.6625e-07, "loss": 0.0006, "reward": 3.9362770318984985, "reward_std": 0.019690027460455894, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9422614872455597, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.997487872838974, "step": 188 }, { "completion_length": 214.15625, "epoch": 0.6048, "grad_norm": 3.583329677581787, "kl": 0.0550537109375, "learning_rate": 7.65e-07, "loss": 0.0005, "reward": 3.9327969551086426, "reward_std": 0.014218965079635382, "rewards/answer_entity_reward": 0.9903846085071564, "rewards/answer_wer_reward": 0.9424121379852295, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 189 }, { "completion_length": 249.15625, "epoch": 0.608, "grad_norm": 1.4651848077774048, "kl": 0.052001953125, "learning_rate": 7.6375e-07, "loss": 0.0005, "reward": 3.941069722175598, "reward_std": 0.009663278236985207, "rewards/answer_entity_reward": 0.9926470518112183, "rewards/answer_wer_reward": 0.9507163166999817, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9977063536643982, "step": 190 }, { "completion_length": 197.84375, "epoch": 0.6112, "grad_norm": 1.4688224792480469, "kl": 0.0577392578125, "learning_rate": 7.624999999999999e-07, "loss": 0.0006, "reward": 3.9300395250320435, "reward_std": 0.014806594932451844, "rewards/answer_entity_reward": 0.984722226858139, "rewards/answer_wer_reward": 0.9455022215843201, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9998151063919067, "step": 191 }, { "completion_length": 254.6875, "epoch": 0.6144, "grad_norm": 1.1648938655853271, "kl": 0.0589599609375, "learning_rate": 7.612499999999999e-07, "loss": 0.0006, "reward": 3.9228453636169434, "reward_std": 0.026355463080108166, "rewards/answer_entity_reward": 0.9819444715976715, "rewards/answer_wer_reward": 0.9418983161449432, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990026652812958, "step": 192 }, { "completion_length": 264.34375, "epoch": 0.6176, "grad_norm": 1.2595146894454956, "kl": 0.0635986328125, "learning_rate": 7.599999999999999e-07, "loss": 0.0006, "reward": 3.9068782329559326, "reward_std": 0.02374061942100525, "rewards/answer_entity_reward": 0.9758522510528564, "rewards/answer_wer_reward": 0.9392839670181274, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9917418956756592, "step": 193 }, { "completion_length": 226.875, "epoch": 0.6208, "grad_norm": 3.0049514770507812, "kl": 0.065185546875, "learning_rate": 7.5875e-07, "loss": 0.0007, "reward": 3.9182554483413696, "reward_std": 0.028174775652587414, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.9239371716976166, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 194 }, { "completion_length": 233.90625, "epoch": 0.624, "grad_norm": 3.6226987838745117, "kl": 0.14013671875, "learning_rate": 7.575e-07, "loss": 0.0014, "reward": 3.917691946029663, "reward_std": 0.015854593832045794, "rewards/answer_entity_reward": 0.9861111044883728, "rewards/answer_wer_reward": 0.9359965324401855, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.99558424949646, "step": 195 }, { "completion_length": 228.96875, "epoch": 0.6272, "grad_norm": 3.1564576625823975, "kl": 0.03131103515625, "learning_rate": 7.5625e-07, "loss": 0.0003, "reward": 3.8988983631134033, "reward_std": 0.04383570794016123, "rewards/answer_entity_reward": 0.980654776096344, "rewards/answer_wer_reward": 0.9372455775737762, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9809979796409607, "step": 196 }, { "completion_length": 235.875, "epoch": 0.6304, "grad_norm": 1.3267861604690552, "kl": 0.052978515625, "learning_rate": 7.55e-07, "loss": 0.0005, "reward": 3.9319225549697876, "reward_std": 0.02372880419716239, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9346356689929962, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999690592288971, "step": 197 }, { "completion_length": 162.34375, "epoch": 0.6336, "grad_norm": 1.4438445568084717, "kl": 0.065185546875, "learning_rate": 7.5375e-07, "loss": 0.0006, "reward": 3.8535887002944946, "reward_std": 0.041104525327682495, "rewards/answer_entity_reward": 0.9681412279605865, "rewards/answer_wer_reward": 0.9683326184749603, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9171150028705597, "step": 198 }, { "completion_length": 203.875, "epoch": 0.6368, "grad_norm": 4.674152374267578, "kl": 0.050048828125, "learning_rate": 7.524999999999999e-07, "loss": 0.0005, "reward": 3.938958764076233, "reward_std": 0.01455747289583087, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9664872884750366, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.974875271320343, "step": 199 }, { "completion_length": 230.625, "epoch": 0.64, "grad_norm": 1.899129867553711, "kl": 0.0535888671875, "learning_rate": 7.512499999999999e-07, "loss": 0.0005, "reward": 3.9438642263412476, "reward_std": 0.014077516738325357, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.952812910079956, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9938922822475433, "step": 200 }, { "completion_length": 212.4375, "epoch": 0.6432, "grad_norm": 1.8970869779586792, "kl": 0.0460205078125, "learning_rate": 7.5e-07, "loss": 0.0005, "reward": 3.9026511907577515, "reward_std": 0.038714910857379436, "rewards/answer_entity_reward": 0.9916666746139526, "rewards/answer_wer_reward": 0.911726325750351, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9992582499980927, "step": 201 }, { "completion_length": 203.875, "epoch": 0.6464, "grad_norm": 2.5214030742645264, "kl": 0.083251953125, "learning_rate": 7.4875e-07, "loss": 0.0008, "reward": 3.9040462970733643, "reward_std": 0.016587836667895317, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9761527180671692, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9278934299945831, "step": 202 }, { "completion_length": 216.375, "epoch": 0.6496, "grad_norm": 4.072224140167236, "kl": 0.053955078125, "learning_rate": 7.475e-07, "loss": 0.0005, "reward": 3.9431036710739136, "reward_std": 0.020094456151127815, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.949131965637207, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.996812641620636, "step": 203 }, { "completion_length": 221.0, "epoch": 0.6528, "grad_norm": 3.3709828853607178, "kl": 0.070556640625, "learning_rate": 7.4625e-07, "loss": 0.0007, "reward": 3.8844679594039917, "reward_std": 0.05386691028252244, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.934579610824585, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9498883783817291, "step": 204 }, { "completion_length": 195.9375, "epoch": 0.656, "grad_norm": 2.4978103637695312, "kl": 0.0775146484375, "learning_rate": 7.45e-07, "loss": 0.0008, "reward": 3.9303336143493652, "reward_std": 0.04689153959043324, "rewards/answer_entity_reward": 0.9804924428462982, "rewards/answer_wer_reward": 0.9526000618934631, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9972410500049591, "step": 205 }, { "completion_length": 256.875, "epoch": 0.6592, "grad_norm": 2.3422584533691406, "kl": 0.1229248046875, "learning_rate": 7.4375e-07, "loss": 0.0012, "reward": 3.9243087768554688, "reward_std": 0.019790570251643658, "rewards/answer_entity_reward": 0.9764957129955292, "rewards/answer_wer_reward": 0.9478131830692291, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 206 }, { "completion_length": 204.15625, "epoch": 0.6624, "grad_norm": 2.19623064994812, "kl": 0.0550537109375, "learning_rate": 7.425e-07, "loss": 0.0006, "reward": 3.936911940574646, "reward_std": 0.02031032182276249, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9463189840316772, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9905929565429688, "step": 207 }, { "completion_length": 225.21875, "epoch": 0.6656, "grad_norm": 5.279341220855713, "kl": 0.0498046875, "learning_rate": 7.412499999999999e-07, "loss": 0.0005, "reward": 3.915460228919983, "reward_std": 0.015285669825971127, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9175935089588165, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9978667497634888, "step": 208 }, { "completion_length": 188.78125, "epoch": 0.6688, "grad_norm": 3.7716915607452393, "kl": 0.0576171875, "learning_rate": 7.4e-07, "loss": 0.0006, "reward": 3.8296241760253906, "reward_std": 0.017440371215343475, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9421272277832031, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8899007737636566, "step": 209 }, { "completion_length": 203.28125, "epoch": 0.672, "grad_norm": 1.2790639400482178, "kl": 0.0582275390625, "learning_rate": 7.3875e-07, "loss": 0.0006, "reward": 3.952346086502075, "reward_std": 0.007349871098995209, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.969746857881546, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9825991690158844, "step": 210 }, { "completion_length": 196.1875, "epoch": 0.6752, "grad_norm": 14.005128860473633, "kl": 0.0604248046875, "learning_rate": 7.375e-07, "loss": 0.0006, "reward": 3.8537105321884155, "reward_std": 0.012695960700511932, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9704558551311493, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8832548260688782, "step": 211 }, { "completion_length": 159.3125, "epoch": 0.6784, "grad_norm": 4.394070625305176, "kl": 0.068115234375, "learning_rate": 7.362499999999999e-07, "loss": 0.0007, "reward": 3.9123398065567017, "reward_std": 0.02882718201726675, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9466139674186707, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.965725839138031, "step": 212 }, { "completion_length": 238.75, "epoch": 0.6816, "grad_norm": 5.395397663116455, "kl": 0.041748046875, "learning_rate": 7.35e-07, "loss": 0.0004, "reward": 3.89706289768219, "reward_std": 0.0131816565990448, "rewards/answer_entity_reward": 0.9914772808551788, "rewards/answer_wer_reward": 0.9062366485595703, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993489682674408, "step": 213 }, { "completion_length": 255.65625, "epoch": 0.6848, "grad_norm": 1.9760891199111938, "kl": 0.03961181640625, "learning_rate": 7.3375e-07, "loss": 0.0004, "reward": 3.917116641998291, "reward_std": 0.04898790689185262, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9182944297790527, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9988220930099487, "step": 214 }, { "completion_length": 165.75, "epoch": 0.688, "grad_norm": 2.763314723968506, "kl": 0.0577392578125, "learning_rate": 7.325e-07, "loss": 0.0006, "reward": 3.952502489089966, "reward_std": 0.016542275436222553, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9569029808044434, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990717768669128, "step": 215 }, { "completion_length": 215.625, "epoch": 0.6912, "grad_norm": 7.516313552856445, "kl": 0.0439453125, "learning_rate": 7.312499999999999e-07, "loss": 0.0004, "reward": 3.9650633335113525, "reward_std": 0.015061032958328724, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9679040908813477, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 216 }, { "completion_length": 227.84375, "epoch": 0.6944, "grad_norm": 1.8075324296951294, "kl": 0.0511474609375, "learning_rate": 7.3e-07, "loss": 0.0005, "reward": 3.9209293127059937, "reward_std": 0.01800437457859516, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.9266109764575958, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 217 }, { "completion_length": 213.625, "epoch": 0.6976, "grad_norm": 5.917069911956787, "kl": 0.0426025390625, "learning_rate": 7.2875e-07, "loss": 0.0004, "reward": 3.9082109928131104, "reward_std": 0.07417950965464115, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9089923202991486, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999218761920929, "step": 218 }, { "completion_length": 228.6875, "epoch": 0.7008, "grad_norm": 1.1044409275054932, "kl": 0.0531005859375, "learning_rate": 7.275e-07, "loss": 0.0005, "reward": 3.908870220184326, "reward_std": 0.016815255396068096, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9117993116378784, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994747638702393, "step": 219 }, { "completion_length": 199.125, "epoch": 0.704, "grad_norm": 3.019407272338867, "kl": 0.058837890625, "learning_rate": 7.262499999999999e-07, "loss": 0.0006, "reward": 3.925763249397278, "reward_std": 0.01313594076782465, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9272693395614624, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9984939694404602, "step": 220 }, { "completion_length": 210.65625, "epoch": 0.7072, "grad_norm": 2.7719058990478516, "kl": 0.0377197265625, "learning_rate": 7.249999999999999e-07, "loss": 0.0004, "reward": 3.8708763122558594, "reward_std": 0.028095172019675374, "rewards/answer_entity_reward": 0.9812500178813934, "rewards/answer_wer_reward": 0.9290285110473633, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.960597813129425, "step": 221 }, { "completion_length": 199.6875, "epoch": 0.7104, "grad_norm": 2.267350435256958, "kl": 0.0660400390625, "learning_rate": 7.2375e-07, "loss": 0.0006, "reward": 3.9580957889556885, "reward_std": 0.03087126836180687, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9705802798271179, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9899193644523621, "step": 222 }, { "completion_length": 181.8125, "epoch": 0.7136, "grad_norm": 8.685694694519043, "kl": 0.081787109375, "learning_rate": 7.225e-07, "loss": 0.0008, "reward": 3.8902955055236816, "reward_std": 0.011068197898566723, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9720200002193451, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9182755053043365, "step": 223 }, { "completion_length": 185.4375, "epoch": 0.7168, "grad_norm": 2.514770746231079, "kl": 0.0609130859375, "learning_rate": 7.212499999999999e-07, "loss": 0.0006, "reward": 3.9320486783981323, "reward_std": 0.033941914327442646, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9598598778247833, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.972188800573349, "step": 224 }, { "completion_length": 250.84375, "epoch": 0.72, "grad_norm": 1.7914812564849854, "kl": 0.03045654296875, "learning_rate": 7.2e-07, "loss": 0.0003, "reward": 3.8908780813217163, "reward_std": 0.03203156217932701, "rewards/answer_entity_reward": 0.9678819179534912, "rewards/answer_wer_reward": 0.9238358736038208, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9991601407527924, "step": 225 }, { "completion_length": 249.125, "epoch": 0.7232, "grad_norm": 4.627202987670898, "kl": 0.0531005859375, "learning_rate": 7.1875e-07, "loss": 0.0005, "reward": 3.899629235267639, "reward_std": 0.06726673897355795, "rewards/answer_entity_reward": 0.9953208565711975, "rewards/answer_wer_reward": 0.9247469902038574, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9795613586902618, "step": 226 }, { "completion_length": 214.40625, "epoch": 0.7264, "grad_norm": 1.942586064338684, "kl": 0.0352783203125, "learning_rate": 7.175e-07, "loss": 0.0003, "reward": 3.959649443626404, "reward_std": 0.01394367078319192, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9649502038955688, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9975401163101196, "step": 227 }, { "completion_length": 182.59375, "epoch": 0.7296, "grad_norm": 3.191298246383667, "kl": 0.055419921875, "learning_rate": 7.1625e-07, "loss": 0.0005, "reward": 3.9260960817337036, "reward_std": 0.021659906953573227, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9576999247074127, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9712370932102203, "step": 228 }, { "completion_length": 212.53125, "epoch": 0.7328, "grad_norm": 1.0323834419250488, "kl": 0.0533447265625, "learning_rate": 7.149999999999999e-07, "loss": 0.0005, "reward": 3.939168095588684, "reward_std": 0.009458722081035376, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9402457773685455, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9989224076271057, "step": 229 }, { "completion_length": 187.8125, "epoch": 0.736, "grad_norm": 4.53863000869751, "kl": 0.050537109375, "learning_rate": 7.137499999999999e-07, "loss": 0.0005, "reward": 3.893386960029602, "reward_std": 0.03008814249187708, "rewards/answer_entity_reward": 0.9941239356994629, "rewards/answer_wer_reward": 0.9532185792922974, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.946044385433197, "step": 230 }, { "completion_length": 235.5, "epoch": 0.7392, "grad_norm": 2.1737990379333496, "kl": 0.0477294921875, "learning_rate": 7.125e-07, "loss": 0.0005, "reward": 3.8995944261550903, "reward_std": 0.021292359568178654, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9127146005630493, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9868797659873962, "step": 231 }, { "completion_length": 230.625, "epoch": 0.7424, "grad_norm": 0.8920266628265381, "kl": 0.02874755859375, "learning_rate": 7.1125e-07, "loss": 0.0003, "reward": 3.9383678436279297, "reward_std": 0.008275180356577039, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9394271969795227, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9989406764507294, "step": 232 }, { "completion_length": 196.125, "epoch": 0.7456, "grad_norm": 2.1836190223693848, "kl": 0.06640625, "learning_rate": 7.1e-07, "loss": 0.0007, "reward": 3.9469913244247437, "reward_std": 0.01094681373797357, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9498908519744873, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9971005320549011, "step": 233 }, { "completion_length": 200.6875, "epoch": 0.7488, "grad_norm": 1.5529507398605347, "kl": 0.041748046875, "learning_rate": 7.0875e-07, "loss": 0.0004, "reward": 3.8839221000671387, "reward_std": 0.02069476176984608, "rewards/answer_entity_reward": 0.9841346144676208, "rewards/answer_wer_reward": 0.9540095031261444, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.945777952671051, "step": 234 }, { "completion_length": 222.90625, "epoch": 0.752, "grad_norm": 17.55677604675293, "kl": 0.061767578125, "learning_rate": 7.075e-07, "loss": 0.0006, "reward": 3.92560076713562, "reward_std": 0.03323593852110207, "rewards/answer_entity_reward": 0.9963235259056091, "rewards/answer_wer_reward": 0.9402145445346832, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.989062488079071, "step": 235 }, { "completion_length": 195.0625, "epoch": 0.7552, "grad_norm": 1.7806612253189087, "kl": 0.056640625, "learning_rate": 7.0625e-07, "loss": 0.0006, "reward": 3.9366722106933594, "reward_std": 0.02212852332741022, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9515082538127899, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9886362552642822, "step": 236 }, { "completion_length": 224.34375, "epoch": 0.7584, "grad_norm": 3.0402088165283203, "kl": 0.0352783203125, "learning_rate": 7.049999999999999e-07, "loss": 0.0004, "reward": 3.947329044342041, "reward_std": 0.011976622510701418, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.961329847574234, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.988840252161026, "step": 237 }, { "completion_length": 223.53125, "epoch": 0.7616, "grad_norm": 2.889293670654297, "kl": 0.0616455078125, "learning_rate": 7.037499999999999e-07, "loss": 0.0006, "reward": 3.9246891736984253, "reward_std": 0.05990536604076624, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9530621469020844, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9750992059707642, "step": 238 }, { "completion_length": 184.78125, "epoch": 0.7648, "grad_norm": 1.2427425384521484, "kl": 0.0623779296875, "learning_rate": 7.024999999999999e-07, "loss": 0.0006, "reward": 3.957573890686035, "reward_std": 0.005278389900922775, "rewards/answer_entity_reward": 0.9926470518112183, "rewards/answer_wer_reward": 0.9649269282817841, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 239 }, { "completion_length": 236.28125, "epoch": 0.768, "grad_norm": 2.361463785171509, "kl": 0.0545654296875, "learning_rate": 7.0125e-07, "loss": 0.0005, "reward": 3.9197674989700317, "reward_std": 0.02553732506930828, "rewards/answer_entity_reward": 0.9834134578704834, "rewards/answer_wer_reward": 0.9363541007041931, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 240 }, { "completion_length": 174.0, "epoch": 0.7712, "grad_norm": 2.3930962085723877, "kl": 0.05926513671875, "learning_rate": 7e-07, "loss": 0.0006, "reward": 3.9211114645004272, "reward_std": 0.008784215082414448, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9724419414997101, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9486694633960724, "step": 241 }, { "completion_length": 254.59375, "epoch": 0.7744, "grad_norm": 1.6553773880004883, "kl": 0.0389404296875, "learning_rate": 6.9875e-07, "loss": 0.0004, "reward": 3.929746985435486, "reward_std": 0.012057055719196796, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9313917756080627, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9983552694320679, "step": 242 }, { "completion_length": 235.375, "epoch": 0.7776, "grad_norm": 0.8029008507728577, "kl": 0.04083251953125, "learning_rate": 6.975e-07, "loss": 0.0004, "reward": 3.9153066873550415, "reward_std": 0.005760843865573406, "rewards/answer_entity_reward": 0.9916666746139526, "rewards/answer_wer_reward": 0.9309280216693878, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9927119016647339, "step": 243 }, { "completion_length": 186.78125, "epoch": 0.7808, "grad_norm": 3.1181294918060303, "kl": 0.0732421875, "learning_rate": 6.9625e-07, "loss": 0.0007, "reward": 3.9115726947784424, "reward_std": 0.007224578293971717, "rewards/answer_entity_reward": 0.9707792401313782, "rewards/answer_wer_reward": 0.940793514251709, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 244 }, { "completion_length": 223.6875, "epoch": 0.784, "grad_norm": 1.3839703798294067, "kl": 0.0380859375, "learning_rate": 6.949999999999999e-07, "loss": 0.0004, "reward": 3.9361883401870728, "reward_std": 0.012964933644980192, "rewards/answer_entity_reward": 0.9818618893623352, "rewards/answer_wer_reward": 0.9550732672214508, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999253123998642, "step": 245 }, { "completion_length": 222.53125, "epoch": 0.7872, "grad_norm": 3.1735548973083496, "kl": 0.072509765625, "learning_rate": 6.937499999999999e-07, "loss": 0.0007, "reward": 3.9446396827697754, "reward_std": 0.023095417767763138, "rewards/answer_entity_reward": 0.9895833134651184, "rewards/answer_wer_reward": 0.9603613913059235, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9946948885917664, "step": 246 }, { "completion_length": 217.4375, "epoch": 0.7904, "grad_norm": 1.185796856880188, "kl": 0.042236328125, "learning_rate": 6.924999999999999e-07, "loss": 0.0004, "reward": 3.9417611360549927, "reward_std": 0.013147154357284307, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9470057189464569, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9947555065155029, "step": 247 }, { "completion_length": 240.59375, "epoch": 0.7936, "grad_norm": 2.088177442550659, "kl": 0.0504150390625, "learning_rate": 6.9125e-07, "loss": 0.0005, "reward": 3.9391993284225464, "reward_std": 0.015122740995138884, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9413229823112488, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9978764653205872, "step": 248 }, { "completion_length": 251.8125, "epoch": 0.7968, "grad_norm": 1.0327165126800537, "kl": 0.0439453125, "learning_rate": 6.9e-07, "loss": 0.0004, "reward": 3.928339123725891, "reward_std": 0.014733773190528154, "rewards/answer_entity_reward": 0.9895104765892029, "rewards/answer_wer_reward": 0.9401907324790955, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9986380338668823, "step": 249 }, { "completion_length": 202.125, "epoch": 0.8, "grad_norm": 1.0536175966262817, "kl": 0.0443115234375, "learning_rate": 6.8875e-07, "loss": 0.0004, "reward": 3.9324183464050293, "reward_std": 0.018241871614009142, "rewards/answer_entity_reward": 0.9873737692832947, "rewards/answer_wer_reward": 0.9567070603370667, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9883374869823456, "step": 250 }, { "completion_length": 231.59375, "epoch": 0.8032, "grad_norm": 1.8605543375015259, "kl": 0.0467529296875, "learning_rate": 6.875e-07, "loss": 0.0005, "reward": 3.9515386819839478, "reward_std": 0.014535096473991871, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9524115920066833, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9991269707679749, "step": 251 }, { "completion_length": 202.8125, "epoch": 0.8064, "grad_norm": 1.7101868391036987, "kl": 0.0673828125, "learning_rate": 6.8625e-07, "loss": 0.0007, "reward": 3.947361946105957, "reward_std": 0.01079330500215292, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9485193192958832, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9988425970077515, "step": 252 }, { "completion_length": 194.4375, "epoch": 0.8096, "grad_norm": 1.6060519218444824, "kl": 0.0518798828125, "learning_rate": 6.85e-07, "loss": 0.0005, "reward": 3.8238483667373657, "reward_std": 0.09831315139308572, "rewards/answer_entity_reward": 0.9366161823272705, "rewards/answer_wer_reward": 0.888142466545105, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990898072719574, "step": 253 }, { "completion_length": 231.71875, "epoch": 0.8128, "grad_norm": 1.4323464632034302, "kl": 0.04559326171875, "learning_rate": 6.837499999999999e-07, "loss": 0.0005, "reward": 3.9585113525390625, "reward_std": 0.009139138273894787, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9591011703014374, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994101524353027, "step": 254 }, { "completion_length": 242.15625, "epoch": 0.816, "grad_norm": 1.638405442237854, "kl": 0.0592041015625, "learning_rate": 6.824999999999999e-07, "loss": 0.0006, "reward": 3.938191056251526, "reward_std": 0.015181098598986864, "rewards/answer_entity_reward": 0.9916666746139526, "rewards/answer_wer_reward": 0.9465242922306061, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 255 }, { "completion_length": 178.96875, "epoch": 0.8192, "grad_norm": 2.906489133834839, "kl": 0.07958984375, "learning_rate": 6.8125e-07, "loss": 0.0008, "reward": 3.9418115615844727, "reward_std": 0.024727396899834275, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.9549268186092377, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9925665557384491, "step": 256 }, { "completion_length": 191.59375, "epoch": 0.8224, "grad_norm": 4.772871494293213, "kl": 0.271484375, "learning_rate": 6.800000000000001e-07, "loss": 0.0027, "reward": 3.9085776805877686, "reward_std": 0.01904244115576148, "rewards/answer_entity_reward": 0.9866071343421936, "rewards/answer_wer_reward": 0.9542762637138367, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9676942825317383, "step": 257 }, { "completion_length": 192.78125, "epoch": 0.8256, "grad_norm": 2.3399181365966797, "kl": 0.081787109375, "learning_rate": 6.7875e-07, "loss": 0.0008, "reward": 3.930221199989319, "reward_std": 0.014671812066808343, "rewards/answer_entity_reward": 0.9867201447486877, "rewards/answer_wer_reward": 0.9438917338848114, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996093809604645, "step": 258 }, { "completion_length": 187.5, "epoch": 0.8288, "grad_norm": 9.805069923400879, "kl": 0.072265625, "learning_rate": 6.775e-07, "loss": 0.0007, "reward": 3.939017653465271, "reward_std": 0.016680479515343904, "rewards/answer_entity_reward": 0.9944852888584137, "rewards/answer_wer_reward": 0.9445324242115021, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 259 }, { "completion_length": 234.5625, "epoch": 0.832, "grad_norm": 1.5217561721801758, "kl": 0.0516357421875, "learning_rate": 6.7625e-07, "loss": 0.0005, "reward": 3.922031283378601, "reward_std": 0.01609009224921465, "rewards/answer_entity_reward": 0.9681277275085449, "rewards/answer_wer_reward": 0.9539035856723785, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 260 }, { "completion_length": 159.0, "epoch": 0.8352, "grad_norm": 2.5927042961120605, "kl": 0.0557861328125, "learning_rate": 6.75e-07, "loss": 0.0006, "reward": 3.9503369331359863, "reward_std": 0.004757039016112685, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9792385697364807, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9710983335971832, "step": 261 }, { "completion_length": 222.15625, "epoch": 0.8384, "grad_norm": 1.9485008716583252, "kl": 0.0928955078125, "learning_rate": 6.737499999999999e-07, "loss": 0.0009, "reward": 3.9718098640441895, "reward_std": 0.01134553411975503, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9718098938465118, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 262 }, { "completion_length": 248.875, "epoch": 0.8416, "grad_norm": 5.045698165893555, "kl": 0.0552978515625, "learning_rate": 6.724999999999999e-07, "loss": 0.0006, "reward": 3.799831986427307, "reward_std": 0.03707320708781481, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.9218086004257202, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.883705198764801, "step": 263 }, { "completion_length": 157.6875, "epoch": 0.8448, "grad_norm": 1.9603397846221924, "kl": 0.14111328125, "learning_rate": 6.7125e-07, "loss": 0.0014, "reward": 3.9334217309951782, "reward_std": 0.00959050771780312, "rewards/answer_entity_reward": 0.9916666746139526, "rewards/answer_wer_reward": 0.9538573622703552, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.987897664308548, "step": 264 }, { "completion_length": 249.21875, "epoch": 0.848, "grad_norm": 1.720057725906372, "kl": 0.102783203125, "learning_rate": 6.7e-07, "loss": 0.001, "reward": 3.9404491186141968, "reward_std": 0.023797483183443546, "rewards/answer_entity_reward": 0.9947552382946014, "rewards/answer_wer_reward": 0.9459458291530609, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997479915618896, "step": 265 }, { "completion_length": 200.65625, "epoch": 0.8512, "grad_norm": 1.7017474174499512, "kl": 0.06640625, "learning_rate": 6.6875e-07, "loss": 0.0007, "reward": 3.897473454475403, "reward_std": 0.017802401445806026, "rewards/answer_entity_reward": 0.9892628192901611, "rewards/answer_wer_reward": 0.9560422301292419, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.952168345451355, "step": 266 }, { "completion_length": 206.9375, "epoch": 0.8544, "grad_norm": 1.7645119428634644, "kl": 0.107177734375, "learning_rate": 6.675e-07, "loss": 0.0011, "reward": 3.919585347175598, "reward_std": 0.017358362209051847, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9206817746162415, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9989035129547119, "step": 267 }, { "completion_length": 234.125, "epoch": 0.8576, "grad_norm": 2.324972629547119, "kl": 0.07275390625, "learning_rate": 6.6625e-07, "loss": 0.0007, "reward": 3.8366565704345703, "reward_std": 0.03994511067867279, "rewards/answer_entity_reward": 0.9375, "rewards/answer_wer_reward": 0.9288243353366852, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9703322649002075, "step": 268 }, { "completion_length": 163.28125, "epoch": 0.8608, "grad_norm": 3.44211483001709, "kl": 0.07080078125, "learning_rate": 6.65e-07, "loss": 0.0007, "reward": 3.8973175287246704, "reward_std": 0.051633019000291824, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9550660252571106, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9457237124443054, "step": 269 }, { "completion_length": 198.0625, "epoch": 0.864, "grad_norm": 5.092156887054443, "kl": 0.072998046875, "learning_rate": 6.637499999999999e-07, "loss": 0.0007, "reward": 3.940290689468384, "reward_std": 0.009564612759277225, "rewards/answer_entity_reward": 0.9821428656578064, "rewards/answer_wer_reward": 0.958147794008255, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 270 }, { "completion_length": 138.875, "epoch": 0.8672, "grad_norm": 3.998215913772583, "kl": 0.05889892578125, "learning_rate": 6.624999999999999e-07, "loss": 0.0006, "reward": 3.9329700469970703, "reward_std": 0.05405183229595423, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9581792652606964, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9782631099224091, "step": 271 }, { "completion_length": 208.53125, "epoch": 0.8704, "grad_norm": 2.191901206970215, "kl": 0.06884765625, "learning_rate": 6.6125e-07, "loss": 0.0007, "reward": 3.956714630126953, "reward_std": 0.01909107668325305, "rewards/answer_entity_reward": 0.993686854839325, "rewards/answer_wer_reward": 0.9632268249988556, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9998009502887726, "step": 272 }, { "completion_length": 196.71875, "epoch": 0.8736, "grad_norm": 3.2068357467651367, "kl": 0.0513916015625, "learning_rate": 6.6e-07, "loss": 0.0005, "reward": 3.9089767932891846, "reward_std": 0.035889009945094585, "rewards/answer_entity_reward": 0.9902777671813965, "rewards/answer_wer_reward": 0.934887707233429, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9838112890720367, "step": 273 }, { "completion_length": 238.03125, "epoch": 0.8768, "grad_norm": 12.858990669250488, "kl": 0.0513916015625, "learning_rate": 6.587499999999999e-07, "loss": 0.0005, "reward": 3.9507744312286377, "reward_std": 0.012679634615778923, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9518805146217346, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9988937973976135, "step": 274 }, { "completion_length": 215.03125, "epoch": 0.88, "grad_norm": 6.914164066314697, "kl": 0.053466796875, "learning_rate": 6.575e-07, "loss": 0.0005, "reward": 3.920554757118225, "reward_std": 0.01066223531961441, "rewards/answer_entity_reward": 0.9821428656578064, "rewards/answer_wer_reward": 0.9384119212627411, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 275 }, { "completion_length": 170.3125, "epoch": 0.8832, "grad_norm": 1.4424182176589966, "kl": 0.0533447265625, "learning_rate": 6.5625e-07, "loss": 0.0005, "reward": 3.8676129579544067, "reward_std": 0.015859364066272974, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9279236793518066, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9396892189979553, "step": 276 }, { "completion_length": 203.0, "epoch": 0.8864, "grad_norm": 1.4304486513137817, "kl": 0.040771484375, "learning_rate": 6.55e-07, "loss": 0.0004, "reward": 3.9131808280944824, "reward_std": 0.020121398381888866, "rewards/answer_entity_reward": 0.9930555820465088, "rewards/answer_wer_reward": 0.9201253056526184, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 277 }, { "completion_length": 199.9375, "epoch": 0.8896, "grad_norm": 4.607363700866699, "kl": 0.0810546875, "learning_rate": 6.5375e-07, "loss": 0.0008, "reward": 3.9438611268997192, "reward_std": 0.014630983117967844, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9560317695140839, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.989912748336792, "step": 278 }, { "completion_length": 215.75, "epoch": 0.8928, "grad_norm": 0.9500401020050049, "kl": 0.0498046875, "learning_rate": 6.524999999999999e-07, "loss": 0.0005, "reward": 3.9393136501312256, "reward_std": 0.010870016179978848, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9396113157272339, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997023940086365, "step": 279 }, { "completion_length": 211.4375, "epoch": 0.896, "grad_norm": 2.4634454250335693, "kl": 0.08154296875, "learning_rate": 6.5125e-07, "loss": 0.0008, "reward": 3.8559117317199707, "reward_std": 0.020915272179991007, "rewards/answer_entity_reward": 0.9944444298744202, "rewards/answer_wer_reward": 0.9251176416873932, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9363496899604797, "step": 280 }, { "completion_length": 172.8125, "epoch": 0.8992, "grad_norm": 5.569718360900879, "kl": 0.1357421875, "learning_rate": 6.5e-07, "loss": 0.0014, "reward": 3.87375545501709, "reward_std": 0.04026831593364477, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9294662475585938, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9442892372608185, "step": 281 }, { "completion_length": 114.875, "epoch": 0.9024, "grad_norm": 4.26852560043335, "kl": 0.053955078125, "learning_rate": 6.4875e-07, "loss": 0.0005, "reward": 3.909887909889221, "reward_std": 0.015241059940308332, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9791332483291626, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9335956573486328, "step": 282 }, { "completion_length": 245.0, "epoch": 0.9056, "grad_norm": 1.3898316621780396, "kl": 0.0450439453125, "learning_rate": 6.474999999999999e-07, "loss": 0.0005, "reward": 3.9195964336395264, "reward_std": 0.018749097362160683, "rewards/answer_entity_reward": 0.9911437332630157, "rewards/answer_wer_reward": 0.9284527003765106, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 283 }, { "completion_length": 218.75, "epoch": 0.9088, "grad_norm": 4.705906391143799, "kl": 0.0338134765625, "learning_rate": 6.4625e-07, "loss": 0.0003, "reward": 3.9526829719543457, "reward_std": 0.012810520827770233, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9526830613613129, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 284 }, { "completion_length": 175.15625, "epoch": 0.912, "grad_norm": 1.7440683841705322, "kl": 0.0616455078125, "learning_rate": 6.45e-07, "loss": 0.0006, "reward": 3.9307706356048584, "reward_std": 0.014890296617522836, "rewards/answer_entity_reward": 0.9845238327980042, "rewards/answer_wer_reward": 0.9668512642383575, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9793955981731415, "step": 285 }, { "completion_length": 154.3125, "epoch": 0.9152, "grad_norm": 2.3717188835144043, "kl": 0.0599365234375, "learning_rate": 6.4375e-07, "loss": 0.0006, "reward": 3.9156084060668945, "reward_std": 0.013419507071375847, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.951806515455246, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9638019800186157, "step": 286 }, { "completion_length": 226.40625, "epoch": 0.9184, "grad_norm": 2.069488525390625, "kl": 0.058349609375, "learning_rate": 6.424999999999999e-07, "loss": 0.0006, "reward": 3.8257880210876465, "reward_std": 0.023342549800872803, "rewards/answer_entity_reward": 0.9914772808551788, "rewards/answer_wer_reward": 0.9156993925571442, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9186112284660339, "step": 287 }, { "completion_length": 203.21875, "epoch": 0.9216, "grad_norm": 1.8522766828536987, "kl": 0.0611572265625, "learning_rate": 6.4125e-07, "loss": 0.0006, "reward": 3.9413124322891235, "reward_std": 0.014133658958598971, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9447846114635468, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 288 }, { "completion_length": 182.90625, "epoch": 0.9248, "grad_norm": 3.1601576805114746, "kl": 0.0626220703125, "learning_rate": 6.4e-07, "loss": 0.0006, "reward": 3.934013605117798, "reward_std": 0.020497526740655303, "rewards/answer_entity_reward": 0.9861111044883728, "rewards/answer_wer_reward": 0.9598910510540009, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.988011360168457, "step": 289 }, { "completion_length": 235.71875, "epoch": 0.928, "grad_norm": 1.5299009084701538, "kl": 0.062744140625, "learning_rate": 6.3875e-07, "loss": 0.0006, "reward": 3.900187373161316, "reward_std": 0.027182841673493385, "rewards/answer_entity_reward": 0.9859217405319214, "rewards/answer_wer_reward": 0.9156533181667328, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9986123144626617, "step": 290 }, { "completion_length": 181.59375, "epoch": 0.9312, "grad_norm": 2.8708431720733643, "kl": 0.09375, "learning_rate": 6.374999999999999e-07, "loss": 0.0009, "reward": 3.878863215446472, "reward_std": 0.016461022198200226, "rewards/answer_entity_reward": 0.9607954621315002, "rewards/answer_wer_reward": 0.9469051957130432, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9711625277996063, "step": 291 }, { "completion_length": 252.71875, "epoch": 0.9344, "grad_norm": 1.3821316957473755, "kl": 0.143798828125, "learning_rate": 6.362499999999999e-07, "loss": 0.0014, "reward": 3.9444687366485596, "reward_std": 0.015690275467932224, "rewards/answer_entity_reward": 0.9958333373069763, "rewards/answer_wer_reward": 0.9486355781555176, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 292 }, { "completion_length": 191.5, "epoch": 0.9376, "grad_norm": 3.0700418949127197, "kl": 0.08984375, "learning_rate": 6.35e-07, "loss": 0.0009, "reward": 3.9288469552993774, "reward_std": 0.025998966302722692, "rewards/answer_entity_reward": 0.9910714626312256, "rewards/answer_wer_reward": 0.9580896496772766, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.97968590259552, "step": 293 }, { "completion_length": 236.40625, "epoch": 0.9408, "grad_norm": 0.9392086863517761, "kl": 0.0728759765625, "learning_rate": 6.3375e-07, "loss": 0.0007, "reward": 3.9576098918914795, "reward_std": 0.004891619086265564, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9576099216938019, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 294 }, { "completion_length": 204.125, "epoch": 0.944, "grad_norm": 1.4554882049560547, "kl": 0.044677734375, "learning_rate": 6.324999999999999e-07, "loss": 0.0004, "reward": 3.9175373315811157, "reward_std": 0.008688606787472963, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9530804753303528, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9644568264484406, "step": 295 }, { "completion_length": 230.8125, "epoch": 0.9472, "grad_norm": 0.7801051139831543, "kl": 0.0537109375, "learning_rate": 6.3125e-07, "loss": 0.0005, "reward": 3.941986918449402, "reward_std": 0.011714181862771511, "rewards/answer_entity_reward": 0.9983552694320679, "rewards/answer_wer_reward": 0.9448631405830383, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987684786319733, "step": 296 }, { "completion_length": 201.6875, "epoch": 0.9504, "grad_norm": 3.2697925567626953, "kl": 0.0723876953125, "learning_rate": 6.3e-07, "loss": 0.0007, "reward": 3.9148101806640625, "reward_std": 0.02096148394048214, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9371316432952881, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9821428656578064, "step": 297 }, { "completion_length": 174.71875, "epoch": 0.9536, "grad_norm": 1.3895010948181152, "kl": 0.072509765625, "learning_rate": 6.2875e-07, "loss": 0.0007, "reward": 3.9413623809814453, "reward_std": 0.012068473850376904, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.96162348985672, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9821428656578064, "step": 298 }, { "completion_length": 226.53125, "epoch": 0.9568, "grad_norm": 0.9915501475334167, "kl": 0.0574951171875, "learning_rate": 6.274999999999999e-07, "loss": 0.0006, "reward": 3.9342339038848877, "reward_std": 0.017138528637588024, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9342339336872101, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 299 }, { "completion_length": 185.96875, "epoch": 0.96, "grad_norm": 2.181473970413208, "kl": 0.0693359375, "learning_rate": 6.262499999999999e-07, "loss": 0.0007, "reward": 3.8075177669525146, "reward_std": 0.008563205134123564, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.974321037530899, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8331968486309052, "step": 300 }, { "completion_length": 259.4375, "epoch": 0.9632, "grad_norm": 0.8825593590736389, "kl": 0.053955078125, "learning_rate": 6.249999999999999e-07, "loss": 0.0005, "reward": 3.9282361268997192, "reward_std": 0.01493215560913086, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9290694296360016, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9991666674613953, "step": 301 }, { "completion_length": 233.4375, "epoch": 0.9664, "grad_norm": 2.377093553543091, "kl": 0.08251953125, "learning_rate": 6.2375e-07, "loss": 0.0008, "reward": 3.8652896881103516, "reward_std": 0.04854640178382397, "rewards/answer_entity_reward": 0.9947552382946014, "rewards/answer_wer_reward": 0.931235283613205, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9392991065979004, "step": 302 }, { "completion_length": 214.9375, "epoch": 0.9696, "grad_norm": 2.7887818813323975, "kl": 0.0765380859375, "learning_rate": 6.225000000000001e-07, "loss": 0.0008, "reward": 3.916442394256592, "reward_std": 0.014312040992081165, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9577742516994476, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9586680829524994, "step": 303 }, { "completion_length": 195.53125, "epoch": 0.9728, "grad_norm": 1.3930556774139404, "kl": 0.0662841796875, "learning_rate": 6.2125e-07, "loss": 0.0007, "reward": 3.8324824571609497, "reward_std": 0.013787610223516822, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.9709192514419556, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8672450482845306, "step": 304 }, { "completion_length": 221.65625, "epoch": 0.976, "grad_norm": 1.6060283184051514, "kl": 0.046875, "learning_rate": 6.2e-07, "loss": 0.0005, "reward": 3.9341059923171997, "reward_std": 0.016552825924009085, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9438435733318329, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9902624487876892, "step": 305 }, { "completion_length": 274.4375, "epoch": 0.9792, "grad_norm": 2.2774875164031982, "kl": 0.0582275390625, "learning_rate": 6.1875e-07, "loss": 0.0006, "reward": 3.8809224367141724, "reward_std": 0.03468186687678099, "rewards/answer_entity_reward": 0.9755851626396179, "rewards/answer_wer_reward": 0.9063642621040344, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9989729523658752, "step": 306 }, { "completion_length": 243.875, "epoch": 0.9824, "grad_norm": 1.4776897430419922, "kl": 0.0865478515625, "learning_rate": 6.175e-07, "loss": 0.0009, "reward": 3.921198606491089, "reward_std": 0.029711266048252583, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9262239336967468, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9984469413757324, "step": 307 }, { "completion_length": 230.6875, "epoch": 0.9856, "grad_norm": 0.8870422840118408, "kl": 0.0528564453125, "learning_rate": 6.162499999999999e-07, "loss": 0.0005, "reward": 3.9468624591827393, "reward_std": 0.010126703884452581, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9468623399734497, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 308 }, { "completion_length": 193.53125, "epoch": 0.9888, "grad_norm": 1.2648320198059082, "kl": 0.0474853515625, "learning_rate": 6.149999999999999e-07, "loss": 0.0005, "reward": 3.9692437648773193, "reward_std": 0.010907594813033938, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9716475903987885, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 309 }, { "completion_length": 226.84375, "epoch": 0.992, "grad_norm": 2.5334410667419434, "kl": 0.099609375, "learning_rate": 6.1375e-07, "loss": 0.001, "reward": 3.932776689529419, "reward_std": 0.025886752177029848, "rewards/answer_entity_reward": 0.9937500059604645, "rewards/answer_wer_reward": 0.9474222362041473, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9916044771671295, "step": 310 }, { "completion_length": 202.40625, "epoch": 0.9952, "grad_norm": 1.6191986799240112, "kl": 0.059326171875, "learning_rate": 6.125000000000001e-07, "loss": 0.0006, "reward": 3.923641085624695, "reward_std": 0.016786989755928516, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9264820218086243, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 311 }, { "completion_length": 226.125, "epoch": 0.9984, "grad_norm": 2.3516252040863037, "kl": 0.0587158203125, "learning_rate": 6.1125e-07, "loss": 0.0006, "reward": 3.822533130645752, "reward_std": 0.19381592608988285, "rewards/answer_entity_reward": 0.9630681872367859, "rewards/answer_wer_reward": 0.8977905511856079, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9929245114326477, "step": 312 }, { "completion_length": 164.4375, "epoch": 1.0, "grad_norm": 9.48376178741455, "kl": 0.04345703125, "learning_rate": 6.1e-07, "loss": 0.0002, "reward": 3.9722466468811035, "reward_std": 0.021218769252300262, "rewards/answer_entity_reward": 0.9861111044883728, "rewards/answer_wer_reward": 0.9880585074424744, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9980769157409668, "step": 313 }, { "completion_length": 194.0625, "epoch": 1.0032, "grad_norm": 1.5969237089157104, "kl": 0.0419921875, "learning_rate": 6.0875e-07, "loss": 0.0004, "reward": 3.9741499423980713, "reward_std": 0.00955872773192823, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9776757061481476, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9985576868057251, "step": 314 }, { "completion_length": 174.25, "epoch": 1.0064, "grad_norm": 5.0026326179504395, "kl": 0.07470703125, "learning_rate": 6.075e-07, "loss": 0.0007, "reward": 3.9532389640808105, "reward_std": 0.01782281370833516, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9582388997077942, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9950000047683716, "step": 315 }, { "completion_length": 218.3125, "epoch": 1.0096, "grad_norm": 1.521260142326355, "kl": 0.072509765625, "learning_rate": 6.062499999999999e-07, "loss": 0.0007, "reward": 3.891371011734009, "reward_std": 0.037183830980211496, "rewards/answer_entity_reward": 0.995192289352417, "rewards/answer_wer_reward": 0.9465020596981049, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9496767222881317, "step": 316 }, { "completion_length": 181.21875, "epoch": 1.0128, "grad_norm": 2.444070339202881, "kl": 0.1011962890625, "learning_rate": 6.049999999999999e-07, "loss": 0.001, "reward": 3.957024097442627, "reward_std": 0.015732225496321917, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.9627059102058411, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 317 }, { "completion_length": 214.8125, "epoch": 1.016, "grad_norm": 5.038032054901123, "kl": 0.081298828125, "learning_rate": 6.037499999999999e-07, "loss": 0.0008, "reward": 3.905093193054199, "reward_std": 0.02073481073603034, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9350383579730988, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.97005495429039, "step": 318 }, { "completion_length": 209.8125, "epoch": 1.0192, "grad_norm": 3.9700140953063965, "kl": 0.07373046875, "learning_rate": 6.025000000000001e-07, "loss": 0.0007, "reward": 3.8465429544448853, "reward_std": 0.044920976273715496, "rewards/answer_entity_reward": 0.953125, "rewards/answer_wer_reward": 0.935539960861206, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9578781127929688, "step": 319 }, { "completion_length": 242.8125, "epoch": 1.0224, "grad_norm": 1.1018257141113281, "kl": 0.0404052734375, "learning_rate": 6.0125e-07, "loss": 0.0004, "reward": 3.9351298809051514, "reward_std": 0.00889231264591217, "rewards/answer_entity_reward": 0.9861111044883728, "rewards/answer_wer_reward": 0.9503234028816223, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9986952543258667, "step": 320 }, { "completion_length": 178.65625, "epoch": 1.0256, "grad_norm": 1.2945948839187622, "kl": 0.059326171875, "learning_rate": 6e-07, "loss": 0.0006, "reward": 3.9444717168807983, "reward_std": 0.010739851742982864, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9468754827976227, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 321 }, { "completion_length": 158.75, "epoch": 1.0288, "grad_norm": 1.9997080564498901, "kl": 0.10498046875, "learning_rate": 5.9875e-07, "loss": 0.001, "reward": 3.8997615575790405, "reward_std": 0.0878201499581337, "rewards/answer_entity_reward": 0.9768981039524078, "rewards/answer_wer_reward": 0.9317395091056824, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9911239445209503, "step": 322 }, { "completion_length": 202.78125, "epoch": 1.032, "grad_norm": 2.5343425273895264, "kl": 0.047119140625, "learning_rate": 5.975e-07, "loss": 0.0005, "reward": 3.9625836610794067, "reward_std": 0.0073791013564914465, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9652430713176727, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9973404407501221, "step": 323 }, { "completion_length": 181.9375, "epoch": 1.0352, "grad_norm": 7.240401744842529, "kl": 0.067138671875, "learning_rate": 5.962499999999999e-07, "loss": 0.0007, "reward": 3.828685760498047, "reward_std": 0.04627671558409929, "rewards/answer_entity_reward": 0.995192289352417, "rewards/answer_wer_reward": 0.951274037361145, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.882219523191452, "step": 324 }, { "completion_length": 209.75, "epoch": 1.0384, "grad_norm": 2.1784214973449707, "kl": 0.0810546875, "learning_rate": 5.949999999999999e-07, "loss": 0.0008, "reward": 3.9578659534454346, "reward_std": 0.015447806101292372, "rewards/answer_entity_reward": 0.9947552382946014, "rewards/answer_wer_reward": 0.9634187519550323, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996921122074127, "step": 325 }, { "completion_length": 200.78125, "epoch": 1.0416, "grad_norm": 1.8993250131607056, "kl": 0.086669921875, "learning_rate": 5.937499999999999e-07, "loss": 0.0009, "reward": 3.9622350931167603, "reward_std": 0.011172362137585878, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9622350335121155, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 326 }, { "completion_length": 188.0625, "epoch": 1.0448, "grad_norm": 2.999244213104248, "kl": 0.04931640625, "learning_rate": 5.925e-07, "loss": 0.0005, "reward": 3.8658429384231567, "reward_std": 0.027352653443813324, "rewards/answer_entity_reward": 0.9859203398227692, "rewards/answer_wer_reward": 0.9490468800067902, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9308757185935974, "step": 327 }, { "completion_length": 211.6875, "epoch": 1.048, "grad_norm": 1.4307529926300049, "kl": 0.06982421875, "learning_rate": 5.912500000000001e-07, "loss": 0.0007, "reward": 3.8813902139663696, "reward_std": 0.015089725144207478, "rewards/answer_entity_reward": 0.9800595343112946, "rewards/answer_wer_reward": 0.9558005034923553, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9455301761627197, "step": 328 }, { "completion_length": 184.1875, "epoch": 1.0512, "grad_norm": 1.9804878234863281, "kl": 0.03851318359375, "learning_rate": 5.9e-07, "loss": 0.0004, "reward": 3.9403220415115356, "reward_std": 0.025673750409623608, "rewards/answer_entity_reward": 0.9941239356994629, "rewards/answer_wer_reward": 0.94679394364357, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994041919708252, "step": 329 }, { "completion_length": 200.71875, "epoch": 1.0544, "grad_norm": 1.5184144973754883, "kl": 0.06689453125, "learning_rate": 5.8875e-07, "loss": 0.0007, "reward": 3.945325493812561, "reward_std": 0.021944692358374596, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.951007217168808, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 330 }, { "completion_length": 211.875, "epoch": 1.0576, "grad_norm": 1.228079915046692, "kl": 0.052978515625, "learning_rate": 5.875e-07, "loss": 0.0005, "reward": 3.9120590686798096, "reward_std": 0.015080507844686508, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.912059098482132, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 331 }, { "completion_length": 240.5625, "epoch": 1.0608, "grad_norm": 1.7073534727096558, "kl": 0.1005859375, "learning_rate": 5.8625e-07, "loss": 0.001, "reward": 3.943448066711426, "reward_std": 0.010788221377879381, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9437373280525208, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997106492519379, "step": 332 }, { "completion_length": 217.78125, "epoch": 1.064, "grad_norm": 1.9268385171890259, "kl": 0.0440673828125, "learning_rate": 5.849999999999999e-07, "loss": 0.0004, "reward": 3.9603058099746704, "reward_std": 0.009590512840077281, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9625644087791443, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9977414906024933, "step": 333 }, { "completion_length": 188.125, "epoch": 1.0672, "grad_norm": 0.780636727809906, "kl": 0.04638671875, "learning_rate": 5.837499999999999e-07, "loss": 0.0005, "reward": 3.949649691581726, "reward_std": 0.0076717507326975465, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9496497213840485, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 334 }, { "completion_length": 240.71875, "epoch": 1.0704, "grad_norm": 21.118270874023438, "kl": 0.04296875, "learning_rate": 5.825e-07, "loss": 0.0004, "reward": 3.968227982521057, "reward_std": 0.01375247398391366, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9715853631496429, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.996642529964447, "step": 335 }, { "completion_length": 251.21875, "epoch": 1.0735999999999999, "grad_norm": 1.0980618000030518, "kl": 0.0467529296875, "learning_rate": 5.8125e-07, "loss": 0.0005, "reward": 3.9321502447128296, "reward_std": 0.02487938292324543, "rewards/answer_entity_reward": 0.987500011920929, "rewards/answer_wer_reward": 0.945962131023407, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9986882209777832, "step": 336 }, { "completion_length": 191.0, "epoch": 1.0768, "grad_norm": 1.9901342391967773, "kl": 0.1015625, "learning_rate": 5.8e-07, "loss": 0.001, "reward": 3.860186219215393, "reward_std": 0.008080802159383893, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9668596386909485, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8933265209197998, "step": 337 }, { "completion_length": 222.40625, "epoch": 1.08, "grad_norm": 1.9760770797729492, "kl": 0.0791015625, "learning_rate": 5.7875e-07, "loss": 0.0008, "reward": 3.943527340888977, "reward_std": 0.013376505114138126, "rewards/answer_entity_reward": 0.9927884340286255, "rewards/answer_wer_reward": 0.950738936662674, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 338 }, { "completion_length": 242.75, "epoch": 1.0832, "grad_norm": 1.4690314531326294, "kl": 0.0699462890625, "learning_rate": 5.775e-07, "loss": 0.0007, "reward": 3.946296215057373, "reward_std": 0.010936432983726263, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.946296215057373, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 339 }, { "completion_length": 213.75, "epoch": 1.0864, "grad_norm": 1.3006911277770996, "kl": 0.068603515625, "learning_rate": 5.7625e-07, "loss": 0.0007, "reward": 3.929935932159424, "reward_std": 0.012226814404129982, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9303079545497894, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996279776096344, "step": 340 }, { "completion_length": 203.875, "epoch": 1.0896, "grad_norm": 20.699094772338867, "kl": 0.0606689453125, "learning_rate": 5.749999999999999e-07, "loss": 0.0006, "reward": 3.839663863182068, "reward_std": 0.2153539047576487, "rewards/answer_entity_reward": 0.9632352888584137, "rewards/answer_wer_reward": 0.9303349256515503, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.977343738079071, "step": 341 }, { "completion_length": 229.9375, "epoch": 1.0928, "grad_norm": 10.713321685791016, "kl": 0.062255859375, "learning_rate": 5.737499999999999e-07, "loss": 0.0006, "reward": 3.952810525894165, "reward_std": 0.013096342328935862, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9535458087921143, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9992647171020508, "step": 342 }, { "completion_length": 226.0625, "epoch": 1.096, "grad_norm": 5.412719249725342, "kl": 0.068115234375, "learning_rate": 5.725e-07, "loss": 0.0007, "reward": 3.9290108680725098, "reward_std": 0.014630899764597416, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9352608323097229, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9937500059604645, "step": 343 }, { "completion_length": 180.875, "epoch": 1.0992, "grad_norm": 1.5433329343795776, "kl": 0.046875, "learning_rate": 5.7125e-07, "loss": 0.0005, "reward": 3.9217172861099243, "reward_std": 0.007004068233072758, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9350151419639587, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9867021441459656, "step": 344 }, { "completion_length": 228.5625, "epoch": 1.1024, "grad_norm": 1.6970151662826538, "kl": 0.058837890625, "learning_rate": 5.699999999999999e-07, "loss": 0.0006, "reward": 3.9185184240341187, "reward_std": 0.013168168719857931, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9197319746017456, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987863898277283, "step": 345 }, { "completion_length": 155.34375, "epoch": 1.1056, "grad_norm": 1.7489057779312134, "kl": 0.0869140625, "learning_rate": 5.6875e-07, "loss": 0.0009, "reward": 3.9059561491012573, "reward_std": 0.00622332957573235, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9627758860588074, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9431802928447723, "step": 346 }, { "completion_length": 173.40625, "epoch": 1.1088, "grad_norm": 1.3873649835586548, "kl": 0.09033203125, "learning_rate": 5.675e-07, "loss": 0.0009, "reward": 3.9297943115234375, "reward_std": 0.039116960018873215, "rewards/answer_entity_reward": 0.9826389253139496, "rewards/answer_wer_reward": 0.9575237333774567, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9896316528320312, "step": 347 }, { "completion_length": 210.3125, "epoch": 1.112, "grad_norm": 3.549527645111084, "kl": 0.0986328125, "learning_rate": 5.6625e-07, "loss": 0.001, "reward": 3.9249199628829956, "reward_std": 0.019829558208584785, "rewards/answer_entity_reward": 0.9842728972434998, "rewards/answer_wer_reward": 0.9483617842197418, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9922854006290436, "step": 348 }, { "completion_length": 210.21875, "epoch": 1.1152, "grad_norm": 1.7917331457138062, "kl": 0.0712890625, "learning_rate": 5.649999999999999e-07, "loss": 0.0007, "reward": 3.9333280324935913, "reward_std": 0.011767172254621983, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9333280622959137, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 349 }, { "completion_length": 220.1875, "epoch": 1.1184, "grad_norm": 0.8690351247787476, "kl": 0.069580078125, "learning_rate": 5.637499999999999e-07, "loss": 0.0007, "reward": 3.9331865310668945, "reward_std": 0.008595036342740059, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9419363439083099, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9912500977516174, "step": 350 }, { "completion_length": 192.65625, "epoch": 1.1216, "grad_norm": 1.7662582397460938, "kl": 0.076171875, "learning_rate": 5.625e-07, "loss": 0.0008, "reward": 3.950869083404541, "reward_std": 0.020245986990630627, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.951172411441803, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996966123580933, "step": 351 }, { "completion_length": 264.25, "epoch": 1.1248, "grad_norm": 6.877583026885986, "kl": 0.0867919921875, "learning_rate": 5.6125e-07, "loss": 0.0009, "reward": 3.9451229572296143, "reward_std": 0.017284557223320007, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.946128636598587, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9989943504333496, "step": 352 }, { "completion_length": 218.4375, "epoch": 1.1280000000000001, "grad_norm": 1.853745460510254, "kl": 0.058837890625, "learning_rate": 5.6e-07, "loss": 0.0006, "reward": 3.9474722146987915, "reward_std": 0.01703261397778988, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9519364535808563, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 353 }, { "completion_length": 229.9375, "epoch": 1.1312, "grad_norm": 7.013837814331055, "kl": 0.079345703125, "learning_rate": 5.587499999999999e-07, "loss": 0.0008, "reward": 3.928715705871582, "reward_std": 0.024107711389660835, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9372670352458954, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9938524663448334, "step": 354 }, { "completion_length": 238.09375, "epoch": 1.1344, "grad_norm": 1.8181698322296143, "kl": 0.0587158203125, "learning_rate": 5.575e-07, "loss": 0.0006, "reward": 3.9445427656173706, "reward_std": 0.028678019531071186, "rewards/answer_entity_reward": 0.9851190447807312, "rewards/answer_wer_reward": 0.9630020260810852, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.996421754360199, "step": 355 }, { "completion_length": 199.46875, "epoch": 1.1376, "grad_norm": 17.45456314086914, "kl": 0.44140625, "learning_rate": 5.5625e-07, "loss": 0.0044, "reward": 3.793405294418335, "reward_std": 0.09584336914122105, "rewards/answer_entity_reward": 0.9953208565711975, "rewards/answer_wer_reward": 0.9546021223068237, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.843482255935669, "step": 356 }, { "completion_length": 234.9375, "epoch": 1.1408, "grad_norm": 1.5193853378295898, "kl": 0.056396484375, "learning_rate": 5.55e-07, "loss": 0.0006, "reward": 3.9331583976745605, "reward_std": 0.01793505996465683, "rewards/answer_entity_reward": 0.9901185929775238, "rewards/answer_wer_reward": 0.9450170993804932, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9980226159095764, "step": 357 }, { "completion_length": 225.21875, "epoch": 1.144, "grad_norm": 0.7461761236190796, "kl": 0.050048828125, "learning_rate": 5.5375e-07, "loss": 0.0005, "reward": 3.9532158374786377, "reward_std": 0.013632898684591055, "rewards/answer_entity_reward": 0.9930555522441864, "rewards/answer_wer_reward": 0.9601602554321289, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 358 }, { "completion_length": 196.21875, "epoch": 1.1472, "grad_norm": 1.688063621520996, "kl": 0.0589599609375, "learning_rate": 5.525e-07, "loss": 0.0006, "reward": 3.957648277282715, "reward_std": 0.009953869972378016, "rewards/answer_entity_reward": 0.9892857074737549, "rewards/answer_wer_reward": 0.9689917266368866, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993708431720734, "step": 359 }, { "completion_length": 230.875, "epoch": 1.1504, "grad_norm": 1.0592241287231445, "kl": 0.057861328125, "learning_rate": 5.5125e-07, "loss": 0.0006, "reward": 3.9605822563171387, "reward_std": 0.00902467966079712, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.961335301399231, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9992469847202301, "step": 360 }, { "completion_length": 177.25, "epoch": 1.1536, "grad_norm": 0.887911856174469, "kl": 0.0631103515625, "learning_rate": 5.5e-07, "loss": 0.0006, "reward": 3.9682934284210205, "reward_std": 0.004935940261930227, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9682934284210205, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 361 }, { "completion_length": 204.09375, "epoch": 1.1568, "grad_norm": 1.4796991348266602, "kl": 0.0721435546875, "learning_rate": 5.487499999999999e-07, "loss": 0.0007, "reward": 3.967429041862488, "reward_std": 0.004718436859548092, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.967721164226532, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997079372406006, "step": 362 }, { "completion_length": 201.90625, "epoch": 1.16, "grad_norm": 1.349228858947754, "kl": 0.0635986328125, "learning_rate": 5.474999999999999e-07, "loss": 0.0006, "reward": 3.968218684196472, "reward_std": 0.004579245578497648, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9686298072338104, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999588817358017, "step": 363 }, { "completion_length": 222.25, "epoch": 1.1632, "grad_norm": 8.183592796325684, "kl": 0.7177734375, "learning_rate": 5.4625e-07, "loss": 0.0072, "reward": 3.8565011024475098, "reward_std": 0.14647854026407003, "rewards/answer_entity_reward": 0.9628739356994629, "rewards/answer_wer_reward": 0.897028774023056, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9965982735157013, "step": 364 }, { "completion_length": 203.875, "epoch": 1.1663999999999999, "grad_norm": 2.1804592609405518, "kl": 0.07666015625, "learning_rate": 5.45e-07, "loss": 0.0008, "reward": 3.9330880641937256, "reward_std": 0.023633791133761406, "rewards/answer_entity_reward": 0.9927884340286255, "rewards/answer_wer_reward": 0.9594465494155884, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9808530211448669, "step": 365 }, { "completion_length": 187.53125, "epoch": 1.1696, "grad_norm": 0.952870786190033, "kl": 0.068603515625, "learning_rate": 5.4375e-07, "loss": 0.0007, "reward": 3.906123399734497, "reward_std": 0.02216299483552575, "rewards/answer_entity_reward": 0.9882478415966034, "rewards/answer_wer_reward": 0.9373133480548859, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9805622696876526, "step": 366 }, { "completion_length": 180.28125, "epoch": 1.1728, "grad_norm": 1.6601589918136597, "kl": 0.069091796875, "learning_rate": 5.425e-07, "loss": 0.0007, "reward": 3.9451587200164795, "reward_std": 0.01368240499868989, "rewards/answer_entity_reward": 0.9923513829708099, "rewards/answer_wer_reward": 0.9530614018440247, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997459352016449, "step": 367 }, { "completion_length": 207.5625, "epoch": 1.176, "grad_norm": 2.0661466121673584, "kl": 0.142578125, "learning_rate": 5.4125e-07, "loss": 0.0014, "reward": 3.9405598640441895, "reward_std": 0.009340570773929358, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9443033933639526, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9962564706802368, "step": 368 }, { "completion_length": 193.4375, "epoch": 1.1792, "grad_norm": 2.3376078605651855, "kl": 0.0548095703125, "learning_rate": 5.4e-07, "loss": 0.0005, "reward": 3.9724533557891846, "reward_std": 0.007678399793803692, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9739435911178589, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9985099136829376, "step": 369 }, { "completion_length": 244.9375, "epoch": 1.1824, "grad_norm": 8.994063377380371, "kl": 0.067138671875, "learning_rate": 5.387499999999999e-07, "loss": 0.0007, "reward": 3.8642784357070923, "reward_std": 0.015206838492304087, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9453278481960297, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9217914342880249, "step": 370 }, { "completion_length": 223.5, "epoch": 1.1856, "grad_norm": 0.7140876054763794, "kl": 0.0628662109375, "learning_rate": 5.374999999999999e-07, "loss": 0.0006, "reward": 3.9566755294799805, "reward_std": 0.008438330609351397, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9571858644485474, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994895756244659, "step": 371 }, { "completion_length": 236.09375, "epoch": 1.1888, "grad_norm": 5.422008514404297, "kl": 0.072021484375, "learning_rate": 5.3625e-07, "loss": 0.0007, "reward": 3.9092832803726196, "reward_std": 0.02735153865069151, "rewards/answer_entity_reward": 0.9869465231895447, "rewards/answer_wer_reward": 0.9258767068386078, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9964599907398224, "step": 372 }, { "completion_length": 215.90625, "epoch": 1.192, "grad_norm": 2.5449435710906982, "kl": 0.0655517578125, "learning_rate": 5.35e-07, "loss": 0.0007, "reward": 3.8726375102996826, "reward_std": 0.15768051333725452, "rewards/answer_entity_reward": 0.991346150636673, "rewards/answer_wer_reward": 0.9473030865192413, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9652382135391235, "step": 373 }, { "completion_length": 221.09375, "epoch": 1.1952, "grad_norm": 1.3450181484222412, "kl": 0.0499267578125, "learning_rate": 5.3375e-07, "loss": 0.0005, "reward": 3.945889711380005, "reward_std": 0.021359253441914916, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9733871817588806, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9725023210048676, "step": 374 }, { "completion_length": 208.03125, "epoch": 1.1984, "grad_norm": 1.1699227094650269, "kl": 0.067626953125, "learning_rate": 5.325e-07, "loss": 0.0007, "reward": 3.951171040534973, "reward_std": 0.008666176348924637, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9543131291866302, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9992617964744568, "step": 375 }, { "completion_length": 253.28125, "epoch": 1.2016, "grad_norm": 2.287163496017456, "kl": 0.0572509765625, "learning_rate": 5.3125e-07, "loss": 0.0006, "reward": 3.9154282808303833, "reward_std": 0.04354940680786967, "rewards/answer_entity_reward": 0.9888257682323456, "rewards/answer_wer_reward": 0.9271413683891296, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994612038135529, "step": 376 }, { "completion_length": 187.21875, "epoch": 1.2048, "grad_norm": 1.3305357694625854, "kl": 0.046142578125, "learning_rate": 5.3e-07, "loss": 0.0005, "reward": 3.9359636306762695, "reward_std": 0.00542741478420794, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9541498124599457, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.981813907623291, "step": 377 }, { "completion_length": 224.125, "epoch": 1.208, "grad_norm": 10.12941837310791, "kl": 0.06201171875, "learning_rate": 5.2875e-07, "loss": 0.0006, "reward": 3.9541337490081787, "reward_std": 0.013694523833692074, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9624313712120056, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9917024075984955, "step": 378 }, { "completion_length": 158.96875, "epoch": 1.2112, "grad_norm": 1.3805967569351196, "kl": 0.05859375, "learning_rate": 5.274999999999999e-07, "loss": 0.0006, "reward": 3.947017788887024, "reward_std": 0.02097574481740594, "rewards/answer_entity_reward": 0.9902146458625793, "rewards/answer_wer_reward": 0.961486428976059, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9953167736530304, "step": 379 }, { "completion_length": 250.40625, "epoch": 1.2144, "grad_norm": 1.2120996713638306, "kl": 0.044921875, "learning_rate": 5.262499999999999e-07, "loss": 0.0004, "reward": 3.918868899345398, "reward_std": 0.021801823284476995, "rewards/answer_entity_reward": 0.9937500059604645, "rewards/answer_wer_reward": 0.9251189529895782, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 380 }, { "completion_length": 211.34375, "epoch": 1.2176, "grad_norm": 2.19063138961792, "kl": 0.078369140625, "learning_rate": 5.25e-07, "loss": 0.0008, "reward": 3.8982889652252197, "reward_std": 0.02524574287235737, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9512019455432892, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.947086900472641, "step": 381 }, { "completion_length": 241.28125, "epoch": 1.2208, "grad_norm": 1.619989275932312, "kl": 0.05615234375, "learning_rate": 5.237500000000001e-07, "loss": 0.0006, "reward": 3.9471057653427124, "reward_std": 0.013869246933609247, "rewards/answer_entity_reward": 0.9944852888584137, "rewards/answer_wer_reward": 0.9526203572750092, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 382 }, { "completion_length": 244.875, "epoch": 1.224, "grad_norm": 0.8697032928466797, "kl": 0.061279296875, "learning_rate": 5.225e-07, "loss": 0.0006, "reward": 3.9235615730285645, "reward_std": 0.015196615364402533, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9275480508804321, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.998417317867279, "step": 383 }, { "completion_length": 191.875, "epoch": 1.2272, "grad_norm": 5.2052154541015625, "kl": 0.06884765625, "learning_rate": 5.2125e-07, "loss": 0.0007, "reward": 3.934178948402405, "reward_std": 0.024661258328706026, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9814408719539642, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9527381658554077, "step": 384 }, { "completion_length": 218.15625, "epoch": 1.2304, "grad_norm": 1.1718415021896362, "kl": 0.105224609375, "learning_rate": 5.2e-07, "loss": 0.0011, "reward": 3.8538546562194824, "reward_std": 0.013242242857813835, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9431050419807434, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9135904610157013, "step": 385 }, { "completion_length": 167.59375, "epoch": 1.2336, "grad_norm": 1.8933672904968262, "kl": 0.0555419921875, "learning_rate": 5.1875e-07, "loss": 0.0006, "reward": 3.942023754119873, "reward_std": 0.04039308475330472, "rewards/answer_entity_reward": 0.9895833432674408, "rewards/answer_wer_reward": 0.9561411142349243, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9962993562221527, "step": 386 }, { "completion_length": 181.1875, "epoch": 1.2368000000000001, "grad_norm": 1.132387399673462, "kl": 0.134033203125, "learning_rate": 5.174999999999999e-07, "loss": 0.0013, "reward": 3.883729100227356, "reward_std": 0.006107622524723411, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9661928117275238, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9175363183021545, "step": 387 }, { "completion_length": 245.78125, "epoch": 1.24, "grad_norm": 1.5286246538162231, "kl": 0.0439453125, "learning_rate": 5.162499999999999e-07, "loss": 0.0004, "reward": 3.9444308280944824, "reward_std": 0.017588268965482712, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.951177716255188, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.993253082036972, "step": 388 }, { "completion_length": 214.5, "epoch": 1.2432, "grad_norm": 4.535660266876221, "kl": 0.4443359375, "learning_rate": 5.149999999999999e-07, "loss": 0.0045, "reward": 3.9712672233581543, "reward_std": 0.017703328281641006, "rewards/answer_entity_reward": 0.9923513829708099, "rewards/answer_wer_reward": 0.9789157509803772, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 389 }, { "completion_length": 237.71875, "epoch": 1.2464, "grad_norm": 1.100642204284668, "kl": 0.0443115234375, "learning_rate": 5.137500000000001e-07, "loss": 0.0004, "reward": 3.9504618644714355, "reward_std": 0.01717091863974929, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9553267061710358, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9995993673801422, "step": 390 }, { "completion_length": 220.8125, "epoch": 1.2496, "grad_norm": 1.8153222799301147, "kl": 0.050537109375, "learning_rate": 5.125e-07, "loss": 0.0005, "reward": 3.954966902732849, "reward_std": 0.023467861115932465, "rewards/answer_entity_reward": 0.9909090995788574, "rewards/answer_wer_reward": 0.9640579223632812, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 391 }, { "completion_length": 215.75, "epoch": 1.2528000000000001, "grad_norm": 1.3607189655303955, "kl": 0.0562744140625, "learning_rate": 5.1125e-07, "loss": 0.0006, "reward": 3.947434425354004, "reward_std": 0.01746128685772419, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9514667093753815, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9959677457809448, "step": 392 }, { "completion_length": 140.75, "epoch": 1.256, "grad_norm": 3.343885898590088, "kl": 0.064208984375, "learning_rate": 5.1e-07, "loss": 0.0006, "reward": 3.9535528421401978, "reward_std": 0.016743881278671324, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9615642726421356, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9948294758796692, "step": 393 }, { "completion_length": 225.09375, "epoch": 1.2591999999999999, "grad_norm": 7.593709468841553, "kl": 0.0628662109375, "learning_rate": 5.0875e-07, "loss": 0.0006, "reward": 3.9337310791015625, "reward_std": 0.01689326297491789, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9342745840549469, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999456524848938, "step": 394 }, { "completion_length": 195.15625, "epoch": 1.2624, "grad_norm": 1.6891230344772339, "kl": 0.085693359375, "learning_rate": 5.074999999999999e-07, "loss": 0.0009, "reward": 3.836549401283264, "reward_std": 0.005918985931202769, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.8378467857837677, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987025856971741, "step": 395 }, { "completion_length": 218.71875, "epoch": 1.2656, "grad_norm": 2.0911483764648438, "kl": 0.057373046875, "learning_rate": 5.062499999999999e-07, "loss": 0.0006, "reward": 3.930617570877075, "reward_std": 0.014833949506282806, "rewards/answer_entity_reward": 0.9881944358348846, "rewards/answer_wer_reward": 0.9436545968055725, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987684786319733, "step": 396 }, { "completion_length": 244.4375, "epoch": 1.2688, "grad_norm": 0.6879564523696899, "kl": 0.05810546875, "learning_rate": 5.049999999999999e-07, "loss": 0.0006, "reward": 3.9541516304016113, "reward_std": 0.014136601239442825, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9578942954540253, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9983407258987427, "step": 397 }, { "completion_length": 171.875, "epoch": 1.272, "grad_norm": 1.0838266611099243, "kl": 0.063232421875, "learning_rate": 5.0375e-07, "loss": 0.0006, "reward": 3.961939811706543, "reward_std": 0.007458951906301081, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9619399607181549, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 398 }, { "completion_length": 224.53125, "epoch": 1.2752, "grad_norm": 2.0163495540618896, "kl": 0.072265625, "learning_rate": 5.025e-07, "loss": 0.0007, "reward": 3.964465856552124, "reward_std": 0.014243231620639563, "rewards/answer_entity_reward": 0.9957579076290131, "rewards/answer_wer_reward": 0.9695450067520142, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9991629421710968, "step": 399 }, { "completion_length": 181.15625, "epoch": 1.2784, "grad_norm": 0.38955262303352356, "kl": 0.0517578125, "learning_rate": 5.0125e-07, "loss": 0.0005, "reward": 3.9557042121887207, "reward_std": 0.005372793646529317, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9557042419910431, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 400 }, { "completion_length": 208.3125, "epoch": 1.2816, "grad_norm": 3.9781861305236816, "kl": 0.0716552734375, "learning_rate": 5e-07, "loss": 0.0007, "reward": 3.8667571544647217, "reward_std": 0.015388892497867346, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9693593382835388, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8998015820980072, "step": 401 }, { "completion_length": 204.375, "epoch": 1.2848, "grad_norm": 1.1456544399261475, "kl": 0.103515625, "learning_rate": 4.9875e-07, "loss": 0.001, "reward": 3.956982374191284, "reward_std": 0.007417811662890017, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9575175940990448, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994648098945618, "step": 402 }, { "completion_length": 216.1875, "epoch": 1.288, "grad_norm": 1.1664754152297974, "kl": 0.06396484375, "learning_rate": 4.975e-07, "loss": 0.0006, "reward": 3.8699432611465454, "reward_std": 0.02020346373319626, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9359997510910034, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.936026930809021, "step": 403 }, { "completion_length": 253.09375, "epoch": 1.2912, "grad_norm": 0.8103052377700806, "kl": 0.0635986328125, "learning_rate": 4.9625e-07, "loss": 0.0006, "reward": 3.937591075897217, "reward_std": 0.018769525457173586, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9415221214294434, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9989098608493805, "step": 404 }, { "completion_length": 215.0625, "epoch": 1.2944, "grad_norm": 1.4777588844299316, "kl": 0.068603515625, "learning_rate": 4.95e-07, "loss": 0.0007, "reward": 3.949555516242981, "reward_std": 0.009917980059981346, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.949555516242981, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 405 }, { "completion_length": 202.65625, "epoch": 1.2976, "grad_norm": 0.7443984150886536, "kl": 0.106689453125, "learning_rate": 4.9375e-07, "loss": 0.0011, "reward": 3.7686209678649902, "reward_std": 0.011178261134773493, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9451543390750885, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8234666287899017, "step": 406 }, { "completion_length": 189.21875, "epoch": 1.3008, "grad_norm": 0.9547207951545715, "kl": 0.077392578125, "learning_rate": 4.924999999999999e-07, "loss": 0.0008, "reward": 3.9593130350112915, "reward_std": 0.006907296134158969, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9597530961036682, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9995598495006561, "step": 407 }, { "completion_length": 208.09375, "epoch": 1.304, "grad_norm": 0.8897162079811096, "kl": 0.0604248046875, "learning_rate": 4.9125e-07, "loss": 0.0006, "reward": 3.9529693126678467, "reward_std": 0.0038969104643911123, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9714880287647247, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9814814925193787, "step": 408 }, { "completion_length": 199.78125, "epoch": 1.3072, "grad_norm": 1.1945850849151611, "kl": 0.056640625, "learning_rate": 4.9e-07, "loss": 0.0006, "reward": 3.951330065727234, "reward_std": 0.0060545760206878185, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9513299763202667, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 409 }, { "completion_length": 176.5, "epoch": 1.3104, "grad_norm": 1.5717577934265137, "kl": 0.085205078125, "learning_rate": 4.8875e-07, "loss": 0.0009, "reward": 3.9731186628341675, "reward_std": 0.009643410099670291, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9749214053153992, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9981971085071564, "step": 410 }, { "completion_length": 209.25, "epoch": 1.3136, "grad_norm": 1.7357205152511597, "kl": 0.05517578125, "learning_rate": 4.875e-07, "loss": 0.0006, "reward": 3.9563956260681152, "reward_std": 0.013218061067163944, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9563955068588257, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 411 }, { "completion_length": 233.28125, "epoch": 1.3168, "grad_norm": 3.6717629432678223, "kl": 0.070068359375, "learning_rate": 4.8625e-07, "loss": 0.0007, "reward": 3.955284357070923, "reward_std": 0.02536593284457922, "rewards/answer_entity_reward": 0.9871794581413269, "rewards/answer_wer_reward": 0.968104898929596, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 412 }, { "completion_length": 205.125, "epoch": 1.32, "grad_norm": 1.0453362464904785, "kl": 0.04473876953125, "learning_rate": 4.85e-07, "loss": 0.0005, "reward": 3.9507482051849365, "reward_std": 0.005348393111489713, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9646830558776855, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9860649704933167, "step": 413 }, { "completion_length": 197.71875, "epoch": 1.3232, "grad_norm": 10.967116355895996, "kl": 0.4443359375, "learning_rate": 4.8375e-07, "loss": 0.0044, "reward": 3.958775758743286, "reward_std": 0.01469768793322146, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9608590006828308, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 414 }, { "completion_length": 240.75, "epoch": 1.3264, "grad_norm": 1.771857738494873, "kl": 0.056884765625, "learning_rate": 4.824999999999999e-07, "loss": 0.0006, "reward": 3.9307100772857666, "reward_std": 0.01262786379083991, "rewards/answer_entity_reward": 0.9861111044883728, "rewards/answer_wer_reward": 0.9445989429950714, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 415 }, { "completion_length": 184.9375, "epoch": 1.3296000000000001, "grad_norm": 0.5742409825325012, "kl": 0.081787109375, "learning_rate": 4.812499999999999e-07, "loss": 0.0008, "reward": 3.965754270553589, "reward_std": 0.003614649409428239, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9657542705535889, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 416 }, { "completion_length": 173.90625, "epoch": 1.3328, "grad_norm": 1.4033151865005493, "kl": 0.074462890625, "learning_rate": 4.8e-07, "loss": 0.0007, "reward": 3.9543731212615967, "reward_std": 0.006403392762877047, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9728915691375732, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9814814925193787, "step": 417 }, { "completion_length": 224.0625, "epoch": 1.336, "grad_norm": 1.0427494049072266, "kl": 0.0576171875, "learning_rate": 4.7875e-07, "loss": 0.0006, "reward": 3.965309262275696, "reward_std": 0.011804148089140654, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9667502641677856, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9985590577125549, "step": 418 }, { "completion_length": 228.53125, "epoch": 1.3392, "grad_norm": 1.1613246202468872, "kl": 0.06591796875, "learning_rate": 4.775e-07, "loss": 0.0007, "reward": 3.948023200035095, "reward_std": 0.012544674333184958, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9482711553573608, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997519850730896, "step": 419 }, { "completion_length": 197.34375, "epoch": 1.3424, "grad_norm": 0.8760451674461365, "kl": 0.072265625, "learning_rate": 4.7625e-07, "loss": 0.0007, "reward": 3.938261866569519, "reward_std": 0.004269103752449155, "rewards/answer_entity_reward": 0.9886363744735718, "rewards/answer_wer_reward": 0.9496253132820129, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 420 }, { "completion_length": 225.5, "epoch": 1.3456000000000001, "grad_norm": 2.4799275398254395, "kl": 0.1290283203125, "learning_rate": 4.7499999999999995e-07, "loss": 0.0013, "reward": 3.9379055500030518, "reward_std": 0.008256121072918177, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9677021205425262, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9702034592628479, "step": 421 }, { "completion_length": 209.3125, "epoch": 1.3488, "grad_norm": 0.6864319443702698, "kl": 0.0604248046875, "learning_rate": 4.7374999999999996e-07, "loss": 0.0006, "reward": 3.9712308645248413, "reward_std": 0.0032088530133478343, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9722216725349426, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990091919898987, "step": 422 }, { "completion_length": 187.5625, "epoch": 1.3519999999999999, "grad_norm": 1.9412598609924316, "kl": 0.06787109375, "learning_rate": 4.725e-07, "loss": 0.0007, "reward": 3.947052240371704, "reward_std": 0.014190569054335356, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9569187760353088, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9925373196601868, "step": 423 }, { "completion_length": 225.59375, "epoch": 1.3552, "grad_norm": 1.4452259540557861, "kl": 0.09619140625, "learning_rate": 4.7125e-07, "loss": 0.001, "reward": 3.939266562461853, "reward_std": 0.012853712774813175, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9556125402450562, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9860578179359436, "step": 424 }, { "completion_length": 261.0, "epoch": 1.3584, "grad_norm": 0.9420474171638489, "kl": 0.054931640625, "learning_rate": 4.6999999999999995e-07, "loss": 0.0006, "reward": 3.939144253730774, "reward_std": 0.00785708031617105, "rewards/answer_entity_reward": 0.9916666746139526, "rewards/answer_wer_reward": 0.9474774897098541, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 425 }, { "completion_length": 243.1875, "epoch": 1.3616, "grad_norm": 1.1776657104492188, "kl": 0.078369140625, "learning_rate": 4.6874999999999996e-07, "loss": 0.0008, "reward": 3.928247570991516, "reward_std": 0.02044426929205656, "rewards/answer_entity_reward": 0.995192289352417, "rewards/answer_wer_reward": 0.9401307106018066, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9929245114326477, "step": 426 }, { "completion_length": 204.4375, "epoch": 1.3648, "grad_norm": 1.6268881559371948, "kl": 0.073974609375, "learning_rate": 4.675e-07, "loss": 0.0007, "reward": 3.9266600608825684, "reward_std": 0.006853222264908254, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9440751671791077, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9825847446918488, "step": 427 }, { "completion_length": 232.0625, "epoch": 1.3679999999999999, "grad_norm": 34.5067138671875, "kl": 0.755859375, "learning_rate": 4.6625e-07, "loss": 0.0076, "reward": 3.844196319580078, "reward_std": 0.04641831433400512, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9399954378604889, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9042008221149445, "step": 428 }, { "completion_length": 253.21875, "epoch": 1.3712, "grad_norm": 1.4444057941436768, "kl": 0.0673828125, "learning_rate": 4.65e-07, "loss": 0.0007, "reward": 3.963658928871155, "reward_std": 0.009957378264516592, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9636587798595428, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 429 }, { "completion_length": 241.875, "epoch": 1.3744, "grad_norm": 0.9258720278739929, "kl": 0.0687255859375, "learning_rate": 4.6374999999999995e-07, "loss": 0.0007, "reward": 3.9617748260498047, "reward_std": 0.013449362479150295, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9652469456195831, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 430 }, { "completion_length": 204.96875, "epoch": 1.3776, "grad_norm": 1.6328847408294678, "kl": 0.0863037109375, "learning_rate": 4.625e-07, "loss": 0.0009, "reward": 3.8922348022460938, "reward_std": 0.007920752046629786, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9477903544902802, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9444444477558136, "step": 431 }, { "completion_length": 222.375, "epoch": 1.3808, "grad_norm": 2.479295492172241, "kl": 0.0732421875, "learning_rate": 4.6125e-07, "loss": 0.0007, "reward": 3.9312403202056885, "reward_std": 0.02260798867791891, "rewards/answer_entity_reward": 0.9941239356994629, "rewards/answer_wer_reward": 0.937116414308548, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 432 }, { "completion_length": 203.28125, "epoch": 1.384, "grad_norm": 2.6669020652770996, "kl": 0.0631103515625, "learning_rate": 4.6e-07, "loss": 0.0006, "reward": 3.938199043273926, "reward_std": 0.014480275101959705, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9408722817897797, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997305870056152, "step": 433 }, { "completion_length": 255.1875, "epoch": 1.3872, "grad_norm": 1.4742846488952637, "kl": 0.057373046875, "learning_rate": 4.5874999999999995e-07, "loss": 0.0006, "reward": 3.9382212162017822, "reward_std": 0.01696724910289049, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.94236820936203, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9982567131519318, "step": 434 }, { "completion_length": 211.15625, "epoch": 1.3904, "grad_norm": 1.795336365699768, "kl": 0.0667724609375, "learning_rate": 4.575e-07, "loss": 0.0007, "reward": 3.919999361038208, "reward_std": 0.028288409113883972, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9725300371646881, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9474693238735199, "step": 435 }, { "completion_length": 208.65625, "epoch": 1.3936, "grad_norm": 2.1704065799713135, "kl": 0.095947265625, "learning_rate": 4.5624999999999997e-07, "loss": 0.001, "reward": 3.857280731201172, "reward_std": 0.2144411588087678, "rewards/answer_entity_reward": 0.9618055820465088, "rewards/answer_wer_reward": 0.949828714132309, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9768964946269989, "step": 436 }, { "completion_length": 194.9375, "epoch": 1.3968, "grad_norm": 3.8814220428466797, "kl": 0.082275390625, "learning_rate": 4.55e-07, "loss": 0.0008, "reward": 3.941987633705139, "reward_std": 0.015088737476617098, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.94545978307724, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 437 }, { "completion_length": 217.5, "epoch": 1.4, "grad_norm": 1.3024876117706299, "kl": 0.0389404296875, "learning_rate": 4.5374999999999994e-07, "loss": 0.0004, "reward": 3.950901508331299, "reward_std": 0.008365771966055036, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9589883685112, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9919130802154541, "step": 438 }, { "completion_length": 159.03125, "epoch": 1.4032, "grad_norm": 0.272270530462265, "kl": 0.0396728515625, "learning_rate": 4.525e-07, "loss": 0.0004, "reward": 3.9221452474594116, "reward_std": 0.0014547138416673988, "rewards/answer_entity_reward": 0.9916666746139526, "rewards/answer_wer_reward": 0.9875754117965698, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9429032206535339, "step": 439 }, { "completion_length": 200.28125, "epoch": 1.4064, "grad_norm": 5.4578399658203125, "kl": 0.0828857421875, "learning_rate": 4.5124999999999997e-07, "loss": 0.0008, "reward": 3.9259976148605347, "reward_std": 0.014895747415721416, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9536634683609009, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9758064448833466, "step": 440 }, { "completion_length": 229.1875, "epoch": 1.4096, "grad_norm": 0.6568198800086975, "kl": 0.067138671875, "learning_rate": 4.5e-07, "loss": 0.0007, "reward": 3.9455034732818604, "reward_std": 0.011267438880167902, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9479073286056519, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 441 }, { "completion_length": 199.3125, "epoch": 1.4128, "grad_norm": 1.0056089162826538, "kl": 0.0567626953125, "learning_rate": 4.4874999999999994e-07, "loss": 0.0006, "reward": 3.9622955322265625, "reward_std": 0.008431105175986886, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9622955024242401, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 442 }, { "completion_length": 212.375, "epoch": 1.416, "grad_norm": 0.7950085997581482, "kl": 0.051025390625, "learning_rate": 4.475e-07, "loss": 0.0005, "reward": 3.9517738819122314, "reward_std": 0.03710572328418493, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9708344638347626, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9809393286705017, "step": 443 }, { "completion_length": 227.71875, "epoch": 1.4192, "grad_norm": 0.8971355557441711, "kl": 0.0460205078125, "learning_rate": 4.4624999999999996e-07, "loss": 0.0005, "reward": 3.980188012123108, "reward_std": 0.00624943315051496, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9801879525184631, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 444 }, { "completion_length": 226.78125, "epoch": 1.4224, "grad_norm": 2.114032745361328, "kl": 0.0791015625, "learning_rate": 4.45e-07, "loss": 0.0008, "reward": 3.879195213317871, "reward_std": 0.03936337144114077, "rewards/answer_entity_reward": 0.9981617629528046, "rewards/answer_wer_reward": 0.9502902626991272, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9307432472705841, "step": 445 }, { "completion_length": 227.875, "epoch": 1.4256, "grad_norm": 1.0065126419067383, "kl": 0.083984375, "learning_rate": 4.4374999999999993e-07, "loss": 0.0009, "reward": 3.939829707145691, "reward_std": 0.013783617876470089, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9398296475410461, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 446 }, { "completion_length": 202.6875, "epoch": 1.4288, "grad_norm": 1.7568168640136719, "kl": 0.0418701171875, "learning_rate": 4.425e-07, "loss": 0.0004, "reward": 3.943518042564392, "reward_std": 0.016201740596443415, "rewards/answer_entity_reward": 0.9914772808551788, "rewards/answer_wer_reward": 0.9520406126976013, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 447 }, { "completion_length": 172.8125, "epoch": 1.432, "grad_norm": 1.0688170194625854, "kl": 0.0494384765625, "learning_rate": 4.4124999999999996e-07, "loss": 0.0005, "reward": 3.7196162939071655, "reward_std": 0.006592530757188797, "rewards/answer_entity_reward": 0.8677884340286255, "rewards/answer_wer_reward": 0.8742637634277344, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9775640964508057, "step": 448 }, { "completion_length": 168.59375, "epoch": 1.4352, "grad_norm": 1.7712996006011963, "kl": 0.0435791015625, "learning_rate": 4.3999999999999997e-07, "loss": 0.0004, "reward": 3.8386131525039673, "reward_std": 0.011066187638789415, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.8386130630970001, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 449 }, { "completion_length": 197.40625, "epoch": 1.4384000000000001, "grad_norm": 0.8872710466384888, "kl": 0.058349609375, "learning_rate": 4.3874999999999993e-07, "loss": 0.0006, "reward": 3.7988067865371704, "reward_std": 0.03104257071390748, "rewards/answer_entity_reward": 0.9734432399272919, "rewards/answer_wer_reward": 0.8270545899868011, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9983089566230774, "step": 450 }, { "completion_length": 178.40625, "epoch": 1.4416, "grad_norm": 6.044506072998047, "kl": 0.0657958984375, "learning_rate": 4.375e-07, "loss": 0.0007, "reward": 3.9419833421707153, "reward_std": 0.021156481467187405, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9676234424114227, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9778319895267487, "step": 451 }, { "completion_length": 201.8125, "epoch": 1.4447999999999999, "grad_norm": 0.7943681478500366, "kl": 0.0511474609375, "learning_rate": 4.3625e-07, "loss": 0.0005, "reward": 3.956661581993103, "reward_std": 0.007463611662387848, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9675310552120209, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.989130437374115, "step": 452 }, { "completion_length": 219.03125, "epoch": 1.448, "grad_norm": 1.069403052330017, "kl": 0.0570068359375, "learning_rate": 4.3499999999999996e-07, "loss": 0.0006, "reward": 3.9562065601348877, "reward_std": 0.011006501503288746, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9564736187458038, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997329115867615, "step": 453 }, { "completion_length": 206.8125, "epoch": 1.4512, "grad_norm": 1.0987451076507568, "kl": 0.0611572265625, "learning_rate": 4.3375000000000003e-07, "loss": 0.0006, "reward": 3.9423000812530518, "reward_std": 0.01284673297777772, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9693345129489899, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9758064448833466, "step": 454 }, { "completion_length": 211.375, "epoch": 1.4544000000000001, "grad_norm": 3.5896220207214355, "kl": 0.065673828125, "learning_rate": 4.325e-07, "loss": 0.0007, "reward": 3.961179494857788, "reward_std": 0.012267218437045813, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9640858769416809, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.99709352850914, "step": 455 }, { "completion_length": 238.8125, "epoch": 1.4576, "grad_norm": 0.625076174736023, "kl": 0.0399169921875, "learning_rate": 4.3125e-07, "loss": 0.0004, "reward": 3.9661307334899902, "reward_std": 0.013454007916152477, "rewards/answer_entity_reward": 0.9958333373069763, "rewards/answer_wer_reward": 0.9702973961830139, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 456 }, { "completion_length": 206.9375, "epoch": 1.4607999999999999, "grad_norm": 0.6369054317474365, "kl": 0.059814453125, "learning_rate": 4.2999999999999996e-07, "loss": 0.0006, "reward": 3.9704521894454956, "reward_std": 0.006653362594079226, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9733729660511017, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9970792829990387, "step": 457 }, { "completion_length": 199.625, "epoch": 1.464, "grad_norm": 1.2201271057128906, "kl": 0.083251953125, "learning_rate": 4.2875e-07, "loss": 0.0008, "reward": 3.967539429664612, "reward_std": 0.012669337913393974, "rewards/answer_entity_reward": 0.9927884340286255, "rewards/answer_wer_reward": 0.9747509360313416, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 458 }, { "completion_length": 220.0, "epoch": 1.4672, "grad_norm": 11.574130058288574, "kl": 0.2125244140625, "learning_rate": 4.275e-07, "loss": 0.0021, "reward": 3.9735381603240967, "reward_std": 0.0033322512172162533, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9737901091575623, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997479915618896, "step": 459 }, { "completion_length": 181.0625, "epoch": 1.4704, "grad_norm": 1.050900936126709, "kl": 0.0736083984375, "learning_rate": 4.2625e-07, "loss": 0.0007, "reward": 3.9467893838882446, "reward_std": 0.00827464903704822, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9717220067977905, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9750673770904541, "step": 460 }, { "completion_length": 207.4375, "epoch": 1.4736, "grad_norm": 1.25560462474823, "kl": 0.07861328125, "learning_rate": 4.2499999999999995e-07, "loss": 0.0008, "reward": 3.885838508605957, "reward_std": 0.012714273296296597, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9541967213153839, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9316417276859283, "step": 461 }, { "completion_length": 205.125, "epoch": 1.4768, "grad_norm": 2.1235697269439697, "kl": 0.064208984375, "learning_rate": 4.2375e-07, "loss": 0.0006, "reward": 3.952380895614624, "reward_std": 0.013835938647389412, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9538231492042542, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9985576868057251, "step": 462 }, { "completion_length": 229.25, "epoch": 1.48, "grad_norm": 3.838672399520874, "kl": 0.09619140625, "learning_rate": 4.225e-07, "loss": 0.001, "reward": 3.9537363052368164, "reward_std": 0.014287983998656273, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9542993903160095, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994369447231293, "step": 463 }, { "completion_length": 224.71875, "epoch": 1.4832, "grad_norm": 0.7103460431098938, "kl": 0.058837890625, "learning_rate": 4.2125e-07, "loss": 0.0006, "reward": 3.9675354957580566, "reward_std": 0.013558031525462866, "rewards/answer_entity_reward": 0.9958333373069763, "rewards/answer_wer_reward": 0.9719286262989044, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997735619544983, "step": 464 }, { "completion_length": 147.625, "epoch": 1.4864, "grad_norm": 2.865051031112671, "kl": 0.099853515625, "learning_rate": 4.1999999999999995e-07, "loss": 0.001, "reward": 3.958040475845337, "reward_std": 0.00422883324790746, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9780724942684174, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9799679517745972, "step": 465 }, { "completion_length": 250.625, "epoch": 1.4896, "grad_norm": 1.115330696105957, "kl": 0.062744140625, "learning_rate": 4.1875e-07, "loss": 0.0006, "reward": 3.925747871398926, "reward_std": 0.01510471198707819, "rewards/answer_entity_reward": 0.9895833134651184, "rewards/answer_wer_reward": 0.9361644089221954, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 466 }, { "completion_length": 181.28125, "epoch": 1.4928, "grad_norm": 0.8615334033966064, "kl": 0.095703125, "learning_rate": 4.1749999999999997e-07, "loss": 0.001, "reward": 3.9389272928237915, "reward_std": 0.009215079713612795, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.947648286819458, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9912790656089783, "step": 467 }, { "completion_length": 201.1875, "epoch": 1.496, "grad_norm": 0.8399393558502197, "kl": 0.067138671875, "learning_rate": 4.1625e-07, "loss": 0.0007, "reward": 3.9645369052886963, "reward_std": 0.005296911578625441, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9660760462284088, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9984607994556427, "step": 468 }, { "completion_length": 181.53125, "epoch": 1.4992, "grad_norm": 1.692581057548523, "kl": 0.116455078125, "learning_rate": 4.1499999999999994e-07, "loss": 0.0012, "reward": 3.91774320602417, "reward_std": 0.007862454745918512, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9589084982872009, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.958834707736969, "step": 469 }, { "completion_length": 208.375, "epoch": 1.5024, "grad_norm": 1.0280638933181763, "kl": 0.0733642578125, "learning_rate": 4.1375e-07, "loss": 0.0007, "reward": 3.963421940803528, "reward_std": 0.010574808926321566, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9634219110012054, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 470 }, { "completion_length": 194.375, "epoch": 1.5056, "grad_norm": 0.9556618332862854, "kl": 0.04541015625, "learning_rate": 4.1249999999999997e-07, "loss": 0.0005, "reward": 3.9483964443206787, "reward_std": 0.0071337176486849785, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9483965635299683, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 471 }, { "completion_length": 219.90625, "epoch": 1.5088, "grad_norm": 8.583925247192383, "kl": 0.057373046875, "learning_rate": 4.1125e-07, "loss": 0.0006, "reward": 3.9298593997955322, "reward_std": 0.010127428220584989, "rewards/answer_entity_reward": 0.9764957129955292, "rewards/answer_wer_reward": 0.9549680352210999, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9983957409858704, "step": 472 }, { "completion_length": 169.71875, "epoch": 1.512, "grad_norm": 1.0506740808486938, "kl": 0.0703125, "learning_rate": 4.0999999999999994e-07, "loss": 0.0007, "reward": 3.9712518453598022, "reward_std": 0.004299861378967762, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9712517857551575, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 473 }, { "completion_length": 254.0, "epoch": 1.5152, "grad_norm": 1.2391588687896729, "kl": 0.055419921875, "learning_rate": 4.0875e-07, "loss": 0.0006, "reward": 3.9443717002868652, "reward_std": 0.007719833869487047, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9459867179393768, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9983848929405212, "step": 474 }, { "completion_length": 173.15625, "epoch": 1.5184, "grad_norm": 21.967166900634766, "kl": 0.0810546875, "learning_rate": 4.0749999999999996e-07, "loss": 0.0008, "reward": 3.892626404762268, "reward_std": 0.03193977475166321, "rewards/answer_entity_reward": 0.9926470518112183, "rewards/answer_wer_reward": 0.9627694487571716, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9372097849845886, "step": 475 }, { "completion_length": 177.96875, "epoch": 1.5215999999999998, "grad_norm": 2.125126838684082, "kl": 0.0814208984375, "learning_rate": 4.0625e-07, "loss": 0.0008, "reward": 3.957445502281189, "reward_std": 0.016827338375151157, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9618943929672241, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990234375, "step": 476 }, { "completion_length": 259.34375, "epoch": 1.5248, "grad_norm": 1.144234538078308, "kl": 0.0545654296875, "learning_rate": 4.05e-07, "loss": 0.0005, "reward": 3.9333302974700928, "reward_std": 0.015490441583096981, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9336776435375214, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999652773141861, "step": 477 }, { "completion_length": 223.84375, "epoch": 1.528, "grad_norm": 0.8379483222961426, "kl": 0.0653076171875, "learning_rate": 4.0375e-07, "loss": 0.0007, "reward": 3.9397594928741455, "reward_std": 0.006189712788909674, "rewards/answer_entity_reward": 0.9926470518112183, "rewards/answer_wer_reward": 0.9652985334396362, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.981813907623291, "step": 478 }, { "completion_length": 195.15625, "epoch": 1.5312000000000001, "grad_norm": 1.9627622365951538, "kl": 0.0709228515625, "learning_rate": 4.025e-07, "loss": 0.0007, "reward": 3.90268337726593, "reward_std": 0.022933244705200195, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9422430694103241, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9632812440395355, "step": 479 }, { "completion_length": 212.03125, "epoch": 1.5344, "grad_norm": 1.4353668689727783, "kl": 0.0572509765625, "learning_rate": 4.0124999999999997e-07, "loss": 0.0006, "reward": 3.955712080001831, "reward_std": 0.004905138397589326, "rewards/answer_entity_reward": 0.9903846085071564, "rewards/answer_wer_reward": 0.9653275012969971, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 480 }, { "completion_length": 238.125, "epoch": 1.5375999999999999, "grad_norm": 0.9400500059127808, "kl": 0.0516357421875, "learning_rate": 4e-07, "loss": 0.0005, "reward": 3.9561740159988403, "reward_std": 0.004761199816130102, "rewards/answer_entity_reward": 0.9903846085071564, "rewards/answer_wer_reward": 0.9657893478870392, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 481 }, { "completion_length": 197.125, "epoch": 1.5408, "grad_norm": 1.7909142971038818, "kl": 0.044677734375, "learning_rate": 3.9875e-07, "loss": 0.0004, "reward": 3.9649877548217773, "reward_std": 0.008824507240206003, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9712709188461304, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.993716835975647, "step": 482 }, { "completion_length": 247.28125, "epoch": 1.544, "grad_norm": 1.305432915687561, "kl": 0.0885009765625, "learning_rate": 3.975e-07, "loss": 0.0009, "reward": 3.9271016120910645, "reward_std": 0.010741112288087606, "rewards/answer_entity_reward": 0.9867424368858337, "rewards/answer_wer_reward": 0.9422920942306519, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9980670213699341, "step": 483 }, { "completion_length": 183.71875, "epoch": 1.5472000000000001, "grad_norm": 1.2143511772155762, "kl": 0.083251953125, "learning_rate": 3.9624999999999996e-07, "loss": 0.0008, "reward": 3.961517810821533, "reward_std": 0.015109732514247298, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9659819006919861, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 484 }, { "completion_length": 190.96875, "epoch": 1.5504, "grad_norm": 1.3901034593582153, "kl": 0.0478515625, "learning_rate": 3.95e-07, "loss": 0.0005, "reward": 3.9620405435562134, "reward_std": 0.007438812637701631, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.962040513753891, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 485 }, { "completion_length": 236.71875, "epoch": 1.5535999999999999, "grad_norm": 1.005139946937561, "kl": 0.064697265625, "learning_rate": 3.9375e-07, "loss": 0.0007, "reward": 3.9681735038757324, "reward_std": 0.007598390802741051, "rewards/answer_entity_reward": 0.9981617629528046, "rewards/answer_wer_reward": 0.9703975021839142, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996141791343689, "step": 486 }, { "completion_length": 167.71875, "epoch": 1.5568, "grad_norm": 14.769695281982422, "kl": 0.088623046875, "learning_rate": 3.925e-07, "loss": 0.0009, "reward": 3.9402579069137573, "reward_std": 0.01711948262527585, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9504852592945099, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9897727370262146, "step": 487 }, { "completion_length": 245.59375, "epoch": 1.56, "grad_norm": 2.1311302185058594, "kl": 0.0643310546875, "learning_rate": 3.9124999999999996e-07, "loss": 0.0006, "reward": 3.965644121170044, "reward_std": 0.006802293471992016, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9664610624313354, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9991829991340637, "step": 488 }, { "completion_length": 228.90625, "epoch": 1.5632000000000001, "grad_norm": 2.194638967514038, "kl": 0.07861328125, "learning_rate": 3.8999999999999997e-07, "loss": 0.0008, "reward": 3.940732479095459, "reward_std": 0.00845141801983118, "rewards/answer_entity_reward": 0.9916666746139526, "rewards/answer_wer_reward": 0.9496362805366516, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994294047355652, "step": 489 }, { "completion_length": 229.09375, "epoch": 1.5664, "grad_norm": 1.4338947534561157, "kl": 0.067138671875, "learning_rate": 3.8875e-07, "loss": 0.0007, "reward": 3.974826216697693, "reward_std": 0.008368036011233926, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9759277105331421, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9988985061645508, "step": 490 }, { "completion_length": 147.1875, "epoch": 1.5695999999999999, "grad_norm": 0.9500789046287537, "kl": 0.055908203125, "learning_rate": 3.875e-07, "loss": 0.0006, "reward": 3.900749683380127, "reward_std": 0.004976645112037659, "rewards/answer_entity_reward": 0.9886363744735718, "rewards/answer_wer_reward": 0.981389045715332, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9307242631912231, "step": 491 }, { "completion_length": 207.1875, "epoch": 1.5728, "grad_norm": 18.29888916015625, "kl": 0.0787353515625, "learning_rate": 3.8624999999999995e-07, "loss": 0.0008, "reward": 3.9231996536254883, "reward_std": 0.01712162047624588, "rewards/answer_entity_reward": 0.9963235259056091, "rewards/answer_wer_reward": 0.9278469979763031, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990289807319641, "step": 492 }, { "completion_length": 215.3125, "epoch": 1.576, "grad_norm": 2.524644613265991, "kl": 0.0682373046875, "learning_rate": 3.8499999999999997e-07, "loss": 0.0007, "reward": 3.9182220697402954, "reward_std": 0.028343133628368378, "rewards/answer_entity_reward": 0.9899839758872986, "rewards/answer_wer_reward": 0.9533904790878296, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9748475551605225, "step": 493 }, { "completion_length": 205.21875, "epoch": 1.5792000000000002, "grad_norm": 0.8041574954986572, "kl": 0.0572509765625, "learning_rate": 3.8375e-07, "loss": 0.0006, "reward": 3.9712276458740234, "reward_std": 0.006993145681917667, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9721719622612, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990555644035339, "step": 494 }, { "completion_length": 245.84375, "epoch": 1.5824, "grad_norm": 1.4723294973373413, "kl": 0.0518798828125, "learning_rate": 3.825e-07, "loss": 0.0005, "reward": 3.9171528816223145, "reward_std": 0.007540189428254962, "rewards/answer_entity_reward": 0.9707792401313782, "rewards/answer_wer_reward": 0.9463737607002258, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 495 }, { "completion_length": 191.1875, "epoch": 1.5856, "grad_norm": 5.778710842132568, "kl": 0.095703125, "learning_rate": 3.8124999999999995e-07, "loss": 0.001, "reward": 3.7989085912704468, "reward_std": 0.02309321239590645, "rewards/answer_entity_reward": 0.9837072491645813, "rewards/answer_wer_reward": 0.9482426345348358, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.866958737373352, "step": 496 }, { "completion_length": 164.375, "epoch": 1.5888, "grad_norm": 3.773331880569458, "kl": 0.0452880859375, "learning_rate": 3.7999999999999996e-07, "loss": 0.0005, "reward": 3.957179307937622, "reward_std": 0.03012340608984232, "rewards/answer_entity_reward": 0.995192289352417, "rewards/answer_wer_reward": 0.9724558889865875, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9895310997962952, "step": 497 }, { "completion_length": 190.34375, "epoch": 1.592, "grad_norm": 1.7698373794555664, "kl": 0.0579833984375, "learning_rate": 3.7875e-07, "loss": 0.0006, "reward": 3.9473685026168823, "reward_std": 0.009419793263077736, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9480363428592682, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993322789669037, "step": 498 }, { "completion_length": 223.03125, "epoch": 1.5952, "grad_norm": 1.197536587715149, "kl": 0.074462890625, "learning_rate": 3.775e-07, "loss": 0.0007, "reward": 3.9201695919036865, "reward_std": 0.012398123741149902, "rewards/answer_entity_reward": 0.9861111044883728, "rewards/answer_wer_reward": 0.9409077167510986, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9931506812572479, "step": 499 }, { "completion_length": 204.46875, "epoch": 1.5984, "grad_norm": 1.5246530771255493, "kl": 0.0849609375, "learning_rate": 3.7624999999999994e-07, "loss": 0.0008, "reward": 3.9556870460510254, "reward_std": 0.010473677422851324, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9580392241477966, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9976478517055511, "step": 500 }, { "completion_length": 230.0625, "epoch": 1.6016, "grad_norm": 1.1340093612670898, "kl": 0.10595703125, "learning_rate": 3.75e-07, "loss": 0.0011, "reward": 3.9659206867218018, "reward_std": 0.008191006258130074, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9659207165241241, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 501 }, { "completion_length": 185.15625, "epoch": 1.6048, "grad_norm": 1.2874914407730103, "kl": 0.045654296875, "learning_rate": 3.7375e-07, "loss": 0.0005, "reward": 3.9568817615509033, "reward_std": 0.011238863109610975, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9603540003299713, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 502 }, { "completion_length": 241.78125, "epoch": 1.608, "grad_norm": 0.9499295353889465, "kl": 0.0531005859375, "learning_rate": 3.725e-07, "loss": 0.0005, "reward": 3.9388747215270996, "reward_std": 0.008348907809704542, "rewards/answer_entity_reward": 0.9886363744735718, "rewards/answer_wer_reward": 0.9510295391082764, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9992088675498962, "step": 503 }, { "completion_length": 233.25, "epoch": 1.6112, "grad_norm": 1.0857101678848267, "kl": 0.062744140625, "learning_rate": 3.7125e-07, "loss": 0.0006, "reward": 3.958517551422119, "reward_std": 0.0058578201569616795, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.958990752696991, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9995267689228058, "step": 504 }, { "completion_length": 251.78125, "epoch": 1.6143999999999998, "grad_norm": 28.171039581298828, "kl": 0.114013671875, "learning_rate": 3.7e-07, "loss": 0.0011, "reward": 3.866329312324524, "reward_std": 0.01942992489784956, "rewards/answer_entity_reward": 0.9720904231071472, "rewards/answer_wer_reward": 0.8955735862255096, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9986652433872223, "step": 505 }, { "completion_length": 186.46875, "epoch": 1.6176, "grad_norm": 6.638906955718994, "kl": 0.06884765625, "learning_rate": 3.6875e-07, "loss": 0.0007, "reward": 3.7806142568588257, "reward_std": 0.013823950197547674, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.945627748966217, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8378273248672485, "step": 506 }, { "completion_length": 225.375, "epoch": 1.6208, "grad_norm": 2.12021803855896, "kl": 0.07177734375, "learning_rate": 3.675e-07, "loss": 0.0007, "reward": 3.9451769590377808, "reward_std": 0.013169697020202875, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9672558605670929, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.977921187877655, "step": 507 }, { "completion_length": 219.125, "epoch": 1.624, "grad_norm": 1.5153933763504028, "kl": 0.053955078125, "learning_rate": 3.6625e-07, "loss": 0.0005, "reward": 3.959490180015564, "reward_std": 0.010949777672067285, "rewards/answer_entity_reward": 0.9958333373069763, "rewards/answer_wer_reward": 0.9636567533016205, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 508 }, { "completion_length": 228.4375, "epoch": 1.6272, "grad_norm": 3.832310676574707, "kl": 0.0521240234375, "learning_rate": 3.65e-07, "loss": 0.0005, "reward": 3.953840732574463, "reward_std": 0.017153040505945683, "rewards/answer_entity_reward": 0.9936868846416473, "rewards/answer_wer_reward": 0.9603707194328308, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997829794883728, "step": 509 }, { "completion_length": 243.46875, "epoch": 1.6303999999999998, "grad_norm": 1.285962462425232, "kl": 0.0673828125, "learning_rate": 3.6375e-07, "loss": 0.0007, "reward": 3.960462808609009, "reward_std": 0.0062334975227713585, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9608500599861145, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996127486228943, "step": 510 }, { "completion_length": 262.65625, "epoch": 1.6336, "grad_norm": 1.124130368232727, "kl": 0.0596923828125, "learning_rate": 3.6249999999999997e-07, "loss": 0.0006, "reward": 3.941042900085449, "reward_std": 0.01204587472602725, "rewards/answer_entity_reward": 0.9970238208770752, "rewards/answer_wer_reward": 0.9446144104003906, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994047582149506, "step": 511 }, { "completion_length": 182.28125, "epoch": 1.6368, "grad_norm": 1.9966425895690918, "kl": 0.061279296875, "learning_rate": 3.6125e-07, "loss": 0.0006, "reward": 3.9531023502349854, "reward_std": 0.02773769712075591, "rewards/answer_entity_reward": 0.9917200803756714, "rewards/answer_wer_reward": 0.9697157144546509, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9916666746139526, "step": 512 }, { "completion_length": 218.125, "epoch": 1.6400000000000001, "grad_norm": 3.2862062454223633, "kl": 0.04736328125, "learning_rate": 3.6e-07, "loss": 0.0005, "reward": 3.858319878578186, "reward_std": 0.07778534758836031, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9565341770648956, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.90625, "step": 513 }, { "completion_length": 235.03125, "epoch": 1.6432, "grad_norm": 1.14111328125, "kl": 0.054443359375, "learning_rate": 3.5875e-07, "loss": 0.0005, "reward": 3.967674970626831, "reward_std": 0.0044005257077515125, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9691169261932373, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9985580444335938, "step": 514 }, { "completion_length": 233.90625, "epoch": 1.6463999999999999, "grad_norm": 1.2006644010543823, "kl": 0.06103515625, "learning_rate": 3.5749999999999997e-07, "loss": 0.0006, "reward": 3.959411859512329, "reward_std": 0.005820953520014882, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9596619009971619, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999750018119812, "step": 515 }, { "completion_length": 252.5625, "epoch": 1.6496, "grad_norm": 0.7272346615791321, "kl": 0.0428466796875, "learning_rate": 3.5625e-07, "loss": 0.0004, "reward": 3.963356375694275, "reward_std": 0.0036240214249119163, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.964261919260025, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990943968296051, "step": 516 }, { "completion_length": 240.6875, "epoch": 1.6528, "grad_norm": 1.0241456031799316, "kl": 0.0665283203125, "learning_rate": 3.55e-07, "loss": 0.0007, "reward": 3.953768730163574, "reward_std": 0.012724505737423897, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9555812776088715, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9981874525547028, "step": 517 }, { "completion_length": 221.5625, "epoch": 1.6560000000000001, "grad_norm": 0.9653159379959106, "kl": 0.0732421875, "learning_rate": 3.5375e-07, "loss": 0.0007, "reward": 3.928879141807556, "reward_std": 0.03069964610040188, "rewards/answer_entity_reward": 0.9769324958324432, "rewards/answer_wer_reward": 0.9525844156742096, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993622303009033, "step": 518 }, { "completion_length": 186.53125, "epoch": 1.6592, "grad_norm": 1.616326928138733, "kl": 0.0673828125, "learning_rate": 3.5249999999999996e-07, "loss": 0.0007, "reward": 3.963484525680542, "reward_std": 0.0024420777335762978, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9634844958782196, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 519 }, { "completion_length": 197.53125, "epoch": 1.6623999999999999, "grad_norm": 1.1605949401855469, "kl": 0.066162109375, "learning_rate": 3.5124999999999997e-07, "loss": 0.0007, "reward": 3.871947407722473, "reward_std": 0.008121895836666226, "rewards/answer_entity_reward": 0.9832701981067657, "rewards/answer_wer_reward": 0.9628296792507172, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9258474707603455, "step": 520 }, { "completion_length": 199.9375, "epoch": 1.6656, "grad_norm": 2.1799464225769043, "kl": 0.098876953125, "learning_rate": 3.5e-07, "loss": 0.001, "reward": 3.914597272872925, "reward_std": 0.046278308145701885, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9440673291683197, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9705299139022827, "step": 521 }, { "completion_length": 213.78125, "epoch": 1.6688, "grad_norm": 1.8315109014511108, "kl": 0.0609130859375, "learning_rate": 3.4875e-07, "loss": 0.0006, "reward": 3.934143304824829, "reward_std": 0.005300799617543817, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9627971351146698, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9713463485240936, "step": 522 }, { "completion_length": 233.21875, "epoch": 1.6720000000000002, "grad_norm": 2.7353854179382324, "kl": 0.0634765625, "learning_rate": 3.4749999999999996e-07, "loss": 0.0006, "reward": 3.940351963043213, "reward_std": 0.012048345990478992, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9584531188011169, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9818988144397736, "step": 523 }, { "completion_length": 226.8125, "epoch": 1.6752, "grad_norm": 1.2798601388931274, "kl": 0.0517578125, "learning_rate": 3.4624999999999997e-07, "loss": 0.0005, "reward": 3.94057559967041, "reward_std": 0.016422050073742867, "rewards/answer_entity_reward": 0.9859203100204468, "rewards/answer_wer_reward": 0.9546553492546082, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 524 }, { "completion_length": 225.375, "epoch": 1.6784, "grad_norm": 2.434398651123047, "kl": 0.0570068359375, "learning_rate": 3.45e-07, "loss": 0.0006, "reward": 3.9358779191970825, "reward_std": 0.02181497309356928, "rewards/answer_entity_reward": 0.9961080551147461, "rewards/answer_wer_reward": 0.9410910904407501, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9986788034439087, "step": 525 }, { "completion_length": 181.21875, "epoch": 1.6816, "grad_norm": 1.322139859199524, "kl": 0.116943359375, "learning_rate": 3.4375e-07, "loss": 0.0012, "reward": 3.946447730064392, "reward_std": 0.007033249130472541, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9464477598667145, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 526 }, { "completion_length": 195.8125, "epoch": 1.6848, "grad_norm": 1.412061333656311, "kl": 0.06640625, "learning_rate": 3.425e-07, "loss": 0.0007, "reward": 3.936468005180359, "reward_std": 0.00922114565037191, "rewards/answer_entity_reward": 0.9841346144676208, "rewards/answer_wer_reward": 0.952333390712738, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 527 }, { "completion_length": 210.84375, "epoch": 1.688, "grad_norm": 3.695819139480591, "kl": 0.056640625, "learning_rate": 3.4124999999999996e-07, "loss": 0.0006, "reward": 3.894517421722412, "reward_std": 0.015210594050586224, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.9634661674499512, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9367331266403198, "step": 528 }, { "completion_length": 220.09375, "epoch": 1.6912, "grad_norm": 1.6299357414245605, "kl": 0.0711669921875, "learning_rate": 3.4000000000000003e-07, "loss": 0.0007, "reward": 3.9391125440597534, "reward_std": 0.014290765568148345, "rewards/answer_entity_reward": 0.9847222566604614, "rewards/answer_wer_reward": 0.954390287399292, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 529 }, { "completion_length": 195.65625, "epoch": 1.6944, "grad_norm": 4.491413116455078, "kl": 0.064453125, "learning_rate": 3.3875e-07, "loss": 0.0007, "reward": 3.971281409263611, "reward_std": 0.017785906326025724, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9796920418739319, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9944303929805756, "step": 530 }, { "completion_length": 208.34375, "epoch": 1.6976, "grad_norm": 4.832588195800781, "kl": 0.0972900390625, "learning_rate": 3.375e-07, "loss": 0.001, "reward": 3.9011433124542236, "reward_std": 0.010198547039180994, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9640267491340637, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9371165633201599, "step": 531 }, { "completion_length": 203.40625, "epoch": 1.7008, "grad_norm": 3.4038021564483643, "kl": 0.071044921875, "learning_rate": 3.3624999999999996e-07, "loss": 0.0007, "reward": 3.9605783224105835, "reward_std": 0.0076046837493777275, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9607688188552856, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9998094439506531, "step": 532 }, { "completion_length": 241.8125, "epoch": 1.704, "grad_norm": 1.0362496376037598, "kl": 0.063232421875, "learning_rate": 3.35e-07, "loss": 0.0006, "reward": 3.9339258670806885, "reward_std": 0.018858356634154916, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9387494027614594, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996408224105835, "step": 533 }, { "completion_length": 235.90625, "epoch": 1.7072, "grad_norm": 3.604599714279175, "kl": 0.0853271484375, "learning_rate": 3.3375e-07, "loss": 0.0009, "reward": 3.861118197441101, "reward_std": 0.011326078558340669, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9576848149299622, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9034333229064941, "step": 534 }, { "completion_length": 229.0, "epoch": 1.7104, "grad_norm": 2.319185256958008, "kl": 0.052001953125, "learning_rate": 3.325e-07, "loss": 0.0005, "reward": 3.9228227138519287, "reward_std": 0.03856424614787102, "rewards/answer_entity_reward": 0.9914772808551788, "rewards/answer_wer_reward": 0.9560109972953796, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9753345847129822, "step": 535 }, { "completion_length": 224.71875, "epoch": 1.7136, "grad_norm": 2.444124460220337, "kl": 0.080810546875, "learning_rate": 3.3124999999999995e-07, "loss": 0.0008, "reward": 3.9688942432403564, "reward_std": 0.003912239335477352, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9688942730426788, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 536 }, { "completion_length": 224.3125, "epoch": 1.7168, "grad_norm": 6.20790958404541, "kl": 0.064697265625, "learning_rate": 3.3e-07, "loss": 0.0006, "reward": 3.8677161931991577, "reward_std": 0.02981195878237486, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9471929371356964, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9233641624450684, "step": 537 }, { "completion_length": 150.34375, "epoch": 1.72, "grad_norm": 1.6208490133285522, "kl": 0.03924560546875, "learning_rate": 3.2875e-07, "loss": 0.0004, "reward": 3.9733328819274902, "reward_std": 0.002679725643247366, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9733329117298126, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 538 }, { "completion_length": 183.0, "epoch": 1.7231999999999998, "grad_norm": 1.2286797761917114, "kl": 0.057861328125, "learning_rate": 3.275e-07, "loss": 0.0006, "reward": 3.935777187347412, "reward_std": 0.003249647794291377, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9795266687870026, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9562505781650543, "step": 539 }, { "completion_length": 234.625, "epoch": 1.7264, "grad_norm": 1.304764747619629, "kl": 0.054931640625, "learning_rate": 3.2624999999999995e-07, "loss": 0.0005, "reward": 3.950987696647644, "reward_std": 0.00898568145930767, "rewards/answer_entity_reward": 0.9958333373069763, "rewards/answer_wer_reward": 0.9557509124279022, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994034171104431, "step": 540 }, { "completion_length": 183.96875, "epoch": 1.7296, "grad_norm": 1.3975461721420288, "kl": 0.07421875, "learning_rate": 3.25e-07, "loss": 0.0007, "reward": 3.918307065963745, "reward_std": 0.01607332704588771, "rewards/answer_entity_reward": 0.9720314145088196, "rewards/answer_wer_reward": 0.9547825455665588, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9914930462837219, "step": 541 }, { "completion_length": 204.0625, "epoch": 1.7328000000000001, "grad_norm": 2.0030770301818848, "kl": 0.070068359375, "learning_rate": 3.2374999999999997e-07, "loss": 0.0007, "reward": 3.9624624252319336, "reward_std": 0.011391833890229464, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9645456969738007, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 542 }, { "completion_length": 236.8125, "epoch": 1.736, "grad_norm": 1.0529872179031372, "kl": 0.06396484375, "learning_rate": 3.225e-07, "loss": 0.0006, "reward": 3.9355998039245605, "reward_std": 0.011712775565683842, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.946576714515686, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9890230894088745, "step": 543 }, { "completion_length": 171.03125, "epoch": 1.7391999999999999, "grad_norm": 1.4777579307556152, "kl": 0.07861328125, "learning_rate": 3.2124999999999994e-07, "loss": 0.0008, "reward": 3.959132194519043, "reward_std": 0.007866068510338664, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9591321349143982, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 544 }, { "completion_length": 199.03125, "epoch": 1.7424, "grad_norm": 1.5819900035858154, "kl": 0.07666015625, "learning_rate": 3.2e-07, "loss": 0.0008, "reward": 3.9456801414489746, "reward_std": 0.01446144049987197, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9492515921592712, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9985119104385376, "step": 545 }, { "completion_length": 243.53125, "epoch": 1.7456, "grad_norm": 6.461181640625, "kl": 0.1029052734375, "learning_rate": 3.1874999999999997e-07, "loss": 0.001, "reward": 3.9253257513046265, "reward_std": 0.013943355064839125, "rewards/answer_entity_reward": 0.9861111044883728, "rewards/answer_wer_reward": 0.9411455988883972, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9980691075325012, "step": 546 }, { "completion_length": 190.21875, "epoch": 1.7488000000000001, "grad_norm": 1.5046278238296509, "kl": 0.0430908203125, "learning_rate": 3.175e-07, "loss": 0.0004, "reward": 3.946847081184387, "reward_std": 0.006090850802138448, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9579125344753265, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9889345765113831, "step": 547 }, { "completion_length": 199.5625, "epoch": 1.752, "grad_norm": 2.7514781951904297, "kl": 0.054931640625, "learning_rate": 3.1624999999999994e-07, "loss": 0.0006, "reward": 3.9198288917541504, "reward_std": 0.008053636411204934, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9198288321495056, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 548 }, { "completion_length": 244.625, "epoch": 1.7551999999999999, "grad_norm": 1.0448155403137207, "kl": 0.0426025390625, "learning_rate": 3.15e-07, "loss": 0.0004, "reward": 3.958520531654358, "reward_std": 0.008235724177211523, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9585205316543579, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 549 }, { "completion_length": 249.0, "epoch": 1.7584, "grad_norm": 128.38499450683594, "kl": 17.28076171875, "learning_rate": 3.1374999999999996e-07, "loss": 0.172, "reward": 3.932722330093384, "reward_std": 0.012139817699790001, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9340447783470154, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.998677521944046, "step": 550 }, { "completion_length": 202.25, "epoch": 1.7616, "grad_norm": 1.6289058923721313, "kl": 0.0709228515625, "learning_rate": 3.1249999999999997e-07, "loss": 0.0007, "reward": 3.931633234024048, "reward_std": 0.015017563942819834, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.9620243012905121, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9752906560897827, "step": 551 }, { "completion_length": 223.65625, "epoch": 1.7648000000000001, "grad_norm": 0.650069534778595, "kl": 0.0467529296875, "learning_rate": 3.1125000000000004e-07, "loss": 0.0005, "reward": 3.9622879028320312, "reward_std": 0.004962240578606725, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9622879028320312, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 552 }, { "completion_length": 238.65625, "epoch": 1.768, "grad_norm": 9.516084671020508, "kl": 0.0474853515625, "learning_rate": 3.1e-07, "loss": 0.0005, "reward": 3.9525749683380127, "reward_std": 0.012759724631905556, "rewards/answer_entity_reward": 0.9937500059604645, "rewards/answer_wer_reward": 0.9610438644886017, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9977810680866241, "step": 553 }, { "completion_length": 224.65625, "epoch": 1.7711999999999999, "grad_norm": 1.8886899948120117, "kl": 0.044189453125, "learning_rate": 3.0875e-07, "loss": 0.0004, "reward": 3.9586617946624756, "reward_std": 0.01200480293482542, "rewards/answer_entity_reward": 0.9914772808551788, "rewards/answer_wer_reward": 0.9675752222537994, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996093809604645, "step": 554 }, { "completion_length": 220.9375, "epoch": 1.7744, "grad_norm": 5.122376918792725, "kl": 0.048828125, "learning_rate": 3.0749999999999997e-07, "loss": 0.0005, "reward": 3.9466060400009155, "reward_std": 0.016119306907057762, "rewards/answer_entity_reward": 0.9965170323848724, "rewards/answer_wer_reward": 0.9567474722862244, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.993341475725174, "step": 555 }, { "completion_length": 198.8125, "epoch": 1.7776, "grad_norm": 4.916889667510986, "kl": 0.068115234375, "learning_rate": 3.0625000000000003e-07, "loss": 0.0007, "reward": 3.949711561203003, "reward_std": 0.0163404387421906, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9576182961463928, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9920931458473206, "step": 556 }, { "completion_length": 180.875, "epoch": 1.7808000000000002, "grad_norm": 10.021855354309082, "kl": 0.072021484375, "learning_rate": 3.05e-07, "loss": 0.0007, "reward": 3.867478370666504, "reward_std": 0.047242360189557076, "rewards/answer_entity_reward": 0.9821428656578064, "rewards/answer_wer_reward": 0.9576010704040527, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.927734375, "step": 557 }, { "completion_length": 227.8125, "epoch": 1.784, "grad_norm": 1.7502044439315796, "kl": 0.04443359375, "learning_rate": 3.0375e-07, "loss": 0.0004, "reward": 3.9525381326675415, "reward_std": 0.013325697276741266, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9532942175865173, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999243974685669, "step": 558 }, { "completion_length": 204.15625, "epoch": 1.7872, "grad_norm": 5.304961681365967, "kl": 0.0496826171875, "learning_rate": 3.0249999999999996e-07, "loss": 0.0005, "reward": 3.957284450531006, "reward_std": 0.005683758878149092, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9572845101356506, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 559 }, { "completion_length": 228.34375, "epoch": 1.7904, "grad_norm": 1.2513984441757202, "kl": 0.0577392578125, "learning_rate": 3.0125000000000003e-07, "loss": 0.0006, "reward": 3.94599187374115, "reward_std": 0.00800859834998846, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.957431435585022, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9885604083538055, "step": 560 }, { "completion_length": 211.03125, "epoch": 1.7936, "grad_norm": 5.97805118560791, "kl": 0.1036376953125, "learning_rate": 3e-07, "loss": 0.001, "reward": 3.9404828548431396, "reward_std": 0.01265423372387886, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9433237612247467, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 561 }, { "completion_length": 205.4375, "epoch": 1.7968, "grad_norm": 3.833575487136841, "kl": 0.22998046875, "learning_rate": 2.9875e-07, "loss": 0.0023, "reward": 3.909332752227783, "reward_std": 0.007294894196093082, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9648370146751404, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9444957971572876, "step": 562 }, { "completion_length": 207.4375, "epoch": 1.8, "grad_norm": 0.8627040982246399, "kl": 0.0611572265625, "learning_rate": 2.9749999999999996e-07, "loss": 0.0006, "reward": 3.9548414945602417, "reward_std": 0.006908831186592579, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9550975561141968, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997438490390778, "step": 563 }, { "completion_length": 198.15625, "epoch": 1.8032, "grad_norm": 0.9193502068519592, "kl": 0.0518798828125, "learning_rate": 2.9625e-07, "loss": 0.0005, "reward": 3.9462149143218994, "reward_std": 0.007913234177976847, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9465437531471252, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996710419654846, "step": 564 }, { "completion_length": 198.15625, "epoch": 1.8064, "grad_norm": 1.9635776281356812, "kl": 0.059814453125, "learning_rate": 2.95e-07, "loss": 0.0006, "reward": 3.896806240081787, "reward_std": 0.012922112364321947, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9503778219223022, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9464285671710968, "step": 565 }, { "completion_length": 164.90625, "epoch": 1.8096, "grad_norm": 1.2068322896957397, "kl": 0.09375, "learning_rate": 2.9375e-07, "loss": 0.0009, "reward": 3.8490008115768433, "reward_std": 0.1467541428282857, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9502907395362854, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9328009486198425, "step": 566 }, { "completion_length": 206.34375, "epoch": 1.8128, "grad_norm": 2.1644375324249268, "kl": 0.08251953125, "learning_rate": 2.9249999999999995e-07, "loss": 0.0008, "reward": 3.970282793045044, "reward_std": 0.0077400594018399715, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9728601574897766, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9974226951599121, "step": 567 }, { "completion_length": 233.09375, "epoch": 1.8159999999999998, "grad_norm": 1.106130599975586, "kl": 0.0552978515625, "learning_rate": 2.9125e-07, "loss": 0.0005, "reward": 3.9414994716644287, "reward_std": 0.011295767035335302, "rewards/answer_entity_reward": 0.9848698973655701, "rewards/answer_wer_reward": 0.9577165246009827, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.998913049697876, "step": 568 }, { "completion_length": 206.46875, "epoch": 1.8192, "grad_norm": 1.2371478080749512, "kl": 0.0599365234375, "learning_rate": 2.9e-07, "loss": 0.0006, "reward": 3.9829952716827393, "reward_std": 0.007155058206990361, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9829952716827393, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 569 }, { "completion_length": 227.875, "epoch": 1.8224, "grad_norm": 0.9648468494415283, "kl": 0.0587158203125, "learning_rate": 2.8875e-07, "loss": 0.0006, "reward": 3.875002384185791, "reward_std": 0.007613388821482658, "rewards/answer_entity_reward": 0.9604166746139526, "rewards/answer_wer_reward": 0.9299702048301697, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9846153855323792, "step": 570 }, { "completion_length": 242.1875, "epoch": 1.8256000000000001, "grad_norm": 3.7682442665100098, "kl": 0.0732421875, "learning_rate": 2.8749999999999995e-07, "loss": 0.0007, "reward": 3.790624737739563, "reward_std": 0.14343099505640566, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9464230239391327, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.8754517436027527, "step": 571 }, { "completion_length": 248.28125, "epoch": 1.8288, "grad_norm": 0.7550325393676758, "kl": 0.039794921875, "learning_rate": 2.8625e-07, "loss": 0.0004, "reward": 3.9295032024383545, "reward_std": 0.004920503590255976, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9295033514499664, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 572 }, { "completion_length": 222.4375, "epoch": 1.8319999999999999, "grad_norm": 1.055333137512207, "kl": 0.0567626953125, "learning_rate": 2.8499999999999997e-07, "loss": 0.0006, "reward": 3.929059386253357, "reward_std": 0.014613255392760038, "rewards/answer_entity_reward": 0.9819711446762085, "rewards/answer_wer_reward": 0.9496394395828247, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.997448742389679, "step": 573 }, { "completion_length": 217.40625, "epoch": 1.8352, "grad_norm": 1.640468716621399, "kl": 0.0443115234375, "learning_rate": 2.8375e-07, "loss": 0.0004, "reward": 3.9705777168273926, "reward_std": 0.013166352873668075, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9736025929450989, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9998161792755127, "step": 574 }, { "completion_length": 229.1875, "epoch": 1.8384, "grad_norm": 3.271684169769287, "kl": 0.0567626953125, "learning_rate": 2.8249999999999994e-07, "loss": 0.0006, "reward": 3.9389246702194214, "reward_std": 0.007664299104362726, "rewards/answer_entity_reward": 0.9833333492279053, "rewards/answer_wer_reward": 0.9555914402008057, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 575 }, { "completion_length": 203.28125, "epoch": 1.8416000000000001, "grad_norm": 1.6847234964370728, "kl": 0.063232421875, "learning_rate": 2.8125e-07, "loss": 0.0006, "reward": 3.9692747592926025, "reward_std": 0.006263851770199835, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9701676964759827, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999107152223587, "step": 576 }, { "completion_length": 251.0625, "epoch": 1.8448, "grad_norm": 4.737148761749268, "kl": 0.128173828125, "learning_rate": 2.8e-07, "loss": 0.0013, "reward": 3.935584545135498, "reward_std": 0.016471964307129383, "rewards/answer_entity_reward": 0.9937500059604645, "rewards/answer_wer_reward": 0.9418345093727112, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 577 }, { "completion_length": 199.5, "epoch": 1.8479999999999999, "grad_norm": 1.7424699068069458, "kl": 0.0618896484375, "learning_rate": 2.7875e-07, "loss": 0.0006, "reward": 3.966155171394348, "reward_std": 0.012047166470438242, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9755966663360596, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9905584752559662, "step": 578 }, { "completion_length": 192.96875, "epoch": 1.8512, "grad_norm": 0.8571773171424866, "kl": 0.0526123046875, "learning_rate": 2.775e-07, "loss": 0.0005, "reward": 3.977761387825012, "reward_std": 0.0047087406273931265, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9777614176273346, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 579 }, { "completion_length": 223.6875, "epoch": 1.8544, "grad_norm": 1.3312608003616333, "kl": 0.050537109375, "learning_rate": 2.7625e-07, "loss": 0.0005, "reward": 3.9508321285247803, "reward_std": 0.00891483761370182, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9508320689201355, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 580 }, { "completion_length": 241.96875, "epoch": 1.8576000000000001, "grad_norm": 4.553063869476318, "kl": 0.19140625, "learning_rate": 2.75e-07, "loss": 0.0019, "reward": 3.925418257713318, "reward_std": 0.016543671488761902, "rewards/answer_entity_reward": 0.9963235259056091, "rewards/answer_wer_reward": 0.9290946125984192, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 581 }, { "completion_length": 241.90625, "epoch": 1.8608, "grad_norm": 0.8970361948013306, "kl": 0.065185546875, "learning_rate": 2.7374999999999997e-07, "loss": 0.0007, "reward": 3.9467151165008545, "reward_std": 0.007796656806021929, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9470826387405396, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996323585510254, "step": 582 }, { "completion_length": 246.96875, "epoch": 1.8639999999999999, "grad_norm": 1.9463343620300293, "kl": 0.04547119140625, "learning_rate": 2.725e-07, "loss": 0.0005, "reward": 3.940864324569702, "reward_std": 0.011073273373767734, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9416800141334534, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9991843402385712, "step": 583 }, { "completion_length": 206.625, "epoch": 1.8672, "grad_norm": 4.5208892822265625, "kl": 0.092529296875, "learning_rate": 2.7125e-07, "loss": 0.0009, "reward": 3.8930487632751465, "reward_std": 0.032747200690209866, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.9660382270812988, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9326923191547394, "step": 584 }, { "completion_length": 255.25, "epoch": 1.8704, "grad_norm": 2.1606805324554443, "kl": 0.04736328125, "learning_rate": 2.7e-07, "loss": 0.0005, "reward": 3.936957836151123, "reward_std": 0.013339729979634285, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9393823444843292, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.997575432062149, "step": 585 }, { "completion_length": 226.3125, "epoch": 1.8736000000000002, "grad_norm": 0.7422674298286438, "kl": 0.048095703125, "learning_rate": 2.6874999999999997e-07, "loss": 0.0005, "reward": 3.9866139888763428, "reward_std": 0.0038484669639728963, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.987176924943924, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994369745254517, "step": 586 }, { "completion_length": 214.59375, "epoch": 1.8768, "grad_norm": 1.313864827156067, "kl": 0.0684814453125, "learning_rate": 2.675e-07, "loss": 0.0007, "reward": 3.9567151069641113, "reward_std": 0.012406408437527716, "rewards/answer_entity_reward": 0.9832702279090881, "rewards/answer_wer_reward": 0.9734448790550232, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 587 }, { "completion_length": 256.46875, "epoch": 1.88, "grad_norm": 1.4952497482299805, "kl": 0.1278076171875, "learning_rate": 2.6625e-07, "loss": 0.0013, "reward": 3.8717525005340576, "reward_std": 0.13869436737149954, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9397719204425812, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9660714268684387, "step": 588 }, { "completion_length": 217.09375, "epoch": 1.8832, "grad_norm": 1.3716284036636353, "kl": 0.054931640625, "learning_rate": 2.65e-07, "loss": 0.0006, "reward": 3.962627410888672, "reward_std": 0.006240109680220485, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9626273214817047, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 589 }, { "completion_length": 253.0, "epoch": 1.8864, "grad_norm": 1.4284135103225708, "kl": 0.07080078125, "learning_rate": 2.6374999999999996e-07, "loss": 0.0007, "reward": 3.9501919746398926, "reward_std": 0.012296234723180532, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9531300067901611, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9970619678497314, "step": 590 }, { "completion_length": 204.5, "epoch": 1.8896, "grad_norm": 3.8569161891937256, "kl": 0.07421875, "learning_rate": 2.625e-07, "loss": 0.0007, "reward": 3.9426995515823364, "reward_std": 0.027584614232182503, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9779268503189087, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9647727012634277, "step": 591 }, { "completion_length": 229.0625, "epoch": 1.8928, "grad_norm": 2.589956760406494, "kl": 0.08203125, "learning_rate": 2.6125e-07, "loss": 0.0008, "reward": 3.9178069829940796, "reward_std": 0.007971604820340872, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.95549076795578, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9623160660266876, "step": 592 }, { "completion_length": 170.65625, "epoch": 1.896, "grad_norm": 3.586792469024658, "kl": 0.0423583984375, "learning_rate": 2.6e-07, "loss": 0.0004, "reward": 3.9206513166427612, "reward_std": 0.023992381058633327, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9824000000953674, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9382513165473938, "step": 593 }, { "completion_length": 229.34375, "epoch": 1.8992, "grad_norm": 4.520889759063721, "kl": 0.07421875, "learning_rate": 2.5874999999999996e-07, "loss": 0.0007, "reward": 3.942514419555664, "reward_std": 0.038696477888152, "rewards/answer_entity_reward": 0.984275609254837, "rewards/answer_wer_reward": 0.9582389295101166, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 594 }, { "completion_length": 223.4375, "epoch": 1.9024, "grad_norm": 1.3104579448699951, "kl": 0.0565185546875, "learning_rate": 2.5749999999999997e-07, "loss": 0.0006, "reward": 3.976773500442505, "reward_std": 0.0044562743860296905, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9767734706401825, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 595 }, { "completion_length": 254.09375, "epoch": 1.9056, "grad_norm": 1.03975510597229, "kl": 0.05322265625, "learning_rate": 2.5625e-07, "loss": 0.0005, "reward": 3.943529725074768, "reward_std": 0.009816794656217098, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9451378583908081, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9983919560909271, "step": 596 }, { "completion_length": 243.03125, "epoch": 1.9088, "grad_norm": 1.0213077068328857, "kl": 0.0506591796875, "learning_rate": 2.55e-07, "loss": 0.0005, "reward": 3.9278059005737305, "reward_std": 0.00602961634285748, "rewards/answer_entity_reward": 0.9861111044883728, "rewards/answer_wer_reward": 0.9420903027057648, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996044337749481, "step": 597 }, { "completion_length": 182.46875, "epoch": 1.912, "grad_norm": 1.8683794736862183, "kl": 0.065185546875, "learning_rate": 2.5374999999999995e-07, "loss": 0.0007, "reward": 3.9624691009521484, "reward_std": 0.012565109878778458, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9729967415332794, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9939365684986115, "step": 598 }, { "completion_length": 166.25, "epoch": 1.9152, "grad_norm": 1.716305136680603, "kl": 0.0968017578125, "learning_rate": 2.5249999999999996e-07, "loss": 0.001, "reward": 3.896498918533325, "reward_std": 0.11676233587786555, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9749563038349152, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9527925550937653, "step": 599 }, { "completion_length": 199.59375, "epoch": 1.9184, "grad_norm": 1.2319942712783813, "kl": 0.0775146484375, "learning_rate": 2.5125e-07, "loss": 0.0008, "reward": 3.9489831924438477, "reward_std": 0.010235858615487814, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9580873548984528, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9908958971500397, "step": 600 }, { "completion_length": 210.53125, "epoch": 1.9216, "grad_norm": 1.0385370254516602, "kl": 0.0650634765625, "learning_rate": 2.5e-07, "loss": 0.0007, "reward": 3.966851830482483, "reward_std": 0.005628936691209674, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9668518006801605, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 601 }, { "completion_length": 186.9375, "epoch": 1.9247999999999998, "grad_norm": 2.1772327423095703, "kl": 0.11279296875, "learning_rate": 2.4875e-07, "loss": 0.0011, "reward": 3.9322038888931274, "reward_std": 0.01743672974407673, "rewards/answer_entity_reward": 0.9880681931972504, "rewards/answer_wer_reward": 0.9574334919452667, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9867021441459656, "step": 602 }, { "completion_length": 209.65625, "epoch": 1.928, "grad_norm": 0.9661850929260254, "kl": 0.072998046875, "learning_rate": 2.475e-07, "loss": 0.0007, "reward": 3.9598844051361084, "reward_std": 0.009228286100551486, "rewards/answer_entity_reward": 0.9937500059604645, "rewards/answer_wer_reward": 0.966718465089798, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994158744812012, "step": 603 }, { "completion_length": 193.65625, "epoch": 1.9312, "grad_norm": 2.6254851818084717, "kl": 0.102294921875, "learning_rate": 2.4624999999999997e-07, "loss": 0.001, "reward": 3.957027792930603, "reward_std": 0.008546661585569382, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9570277333259583, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 604 }, { "completion_length": 219.34375, "epoch": 1.9344000000000001, "grad_norm": 1.0413298606872559, "kl": 0.104736328125, "learning_rate": 2.45e-07, "loss": 0.0011, "reward": 3.9702824354171753, "reward_std": 0.007483657216653228, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9702823162078857, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 605 }, { "completion_length": 157.46875, "epoch": 1.9376, "grad_norm": 2.432849645614624, "kl": 0.14453125, "learning_rate": 2.4375e-07, "loss": 0.0014, "reward": 3.957343101501465, "reward_std": 0.005332180997356772, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.957624614238739, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997184872627258, "step": 606 }, { "completion_length": 248.40625, "epoch": 1.9407999999999999, "grad_norm": 0.8216654062271118, "kl": 0.071044921875, "learning_rate": 2.425e-07, "loss": 0.0007, "reward": 3.9644582271575928, "reward_std": 0.01216787239536643, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9688305556774139, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.998031497001648, "step": 607 }, { "completion_length": 218.625, "epoch": 1.944, "grad_norm": 0.9195014834403992, "kl": 0.0545654296875, "learning_rate": 2.4124999999999997e-07, "loss": 0.0005, "reward": 3.972040057182312, "reward_std": 0.004315207479521632, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9726911783218384, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993489682674408, "step": 608 }, { "completion_length": 231.84375, "epoch": 1.9472, "grad_norm": 1.3564932346343994, "kl": 0.06103515625, "learning_rate": 2.4e-07, "loss": 0.0006, "reward": 3.951057553291321, "reward_std": 0.013061597011983395, "rewards/answer_entity_reward": 0.9963235259056091, "rewards/answer_wer_reward": 0.9553851187229156, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993489682674408, "step": 609 }, { "completion_length": 241.4375, "epoch": 1.9504000000000001, "grad_norm": 0.9419238567352295, "kl": 0.051513671875, "learning_rate": 2.3875e-07, "loss": 0.0005, "reward": 3.971252202987671, "reward_std": 0.006067809648811817, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9715149104595184, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997373819351196, "step": 610 }, { "completion_length": 222.09375, "epoch": 1.9536, "grad_norm": 1.4854899644851685, "kl": 0.166748046875, "learning_rate": 2.3749999999999998e-07, "loss": 0.0017, "reward": 3.9489357471466064, "reward_std": 0.012118924409151077, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.948935866355896, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 611 }, { "completion_length": 259.8125, "epoch": 1.9567999999999999, "grad_norm": 2.2286458015441895, "kl": 0.0426025390625, "learning_rate": 2.3625e-07, "loss": 0.0004, "reward": 3.96254563331604, "reward_std": 0.005056597990915179, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9625457525253296, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 612 }, { "completion_length": 209.4375, "epoch": 1.96, "grad_norm": 4.077661514282227, "kl": 0.05615234375, "learning_rate": 2.3499999999999997e-07, "loss": 0.0006, "reward": 3.941632628440857, "reward_std": 0.01233140891417861, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9416325688362122, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 613 }, { "completion_length": 221.71875, "epoch": 1.9632, "grad_norm": 0.7665371298789978, "kl": 0.0555419921875, "learning_rate": 2.3375e-07, "loss": 0.0005, "reward": 3.9698644876480103, "reward_std": 0.009979546128306538, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.973064661026001, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996408224105835, "step": 614 }, { "completion_length": 219.875, "epoch": 1.9664000000000001, "grad_norm": 2.4666738510131836, "kl": 0.0546875, "learning_rate": 2.325e-07, "loss": 0.0005, "reward": 3.9548712968826294, "reward_std": 0.011192699894309044, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9553521871566772, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9995192289352417, "step": 615 }, { "completion_length": 235.0, "epoch": 1.9696, "grad_norm": 1.5382620096206665, "kl": 0.044921875, "learning_rate": 2.3125e-07, "loss": 0.0005, "reward": 3.9565550088882446, "reward_std": 0.008881408954039216, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9740456640720367, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9825093150138855, "step": 616 }, { "completion_length": 141.09375, "epoch": 1.9727999999999999, "grad_norm": 2.0756258964538574, "kl": 0.0631103515625, "learning_rate": 2.3e-07, "loss": 0.0006, "reward": 3.9571491479873657, "reward_std": 0.005044124089181423, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.980070561170578, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9770786762237549, "step": 617 }, { "completion_length": 222.46875, "epoch": 1.976, "grad_norm": 5.071360111236572, "kl": 0.075927734375, "learning_rate": 2.2875e-07, "loss": 0.0008, "reward": 3.8557703495025635, "reward_std": 0.06493359804153442, "rewards/answer_entity_reward": 0.9847027957439423, "rewards/answer_wer_reward": 0.9706770181655884, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.900390625, "step": 618 }, { "completion_length": 231.125, "epoch": 1.9792, "grad_norm": 1.0749843120574951, "kl": 0.050537109375, "learning_rate": 2.275e-07, "loss": 0.0005, "reward": 3.9660208225250244, "reward_std": 0.0037171735893934965, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9660208523273468, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 619 }, { "completion_length": 252.625, "epoch": 1.9824000000000002, "grad_norm": 1.5367364883422852, "kl": 0.070068359375, "learning_rate": 2.2625e-07, "loss": 0.0007, "reward": 3.946213126182556, "reward_std": 0.01816728012636304, "rewards/answer_entity_reward": 0.9867424070835114, "rewards/answer_wer_reward": 0.9616928696632385, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9977777898311615, "step": 620 }, { "completion_length": 239.34375, "epoch": 1.9856, "grad_norm": 2.541694164276123, "kl": 0.142578125, "learning_rate": 2.25e-07, "loss": 0.0014, "reward": 3.947938561439514, "reward_std": 0.009988004341721535, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9479385614395142, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 621 }, { "completion_length": 224.65625, "epoch": 1.9888, "grad_norm": 1.3821133375167847, "kl": 0.075927734375, "learning_rate": 2.2375e-07, "loss": 0.0007, "reward": 3.953581690788269, "reward_std": 0.006479294504970312, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.953581839799881, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 622 }, { "completion_length": 206.3125, "epoch": 1.992, "grad_norm": 1.0023412704467773, "kl": 0.13232421875, "learning_rate": 2.225e-07, "loss": 0.0013, "reward": 3.8949310779571533, "reward_std": 0.006026371265761554, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9634793996810913, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9314516186714172, "step": 623 }, { "completion_length": 179.96875, "epoch": 1.9952, "grad_norm": 1.534476637840271, "kl": 0.078125, "learning_rate": 2.2125e-07, "loss": 0.0008, "reward": 3.966533660888672, "reward_std": 0.008991609327495098, "rewards/answer_entity_reward": 0.9950658082962036, "rewards/answer_wer_reward": 0.9756669104099274, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9958009123802185, "step": 624 }, { "completion_length": 232.75, "epoch": 1.9984, "grad_norm": 0.7324752807617188, "kl": 0.0499267578125, "learning_rate": 2.1999999999999998e-07, "loss": 0.0005, "reward": 3.946596384048462, "reward_std": 0.011123172473162413, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9492979049682617, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997023940086365, "step": 625 }, { "completion_length": 176.0625, "epoch": 2.0, "grad_norm": 0.33141908049583435, "kl": 0.06005859375, "learning_rate": 2.1875e-07, "loss": 0.0003, "reward": 3.9717535972595215, "reward_std": 0.012056672014296055, "rewards/answer_entity_reward": 0.9963235259056091, "rewards/answer_wer_reward": 0.975429892539978, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 626 }, { "completion_length": 232.21875, "epoch": 2.0032, "grad_norm": 0.8334391117095947, "kl": 0.0457763671875, "learning_rate": 2.1749999999999998e-07, "loss": 0.0004, "reward": 3.970544457435608, "reward_std": 0.003736199578270316, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9705445766448975, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 627 }, { "completion_length": 173.375, "epoch": 2.0064, "grad_norm": 0.965114951133728, "kl": 0.067626953125, "learning_rate": 2.1625e-07, "loss": 0.0007, "reward": 3.974756956100464, "reward_std": 0.004756669281050563, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9788074791431427, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9959495067596436, "step": 628 }, { "completion_length": 222.15625, "epoch": 2.0096, "grad_norm": 2.102520227432251, "kl": 0.0474853515625, "learning_rate": 2.1499999999999998e-07, "loss": 0.0005, "reward": 3.938779830932617, "reward_std": 0.01813220279291272, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9791045486927032, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9596752524375916, "step": 629 }, { "completion_length": 206.40625, "epoch": 2.0128, "grad_norm": 1.3867822885513306, "kl": 0.095458984375, "learning_rate": 2.1375e-07, "loss": 0.001, "reward": 3.977003812789917, "reward_std": 0.003467106493189931, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9772301912307739, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997735619544983, "step": 630 }, { "completion_length": 237.625, "epoch": 2.016, "grad_norm": 1.2721437215805054, "kl": 0.0576171875, "learning_rate": 2.1249999999999998e-07, "loss": 0.0006, "reward": 3.96044921875, "reward_std": 0.007887857500463724, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9609974026679993, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999451756477356, "step": 631 }, { "completion_length": 190.65625, "epoch": 2.0192, "grad_norm": 1.6940927505493164, "kl": 0.170166015625, "learning_rate": 2.1125e-07, "loss": 0.0017, "reward": 3.92085862159729, "reward_std": 0.012093114666640759, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9635953307151794, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9572633504867554, "step": 632 }, { "completion_length": 213.75, "epoch": 2.0224, "grad_norm": 1.3798060417175293, "kl": 0.0552978515625, "learning_rate": 2.0999999999999997e-07, "loss": 0.0006, "reward": 3.9467806816101074, "reward_std": 0.00452708825469017, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9470699727535248, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997106492519379, "step": 633 }, { "completion_length": 193.5625, "epoch": 2.0256, "grad_norm": 1.5375889539718628, "kl": 0.046875, "learning_rate": 2.0874999999999999e-07, "loss": 0.0005, "reward": 3.9730241298675537, "reward_std": 0.006102013634517789, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9743154048919678, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987086653709412, "step": 634 }, { "completion_length": 204.09375, "epoch": 2.0288, "grad_norm": 1.0933163166046143, "kl": 0.09228515625, "learning_rate": 2.0749999999999997e-07, "loss": 0.0009, "reward": 3.9593019485473633, "reward_std": 0.008372287498787045, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9602685272693634, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999033510684967, "step": 635 }, { "completion_length": 186.875, "epoch": 2.032, "grad_norm": 3.5551085472106934, "kl": 0.085205078125, "learning_rate": 2.0624999999999998e-07, "loss": 0.0008, "reward": 3.937085270881653, "reward_std": 0.028064538724720478, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9683353006839752, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9687500298023224, "step": 636 }, { "completion_length": 228.875, "epoch": 2.0352, "grad_norm": 0.9865986108779907, "kl": 0.0728759765625, "learning_rate": 2.0499999999999997e-07, "loss": 0.0007, "reward": 3.9492111206054688, "reward_std": 0.007756081875413656, "rewards/answer_entity_reward": 0.9916666746139526, "rewards/answer_wer_reward": 0.9575444757938385, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 637 }, { "completion_length": 212.28125, "epoch": 2.0384, "grad_norm": 3.542672872543335, "kl": 0.110107421875, "learning_rate": 2.0374999999999998e-07, "loss": 0.0011, "reward": 3.9374581575393677, "reward_std": 0.009235690347850323, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9742424190044403, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9652990996837616, "step": 638 }, { "completion_length": 232.0, "epoch": 2.0416, "grad_norm": 1.4940472841262817, "kl": 0.0565185546875, "learning_rate": 2.025e-07, "loss": 0.0006, "reward": 3.947740077972412, "reward_std": 0.006069941911846399, "rewards/answer_entity_reward": 0.9861111044883728, "rewards/answer_wer_reward": 0.9616289734840393, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 639 }, { "completion_length": 214.46875, "epoch": 2.0448, "grad_norm": 1.0322229862213135, "kl": 0.0865478515625, "learning_rate": 2.0125e-07, "loss": 0.0009, "reward": 3.973870038986206, "reward_std": 0.005974382860586047, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9738699197769165, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 640 }, { "completion_length": 175.71875, "epoch": 2.048, "grad_norm": 2.1991164684295654, "kl": 0.0986328125, "learning_rate": 2e-07, "loss": 0.001, "reward": 3.9478849172592163, "reward_std": 0.012253349646925926, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9485794901847839, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993055462837219, "step": 641 }, { "completion_length": 202.3125, "epoch": 2.0512, "grad_norm": 2.254936456680298, "kl": 0.0758056640625, "learning_rate": 1.9875e-07, "loss": 0.0008, "reward": 3.9462071657180786, "reward_std": 0.007457165978848934, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9462071061134338, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 642 }, { "completion_length": 205.03125, "epoch": 2.0544, "grad_norm": 2.473928928375244, "kl": 0.079345703125, "learning_rate": 1.975e-07, "loss": 0.0008, "reward": 3.92992103099823, "reward_std": 0.014722079504281282, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.9436539113521576, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9919487535953522, "step": 643 }, { "completion_length": 202.3125, "epoch": 2.0576, "grad_norm": 1.5329126119613647, "kl": 0.03643798828125, "learning_rate": 1.9625e-07, "loss": 0.0004, "reward": 3.944863796234131, "reward_std": 0.006489667110145092, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9667904078960419, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9780733287334442, "step": 644 }, { "completion_length": 202.53125, "epoch": 2.0608, "grad_norm": 0.6484522223472595, "kl": 0.04443359375, "learning_rate": 1.9499999999999999e-07, "loss": 0.0004, "reward": 3.975989580154419, "reward_std": 0.0032934267073869705, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9759896695613861, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 645 }, { "completion_length": 248.65625, "epoch": 2.064, "grad_norm": 3.43375301361084, "kl": 0.0609130859375, "learning_rate": 1.9375e-07, "loss": 0.0006, "reward": 3.952019691467285, "reward_std": 0.010596145410090685, "rewards/answer_entity_reward": 0.9983552694320679, "rewards/answer_wer_reward": 0.9558849632740021, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9977796375751495, "step": 646 }, { "completion_length": 209.40625, "epoch": 2.0672, "grad_norm": 1.1015528440475464, "kl": 0.057373046875, "learning_rate": 1.9249999999999998e-07, "loss": 0.0006, "reward": 3.9535114765167236, "reward_std": 0.0073295624461025, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9535112977027893, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 647 }, { "completion_length": 247.15625, "epoch": 2.0704, "grad_norm": 5.493063449859619, "kl": 0.052490234375, "learning_rate": 1.9125e-07, "loss": 0.0005, "reward": 3.959768056869507, "reward_std": 0.009880491998046637, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9597680270671844, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 648 }, { "completion_length": 190.3125, "epoch": 2.0736, "grad_norm": 3.042928457260132, "kl": 0.070556640625, "learning_rate": 1.8999999999999998e-07, "loss": 0.0007, "reward": 3.935302972793579, "reward_std": 0.008418679004535079, "rewards/answer_entity_reward": 0.9861111044883728, "rewards/answer_wer_reward": 0.9707636535167694, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9784283638000488, "step": 649 }, { "completion_length": 240.1875, "epoch": 2.0768, "grad_norm": 1.1801666021347046, "kl": 0.068359375, "learning_rate": 1.8875e-07, "loss": 0.0007, "reward": 3.944392442703247, "reward_std": 0.008859490510076284, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9443924725055695, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 650 }, { "completion_length": 212.0625, "epoch": 2.08, "grad_norm": 1.1967086791992188, "kl": 0.072021484375, "learning_rate": 1.875e-07, "loss": 0.0007, "reward": 3.96494197845459, "reward_std": 0.011900570709258318, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9674758613109589, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9974662065505981, "step": 651 }, { "completion_length": 179.90625, "epoch": 2.0832, "grad_norm": 2.0556278228759766, "kl": 0.056640625, "learning_rate": 1.8625e-07, "loss": 0.0006, "reward": 3.925339102745056, "reward_std": 0.005963671952486038, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9453259110450745, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9800131618976593, "step": 652 }, { "completion_length": 232.1875, "epoch": 2.0864, "grad_norm": 1.1875349283218384, "kl": 0.076171875, "learning_rate": 1.85e-07, "loss": 0.0008, "reward": 3.9718481302261353, "reward_std": 0.01158686971757561, "rewards/answer_entity_reward": 0.9955128133296967, "rewards/answer_wer_reward": 0.9763352572917938, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 653 }, { "completion_length": 222.65625, "epoch": 2.0896, "grad_norm": 2.1682872772216797, "kl": 0.09423828125, "learning_rate": 1.8375e-07, "loss": 0.0009, "reward": 3.94124174118042, "reward_std": 0.008590340381488204, "rewards/answer_entity_reward": 0.9903846085071564, "rewards/answer_wer_reward": 0.9508572518825531, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 654 }, { "completion_length": 173.03125, "epoch": 2.0928, "grad_norm": 2.1240601539611816, "kl": 0.066162109375, "learning_rate": 1.825e-07, "loss": 0.0007, "reward": 3.9930202960968018, "reward_std": 0.0026576630771160126, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9934512376785278, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9995689690113068, "step": 655 }, { "completion_length": 177.09375, "epoch": 2.096, "grad_norm": 4.589439868927002, "kl": 0.083984375, "learning_rate": 1.8124999999999999e-07, "loss": 0.0008, "reward": 3.7905973196029663, "reward_std": 0.05029802396893501, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9605589509010315, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8300382792949677, "step": 656 }, { "completion_length": 182.5, "epoch": 2.0992, "grad_norm": 2.9955060482025146, "kl": 0.0601806640625, "learning_rate": 1.8e-07, "loss": 0.0006, "reward": 3.959343194961548, "reward_std": 0.010165283223614097, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9634606242179871, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.995882511138916, "step": 657 }, { "completion_length": 247.53125, "epoch": 2.1024, "grad_norm": 6.366602897644043, "kl": 0.2166748046875, "learning_rate": 1.7874999999999998e-07, "loss": 0.0022, "reward": 3.95376193523407, "reward_std": 0.007726241368800402, "rewards/answer_entity_reward": 0.9903846085071564, "rewards/answer_wer_reward": 0.9633772671222687, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 658 }, { "completion_length": 212.8125, "epoch": 2.1056, "grad_norm": 1.1973211765289307, "kl": 0.0445556640625, "learning_rate": 1.775e-07, "loss": 0.0004, "reward": 3.979708194732666, "reward_std": 0.007615833543241024, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9800336956977844, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996744692325592, "step": 659 }, { "completion_length": 244.65625, "epoch": 2.1088, "grad_norm": 1.237342357635498, "kl": 0.063232421875, "learning_rate": 1.7624999999999998e-07, "loss": 0.0006, "reward": 3.9267531633377075, "reward_std": 0.01262162160128355, "rewards/answer_entity_reward": 0.9903846085071564, "rewards/answer_wer_reward": 0.937911719083786, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9984567761421204, "step": 660 }, { "completion_length": 211.46875, "epoch": 2.112, "grad_norm": 1.6842882633209229, "kl": 0.0623779296875, "learning_rate": 1.75e-07, "loss": 0.0006, "reward": 3.9610049724578857, "reward_std": 0.008832846768200397, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9619665145874023, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990384578704834, "step": 661 }, { "completion_length": 208.6875, "epoch": 2.1152, "grad_norm": 1.8498320579528809, "kl": 0.0687255859375, "learning_rate": 1.7374999999999998e-07, "loss": 0.0007, "reward": 3.908181667327881, "reward_std": 0.05270358338020742, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9462520182132721, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9654017686843872, "step": 662 }, { "completion_length": 220.15625, "epoch": 2.1184, "grad_norm": 1.3248109817504883, "kl": 0.0576171875, "learning_rate": 1.725e-07, "loss": 0.0006, "reward": 3.977890729904175, "reward_std": 0.0048680840991437435, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9778908789157867, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 663 }, { "completion_length": 203.125, "epoch": 2.1216, "grad_norm": 1.2837951183319092, "kl": 0.0660400390625, "learning_rate": 1.7125e-07, "loss": 0.0007, "reward": 3.951757311820984, "reward_std": 0.01306973909959197, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9517573118209839, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 664 }, { "completion_length": 234.71875, "epoch": 2.1248, "grad_norm": 1.2517513036727905, "kl": 0.072265625, "learning_rate": 1.7000000000000001e-07, "loss": 0.0007, "reward": 3.932037830352783, "reward_std": 0.018653371836990118, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9320378601551056, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 665 }, { "completion_length": 154.4375, "epoch": 2.128, "grad_norm": 1.6812143325805664, "kl": 0.057373046875, "learning_rate": 1.6875e-07, "loss": 0.0006, "reward": 3.933722972869873, "reward_std": 0.004374760144855827, "rewards/answer_entity_reward": 0.9916666746139526, "rewards/answer_wer_reward": 0.9603091180324554, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9817472994327545, "step": 666 }, { "completion_length": 194.375, "epoch": 2.1312, "grad_norm": 1.1369833946228027, "kl": 0.10205078125, "learning_rate": 1.675e-07, "loss": 0.001, "reward": 3.948467254638672, "reward_std": 0.013669541105628014, "rewards/answer_entity_reward": 0.9895833134651184, "rewards/answer_wer_reward": 0.9588838517665863, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 667 }, { "completion_length": 222.40625, "epoch": 2.1344, "grad_norm": 1.289441466331482, "kl": 0.09716796875, "learning_rate": 1.6625e-07, "loss": 0.001, "reward": 3.938557267189026, "reward_std": 0.005478785838931799, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9577881693840027, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9807692170143127, "step": 668 }, { "completion_length": 185.71875, "epoch": 2.1376, "grad_norm": 1.9890272617340088, "kl": 0.084716796875, "learning_rate": 1.65e-07, "loss": 0.0008, "reward": 3.967849016189575, "reward_std": 0.008760316297411919, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.967848926782608, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 669 }, { "completion_length": 248.46875, "epoch": 2.1408, "grad_norm": 1.1813039779663086, "kl": 0.074462890625, "learning_rate": 1.6375e-07, "loss": 0.0007, "reward": 3.8907772302627563, "reward_std": 0.07307082694023848, "rewards/answer_entity_reward": 0.9749999940395355, "rewards/answer_wer_reward": 0.915777176618576, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 670 }, { "completion_length": 204.09375, "epoch": 2.144, "grad_norm": 1.4091624021530151, "kl": 0.079833984375, "learning_rate": 1.625e-07, "loss": 0.0008, "reward": 3.9357553720474243, "reward_std": 0.018585966899991035, "rewards/answer_entity_reward": 0.9924799501895905, "rewards/answer_wer_reward": 0.9553823173046112, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9878930449485779, "step": 671 }, { "completion_length": 204.15625, "epoch": 2.1471999999999998, "grad_norm": 1.9349714517593384, "kl": 0.0614013671875, "learning_rate": 1.6125e-07, "loss": 0.0006, "reward": 3.963050127029419, "reward_std": 0.011341096367686987, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9657188355922699, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997351765632629, "step": 672 }, { "completion_length": 183.1875, "epoch": 2.1504, "grad_norm": 3.866070508956909, "kl": 0.1171875, "learning_rate": 1.6e-07, "loss": 0.0012, "reward": 3.778456449508667, "reward_std": 0.1051805429160595, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9563734233379364, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8244869709014893, "step": 673 }, { "completion_length": 237.875, "epoch": 2.1536, "grad_norm": 1.3984158039093018, "kl": 0.0478515625, "learning_rate": 1.5875e-07, "loss": 0.0005, "reward": 3.9681609869003296, "reward_std": 0.007229159120470285, "rewards/answer_entity_reward": 0.9981617629528046, "rewards/answer_wer_reward": 0.9706325232982635, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993667006492615, "step": 674 }, { "completion_length": 201.9375, "epoch": 2.1568, "grad_norm": 4.475615501403809, "kl": 0.06640625, "learning_rate": 1.575e-07, "loss": 0.0007, "reward": 3.8558905124664307, "reward_std": 0.0662167351692915, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9515935778617859, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.904296875, "step": 675 }, { "completion_length": 199.59375, "epoch": 2.16, "grad_norm": 1.3850592374801636, "kl": 0.042236328125, "learning_rate": 1.5624999999999999e-07, "loss": 0.0004, "reward": 3.9729303121566772, "reward_std": 0.01144796540029347, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9750137031078339, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 676 }, { "completion_length": 198.8125, "epoch": 2.1632, "grad_norm": 0.8988875150680542, "kl": 0.0848388671875, "learning_rate": 1.55e-07, "loss": 0.0008, "reward": 3.9634130001068115, "reward_std": 0.016308533609844744, "rewards/answer_entity_reward": 0.9937500059604645, "rewards/answer_wer_reward": 0.969995379447937, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996675550937653, "step": 677 }, { "completion_length": 242.0, "epoch": 2.1664, "grad_norm": 0.886544406414032, "kl": 0.057861328125, "learning_rate": 1.5374999999999998e-07, "loss": 0.0006, "reward": 3.9666435718536377, "reward_std": 0.009206962306052446, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9666436016559601, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 678 }, { "completion_length": 208.09375, "epoch": 2.1696, "grad_norm": 1.2104874849319458, "kl": 0.0665283203125, "learning_rate": 1.525e-07, "loss": 0.0007, "reward": 3.956413745880127, "reward_std": 0.008385751629248261, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9564136564731598, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 679 }, { "completion_length": 205.65625, "epoch": 2.1728, "grad_norm": 1.4340012073516846, "kl": 0.0653076171875, "learning_rate": 1.5124999999999998e-07, "loss": 0.0007, "reward": 3.9660589694976807, "reward_std": 0.007518206490203738, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9666839838027954, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993749856948853, "step": 680 }, { "completion_length": 243.875, "epoch": 2.176, "grad_norm": 2.6693804264068604, "kl": 0.0611572265625, "learning_rate": 1.5e-07, "loss": 0.0006, "reward": 3.9342352151870728, "reward_std": 0.0278960638679564, "rewards/answer_entity_reward": 0.9851190745830536, "rewards/answer_wer_reward": 0.9509375989437103, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9981784820556641, "step": 681 }, { "completion_length": 247.875, "epoch": 2.1792, "grad_norm": 0.978139340877533, "kl": 0.050537109375, "learning_rate": 1.4874999999999998e-07, "loss": 0.0005, "reward": 3.9769967794418335, "reward_std": 0.006702936254441738, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9769968390464783, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 682 }, { "completion_length": 222.5625, "epoch": 2.1824, "grad_norm": 1.382318139076233, "kl": 0.065185546875, "learning_rate": 1.475e-07, "loss": 0.0007, "reward": 3.9492597579956055, "reward_std": 0.008544785436242819, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9507622122764587, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9984976053237915, "step": 683 }, { "completion_length": 219.71875, "epoch": 2.1856, "grad_norm": 2.196531057357788, "kl": 0.0595703125, "learning_rate": 1.4624999999999998e-07, "loss": 0.0006, "reward": 3.9446985721588135, "reward_std": 0.014558171853423119, "rewards/answer_entity_reward": 0.9813033938407898, "rewards/answer_wer_reward": 0.9633950591087341, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 684 }, { "completion_length": 219.5, "epoch": 2.1888, "grad_norm": 1.4868621826171875, "kl": 0.07177734375, "learning_rate": 1.45e-07, "loss": 0.0007, "reward": 3.9446860551834106, "reward_std": 0.010166772175580263, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9451901018619537, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994959831237793, "step": 685 }, { "completion_length": 261.59375, "epoch": 2.192, "grad_norm": 0.8591821789741516, "kl": 0.0595703125, "learning_rate": 1.4374999999999997e-07, "loss": 0.0006, "reward": 3.9277877807617188, "reward_std": 0.010211648885160685, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.930150032043457, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9976378083229065, "step": 686 }, { "completion_length": 205.0625, "epoch": 2.1952, "grad_norm": 0.924826443195343, "kl": 0.0703125, "learning_rate": 1.4249999999999999e-07, "loss": 0.0007, "reward": 3.9727468490600586, "reward_std": 0.006501165917143226, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.972746878862381, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 687 }, { "completion_length": 197.625, "epoch": 2.1984, "grad_norm": 1.508520483970642, "kl": 0.092041015625, "learning_rate": 1.4124999999999997e-07, "loss": 0.0009, "reward": 3.9627835750579834, "reward_std": 0.010947544127702713, "rewards/answer_entity_reward": 0.9930555522441864, "rewards/answer_wer_reward": 0.9707047045230865, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9990234375, "step": 688 }, { "completion_length": 205.09375, "epoch": 2.2016, "grad_norm": 2.3478713035583496, "kl": 0.0712890625, "learning_rate": 1.4e-07, "loss": 0.0007, "reward": 3.933359384536743, "reward_std": 0.008363787084817886, "rewards/answer_entity_reward": 0.9861111044883728, "rewards/answer_wer_reward": 0.9626152515411377, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9846329689025879, "step": 689 }, { "completion_length": 225.03125, "epoch": 2.2048, "grad_norm": 1.3916107416152954, "kl": 0.058837890625, "learning_rate": 1.3875e-07, "loss": 0.0006, "reward": 3.9732636213302612, "reward_std": 0.009609260130673647, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9732636511325836, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 690 }, { "completion_length": 152.59375, "epoch": 2.208, "grad_norm": 1.322786808013916, "kl": 0.0557861328125, "learning_rate": 1.375e-07, "loss": 0.0006, "reward": 3.8575568199157715, "reward_std": 0.011282142717391253, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9596264958381653, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9003343284130096, "step": 691 }, { "completion_length": 162.71875, "epoch": 2.2112, "grad_norm": 0.7846171855926514, "kl": 0.0657958984375, "learning_rate": 1.3625e-07, "loss": 0.0007, "reward": 3.9684951305389404, "reward_std": 0.013251218944787979, "rewards/answer_entity_reward": 0.9910714328289032, "rewards/answer_wer_reward": 0.9774238169193268, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 692 }, { "completion_length": 207.65625, "epoch": 2.2144, "grad_norm": 1.7230638265609741, "kl": 0.1243896484375, "learning_rate": 1.35e-07, "loss": 0.0012, "reward": 3.9475139379501343, "reward_std": 0.00949817756190896, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9486435055732727, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.998870462179184, "step": 693 }, { "completion_length": 245.96875, "epoch": 2.2176, "grad_norm": 1.5247471332550049, "kl": 0.061767578125, "learning_rate": 1.3375e-07, "loss": 0.0006, "reward": 3.947926878929138, "reward_std": 0.014066703617572784, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9513991177082062, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 694 }, { "completion_length": 222.5625, "epoch": 2.2208, "grad_norm": 1.5721601247787476, "kl": 0.0782470703125, "learning_rate": 1.325e-07, "loss": 0.0008, "reward": 3.903387188911438, "reward_std": 0.005873196758329868, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9640650153160095, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9393221139907837, "step": 695 }, { "completion_length": 187.03125, "epoch": 2.224, "grad_norm": 1.1470870971679688, "kl": 0.0457763671875, "learning_rate": 1.3125e-07, "loss": 0.0005, "reward": 3.9857735633850098, "reward_std": 0.003898413386195898, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9857736229896545, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 696 }, { "completion_length": 202.84375, "epoch": 2.2272, "grad_norm": 2.00569486618042, "kl": 0.077392578125, "learning_rate": 1.3e-07, "loss": 0.0008, "reward": 3.9439765214920044, "reward_std": 0.00677294097840786, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9666953980922699, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9772810935974121, "step": 697 }, { "completion_length": 200.875, "epoch": 2.2304, "grad_norm": 0.5203324556350708, "kl": 0.0533447265625, "learning_rate": 1.2874999999999998e-07, "loss": 0.0005, "reward": 3.981989622116089, "reward_std": 0.003249130444601178, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9819895327091217, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 698 }, { "completion_length": 229.5, "epoch": 2.2336, "grad_norm": 1.028457760810852, "kl": 0.0615234375, "learning_rate": 1.275e-07, "loss": 0.0006, "reward": 3.9699747562408447, "reward_std": 0.007223621942102909, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9699748456478119, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 699 }, { "completion_length": 183.375, "epoch": 2.2368, "grad_norm": 1.1010169982910156, "kl": 0.09619140625, "learning_rate": 1.2624999999999998e-07, "loss": 0.001, "reward": 3.9709969758987427, "reward_std": 0.013876417418941855, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9763848185539246, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.994612067937851, "step": 700 }, { "completion_length": 192.6875, "epoch": 2.24, "grad_norm": 1.9254510402679443, "kl": 0.126708984375, "learning_rate": 1.25e-07, "loss": 0.0013, "reward": 3.9508676528930664, "reward_std": 0.007698251400142908, "rewards/answer_entity_reward": 0.9903846085071564, "rewards/answer_wer_reward": 0.9661648571491241, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9943181872367859, "step": 701 }, { "completion_length": 206.6875, "epoch": 2.2432, "grad_norm": 4.035684108734131, "kl": 0.04833984375, "learning_rate": 1.2375e-07, "loss": 0.0005, "reward": 3.9621732234954834, "reward_std": 0.007325239945203066, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.977934330701828, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9842387735843658, "step": 702 }, { "completion_length": 240.78125, "epoch": 2.2464, "grad_norm": 1.4605140686035156, "kl": 0.0582275390625, "learning_rate": 1.225e-07, "loss": 0.0006, "reward": 3.951379179954529, "reward_std": 0.005893495166674256, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9543100893497467, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9970689713954926, "step": 703 }, { "completion_length": 190.84375, "epoch": 2.2496, "grad_norm": 0.8877372741699219, "kl": 0.064453125, "learning_rate": 1.2125e-07, "loss": 0.0007, "reward": 3.9827821254730225, "reward_std": 0.003501511411741376, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.983114629983902, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996675550937653, "step": 704 }, { "completion_length": 169.875, "epoch": 2.2528, "grad_norm": 4.669096946716309, "kl": 0.0634765625, "learning_rate": 1.2e-07, "loss": 0.0006, "reward": 3.9501044750213623, "reward_std": 0.00536915916018188, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9683522582054138, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9817522466182709, "step": 705 }, { "completion_length": 208.34375, "epoch": 2.2560000000000002, "grad_norm": 2.4436697959899902, "kl": 0.072998046875, "learning_rate": 1.1874999999999999e-07, "loss": 0.0007, "reward": 3.95159912109375, "reward_std": 0.012246299302205443, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9697677791118622, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9818313717842102, "step": 706 }, { "completion_length": 253.34375, "epoch": 2.2592, "grad_norm": 0.6258556842803955, "kl": 0.0625, "learning_rate": 1.1749999999999999e-07, "loss": 0.0006, "reward": 3.943672776222229, "reward_std": 0.004726027720607817, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9436727464199066, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 707 }, { "completion_length": 187.96875, "epoch": 2.2624, "grad_norm": 2.1608188152313232, "kl": 0.09521484375, "learning_rate": 1.1625e-07, "loss": 0.0009, "reward": 3.9321788549423218, "reward_std": 0.021823766641318798, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9405494034290314, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.99609375, "step": 708 }, { "completion_length": 201.0625, "epoch": 2.2656, "grad_norm": 5.012310028076172, "kl": 0.04071044921875, "learning_rate": 1.15e-07, "loss": 0.0004, "reward": 3.9624879360198975, "reward_std": 0.01549163879826665, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9760953187942505, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.989864856004715, "step": 709 }, { "completion_length": 238.71875, "epoch": 2.2688, "grad_norm": 1.1021510362625122, "kl": 0.08154296875, "learning_rate": 1.1375e-07, "loss": 0.0008, "reward": 3.9332664012908936, "reward_std": 0.015113649424165487, "rewards/answer_entity_reward": 0.9832701981067657, "rewards/answer_wer_reward": 0.9499962031841278, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 710 }, { "completion_length": 220.90625, "epoch": 2.2720000000000002, "grad_norm": 1.1716574430465698, "kl": 0.053466796875, "learning_rate": 1.125e-07, "loss": 0.0005, "reward": 3.9751139879226685, "reward_std": 0.007001735270023346, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9751139879226685, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 711 }, { "completion_length": 241.40625, "epoch": 2.2752, "grad_norm": 1.469359278678894, "kl": 0.07275390625, "learning_rate": 1.1125e-07, "loss": 0.0007, "reward": 3.898247718811035, "reward_std": 0.039173625875264406, "rewards/answer_entity_reward": 0.984375, "rewards/answer_wer_reward": 0.9162905812263489, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9975819885730743, "step": 712 }, { "completion_length": 205.25, "epoch": 2.2784, "grad_norm": 0.7749589085578918, "kl": 0.0621337890625, "learning_rate": 1.0999999999999999e-07, "loss": 0.0006, "reward": 3.9739962816238403, "reward_std": 0.0056007420644164085, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9743727445602417, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996234774589539, "step": 713 }, { "completion_length": 206.125, "epoch": 2.2816, "grad_norm": 0.5464848875999451, "kl": 0.04901123046875, "learning_rate": 1.0874999999999999e-07, "loss": 0.0005, "reward": 3.95177161693573, "reward_std": 0.004434725036844611, "rewards/answer_entity_reward": 0.9903846085071564, "rewards/answer_wer_reward": 0.9615707993507385, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9998161792755127, "step": 714 }, { "completion_length": 169.09375, "epoch": 2.2848, "grad_norm": 3.133605480194092, "kl": 0.06689453125, "learning_rate": 1.0749999999999999e-07, "loss": 0.0007, "reward": 3.929832339286804, "reward_std": 0.01732827629894018, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9689165651798248, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.960915744304657, "step": 715 }, { "completion_length": 204.59375, "epoch": 2.288, "grad_norm": 0.7156680822372437, "kl": 0.06884765625, "learning_rate": 1.0624999999999999e-07, "loss": 0.0007, "reward": 3.976062059402466, "reward_std": 0.0025083101354539394, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9836839437484741, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9923780560493469, "step": 716 }, { "completion_length": 210.59375, "epoch": 2.2912, "grad_norm": 284.2210998535156, "kl": 0.1416015625, "learning_rate": 1.0499999999999999e-07, "loss": 0.0014, "reward": 3.9028064012527466, "reward_std": 0.016830324195325375, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9614686369895935, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9413377344608307, "step": 717 }, { "completion_length": 232.53125, "epoch": 2.2944, "grad_norm": 1.077739953994751, "kl": 0.08544921875, "learning_rate": 1.0374999999999999e-07, "loss": 0.0009, "reward": 3.9475821256637573, "reward_std": 0.011592368595302105, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9478915929794312, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999690592288971, "step": 718 }, { "completion_length": 217.875, "epoch": 2.2976, "grad_norm": 2.2114531993865967, "kl": 0.195068359375, "learning_rate": 1.0249999999999998e-07, "loss": 0.002, "reward": 3.941352367401123, "reward_std": 0.00652403780259192, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9605833292007446, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9807692170143127, "step": 719 }, { "completion_length": 241.875, "epoch": 2.3008, "grad_norm": 2.330026865005493, "kl": 0.10693359375, "learning_rate": 1.0125e-07, "loss": 0.0011, "reward": 3.8385108709335327, "reward_std": 0.0217201872728765, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9317739605903625, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9067369103431702, "step": 720 }, { "completion_length": 148.15625, "epoch": 2.304, "grad_norm": 6.020991802215576, "kl": 0.0804443359375, "learning_rate": 1e-07, "loss": 0.0008, "reward": 3.9653271436691284, "reward_std": 0.010471278452314436, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9677309989929199, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 721 }, { "completion_length": 242.34375, "epoch": 2.3072, "grad_norm": 1.3827441930770874, "kl": 0.0606689453125, "learning_rate": 9.875e-08, "loss": 0.0006, "reward": 3.9477760791778564, "reward_std": 0.017027822323143482, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9524165093898773, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9988317787647247, "step": 722 }, { "completion_length": 182.875, "epoch": 2.3104, "grad_norm": 0.6132823824882507, "kl": 0.0732421875, "learning_rate": 9.749999999999999e-08, "loss": 0.0007, "reward": 3.9824774265289307, "reward_std": 0.0017756590968929231, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9837089478969574, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9987684786319733, "step": 723 }, { "completion_length": 259.21875, "epoch": 2.3136, "grad_norm": 1.0919182300567627, "kl": 0.052001953125, "learning_rate": 9.624999999999999e-08, "loss": 0.0005, "reward": 3.9247913360595703, "reward_std": 0.0157609935849905, "rewards/answer_entity_reward": 0.9692307412624359, "rewards/answer_wer_reward": 0.9555604159832001, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 724 }, { "completion_length": 243.21875, "epoch": 2.3168, "grad_norm": 1.7886172533035278, "kl": 0.04718017578125, "learning_rate": 9.499999999999999e-08, "loss": 0.0005, "reward": 3.9662917852401733, "reward_std": 0.005910404259338975, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9665379524230957, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997539520263672, "step": 725 }, { "completion_length": 201.65625, "epoch": 2.32, "grad_norm": 1.3444185256958008, "kl": 0.0606689453125, "learning_rate": 9.375e-08, "loss": 0.0006, "reward": 3.9709818363189697, "reward_std": 0.00892023229971528, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9738226532936096, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 726 }, { "completion_length": 224.1875, "epoch": 2.3232, "grad_norm": 4.107091426849365, "kl": 0.229736328125, "learning_rate": 9.25e-08, "loss": 0.0023, "reward": 3.9483840465545654, "reward_std": 0.013201091904193163, "rewards/answer_entity_reward": 0.9927884340286255, "rewards/answer_wer_reward": 0.955822080373764, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997735619544983, "step": 727 }, { "completion_length": 189.75, "epoch": 2.3264, "grad_norm": 1.512626051902771, "kl": 0.0589599609375, "learning_rate": 9.125e-08, "loss": 0.0006, "reward": 3.9542768001556396, "reward_std": 0.008582692593336105, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9702657759189606, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.984011173248291, "step": 728 }, { "completion_length": 172.8125, "epoch": 2.3296, "grad_norm": 4.1475830078125, "kl": 0.110107421875, "learning_rate": 9e-08, "loss": 0.0011, "reward": 3.9462348222732544, "reward_std": 0.009323009755462408, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9772224724292755, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9690123498439789, "step": 729 }, { "completion_length": 198.84375, "epoch": 2.3327999999999998, "grad_norm": 1.3541475534439087, "kl": 0.045166015625, "learning_rate": 8.875e-08, "loss": 0.0005, "reward": 3.9697635173797607, "reward_std": 0.00771446293219924, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9707715511322021, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9989919364452362, "step": 730 }, { "completion_length": 217.71875, "epoch": 2.336, "grad_norm": 1.2064177989959717, "kl": 0.05908203125, "learning_rate": 8.75e-08, "loss": 0.0006, "reward": 3.9431110620498657, "reward_std": 0.01243708049878478, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9462102055549622, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9969007968902588, "step": 731 }, { "completion_length": 208.0, "epoch": 2.3392, "grad_norm": 1.1856428384780884, "kl": 0.048095703125, "learning_rate": 8.625e-08, "loss": 0.0005, "reward": 3.955425500869751, "reward_std": 0.013023892883211374, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9704216420650482, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9850037693977356, "step": 732 }, { "completion_length": 230.46875, "epoch": 2.3424, "grad_norm": 7.96836519241333, "kl": 0.0765380859375, "learning_rate": 8.500000000000001e-08, "loss": 0.0008, "reward": 3.8350234031677246, "reward_std": 0.0071187918074429035, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9640994668006897, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.8709239065647125, "step": 733 }, { "completion_length": 240.46875, "epoch": 2.3456, "grad_norm": 1.9817602634429932, "kl": 0.067138671875, "learning_rate": 8.375e-08, "loss": 0.0007, "reward": 3.8598886728286743, "reward_std": 0.009870891459286213, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9313421249389648, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9285465180873871, "step": 734 }, { "completion_length": 232.625, "epoch": 2.3487999999999998, "grad_norm": 1.4039250612258911, "kl": 0.05126953125, "learning_rate": 8.25e-08, "loss": 0.0005, "reward": 3.9484113454818726, "reward_std": 0.011133690131828189, "rewards/answer_entity_reward": 0.9971590936183929, "rewards/answer_wer_reward": 0.9577626585960388, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9934895932674408, "step": 735 }, { "completion_length": 168.15625, "epoch": 2.352, "grad_norm": 0.8416581153869629, "kl": 0.068359375, "learning_rate": 8.125e-08, "loss": 0.0007, "reward": 3.9322515726089478, "reward_std": 0.002792949788272381, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9946084916591644, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9376430511474609, "step": 736 }, { "completion_length": 232.375, "epoch": 2.3552, "grad_norm": 1.3709439039230347, "kl": 0.068359375, "learning_rate": 8e-08, "loss": 0.0007, "reward": 3.9093856811523438, "reward_std": 0.0034298759419471025, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.971885621547699, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9375, "step": 737 }, { "completion_length": 201.15625, "epoch": 2.3584, "grad_norm": 0.9587724804878235, "kl": 0.0657958984375, "learning_rate": 7.875e-08, "loss": 0.0007, "reward": 3.960189461708069, "reward_std": 0.017379604279994965, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9656778275966644, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9979838728904724, "step": 738 }, { "completion_length": 204.3125, "epoch": 2.3616, "grad_norm": 1.5729237794876099, "kl": 0.075439453125, "learning_rate": 7.75e-08, "loss": 0.0007, "reward": 3.9626389741897583, "reward_std": 0.01823890022933483, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9661112725734711, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 739 }, { "completion_length": 239.875, "epoch": 2.3648, "grad_norm": 0.9296643733978271, "kl": 0.064208984375, "learning_rate": 7.625e-08, "loss": 0.0006, "reward": 3.968054413795471, "reward_std": 0.0051011774921789765, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9680543541908264, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 740 }, { "completion_length": 242.71875, "epoch": 2.368, "grad_norm": 0.9536841511726379, "kl": 0.0606689453125, "learning_rate": 7.5e-08, "loss": 0.0006, "reward": 3.9280422925949097, "reward_std": 0.005676981760188937, "rewards/answer_entity_reward": 0.9861111044883728, "rewards/answer_wer_reward": 0.9430340826511383, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9988970458507538, "step": 741 }, { "completion_length": 239.59375, "epoch": 2.3712, "grad_norm": 1.1191787719726562, "kl": 0.0565185546875, "learning_rate": 7.375e-08, "loss": 0.0006, "reward": 3.9627801179885864, "reward_std": 0.004723543883301318, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9627801775932312, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 742 }, { "completion_length": 198.125, "epoch": 2.3744, "grad_norm": 19.45572280883789, "kl": 0.0677490234375, "learning_rate": 7.25e-08, "loss": 0.0007, "reward": 3.8835959434509277, "reward_std": 0.0259452061727643, "rewards/answer_entity_reward": 0.9975961446762085, "rewards/answer_wer_reward": 0.9579322040081024, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9280676245689392, "step": 743 }, { "completion_length": 176.875, "epoch": 2.3776, "grad_norm": 2.2377281188964844, "kl": 0.090087890625, "learning_rate": 7.124999999999999e-08, "loss": 0.0009, "reward": 3.9422539472579956, "reward_std": 0.039653101935982704, "rewards/answer_entity_reward": 0.9895833134651184, "rewards/answer_wer_reward": 0.9664814472198486, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.986189216375351, "step": 744 }, { "completion_length": 229.1875, "epoch": 2.3808, "grad_norm": 1.561314344406128, "kl": 0.0491943359375, "learning_rate": 7e-08, "loss": 0.0005, "reward": 3.8669506311416626, "reward_std": 0.19146580225788057, "rewards/answer_entity_reward": 0.96875, "rewards/answer_wer_reward": 0.9294506311416626, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 1.0, "step": 745 }, { "completion_length": 192.875, "epoch": 2.384, "grad_norm": 1.9305033683776855, "kl": 0.078857421875, "learning_rate": 6.875e-08, "loss": 0.0008, "reward": 3.944983959197998, "reward_std": 0.012190061155706644, "rewards/answer_entity_reward": 0.9979166686534882, "rewards/answer_wer_reward": 0.9473004341125488, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997667968273163, "step": 746 }, { "completion_length": 214.75, "epoch": 2.3872, "grad_norm": 13.16278076171875, "kl": 0.0552978515625, "learning_rate": 6.75e-08, "loss": 0.0006, "reward": 3.981534004211426, "reward_std": 0.016841471777297556, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.989596426486969, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.991937518119812, "step": 747 }, { "completion_length": 202.65625, "epoch": 2.3904, "grad_norm": 1.269473671913147, "kl": 0.0595703125, "learning_rate": 6.625e-08, "loss": 0.0006, "reward": 3.9539172649383545, "reward_std": 0.006352424388751388, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9545792937278748, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993380010128021, "step": 748 }, { "completion_length": 241.9375, "epoch": 2.3936, "grad_norm": 0.799062192440033, "kl": 0.08447265625, "learning_rate": 6.5e-08, "loss": 0.0008, "reward": 3.968814492225647, "reward_std": 0.0058513006661087275, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9696769118309021, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9991375803947449, "step": 749 }, { "completion_length": 175.90625, "epoch": 2.3968, "grad_norm": 1.7988041639328003, "kl": 0.06201171875, "learning_rate": 6.375e-08, "loss": 0.0006, "reward": 3.9838857650756836, "reward_std": 0.0046576057793572545, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9841121137142181, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997735619544983, "step": 750 }, { "completion_length": 215.59375, "epoch": 2.4, "grad_norm": 2.852858781814575, "kl": 0.0533447265625, "learning_rate": 6.25e-08, "loss": 0.0005, "reward": 3.943244457244873, "reward_std": 0.03492546791676432, "rewards/answer_entity_reward": 0.9943181872367859, "rewards/answer_wer_reward": 0.9835853576660156, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9653409123420715, "step": 751 }, { "completion_length": 238.5, "epoch": 2.4032, "grad_norm": 12.164900779724121, "kl": 0.0615234375, "learning_rate": 6.125e-08, "loss": 0.0006, "reward": 3.9755419492721558, "reward_std": 0.010625506052747369, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9782145917415619, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9973272979259491, "step": 752 }, { "completion_length": 179.15625, "epoch": 2.4064, "grad_norm": 0.9550566077232361, "kl": 0.0693359375, "learning_rate": 6e-08, "loss": 0.0007, "reward": 3.954240560531616, "reward_std": 0.011055386741645634, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9720976054668427, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9821428656578064, "step": 753 }, { "completion_length": 216.0625, "epoch": 2.4096, "grad_norm": 1.3647923469543457, "kl": 0.0582275390625, "learning_rate": 5.8749999999999993e-08, "loss": 0.0006, "reward": 3.962032198905945, "reward_std": 0.008129856083542109, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9623997509479523, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996323585510254, "step": 754 }, { "completion_length": 221.8125, "epoch": 2.4128, "grad_norm": 1.9497917890548706, "kl": 0.0604248046875, "learning_rate": 5.75e-08, "loss": 0.0006, "reward": 3.9653851985931396, "reward_std": 0.02012356440536678, "rewards/answer_entity_reward": 0.9937500059604645, "rewards/answer_wer_reward": 0.9722139835357666, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9994212985038757, "step": 755 }, { "completion_length": 198.4375, "epoch": 2.416, "grad_norm": 0.6684221029281616, "kl": 0.07568359375, "learning_rate": 5.625e-08, "loss": 0.0008, "reward": 3.942944049835205, "reward_std": 0.008921493077650666, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9674927294254303, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9754513502120972, "step": 756 }, { "completion_length": 234.875, "epoch": 2.4192, "grad_norm": 1.097367525100708, "kl": 0.1142578125, "learning_rate": 5.4999999999999996e-08, "loss": 0.0011, "reward": 3.9485758543014526, "reward_std": 0.01669642748311162, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9544399976730347, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9986002445220947, "step": 757 }, { "completion_length": 150.46875, "epoch": 2.4224, "grad_norm": 0.21660760045051575, "kl": 0.0321044921875, "learning_rate": 5.3749999999999995e-08, "loss": 0.0003, "reward": 3.978167176246643, "reward_std": 0.0010678768157958984, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9781671762466431, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 758 }, { "completion_length": 231.25, "epoch": 2.4256, "grad_norm": 3.330300807952881, "kl": 0.078857421875, "learning_rate": 5.2499999999999994e-08, "loss": 0.0008, "reward": 3.9418994188308716, "reward_std": 0.007436740444973111, "rewards/answer_entity_reward": 0.9861111044883728, "rewards/answer_wer_reward": 0.9557883143424988, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 759 }, { "completion_length": 212.5, "epoch": 2.4288, "grad_norm": 3.427900791168213, "kl": 0.13525390625, "learning_rate": 5.124999999999999e-08, "loss": 0.0014, "reward": 3.9013478755950928, "reward_std": 0.030906156171113253, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9625242948532104, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9388234913349152, "step": 760 }, { "completion_length": 218.90625, "epoch": 2.432, "grad_norm": 1.3307231664657593, "kl": 0.0567626953125, "learning_rate": 5e-08, "loss": 0.0006, "reward": 3.9774084091186523, "reward_std": 0.0034683155827224255, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9774083495140076, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 761 }, { "completion_length": 207.875, "epoch": 2.4352, "grad_norm": 0.7475162148475647, "kl": 0.057373046875, "learning_rate": 4.8749999999999996e-08, "loss": 0.0006, "reward": 3.9419760704040527, "reward_std": 0.004616708727553487, "rewards/answer_entity_reward": 0.9788995385169983, "rewards/answer_wer_reward": 0.9634398818016052, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996366202831268, "step": 762 }, { "completion_length": 195.28125, "epoch": 2.4384, "grad_norm": 2.0728979110717773, "kl": 0.0966796875, "learning_rate": 4.7499999999999995e-08, "loss": 0.001, "reward": 3.944322109222412, "reward_std": 0.017246471252292395, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9587452709674835, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9855769276618958, "step": 763 }, { "completion_length": 224.9375, "epoch": 2.4416, "grad_norm": 1.2122951745986938, "kl": 0.1226806640625, "learning_rate": 4.625e-08, "loss": 0.0012, "reward": 3.9620940685272217, "reward_std": 0.007986569311469793, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9652903079986572, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.996803879737854, "step": 764 }, { "completion_length": 249.03125, "epoch": 2.4448, "grad_norm": 1.21713125705719, "kl": 0.065673828125, "learning_rate": 4.5e-08, "loss": 0.0007, "reward": 3.9346585273742676, "reward_std": 0.006481441203504801, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9346585869789124, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 765 }, { "completion_length": 239.875, "epoch": 2.448, "grad_norm": 5.105895519256592, "kl": 0.0665283203125, "learning_rate": 4.375e-08, "loss": 0.0007, "reward": 3.916127324104309, "reward_std": 0.02047336893156171, "rewards/answer_entity_reward": 0.9910714626312256, "rewards/answer_wer_reward": 0.9319192171096802, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9931367635726929, "step": 766 }, { "completion_length": 216.84375, "epoch": 2.4512, "grad_norm": 3.230001449584961, "kl": 0.0445556640625, "learning_rate": 4.2500000000000003e-08, "loss": 0.0004, "reward": 3.9800050258636475, "reward_std": 0.004955247277393937, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9800049960613251, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 767 }, { "completion_length": 229.8125, "epoch": 2.4544, "grad_norm": 1.2354313135147095, "kl": 0.0478515625, "learning_rate": 4.125e-08, "loss": 0.0005, "reward": 3.9553003311157227, "reward_std": 0.013880819431506097, "rewards/answer_entity_reward": 0.9826388955116272, "rewards/answer_wer_reward": 0.9739912152290344, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.998670220375061, "step": 768 }, { "completion_length": 248.34375, "epoch": 2.4576000000000002, "grad_norm": 0.8089145421981812, "kl": 0.06005859375, "learning_rate": 4e-08, "loss": 0.0006, "reward": 3.9643748998641968, "reward_std": 0.007618119474500418, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9654783606529236, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9988966286182404, "step": 769 }, { "completion_length": 233.53125, "epoch": 2.4608, "grad_norm": 1.2253531217575073, "kl": 0.0540771484375, "learning_rate": 3.875e-08, "loss": 0.0005, "reward": 3.955801010131836, "reward_std": 0.007193901808932424, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9558009505271912, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 770 }, { "completion_length": 246.25, "epoch": 2.464, "grad_norm": 0.8907082080841064, "kl": 0.0740966796875, "learning_rate": 3.75e-08, "loss": 0.0007, "reward": 3.9567649364471436, "reward_std": 0.007558103417977691, "rewards/answer_entity_reward": 0.9926470518112183, "rewards/answer_wer_reward": 0.9644212424755096, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996966123580933, "step": 771 }, { "completion_length": 157.84375, "epoch": 2.4672, "grad_norm": 0.6787045001983643, "kl": 0.080078125, "learning_rate": 3.625e-08, "loss": 0.0008, "reward": 3.989119529724121, "reward_std": 0.0026377947069704533, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9893985092639923, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997209906578064, "step": 772 }, { "completion_length": 228.75, "epoch": 2.4704, "grad_norm": 0.6448482275009155, "kl": 0.0562744140625, "learning_rate": 3.5e-08, "loss": 0.0006, "reward": 3.960241913795471, "reward_std": 0.005235916236415505, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.960241824388504, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 773 }, { "completion_length": 226.5, "epoch": 2.4736000000000002, "grad_norm": 0.9646191596984863, "kl": 0.05224609375, "learning_rate": 3.375e-08, "loss": 0.0005, "reward": 3.9351943731307983, "reward_std": 0.015792422462254763, "rewards/answer_entity_reward": 0.9866071343421936, "rewards/answer_wer_reward": 0.9691915214061737, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9793955981731415, "step": 774 }, { "completion_length": 249.53125, "epoch": 2.4768, "grad_norm": 2.9048826694488525, "kl": 0.0540771484375, "learning_rate": 3.25e-08, "loss": 0.0005, "reward": 3.952099561691284, "reward_std": 0.00629690324421972, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9520994424819946, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 775 }, { "completion_length": 222.21875, "epoch": 2.48, "grad_norm": 1.1555320024490356, "kl": 0.0548095703125, "learning_rate": 3.125e-08, "loss": 0.0005, "reward": 3.9609912633895874, "reward_std": 0.017560790292918682, "rewards/answer_entity_reward": 0.9927884340286255, "rewards/answer_wer_reward": 0.9682029485702515, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 776 }, { "completion_length": 245.46875, "epoch": 2.4832, "grad_norm": 2.5107345581054688, "kl": 0.098388671875, "learning_rate": 3e-08, "loss": 0.001, "reward": 3.9294867515563965, "reward_std": 0.009384696371853352, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.932422935962677, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9970638751983643, "step": 777 }, { "completion_length": 175.1875, "epoch": 2.4864, "grad_norm": 3.319678783416748, "kl": 0.06640625, "learning_rate": 2.875e-08, "loss": 0.0007, "reward": 3.9766006469726562, "reward_std": 0.005283091915771365, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9770888686180115, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.99951171875, "step": 778 }, { "completion_length": 215.8125, "epoch": 2.4896, "grad_norm": 1.7188315391540527, "kl": 0.058837890625, "learning_rate": 2.7499999999999998e-08, "loss": 0.0006, "reward": 3.945501208305359, "reward_std": 0.006351021584123373, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9457343518733978, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9997667968273163, "step": 779 }, { "completion_length": 204.28125, "epoch": 2.4928, "grad_norm": 1.284071683883667, "kl": 0.0640869140625, "learning_rate": 2.6249999999999997e-08, "loss": 0.0006, "reward": 3.9768584966659546, "reward_std": 0.003316762624308467, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9779550433158875, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9989035129547119, "step": 780 }, { "completion_length": 216.71875, "epoch": 2.496, "grad_norm": 1.442418098449707, "kl": 0.067138671875, "learning_rate": 2.5e-08, "loss": 0.0007, "reward": 3.945361614227295, "reward_std": 0.03020885493606329, "rewards/answer_entity_reward": 0.9867424070835114, "rewards/answer_wer_reward": 0.9586191177368164, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 781 }, { "completion_length": 199.28125, "epoch": 2.4992, "grad_norm": 2.220127582550049, "kl": 0.071533203125, "learning_rate": 2.3749999999999998e-08, "loss": 0.0007, "reward": 3.945390462875366, "reward_std": 0.012143698055297136, "rewards/answer_entity_reward": 0.9965277910232544, "rewards/answer_wer_reward": 0.9488627314567566, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 782 }, { "completion_length": 220.6875, "epoch": 2.5023999999999997, "grad_norm": 2.2362775802612305, "kl": 0.0634765625, "learning_rate": 2.25e-08, "loss": 0.0006, "reward": 3.960192322731018, "reward_std": 0.006831311853602529, "rewards/answer_entity_reward": 0.9916666746139526, "rewards/answer_wer_reward": 0.9691977500915527, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9993279576301575, "step": 783 }, { "completion_length": 235.5, "epoch": 2.5056000000000003, "grad_norm": 0.9817630052566528, "kl": 0.05224609375, "learning_rate": 2.1250000000000002e-08, "loss": 0.0005, "reward": 3.9702308177948, "reward_std": 0.006825624033808708, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9809376895427704, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9892931282520294, "step": 784 }, { "completion_length": 203.78125, "epoch": 2.5088, "grad_norm": 2.859792947769165, "kl": 0.053955078125, "learning_rate": 2e-08, "loss": 0.0005, "reward": 3.9142426252365112, "reward_std": 0.01792304962873459, "rewards/answer_entity_reward": 0.9944852888584137, "rewards/answer_wer_reward": 0.9792338609695435, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9405233263969421, "step": 785 }, { "completion_length": 224.28125, "epoch": 2.512, "grad_norm": 3.7338051795959473, "kl": 0.060791015625, "learning_rate": 1.875e-08, "loss": 0.0006, "reward": 3.948864221572876, "reward_std": 0.01559874601662159, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9604960083961487, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9883682429790497, "step": 786 }, { "completion_length": 175.1875, "epoch": 2.5152, "grad_norm": 4.41845703125, "kl": 0.083740234375, "learning_rate": 1.75e-08, "loss": 0.0008, "reward": 3.949966311454773, "reward_std": 0.01157908933237195, "rewards/answer_entity_reward": 0.9914772808551788, "rewards/answer_wer_reward": 0.9717868566513062, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9867021441459656, "step": 787 }, { "completion_length": 259.25, "epoch": 2.5183999999999997, "grad_norm": 0.9571487903594971, "kl": 0.0584716796875, "learning_rate": 1.625e-08, "loss": 0.0006, "reward": 3.853899836540222, "reward_std": 0.1917457883246243, "rewards/answer_entity_reward": 0.9654605388641357, "rewards/answer_wer_reward": 0.9225141406059265, "rewards/format_reward": 0.96875, "rewards/think_ocr_reward": 0.9971751868724823, "step": 788 }, { "completion_length": 249.375, "epoch": 2.5216, "grad_norm": 2.86120867729187, "kl": 0.1368408203125, "learning_rate": 1.5e-08, "loss": 0.0014, "reward": 3.9423060417175293, "reward_std": 0.01874951831996441, "rewards/answer_entity_reward": 0.9955357313156128, "rewards/answer_wer_reward": 0.9478386044502258, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9989316165447235, "step": 789 }, { "completion_length": 198.78125, "epoch": 2.5248, "grad_norm": 4.95521879196167, "kl": 0.0611572265625, "learning_rate": 1.3749999999999999e-08, "loss": 0.0006, "reward": 3.915849447250366, "reward_std": 0.016107629984617233, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9824348092079163, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9334145784378052, "step": 790 }, { "completion_length": 184.1875, "epoch": 2.528, "grad_norm": 0.8447386622428894, "kl": 0.0634765625, "learning_rate": 1.25e-08, "loss": 0.0006, "reward": 3.929018259048462, "reward_std": 0.009709671430755407, "rewards/answer_entity_reward": 0.9886363744735718, "rewards/answer_wer_reward": 0.940733015537262, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9996488690376282, "step": 791 }, { "completion_length": 185.59375, "epoch": 2.5312, "grad_norm": 2.6198718547821045, "kl": 0.0439453125, "learning_rate": 1.125e-08, "loss": 0.0004, "reward": 3.9582111835479736, "reward_std": 0.007002702914178371, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9582110941410065, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 792 }, { "completion_length": 197.8125, "epoch": 2.5343999999999998, "grad_norm": 1.3550831079483032, "kl": 0.065185546875, "learning_rate": 1e-08, "loss": 0.0007, "reward": 3.8907723426818848, "reward_std": 0.005525397136807442, "rewards/answer_entity_reward": 0.9903846085071564, "rewards/answer_wer_reward": 0.9687470197677612, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.931640625, "step": 793 }, { "completion_length": 187.375, "epoch": 2.5376, "grad_norm": 1.0252914428710938, "kl": 0.086181640625, "learning_rate": 8.75e-09, "loss": 0.0009, "reward": 3.86617374420166, "reward_std": 0.011230799835175276, "rewards/answer_entity_reward": 0.9981617629528046, "rewards/answer_wer_reward": 0.9432033002376556, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9248086512088776, "step": 794 }, { "completion_length": 220.84375, "epoch": 2.5408, "grad_norm": 3.189028739929199, "kl": 0.05078125, "learning_rate": 7.5e-09, "loss": 0.0005, "reward": 3.9672648906707764, "reward_std": 0.006707400782033801, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.968046098947525, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.999218761920929, "step": 795 }, { "completion_length": 148.875, "epoch": 2.544, "grad_norm": 0.518578052520752, "kl": 0.085693359375, "learning_rate": 6.25e-09, "loss": 0.0009, "reward": 3.8482353687286377, "reward_std": 0.0038536423817276955, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.8497678339481354, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9984675645828247, "step": 796 }, { "completion_length": 193.40625, "epoch": 2.5472, "grad_norm": 0.928065299987793, "kl": 0.081298828125, "learning_rate": 5e-09, "loss": 0.0008, "reward": 3.9669394493103027, "reward_std": 0.013519858941435814, "rewards/answer_entity_reward": 0.9937500059604645, "rewards/answer_wer_reward": 0.9760889112949371, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9971005320549011, "step": 797 }, { "completion_length": 220.0625, "epoch": 2.5504, "grad_norm": 2.7394306659698486, "kl": 0.050537109375, "learning_rate": 3.75e-09, "loss": 0.0005, "reward": 3.972287654876709, "reward_std": 0.0053059973288327456, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9722877740859985, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 798 }, { "completion_length": 221.4375, "epoch": 2.5536, "grad_norm": 3.9942383766174316, "kl": 0.0673828125, "learning_rate": 2.5e-09, "loss": 0.0007, "reward": 3.9344537258148193, "reward_std": 0.01906409254297614, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9436750113964081, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 0.9907786846160889, "step": 799 }, { "completion_length": 231.1875, "epoch": 2.5568, "grad_norm": 2.3216702938079834, "kl": 0.0462646484375, "learning_rate": 1.25e-09, "loss": 0.0005, "reward": 3.959131956100464, "reward_std": 0.005453485995531082, "rewards/answer_entity_reward": 1.0, "rewards/answer_wer_reward": 0.9591320157051086, "rewards/format_reward": 1.0, "rewards/think_ocr_reward": 1.0, "step": 800 } ], "logging_steps": 1, "max_steps": 800, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }