diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,12034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.5568, + "eval_steps": 500, + "global_step": 800, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "completion_length": 175.78125, + "epoch": 0.0032, + "grad_norm": 5.3713698387146, + "kl": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "reward": 2.691648483276367, + "reward_std": 0.9842272102832794, + "rewards/answer_entity_reward": 0.8998827934265137, + "rewards/answer_wer_reward": 0.6144023239612579, + "rewards/format_reward": 0.65625, + "rewards/think_ocr_reward": 0.5211134254932404, + "step": 1 + }, + { + "completion_length": 205.1875, + "epoch": 0.0064, + "grad_norm": 12.984394073486328, + "kl": 0.000339508056640625, + "learning_rate": 9.9875e-07, + "loss": 0.0, + "reward": 2.8287014961242676, + "reward_std": 1.0050830841064453, + "rewards/answer_entity_reward": 0.7303222715854645, + "rewards/answer_wer_reward": 0.47497838735580444, + "rewards/format_reward": 0.875, + "rewards/think_ocr_reward": 0.7484009563922882, + "step": 2 + }, + { + "completion_length": 203.09375, + "epoch": 0.0096, + "grad_norm": 5.166553497314453, + "kl": 0.00044536590576171875, + "learning_rate": 9.975e-07, + "loss": 0.0, + "reward": 3.498788595199585, + "reward_std": 0.2545953020453453, + "rewards/answer_entity_reward": 0.9527146220207214, + "rewards/answer_wer_reward": 0.7393675744533539, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8067062795162201, + "step": 3 + }, + { + "completion_length": 206.1875, + "epoch": 0.0128, + "grad_norm": 2.356685161590576, + "kl": 0.0009002685546875, + "learning_rate": 9.9625e-07, + "loss": 0.0, + "reward": 3.299022078514099, + "reward_std": 0.5456227362155914, + "rewards/answer_entity_reward": 0.8519714176654816, + "rewards/answer_wer_reward": 0.6592651903629303, + "rewards/format_reward": 0.96875, + "rewards/think_ocr_reward": 0.8190353512763977, + "step": 4 + }, + { + "completion_length": 223.28125, + "epoch": 0.016, + "grad_norm": 3.5642409324645996, + "kl": 0.001827239990234375, + "learning_rate": 9.95e-07, + "loss": 0.0, + "reward": 2.8498330116271973, + "reward_std": 0.6001743674278259, + "rewards/answer_entity_reward": 0.8803278803825378, + "rewards/answer_wer_reward": 0.45287495851516724, + "rewards/format_reward": 0.96875, + "rewards/think_ocr_reward": 0.5478802025318146, + "step": 5 + }, + { + "completion_length": 210.28125, + "epoch": 0.0192, + "grad_norm": 2.062991142272949, + "kl": 0.004608154296875, + "learning_rate": 9.9375e-07, + "loss": 0.0, + "reward": 3.345002055168152, + "reward_std": 0.5891430526971817, + "rewards/answer_entity_reward": 0.8334160447120667, + "rewards/answer_wer_reward": 0.7313504219055176, + "rewards/format_reward": 0.875, + "rewards/think_ocr_reward": 0.9052354693412781, + "step": 6 + }, + { + "completion_length": 204.9375, + "epoch": 0.0224, + "grad_norm": 2.77138090133667, + "kl": 0.01922607421875, + "learning_rate": 9.925e-07, + "loss": 0.0002, + "reward": 3.3531779050827026, + "reward_std": 0.7286678552627563, + "rewards/answer_entity_reward": 0.8474657833576202, + "rewards/answer_wer_reward": 0.7306987345218658, + "rewards/format_reward": 0.90625, + "rewards/think_ocr_reward": 0.8687634468078613, + "step": 7 + }, + { + "completion_length": 242.0, + "epoch": 0.0256, + "grad_norm": 1.9377678632736206, + "kl": 0.00897216796875, + "learning_rate": 9.912499999999998e-07, + "loss": 0.0001, + "reward": 3.538244366645813, + "reward_std": 0.26357416808605194, + "rewards/answer_entity_reward": 0.8956374526023865, + "rewards/answer_wer_reward": 0.795194149017334, + "rewards/format_reward": 0.96875, + "rewards/think_ocr_reward": 0.8786628246307373, + "step": 8 + }, + { + "completion_length": 181.28125, + "epoch": 0.0288, + "grad_norm": 2.9018149375915527, + "kl": 0.0250244140625, + "learning_rate": 9.9e-07, + "loss": 0.0002, + "reward": 3.6827263832092285, + "reward_std": 0.21120695769786835, + "rewards/answer_entity_reward": 0.9178647994995117, + "rewards/answer_wer_reward": 0.8329994082450867, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9318622648715973, + "step": 9 + }, + { + "completion_length": 211.1875, + "epoch": 0.032, + "grad_norm": 3.4354376792907715, + "kl": 0.02166748046875, + "learning_rate": 9.8875e-07, + "loss": 0.0002, + "reward": 3.6928374767303467, + "reward_std": 0.21010804921388626, + "rewards/answer_entity_reward": 0.8995116055011749, + "rewards/answer_wer_reward": 0.8549435138702393, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9383824467658997, + "step": 10 + }, + { + "completion_length": 165.40625, + "epoch": 0.0352, + "grad_norm": 5.1537652015686035, + "kl": 0.0521240234375, + "learning_rate": 9.875e-07, + "loss": 0.0005, + "reward": 3.500484824180603, + "reward_std": 0.5196337550878525, + "rewards/answer_entity_reward": 0.9380581974983215, + "rewards/answer_wer_reward": 0.7917109727859497, + "rewards/format_reward": 0.9375, + "rewards/think_ocr_reward": 0.833215594291687, + "step": 11 + }, + { + "completion_length": 223.8125, + "epoch": 0.0384, + "grad_norm": 3.7026002407073975, + "kl": 0.02813720703125, + "learning_rate": 9.862499999999999e-07, + "loss": 0.0003, + "reward": 3.7366983890533447, + "reward_std": 0.19402557611465454, + "rewards/answer_entity_reward": 0.9315968751907349, + "rewards/answer_wer_reward": 0.836162269115448, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9689393639564514, + "step": 12 + }, + { + "completion_length": 201.34375, + "epoch": 0.0416, + "grad_norm": 4.624758243560791, + "kl": 0.0487060546875, + "learning_rate": 9.849999999999999e-07, + "loss": 0.0005, + "reward": 3.6485583782196045, + "reward_std": 0.19490989297628403, + "rewards/answer_entity_reward": 0.9538419842720032, + "rewards/answer_wer_reward": 0.8439803719520569, + "rewards/format_reward": 0.96875, + "rewards/think_ocr_reward": 0.8819859325885773, + "step": 13 + }, + { + "completion_length": 197.53125, + "epoch": 0.0448, + "grad_norm": 5.349609375, + "kl": 0.03363037109375, + "learning_rate": 9.8375e-07, + "loss": 0.0003, + "reward": 3.579698920249939, + "reward_std": 0.12941206991672516, + "rewards/answer_entity_reward": 0.9086007177829742, + "rewards/answer_wer_reward": 0.8474478721618652, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8236501812934875, + "step": 14 + }, + { + "completion_length": 180.5625, + "epoch": 0.048, + "grad_norm": 5.51423454284668, + "kl": 0.0633544921875, + "learning_rate": 9.825e-07, + "loss": 0.0006, + "reward": 3.6973917484283447, + "reward_std": 0.15208109095692635, + "rewards/answer_entity_reward": 0.9153402149677277, + "rewards/answer_wer_reward": 0.8323444426059723, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9497069418430328, + "step": 15 + }, + { + "completion_length": 205.03125, + "epoch": 0.0512, + "grad_norm": 3.2830357551574707, + "kl": 0.059326171875, + "learning_rate": 9.8125e-07, + "loss": 0.0006, + "reward": 3.477460026741028, + "reward_std": 0.43340209126472473, + "rewards/answer_entity_reward": 0.8780590891838074, + "rewards/answer_wer_reward": 0.7556597292423248, + "rewards/format_reward": 0.9375, + "rewards/think_ocr_reward": 0.9062411189079285, + "step": 16 + }, + { + "completion_length": 243.84375, + "epoch": 0.0544, + "grad_norm": 2.257538080215454, + "kl": 0.03240966796875, + "learning_rate": 9.8e-07, + "loss": 0.0003, + "reward": 3.6340386867523193, + "reward_std": 0.15337160229682922, + "rewards/answer_entity_reward": 0.8995862305164337, + "rewards/answer_wer_reward": 0.7731227576732635, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9613295793533325, + "step": 17 + }, + { + "completion_length": 236.125, + "epoch": 0.0576, + "grad_norm": 2.133462429046631, + "kl": 0.0579833984375, + "learning_rate": 9.7875e-07, + "loss": 0.0006, + "reward": 3.730382204055786, + "reward_std": 0.1639438048005104, + "rewards/answer_entity_reward": 0.9158936738967896, + "rewards/answer_wer_reward": 0.8535431623458862, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9609452486038208, + "step": 18 + }, + { + "completion_length": 253.84375, + "epoch": 0.0608, + "grad_norm": 2.6911232471466064, + "kl": 0.042236328125, + "learning_rate": 9.775e-07, + "loss": 0.0004, + "reward": 3.6918214559555054, + "reward_std": 0.24240515753626823, + "rewards/answer_entity_reward": 0.908495306968689, + "rewards/answer_wer_reward": 0.8162411749362946, + "rewards/format_reward": 0.96875, + "rewards/think_ocr_reward": 0.9983349442481995, + "step": 19 + }, + { + "completion_length": 195.3125, + "epoch": 0.064, + "grad_norm": 2.856860876083374, + "kl": 0.0548095703125, + "learning_rate": 9.7625e-07, + "loss": 0.0005, + "reward": 3.570927858352661, + "reward_std": 0.38515634275972843, + "rewards/answer_entity_reward": 0.885971337556839, + "rewards/answer_wer_reward": 0.7937527894973755, + "rewards/format_reward": 0.9375, + "rewards/think_ocr_reward": 0.9537037014961243, + "step": 20 + }, + { + "completion_length": 200.21875, + "epoch": 0.0672, + "grad_norm": 2.869398355484009, + "kl": 0.059814453125, + "learning_rate": 9.75e-07, + "loss": 0.0006, + "reward": 3.7599644660949707, + "reward_std": 0.13445724919438362, + "rewards/answer_entity_reward": 0.9744762480258942, + "rewards/answer_wer_reward": 0.8406906425952911, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9447975754737854, + "step": 21 + }, + { + "completion_length": 228.9375, + "epoch": 0.0704, + "grad_norm": 2.2584221363067627, + "kl": 0.03387451171875, + "learning_rate": 9.7375e-07, + "loss": 0.0003, + "reward": 3.5859320163726807, + "reward_std": 0.14986564964056015, + "rewards/answer_entity_reward": 0.9357894659042358, + "rewards/answer_wer_reward": 0.8099571466445923, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8401854038238525, + "step": 22 + }, + { + "completion_length": 219.78125, + "epoch": 0.0736, + "grad_norm": 2.140197277069092, + "kl": 0.0499267578125, + "learning_rate": 9.725e-07, + "loss": 0.0005, + "reward": 3.755205750465393, + "reward_std": 0.09474575892090797, + "rewards/answer_entity_reward": 0.9487689137458801, + "rewards/answer_wer_reward": 0.871625155210495, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9348115921020508, + "step": 23 + }, + { + "completion_length": 206.28125, + "epoch": 0.0768, + "grad_norm": 3.823035478591919, + "kl": 0.13916015625, + "learning_rate": 9.712499999999998e-07, + "loss": 0.0014, + "reward": 3.7580984830856323, + "reward_std": 0.07033384963870049, + "rewards/answer_entity_reward": 0.9635280966758728, + "rewards/answer_wer_reward": 0.8670244812965393, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9275458753108978, + "step": 24 + }, + { + "completion_length": 141.875, + "epoch": 0.08, + "grad_norm": 3.9088714122772217, + "kl": 0.10791015625, + "learning_rate": 9.7e-07, + "loss": 0.0011, + "reward": 3.7762891054153442, + "reward_std": 0.04259665124118328, + "rewards/answer_entity_reward": 0.9848519563674927, + "rewards/answer_wer_reward": 0.8006402850151062, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9907970130443573, + "step": 25 + }, + { + "completion_length": 205.21875, + "epoch": 0.0832, + "grad_norm": 2.103792905807495, + "kl": 0.065185546875, + "learning_rate": 9.6875e-07, + "loss": 0.0007, + "reward": 3.811550498008728, + "reward_std": 0.11633584462106228, + "rewards/answer_entity_reward": 0.9553370177745819, + "rewards/answer_wer_reward": 0.9040265679359436, + "rewards/format_reward": 0.96875, + "rewards/think_ocr_reward": 0.9834368824958801, + "step": 26 + }, + { + "completion_length": 228.78125, + "epoch": 0.0864, + "grad_norm": 2.7897403240203857, + "kl": 0.0435791015625, + "learning_rate": 9.675e-07, + "loss": 0.0004, + "reward": 3.788088798522949, + "reward_std": 0.10910476744174957, + "rewards/answer_entity_reward": 0.9546680450439453, + "rewards/answer_wer_reward": 0.872740238904953, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9606806039810181, + "step": 27 + }, + { + "completion_length": 210.5, + "epoch": 0.0896, + "grad_norm": 1.2101320028305054, + "kl": 0.0552978515625, + "learning_rate": 9.6625e-07, + "loss": 0.0006, + "reward": 3.8938169479370117, + "reward_std": 0.04485907219350338, + "rewards/answer_entity_reward": 0.974581778049469, + "rewards/answer_wer_reward": 0.9207929372787476, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9984423518180847, + "step": 28 + }, + { + "completion_length": 233.78125, + "epoch": 0.0928, + "grad_norm": 2.7460684776306152, + "kl": 0.035400390625, + "learning_rate": 9.649999999999999e-07, + "loss": 0.0004, + "reward": 3.662728428840637, + "reward_std": 0.20339616388082504, + "rewards/answer_entity_reward": 0.8774791359901428, + "rewards/answer_wer_reward": 0.8000176846981049, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9852316677570343, + "step": 29 + }, + { + "completion_length": 199.59375, + "epoch": 0.096, + "grad_norm": 1.8316643238067627, + "kl": 0.0596923828125, + "learning_rate": 9.637499999999999e-07, + "loss": 0.0006, + "reward": 3.890167713165283, + "reward_std": 0.037449197843670845, + "rewards/answer_entity_reward": 0.96912881731987, + "rewards/answer_wer_reward": 0.9220606982707977, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9989781975746155, + "step": 30 + }, + { + "completion_length": 226.125, + "epoch": 0.0992, + "grad_norm": 2.0417702198028564, + "kl": 0.0440673828125, + "learning_rate": 9.624999999999999e-07, + "loss": 0.0004, + "reward": 3.8260613679885864, + "reward_std": 0.07994803786277771, + "rewards/answer_entity_reward": 0.9577426314353943, + "rewards/answer_wer_reward": 0.902205765247345, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9661130309104919, + "step": 31 + }, + { + "completion_length": 214.5, + "epoch": 0.1024, + "grad_norm": 4.027645111083984, + "kl": 0.1015625, + "learning_rate": 9.6125e-07, + "loss": 0.001, + "reward": 3.7394936084747314, + "reward_std": 0.10389792174100876, + "rewards/answer_entity_reward": 0.9218434691429138, + "rewards/answer_wer_reward": 0.8621510863304138, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9554989635944366, + "step": 32 + }, + { + "completion_length": 255.28125, + "epoch": 0.1056, + "grad_norm": 1.527213454246521, + "kl": 0.046875, + "learning_rate": 9.6e-07, + "loss": 0.0005, + "reward": 3.8307132720947266, + "reward_std": 0.0552691500633955, + "rewards/answer_entity_reward": 0.9554121494293213, + "rewards/answer_wer_reward": 0.8765550553798676, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9987460970878601, + "step": 33 + }, + { + "completion_length": 226.0, + "epoch": 0.1088, + "grad_norm": 1.822529673576355, + "kl": 0.0372314453125, + "learning_rate": 9.5875e-07, + "loss": 0.0004, + "reward": 3.8188695907592773, + "reward_std": 0.07392234448343515, + "rewards/answer_entity_reward": 0.9491736888885498, + "rewards/answer_wer_reward": 0.8781739175319672, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.991521954536438, + "step": 34 + }, + { + "completion_length": 230.71875, + "epoch": 0.112, + "grad_norm": 1.96689772605896, + "kl": 0.05322265625, + "learning_rate": 9.575e-07, + "loss": 0.0005, + "reward": 3.839812397956848, + "reward_std": 0.04108080454170704, + "rewards/answer_entity_reward": 0.9491481184959412, + "rewards/answer_wer_reward": 0.8918017745018005, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9988624751567841, + "step": 35 + }, + { + "completion_length": 181.75, + "epoch": 0.1152, + "grad_norm": 25.535808563232422, + "kl": 0.100830078125, + "learning_rate": 9.5625e-07, + "loss": 0.001, + "reward": 3.8188287019729614, + "reward_std": 0.1601814702153206, + "rewards/answer_entity_reward": 0.9457894563674927, + "rewards/answer_wer_reward": 0.9093815982341766, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9636577069759369, + "step": 36 + }, + { + "completion_length": 165.375, + "epoch": 0.1184, + "grad_norm": 2.886183738708496, + "kl": 0.0692138671875, + "learning_rate": 9.55e-07, + "loss": 0.0007, + "reward": 3.8752315044403076, + "reward_std": 0.04815678671002388, + "rewards/answer_entity_reward": 0.994689553976059, + "rewards/answer_wer_reward": 0.9401271045207977, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9404149055480957, + "step": 37 + }, + { + "completion_length": 250.40625, + "epoch": 0.1216, + "grad_norm": 2.9052975177764893, + "kl": 0.0467529296875, + "learning_rate": 9.5375e-07, + "loss": 0.0005, + "reward": 3.8545405864715576, + "reward_std": 0.04892056295648217, + "rewards/answer_entity_reward": 0.9534467458724976, + "rewards/answer_wer_reward": 0.9035276472568512, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9975661933422089, + "step": 38 + }, + { + "completion_length": 234.125, + "epoch": 0.1248, + "grad_norm": 1.5214505195617676, + "kl": 0.04010009765625, + "learning_rate": 9.525e-07, + "loss": 0.0004, + "reward": 3.7642624378204346, + "reward_std": 0.06860890984535217, + "rewards/answer_entity_reward": 0.9330369234085083, + "rewards/answer_wer_reward": 0.8348780572414398, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9963473677635193, + "step": 39 + }, + { + "completion_length": 222.5, + "epoch": 0.128, + "grad_norm": 1.4751359224319458, + "kl": 0.0521240234375, + "learning_rate": 9.5125e-07, + "loss": 0.0005, + "reward": 3.8170441389083862, + "reward_std": 0.06563596054911613, + "rewards/answer_entity_reward": 0.9340721964836121, + "rewards/answer_wer_reward": 0.8999682068824768, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9830037355422974, + "step": 40 + }, + { + "completion_length": 201.84375, + "epoch": 0.1312, + "grad_norm": 20.2832088470459, + "kl": 0.038818359375, + "learning_rate": 9.499999999999999e-07, + "loss": 0.0004, + "reward": 3.7008172273635864, + "reward_std": 0.039744822308421135, + "rewards/answer_entity_reward": 0.9294143319129944, + "rewards/answer_wer_reward": 0.890234112739563, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8811687231063843, + "step": 41 + }, + { + "completion_length": 192.09375, + "epoch": 0.1344, + "grad_norm": 3.430189609527588, + "kl": 0.0523681640625, + "learning_rate": 9.487499999999999e-07, + "loss": 0.0005, + "reward": 3.8015908002853394, + "reward_std": 0.057819752022624016, + "rewards/answer_entity_reward": 0.9672390222549438, + "rewards/answer_wer_reward": 0.8474858105182648, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9868659377098083, + "step": 42 + }, + { + "completion_length": 215.53125, + "epoch": 0.1376, + "grad_norm": 16.041494369506836, + "kl": 0.0418701171875, + "learning_rate": 9.474999999999999e-07, + "loss": 0.0004, + "reward": 3.730579137802124, + "reward_std": 0.11731705069541931, + "rewards/answer_entity_reward": 0.9560448527336121, + "rewards/answer_wer_reward": 0.8699329495429993, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9046012759208679, + "step": 43 + }, + { + "completion_length": 236.78125, + "epoch": 0.1408, + "grad_norm": 1.6949574947357178, + "kl": 0.0352783203125, + "learning_rate": 9.462499999999999e-07, + "loss": 0.0004, + "reward": 3.899806261062622, + "reward_std": 0.018219145480543375, + "rewards/answer_entity_reward": 0.9738267660140991, + "rewards/answer_wer_reward": 0.9316939115524292, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9942856431007385, + "step": 44 + }, + { + "completion_length": 246.4375, + "epoch": 0.144, + "grad_norm": 1.3507007360458374, + "kl": 0.0330810546875, + "learning_rate": 9.45e-07, + "loss": 0.0003, + "reward": 3.8328453302383423, + "reward_std": 0.06314087565988302, + "rewards/answer_entity_reward": 0.9711392819881439, + "rewards/answer_wer_reward": 0.8670938909053802, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.994612067937851, + "step": 45 + }, + { + "completion_length": 170.28125, + "epoch": 0.1472, + "grad_norm": 2.2585864067077637, + "kl": 0.077392578125, + "learning_rate": 9.4375e-07, + "loss": 0.0008, + "reward": 3.902386784553528, + "reward_std": 0.035709235817193985, + "rewards/answer_entity_reward": 0.9873873591423035, + "rewards/answer_wer_reward": 0.9353838264942169, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9796155691146851, + "step": 46 + }, + { + "completion_length": 149.8125, + "epoch": 0.1504, + "grad_norm": 4.581851005554199, + "kl": 0.0452880859375, + "learning_rate": 9.425e-07, + "loss": 0.0005, + "reward": 3.6548960208892822, + "reward_std": 0.06261088512837887, + "rewards/answer_entity_reward": 0.9477430880069733, + "rewards/answer_wer_reward": 0.8129006922245026, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8942522406578064, + "step": 47 + }, + { + "completion_length": 216.75, + "epoch": 0.1536, + "grad_norm": 47.897464752197266, + "kl": 0.3621826171875, + "learning_rate": 9.4125e-07, + "loss": 0.0036, + "reward": 3.906231164932251, + "reward_std": 0.034966002218425274, + "rewards/answer_entity_reward": 0.9823353588581085, + "rewards/answer_wer_reward": 0.9293725490570068, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9945231974124908, + "step": 48 + }, + { + "completion_length": 196.9375, + "epoch": 0.1568, + "grad_norm": 2.257028579711914, + "kl": 0.0465087890625, + "learning_rate": 9.399999999999999e-07, + "loss": 0.0005, + "reward": 3.8652477264404297, + "reward_std": 0.03087481390684843, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9092975854873657, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.958791047334671, + "step": 49 + }, + { + "completion_length": 196.9375, + "epoch": 0.16, + "grad_norm": 4.950622081756592, + "kl": 0.0345458984375, + "learning_rate": 9.387499999999999e-07, + "loss": 0.0003, + "reward": 3.824746251106262, + "reward_std": 0.058218397200107574, + "rewards/answer_entity_reward": 0.9825757443904877, + "rewards/answer_wer_reward": 0.9601459503173828, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8820245563983917, + "step": 50 + }, + { + "completion_length": 174.8125, + "epoch": 0.1632, + "grad_norm": 7.211401462554932, + "kl": 0.0582275390625, + "learning_rate": 9.374999999999999e-07, + "loss": 0.0006, + "reward": 3.8160147666931152, + "reward_std": 0.04299969598650932, + "rewards/answer_entity_reward": 0.9790209829807281, + "rewards/answer_wer_reward": 0.9350173771381378, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.901976466178894, + "step": 51 + }, + { + "completion_length": 248.125, + "epoch": 0.1664, + "grad_norm": 0.9922041893005371, + "kl": 0.0201416015625, + "learning_rate": 9.3625e-07, + "loss": 0.0002, + "reward": 3.8918874263763428, + "reward_std": 0.029974642675369978, + "rewards/answer_entity_reward": 0.9869123697280884, + "rewards/answer_wer_reward": 0.9067046940326691, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9982704818248749, + "step": 52 + }, + { + "completion_length": 251.59375, + "epoch": 0.1696, + "grad_norm": 0.9144994020462036, + "kl": 0.02001953125, + "learning_rate": 9.35e-07, + "loss": 0.0002, + "reward": 3.782878875732422, + "reward_std": 0.04338405467569828, + "rewards/answer_entity_reward": 0.9685876965522766, + "rewards/answer_wer_reward": 0.8232664167881012, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9910247623920441, + "step": 53 + }, + { + "completion_length": 224.5625, + "epoch": 0.1728, + "grad_norm": 0.8014624118804932, + "kl": 0.01904296875, + "learning_rate": 9.3375e-07, + "loss": 0.0002, + "reward": 3.804163098335266, + "reward_std": 0.02029208466410637, + "rewards/answer_entity_reward": 0.9539299309253693, + "rewards/answer_wer_reward": 0.8539278209209442, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9963052868843079, + "step": 54 + }, + { + "completion_length": 174.59375, + "epoch": 0.176, + "grad_norm": 2.5315935611724854, + "kl": 0.02862548828125, + "learning_rate": 9.325e-07, + "loss": 0.0003, + "reward": 3.8737215995788574, + "reward_std": 0.06625958904623985, + "rewards/answer_entity_reward": 0.9887503385543823, + "rewards/answer_wer_reward": 0.9215180277824402, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9634531438350677, + "step": 55 + }, + { + "completion_length": 239.4375, + "epoch": 0.1792, + "grad_norm": 1.3654975891113281, + "kl": 0.0283203125, + "learning_rate": 9.3125e-07, + "loss": 0.0003, + "reward": 3.8753963708877563, + "reward_std": 0.04764867387712002, + "rewards/answer_entity_reward": 0.9810132682323456, + "rewards/answer_wer_reward": 0.8943831324577332, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 56 + }, + { + "completion_length": 214.75, + "epoch": 0.1824, + "grad_norm": 1.4159584045410156, + "kl": 0.02081298828125, + "learning_rate": 9.3e-07, + "loss": 0.0002, + "reward": 3.8986427783966064, + "reward_std": 0.031265249475836754, + "rewards/answer_entity_reward": 0.9880681931972504, + "rewards/answer_wer_reward": 0.9130412340164185, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.99753338098526, + "step": 57 + }, + { + "completion_length": 240.46875, + "epoch": 0.1856, + "grad_norm": 1.1824144124984741, + "kl": 0.015960693359375, + "learning_rate": 9.287499999999999e-07, + "loss": 0.0002, + "reward": 3.90795361995697, + "reward_std": 0.02096135076135397, + "rewards/answer_entity_reward": 0.9983552694320679, + "rewards/answer_wer_reward": 0.9095984101295471, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 58 + }, + { + "completion_length": 238.09375, + "epoch": 0.1888, + "grad_norm": 1.165099024772644, + "kl": 0.026123046875, + "learning_rate": 9.274999999999999e-07, + "loss": 0.0003, + "reward": 3.9033310413360596, + "reward_std": 0.03423699922859669, + "rewards/answer_entity_reward": 0.9810605943202972, + "rewards/answer_wer_reward": 0.9234386384487152, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9988317787647247, + "step": 59 + }, + { + "completion_length": 221.84375, + "epoch": 0.192, + "grad_norm": 2.964642286300659, + "kl": 0.02587890625, + "learning_rate": 9.2625e-07, + "loss": 0.0003, + "reward": 3.9065024852752686, + "reward_std": 0.022342820651829243, + "rewards/answer_entity_reward": 0.978426069021225, + "rewards/answer_wer_reward": 0.9289742708206177, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9991020858287811, + "step": 60 + }, + { + "completion_length": 211.1875, + "epoch": 0.1952, + "grad_norm": 2.225137233734131, + "kl": 0.0374755859375, + "learning_rate": 9.25e-07, + "loss": 0.0004, + "reward": 3.6701877117156982, + "reward_std": 0.03641202859580517, + "rewards/answer_entity_reward": 0.9796620309352875, + "rewards/answer_wer_reward": 0.7723922729492188, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9181334376335144, + "step": 61 + }, + { + "completion_length": 150.0, + "epoch": 0.1984, + "grad_norm": 4.289616584777832, + "kl": 0.062744140625, + "learning_rate": 9.237499999999999e-07, + "loss": 0.0006, + "reward": 3.769058585166931, + "reward_std": 0.060237159952521324, + "rewards/answer_entity_reward": 0.842234879732132, + "rewards/answer_wer_reward": 0.9324747323989868, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9943490326404572, + "step": 62 + }, + { + "completion_length": 172.59375, + "epoch": 0.2016, + "grad_norm": 0.9226670861244202, + "kl": 0.04541015625, + "learning_rate": 9.225e-07, + "loss": 0.0005, + "reward": 3.9475854635238647, + "reward_std": 0.009972278494387865, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9488748908042908, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9987106323242188, + "step": 63 + }, + { + "completion_length": 186.28125, + "epoch": 0.2048, + "grad_norm": 2.8787524700164795, + "kl": 0.02923583984375, + "learning_rate": 9.2125e-07, + "loss": 0.0003, + "reward": 3.8407578468322754, + "reward_std": 0.04633911233395338, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9414158165454865, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8993421196937561, + "step": 64 + }, + { + "completion_length": 235.21875, + "epoch": 0.208, + "grad_norm": 3.289802074432373, + "kl": 0.02203369140625, + "learning_rate": 9.2e-07, + "loss": 0.0002, + "reward": 3.8516111373901367, + "reward_std": 0.05013709142804146, + "rewards/answer_entity_reward": 0.9782106876373291, + "rewards/answer_wer_reward": 0.8967941999435425, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9766062498092651, + "step": 65 + }, + { + "completion_length": 182.9375, + "epoch": 0.2112, + "grad_norm": 15.17410659790039, + "kl": 0.079833984375, + "learning_rate": 9.187499999999999e-07, + "loss": 0.0008, + "reward": 3.7952799797058105, + "reward_std": 0.08191402442753315, + "rewards/answer_entity_reward": 0.9947552382946014, + "rewards/answer_wer_reward": 0.9461319446563721, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8543927371501923, + "step": 66 + }, + { + "completion_length": 195.3125, + "epoch": 0.2144, + "grad_norm": 1.6663379669189453, + "kl": 0.0638427734375, + "learning_rate": 9.174999999999999e-07, + "loss": 0.0006, + "reward": 3.916337490081787, + "reward_std": 0.018936872947961092, + "rewards/answer_entity_reward": 0.9955128133296967, + "rewards/answer_wer_reward": 0.9398471117019653, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.980977475643158, + "step": 67 + }, + { + "completion_length": 211.84375, + "epoch": 0.2176, + "grad_norm": 2.6255111694335938, + "kl": 0.05126953125, + "learning_rate": 9.1625e-07, + "loss": 0.0005, + "reward": 3.9224915504455566, + "reward_std": 0.01644316827878356, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9280897378921509, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.997242659330368, + "step": 68 + }, + { + "completion_length": 170.65625, + "epoch": 0.2208, + "grad_norm": 3.3114447593688965, + "kl": 0.0849609375, + "learning_rate": 9.15e-07, + "loss": 0.0009, + "reward": 3.801788806915283, + "reward_std": 0.07587217539548874, + "rewards/answer_entity_reward": 0.9663097262382507, + "rewards/answer_wer_reward": 0.9007239937782288, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9347550868988037, + "step": 69 + }, + { + "completion_length": 194.0, + "epoch": 0.224, + "grad_norm": 0.908227264881134, + "kl": 0.0428466796875, + "learning_rate": 9.137499999999999e-07, + "loss": 0.0004, + "reward": 3.908014178276062, + "reward_std": 0.015611772891134024, + "rewards/answer_entity_reward": 0.9866071343421936, + "rewards/answer_wer_reward": 0.9214071035385132, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 70 + }, + { + "completion_length": 235.15625, + "epoch": 0.2272, + "grad_norm": 6.288023471832275, + "kl": 0.0377197265625, + "learning_rate": 9.124999999999999e-07, + "loss": 0.0004, + "reward": 3.8232322931289673, + "reward_std": 0.019494441337883472, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9413564205169678, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8853481709957123, + "step": 71 + }, + { + "completion_length": 202.84375, + "epoch": 0.2304, + "grad_norm": 3.666252374649048, + "kl": 0.02703857421875, + "learning_rate": 9.1125e-07, + "loss": 0.0003, + "reward": 3.8724911212921143, + "reward_std": 0.036418632604181767, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9379763305187225, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.934514731168747, + "step": 72 + }, + { + "completion_length": 192.59375, + "epoch": 0.2336, + "grad_norm": 2.5703845024108887, + "kl": 0.04815673828125, + "learning_rate": 9.1e-07, + "loss": 0.0005, + "reward": 3.819400668144226, + "reward_std": 0.09702013805508614, + "rewards/answer_entity_reward": 0.9749708473682404, + "rewards/answer_wer_reward": 0.8958881497383118, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9485417604446411, + "step": 73 + }, + { + "completion_length": 233.96875, + "epoch": 0.2368, + "grad_norm": 5.079833030700684, + "kl": 0.03594970703125, + "learning_rate": 9.087499999999999e-07, + "loss": 0.0004, + "reward": 3.87298047542572, + "reward_std": 0.04117333237081766, + "rewards/answer_entity_reward": 0.979208379983902, + "rewards/answer_wer_reward": 0.8985798060894012, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.995192289352417, + "step": 74 + }, + { + "completion_length": 232.09375, + "epoch": 0.24, + "grad_norm": 1.3709529638290405, + "kl": 0.0469970703125, + "learning_rate": 9.074999999999999e-07, + "loss": 0.0005, + "reward": 3.8842471837997437, + "reward_std": 0.02406489010900259, + "rewards/answer_entity_reward": 0.976262629032135, + "rewards/answer_wer_reward": 0.9083134233951569, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996710419654846, + "step": 75 + }, + { + "completion_length": 134.46875, + "epoch": 0.2432, + "grad_norm": 1.7917073965072632, + "kl": 0.04345703125, + "learning_rate": 9.0625e-07, + "loss": 0.0004, + "reward": 3.9434739351272583, + "reward_std": 0.03165043890476227, + "rewards/answer_entity_reward": 0.9853896200656891, + "rewards/answer_wer_reward": 0.960752934217453, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9973313212394714, + "step": 76 + }, + { + "completion_length": 260.75, + "epoch": 0.2464, + "grad_norm": 2.487206220626831, + "kl": 0.02789306640625, + "learning_rate": 9.05e-07, + "loss": 0.0003, + "reward": 3.8149930238723755, + "reward_std": 0.04638839513063431, + "rewards/answer_entity_reward": 0.9494674503803253, + "rewards/answer_wer_reward": 0.8663396835327148, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9991858303546906, + "step": 77 + }, + { + "completion_length": 221.3125, + "epoch": 0.2496, + "grad_norm": 1.8767852783203125, + "kl": 0.017425537109375, + "learning_rate": 9.0375e-07, + "loss": 0.0002, + "reward": 3.8600170612335205, + "reward_std": 0.04895954905077815, + "rewards/answer_entity_reward": 0.9861111044883728, + "rewards/answer_wer_reward": 0.8933806419372559, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9805253744125366, + "step": 78 + }, + { + "completion_length": 230.71875, + "epoch": 0.2528, + "grad_norm": 3.712688684463501, + "kl": 0.054931640625, + "learning_rate": 9.024999999999999e-07, + "loss": 0.0005, + "reward": 3.8847248554229736, + "reward_std": 0.012873267754912376, + "rewards/answer_entity_reward": 0.9855768978595734, + "rewards/answer_wer_reward": 0.9019420742988586, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9972057938575745, + "step": 79 + }, + { + "completion_length": 199.3125, + "epoch": 0.256, + "grad_norm": 1.9246958494186401, + "kl": 0.054931640625, + "learning_rate": 9.0125e-07, + "loss": 0.0005, + "reward": 3.8006842136383057, + "reward_std": 0.052133604884147644, + "rewards/answer_entity_reward": 0.9955128133296967, + "rewards/answer_wer_reward": 0.9017785787582397, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9033928513526917, + "step": 80 + }, + { + "completion_length": 250.21875, + "epoch": 0.2592, + "grad_norm": 1.160876989364624, + "kl": 0.0220947265625, + "learning_rate": 9e-07, + "loss": 0.0002, + "reward": 3.8708144426345825, + "reward_std": 0.030466954689472914, + "rewards/answer_entity_reward": 0.9937500059604645, + "rewards/answer_wer_reward": 0.8790038824081421, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9980604648590088, + "step": 81 + }, + { + "completion_length": 237.125, + "epoch": 0.2624, + "grad_norm": 5.024093151092529, + "kl": 0.0382080078125, + "learning_rate": 8.9875e-07, + "loss": 0.0004, + "reward": 3.9048351049423218, + "reward_std": 0.03107828088104725, + "rewards/answer_entity_reward": 0.9851398468017578, + "rewards/answer_wer_reward": 0.9344828426837921, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9852123558521271, + "step": 82 + }, + { + "completion_length": 222.5, + "epoch": 0.2656, + "grad_norm": 1.6519030332565308, + "kl": 0.0380859375, + "learning_rate": 8.974999999999999e-07, + "loss": 0.0004, + "reward": 3.863801956176758, + "reward_std": 0.030243747401982546, + "rewards/answer_entity_reward": 0.9727078676223755, + "rewards/answer_wer_reward": 0.9002127051353455, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9908813536167145, + "step": 83 + }, + { + "completion_length": 225.53125, + "epoch": 0.2688, + "grad_norm": 1.4793689250946045, + "kl": 0.0517578125, + "learning_rate": 8.9625e-07, + "loss": 0.0005, + "reward": 3.8814769983291626, + "reward_std": 0.029270809143781662, + "rewards/answer_entity_reward": 0.9880681931972504, + "rewards/answer_wer_reward": 0.8934087753295898, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 84 + }, + { + "completion_length": 235.9375, + "epoch": 0.272, + "grad_norm": 1.597517728805542, + "kl": 0.1016845703125, + "learning_rate": 8.95e-07, + "loss": 0.001, + "reward": 3.8768863677978516, + "reward_std": 0.03502520266920328, + "rewards/answer_entity_reward": 0.9798878133296967, + "rewards/answer_wer_reward": 0.8985857367515564, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9984128177165985, + "step": 85 + }, + { + "completion_length": 214.4375, + "epoch": 0.2752, + "grad_norm": 4.483051300048828, + "kl": 0.04150390625, + "learning_rate": 8.9375e-07, + "loss": 0.0004, + "reward": 3.903320074081421, + "reward_std": 0.019831405603326857, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9384645223617554, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9648554623126984, + "step": 86 + }, + { + "completion_length": 217.3125, + "epoch": 0.2784, + "grad_norm": 2.5979843139648438, + "kl": 0.0279541015625, + "learning_rate": 8.924999999999999e-07, + "loss": 0.0003, + "reward": 3.8643628358840942, + "reward_std": 0.07706086616963148, + "rewards/answer_entity_reward": 0.9751845002174377, + "rewards/answer_wer_reward": 0.9189748764038086, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9702034890651703, + "step": 87 + }, + { + "completion_length": 209.0625, + "epoch": 0.2816, + "grad_norm": 2.134483575820923, + "kl": 0.0654296875, + "learning_rate": 8.912499999999999e-07, + "loss": 0.0007, + "reward": 3.829586148262024, + "reward_std": 0.11678730137646198, + "rewards/answer_entity_reward": 0.9327990114688873, + "rewards/answer_wer_reward": 0.9185277223587036, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9782594740390778, + "step": 88 + }, + { + "completion_length": 202.5625, + "epoch": 0.2848, + "grad_norm": 2.750098943710327, + "kl": 0.0386962890625, + "learning_rate": 8.9e-07, + "loss": 0.0004, + "reward": 3.813106060028076, + "reward_std": 0.013170521473512053, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.8155100047588348, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9975961446762085, + "step": 89 + }, + { + "completion_length": 208.21875, + "epoch": 0.288, + "grad_norm": 1.0419001579284668, + "kl": 0.02874755859375, + "learning_rate": 8.8875e-07, + "loss": 0.0003, + "reward": 3.7984471321105957, + "reward_std": 0.046625567600131035, + "rewards/answer_entity_reward": 0.9813492298126221, + "rewards/answer_wer_reward": 0.908283531665802, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9088144898414612, + "step": 90 + }, + { + "completion_length": 240.875, + "epoch": 0.2912, + "grad_norm": 1.406315565109253, + "kl": 0.0322265625, + "learning_rate": 8.874999999999999e-07, + "loss": 0.0003, + "reward": 3.917527914047241, + "reward_std": 0.018682857509702444, + "rewards/answer_entity_reward": 0.9979166686534882, + "rewards/answer_wer_reward": 0.919611245393753, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 91 + }, + { + "completion_length": 248.28125, + "epoch": 0.2944, + "grad_norm": 0.9986963868141174, + "kl": 0.034912109375, + "learning_rate": 8.8625e-07, + "loss": 0.0003, + "reward": 3.8824074268341064, + "reward_std": 0.027639332227408886, + "rewards/answer_entity_reward": 0.9829497039318085, + "rewards/answer_wer_reward": 0.8998689651489258, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.999588817358017, + "step": 92 + }, + { + "completion_length": 166.84375, + "epoch": 0.2976, + "grad_norm": 1.9086061716079712, + "kl": 0.03448486328125, + "learning_rate": 8.85e-07, + "loss": 0.0003, + "reward": 3.9501060247421265, + "reward_std": 0.012802016455680132, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9628694355487823, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9872365295886993, + "step": 93 + }, + { + "completion_length": 256.1875, + "epoch": 0.3008, + "grad_norm": 3.4043421745300293, + "kl": 0.049072265625, + "learning_rate": 8.8375e-07, + "loss": 0.0005, + "reward": 3.814915657043457, + "reward_std": 0.03222915716469288, + "rewards/answer_entity_reward": 0.9890734255313873, + "rewards/answer_wer_reward": 0.8261894881725311, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.999652773141861, + "step": 94 + }, + { + "completion_length": 253.4375, + "epoch": 0.304, + "grad_norm": 0.9184324741363525, + "kl": 0.03564453125, + "learning_rate": 8.824999999999999e-07, + "loss": 0.0004, + "reward": 3.8896020650863647, + "reward_std": 0.02269437536597252, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.8971993029117584, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9952436983585358, + "step": 95 + }, + { + "completion_length": 202.15625, + "epoch": 0.3072, + "grad_norm": 12.922323226928711, + "kl": 0.05908203125, + "learning_rate": 8.812499999999999e-07, + "loss": 0.0006, + "reward": 3.9009629487991333, + "reward_std": 0.0202713580802083, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9189554452896118, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9820075929164886, + "step": 96 + }, + { + "completion_length": 224.53125, + "epoch": 0.3104, + "grad_norm": 4.217601299285889, + "kl": 0.0465087890625, + "learning_rate": 8.799999999999999e-07, + "loss": 0.0005, + "reward": 3.8913207054138184, + "reward_std": 0.014381649438291788, + "rewards/answer_entity_reward": 0.9821428656578064, + "rewards/answer_wer_reward": 0.9095685184001923, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996093809604645, + "step": 97 + }, + { + "completion_length": 206.40625, + "epoch": 0.3136, + "grad_norm": 2.168041706085205, + "kl": 0.0323486328125, + "learning_rate": 8.7875e-07, + "loss": 0.0003, + "reward": 3.8137295246124268, + "reward_std": 0.06389336660504341, + "rewards/answer_entity_reward": 0.9776169061660767, + "rewards/answer_wer_reward": 0.8989610075950623, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9371516108512878, + "step": 98 + }, + { + "completion_length": 209.0625, + "epoch": 0.3168, + "grad_norm": 1.6052436828613281, + "kl": 0.0345458984375, + "learning_rate": 8.774999999999999e-07, + "loss": 0.0003, + "reward": 3.828700304031372, + "reward_std": 0.019330056384205818, + "rewards/answer_entity_reward": 0.9850388169288635, + "rewards/answer_wer_reward": 0.846589595079422, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9970719814300537, + "step": 99 + }, + { + "completion_length": 210.6875, + "epoch": 0.32, + "grad_norm": 0.9548845887184143, + "kl": 0.0341796875, + "learning_rate": 8.7625e-07, + "loss": 0.0003, + "reward": 3.9469358921051025, + "reward_std": 0.021241382230073214, + "rewards/answer_entity_reward": 0.9851641654968262, + "rewards/answer_wer_reward": 0.961771547794342, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 100 + }, + { + "completion_length": 214.28125, + "epoch": 0.3232, + "grad_norm": 2.8610620498657227, + "kl": 0.052734375, + "learning_rate": 8.75e-07, + "loss": 0.0005, + "reward": 3.806527853012085, + "reward_std": 0.04471902176737785, + "rewards/answer_entity_reward": 0.9853896200656891, + "rewards/answer_wer_reward": 0.8547504544258118, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.966387927532196, + "step": 101 + }, + { + "completion_length": 223.4375, + "epoch": 0.3264, + "grad_norm": 0.7780336141586304, + "kl": 0.034912109375, + "learning_rate": 8.7375e-07, + "loss": 0.0003, + "reward": 3.880792260169983, + "reward_std": 0.022754055447876453, + "rewards/answer_entity_reward": 0.989393949508667, + "rewards/answer_wer_reward": 0.8913983702659607, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 102 + }, + { + "completion_length": 231.0625, + "epoch": 0.3296, + "grad_norm": 1.3763070106506348, + "kl": 0.024444580078125, + "learning_rate": 8.725e-07, + "loss": 0.0003, + "reward": 3.929618239402771, + "reward_std": 0.012849014718085527, + "rewards/answer_entity_reward": 0.9983552694320679, + "rewards/answer_wer_reward": 0.9325020015239716, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9987609684467316, + "step": 103 + }, + { + "completion_length": 268.96875, + "epoch": 0.3328, + "grad_norm": 1.7985624074935913, + "kl": 0.0289306640625, + "learning_rate": 8.712499999999999e-07, + "loss": 0.0003, + "reward": 3.888875961303711, + "reward_std": 0.027541114017367363, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.8925231993198395, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.999193549156189, + "step": 104 + }, + { + "completion_length": 254.1875, + "epoch": 0.336, + "grad_norm": 18.920978546142578, + "kl": 0.027099609375, + "learning_rate": 8.699999999999999e-07, + "loss": 0.0003, + "reward": 3.860435366630554, + "reward_std": 0.030950906220823526, + "rewards/answer_entity_reward": 0.9734883308410645, + "rewards/answer_wer_reward": 0.8872724771499634, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996744692325592, + "step": 105 + }, + { + "completion_length": 163.53125, + "epoch": 0.3392, + "grad_norm": 2.867141008377075, + "kl": 0.03399658203125, + "learning_rate": 8.687499999999999e-07, + "loss": 0.0003, + "reward": 3.9226391315460205, + "reward_std": 0.023416020907461643, + "rewards/answer_entity_reward": 0.9886363744735718, + "rewards/answer_wer_reward": 0.9473121762275696, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9866906106472015, + "step": 106 + }, + { + "completion_length": 230.5625, + "epoch": 0.3424, + "grad_norm": 1.7444649934768677, + "kl": 0.03302001953125, + "learning_rate": 8.675000000000001e-07, + "loss": 0.0003, + "reward": 3.9037901163101196, + "reward_std": 0.013123108074069023, + "rewards/answer_entity_reward": 0.9979166686534882, + "rewards/answer_wer_reward": 0.9062368869781494, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996366202831268, + "step": 107 + }, + { + "completion_length": 196.96875, + "epoch": 0.3456, + "grad_norm": 1.4596710205078125, + "kl": 0.0565185546875, + "learning_rate": 8.6625e-07, + "loss": 0.0006, + "reward": 3.927306890487671, + "reward_std": 0.017726238816976547, + "rewards/answer_entity_reward": 0.9847221970558167, + "rewards/answer_wer_reward": 0.9435714483261108, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9990131855010986, + "step": 108 + }, + { + "completion_length": 204.03125, + "epoch": 0.3488, + "grad_norm": 21.111600875854492, + "kl": 0.259765625, + "learning_rate": 8.65e-07, + "loss": 0.0026, + "reward": 3.878751039505005, + "reward_std": 0.09589649271219969, + "rewards/answer_entity_reward": 0.9957579076290131, + "rewards/answer_wer_reward": 0.9333003461360931, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9496928453445435, + "step": 109 + }, + { + "completion_length": 213.25, + "epoch": 0.352, + "grad_norm": 5.349282264709473, + "kl": 0.0455322265625, + "learning_rate": 8.6375e-07, + "loss": 0.0005, + "reward": 3.862163782119751, + "reward_std": 0.031207362189888954, + "rewards/answer_entity_reward": 0.9892857372760773, + "rewards/answer_wer_reward": 0.9074709117412567, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9654072225093842, + "step": 110 + }, + { + "completion_length": 220.3125, + "epoch": 0.3552, + "grad_norm": 3.316596746444702, + "kl": 0.03369140625, + "learning_rate": 8.625e-07, + "loss": 0.0003, + "reward": 3.8875255584716797, + "reward_std": 0.03998068626970053, + "rewards/answer_entity_reward": 0.9902909696102142, + "rewards/answer_wer_reward": 0.9038136303424835, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9934210479259491, + "step": 111 + }, + { + "completion_length": 250.78125, + "epoch": 0.3584, + "grad_norm": 2.525360107421875, + "kl": 0.03515625, + "learning_rate": 8.612499999999999e-07, + "loss": 0.0003, + "reward": 3.8880720138549805, + "reward_std": 0.025330569595098495, + "rewards/answer_entity_reward": 0.9918486475944519, + "rewards/answer_wer_reward": 0.8981437385082245, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9980796277523041, + "step": 112 + }, + { + "completion_length": 220.09375, + "epoch": 0.3616, + "grad_norm": 5.7261433601379395, + "kl": 0.038330078125, + "learning_rate": 8.599999999999999e-07, + "loss": 0.0004, + "reward": 3.873054027557373, + "reward_std": 0.018459735438227654, + "rewards/answer_entity_reward": 0.9903846085071564, + "rewards/answer_wer_reward": 0.8848404586315155, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9978289306163788, + "step": 113 + }, + { + "completion_length": 233.625, + "epoch": 0.3648, + "grad_norm": 2.1468665599823, + "kl": 0.0286865234375, + "learning_rate": 8.587499999999999e-07, + "loss": 0.0003, + "reward": 3.9267923831939697, + "reward_std": 0.026638164184987545, + "rewards/answer_entity_reward": 0.993686854839325, + "rewards/answer_wer_reward": 0.9341540634632111, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9989514350891113, + "step": 114 + }, + { + "completion_length": 237.375, + "epoch": 0.368, + "grad_norm": 14.322599411010742, + "kl": 0.04052734375, + "learning_rate": 8.575e-07, + "loss": 0.0004, + "reward": 3.9121710062026978, + "reward_std": 0.02902364358305931, + "rewards/answer_entity_reward": 0.9908459782600403, + "rewards/answer_wer_reward": 0.922933429479599, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9983916878700256, + "step": 115 + }, + { + "completion_length": 240.15625, + "epoch": 0.3712, + "grad_norm": 2.0209200382232666, + "kl": 0.06103515625, + "learning_rate": 8.5625e-07, + "loss": 0.0006, + "reward": 3.888006567955017, + "reward_std": 0.023146681487560272, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.895849883556366, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.994560569524765, + "step": 116 + }, + { + "completion_length": 222.4375, + "epoch": 0.3744, + "grad_norm": 2.933910608291626, + "kl": 0.0419921875, + "learning_rate": 8.55e-07, + "loss": 0.0004, + "reward": 3.8359127044677734, + "reward_std": 0.058022117242217064, + "rewards/answer_entity_reward": 0.9440500438213348, + "rewards/answer_wer_reward": 0.894202709197998, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9976600110530853, + "step": 117 + }, + { + "completion_length": 214.5625, + "epoch": 0.3776, + "grad_norm": 7.493628025054932, + "kl": 0.064453125, + "learning_rate": 8.5375e-07, + "loss": 0.0006, + "reward": 3.799570918083191, + "reward_std": 0.06657508388161659, + "rewards/answer_entity_reward": 0.9727430641651154, + "rewards/answer_wer_reward": 0.871229887008667, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9555979073047638, + "step": 118 + }, + { + "completion_length": 212.625, + "epoch": 0.3808, + "grad_norm": 2.1899421215057373, + "kl": 0.0570068359375, + "learning_rate": 8.525e-07, + "loss": 0.0006, + "reward": 3.9054840803146362, + "reward_std": 0.027329989708960056, + "rewards/answer_entity_reward": 0.9914772808551788, + "rewards/answer_wer_reward": 0.9344967901706696, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9795099198818207, + "step": 119 + }, + { + "completion_length": 249.8125, + "epoch": 0.384, + "grad_norm": 2.4804491996765137, + "kl": 0.035888671875, + "learning_rate": 8.512499999999999e-07, + "loss": 0.0004, + "reward": 3.8948739767074585, + "reward_std": 0.028746116440743208, + "rewards/answer_entity_reward": 0.9953208565711975, + "rewards/answer_wer_reward": 0.9002179205417633, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9993351101875305, + "step": 120 + }, + { + "completion_length": 185.34375, + "epoch": 0.3872, + "grad_norm": 2.305140256881714, + "kl": 0.102783203125, + "learning_rate": 8.499999999999999e-07, + "loss": 0.001, + "reward": 3.9010980129241943, + "reward_std": 0.021339962724596262, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9222235083580017, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9788744449615479, + "step": 121 + }, + { + "completion_length": 204.65625, + "epoch": 0.3904, + "grad_norm": 1.5420470237731934, + "kl": 0.0313720703125, + "learning_rate": 8.487499999999999e-07, + "loss": 0.0003, + "reward": 3.927214741706848, + "reward_std": 0.019817203283309937, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.92842698097229, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9987878203392029, + "step": 122 + }, + { + "completion_length": 216.90625, + "epoch": 0.3936, + "grad_norm": 8.852909088134766, + "kl": 0.0716552734375, + "learning_rate": 8.475e-07, + "loss": 0.0007, + "reward": 3.811018466949463, + "reward_std": 0.010543343145400286, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.938366711139679, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8726518452167511, + "step": 123 + }, + { + "completion_length": 257.75, + "epoch": 0.3968, + "grad_norm": 1.4971685409545898, + "kl": 0.0330810546875, + "learning_rate": 8.462499999999999e-07, + "loss": 0.0003, + "reward": 3.9272462129592896, + "reward_std": 0.01983210165053606, + "rewards/answer_entity_reward": 0.9979166686534882, + "rewards/answer_wer_reward": 0.9303403496742249, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9989891648292542, + "step": 124 + }, + { + "completion_length": 207.4375, + "epoch": 0.4, + "grad_norm": 1.9963277578353882, + "kl": 0.056396484375, + "learning_rate": 8.45e-07, + "loss": 0.0006, + "reward": 3.9006247520446777, + "reward_std": 0.030232679098844528, + "rewards/answer_entity_reward": 0.9941239356994629, + "rewards/answer_wer_reward": 0.9261119067668915, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9803889393806458, + "step": 125 + }, + { + "completion_length": 246.875, + "epoch": 0.4032, + "grad_norm": 1.1950430870056152, + "kl": 0.03369140625, + "learning_rate": 8.4375e-07, + "loss": 0.0003, + "reward": 3.881152391433716, + "reward_std": 0.03120280895382166, + "rewards/answer_entity_reward": 0.9683753550052643, + "rewards/answer_wer_reward": 0.9131445586681366, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996323585510254, + "step": 126 + }, + { + "completion_length": 212.15625, + "epoch": 0.4064, + "grad_norm": 4.167364120483398, + "kl": 0.257568359375, + "learning_rate": 8.425e-07, + "loss": 0.0026, + "reward": 3.891525626182556, + "reward_std": 0.03758985735476017, + "rewards/answer_entity_reward": 0.9853896200656891, + "rewards/answer_wer_reward": 0.9100889563560486, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9960470795631409, + "step": 127 + }, + { + "completion_length": 215.5625, + "epoch": 0.4096, + "grad_norm": 1.2758169174194336, + "kl": 0.059326171875, + "learning_rate": 8.4125e-07, + "loss": 0.0006, + "reward": 3.8984569311141968, + "reward_std": 0.02103353524580598, + "rewards/answer_entity_reward": 0.987500011920929, + "rewards/answer_wer_reward": 0.9310561716556549, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9799006283283234, + "step": 128 + }, + { + "completion_length": 221.0, + "epoch": 0.4128, + "grad_norm": 1.6011369228363037, + "kl": 0.02734375, + "learning_rate": 8.399999999999999e-07, + "loss": 0.0003, + "reward": 3.907585859298706, + "reward_std": 0.024174046237021685, + "rewards/answer_entity_reward": 0.9887152910232544, + "rewards/answer_wer_reward": 0.9191110134124756, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997596144676208, + "step": 129 + }, + { + "completion_length": 189.375, + "epoch": 0.416, + "grad_norm": 2.7846839427948, + "kl": 0.0413818359375, + "learning_rate": 8.387499999999999e-07, + "loss": 0.0004, + "reward": 3.8641178607940674, + "reward_std": 0.03212345764040947, + "rewards/answer_entity_reward": 0.9947552382946014, + "rewards/answer_wer_reward": 0.9255104064941406, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9438523054122925, + "step": 130 + }, + { + "completion_length": 209.5, + "epoch": 0.4192, + "grad_norm": 4.144553184509277, + "kl": 0.0548095703125, + "learning_rate": 8.375e-07, + "loss": 0.0006, + "reward": 3.8618308305740356, + "reward_std": 0.07612445950508118, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9306082725524902, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9312225580215454, + "step": 131 + }, + { + "completion_length": 198.5, + "epoch": 0.4224, + "grad_norm": 2.663985013961792, + "kl": 0.04052734375, + "learning_rate": 8.3625e-07, + "loss": 0.0004, + "reward": 3.897012948989868, + "reward_std": 0.030758653301745653, + "rewards/answer_entity_reward": 0.9861111044883728, + "rewards/answer_wer_reward": 0.9326047897338867, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9782971143722534, + "step": 132 + }, + { + "completion_length": 180.78125, + "epoch": 0.4256, + "grad_norm": 2.2100954055786133, + "kl": 0.0439453125, + "learning_rate": 8.349999999999999e-07, + "loss": 0.0004, + "reward": 3.923304557800293, + "reward_std": 0.025213422253727913, + "rewards/answer_entity_reward": 0.9882478713989258, + "rewards/answer_wer_reward": 0.9360361397266388, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.999020516872406, + "step": 133 + }, + { + "completion_length": 219.59375, + "epoch": 0.4288, + "grad_norm": 15.98015022277832, + "kl": 0.0645751953125, + "learning_rate": 8.3375e-07, + "loss": 0.0006, + "reward": 3.8721258640289307, + "reward_std": 0.02985560242086649, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9070867002010345, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9650392830371857, + "step": 134 + }, + { + "completion_length": 239.90625, + "epoch": 0.432, + "grad_norm": 3.754002332687378, + "kl": 0.0419921875, + "learning_rate": 8.325e-07, + "loss": 0.0004, + "reward": 3.8614091873168945, + "reward_std": 0.0724228248000145, + "rewards/answer_entity_reward": 0.9794008135795593, + "rewards/answer_wer_reward": 0.9043296277523041, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9776785373687744, + "step": 135 + }, + { + "completion_length": 228.09375, + "epoch": 0.4352, + "grad_norm": 2.609844207763672, + "kl": 0.037841796875, + "learning_rate": 8.3125e-07, + "loss": 0.0004, + "reward": 3.8617947101593018, + "reward_std": 0.021692313253879547, + "rewards/answer_entity_reward": 0.9861111044883728, + "rewards/answer_wer_reward": 0.8795575797557831, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.996126115322113, + "step": 136 + }, + { + "completion_length": 158.8125, + "epoch": 0.4384, + "grad_norm": 1.6180543899536133, + "kl": 0.055419921875, + "learning_rate": 8.299999999999999e-07, + "loss": 0.0005, + "reward": 3.9137951135635376, + "reward_std": 0.020158007740974426, + "rewards/answer_entity_reward": 0.970695972442627, + "rewards/answer_wer_reward": 0.9480262100696564, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9950730204582214, + "step": 137 + }, + { + "completion_length": 231.25, + "epoch": 0.4416, + "grad_norm": 0.9336134195327759, + "kl": 0.03399658203125, + "learning_rate": 8.287499999999999e-07, + "loss": 0.0003, + "reward": 3.9351539611816406, + "reward_std": 0.014509289292618632, + "rewards/answer_entity_reward": 0.9934294819831848, + "rewards/answer_wer_reward": 0.9442258775234222, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9974986016750336, + "step": 138 + }, + { + "completion_length": 220.34375, + "epoch": 0.4448, + "grad_norm": 21.355905532836914, + "kl": 0.059814453125, + "learning_rate": 8.275e-07, + "loss": 0.0006, + "reward": 3.863122820854187, + "reward_std": 0.060401469469070435, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9233364760875702, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9397862255573273, + "step": 139 + }, + { + "completion_length": 214.90625, + "epoch": 0.448, + "grad_norm": 1.280321478843689, + "kl": 0.052490234375, + "learning_rate": 8.2625e-07, + "loss": 0.0005, + "reward": 3.9231661558151245, + "reward_std": 0.009715312160551548, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9245247840881348, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.998641312122345, + "step": 140 + }, + { + "completion_length": 211.375, + "epoch": 0.4512, + "grad_norm": 1.7492412328720093, + "kl": 0.062744140625, + "learning_rate": 8.249999999999999e-07, + "loss": 0.0006, + "reward": 3.88791024684906, + "reward_std": 0.011862037936225533, + "rewards/answer_entity_reward": 0.9832702279090881, + "rewards/answer_wer_reward": 0.957579493522644, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9470604658126831, + "step": 141 + }, + { + "completion_length": 246.3125, + "epoch": 0.4544, + "grad_norm": 2.37640118598938, + "kl": 0.0369873046875, + "learning_rate": 8.2375e-07, + "loss": 0.0004, + "reward": 3.944279909133911, + "reward_std": 0.011443465016782284, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9472803771495819, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9969995319843292, + "step": 142 + }, + { + "completion_length": 199.90625, + "epoch": 0.4576, + "grad_norm": 2.8359158039093018, + "kl": 0.0540771484375, + "learning_rate": 8.225e-07, + "loss": 0.0005, + "reward": 3.93644380569458, + "reward_std": 0.023367811925709248, + "rewards/answer_entity_reward": 0.9916666746139526, + "rewards/answer_wer_reward": 0.9554752707481384, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9893018305301666, + "step": 143 + }, + { + "completion_length": 195.28125, + "epoch": 0.4608, + "grad_norm": 1.723976731300354, + "kl": 0.031982421875, + "learning_rate": 8.2125e-07, + "loss": 0.0003, + "reward": 3.9411680698394775, + "reward_std": 0.007689078338444233, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.941936582326889, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9992315769195557, + "step": 144 + }, + { + "completion_length": 223.375, + "epoch": 0.464, + "grad_norm": 1.08156418800354, + "kl": 0.02874755859375, + "learning_rate": 8.199999999999999e-07, + "loss": 0.0003, + "reward": 3.9059054851531982, + "reward_std": 0.007867377484217286, + "rewards/answer_entity_reward": 0.9903846085071564, + "rewards/answer_wer_reward": 0.9531411230564117, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9623797535896301, + "step": 145 + }, + { + "completion_length": 184.75, + "epoch": 0.4672, + "grad_norm": 1.7059741020202637, + "kl": 0.0400390625, + "learning_rate": 8.187499999999999e-07, + "loss": 0.0004, + "reward": 3.939697027206421, + "reward_std": 0.0070332614704966545, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9535529613494873, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9861441254615784, + "step": 146 + }, + { + "completion_length": 222.84375, + "epoch": 0.4704, + "grad_norm": 1.5283204317092896, + "kl": 0.072998046875, + "learning_rate": 8.175e-07, + "loss": 0.0007, + "reward": 3.843386173248291, + "reward_std": 0.02895416272804141, + "rewards/answer_entity_reward": 0.9304008483886719, + "rewards/answer_wer_reward": 0.9129853844642639, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 147 + }, + { + "completion_length": 165.25, + "epoch": 0.4736, + "grad_norm": 2.885890245437622, + "kl": 0.04193115234375, + "learning_rate": 8.1625e-07, + "loss": 0.0004, + "reward": 3.8639066219329834, + "reward_std": 0.01842296402901411, + "rewards/answer_entity_reward": 0.9947552382946014, + "rewards/answer_wer_reward": 0.9352113604545593, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9339398741722107, + "step": 148 + }, + { + "completion_length": 225.8125, + "epoch": 0.4768, + "grad_norm": 1.5893429517745972, + "kl": 0.0615234375, + "learning_rate": 8.149999999999999e-07, + "loss": 0.0006, + "reward": 3.9009220600128174, + "reward_std": 0.022383708506822586, + "rewards/answer_entity_reward": 0.9967105388641357, + "rewards/answer_wer_reward": 0.9052460193634033, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9989655911922455, + "step": 149 + }, + { + "completion_length": 236.21875, + "epoch": 0.48, + "grad_norm": 2.1324307918548584, + "kl": 0.0377197265625, + "learning_rate": 8.137499999999999e-07, + "loss": 0.0004, + "reward": 3.8904128074645996, + "reward_std": 0.02841739635914564, + "rewards/answer_entity_reward": 0.9930555820465088, + "rewards/answer_wer_reward": 0.8976494371891022, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997079372406006, + "step": 150 + }, + { + "completion_length": 213.15625, + "epoch": 0.4832, + "grad_norm": 0.9698525667190552, + "kl": 0.034423828125, + "learning_rate": 8.125e-07, + "loss": 0.0003, + "reward": 3.890373468399048, + "reward_std": 0.009532647207379341, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9459290206432343, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9444444477558136, + "step": 151 + }, + { + "completion_length": 250.21875, + "epoch": 0.4864, + "grad_norm": 4.16625452041626, + "kl": 0.198486328125, + "learning_rate": 8.1125e-07, + "loss": 0.002, + "reward": 3.8978230953216553, + "reward_std": 0.024048997554928064, + "rewards/answer_entity_reward": 0.987500011920929, + "rewards/answer_wer_reward": 0.9117782711982727, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9985446929931641, + "step": 152 + }, + { + "completion_length": 174.15625, + "epoch": 0.4896, + "grad_norm": 2.9183833599090576, + "kl": 0.0716552734375, + "learning_rate": 8.1e-07, + "loss": 0.0007, + "reward": 3.908216118812561, + "reward_std": 0.032137976959347725, + "rewards/answer_entity_reward": 0.9895833432674408, + "rewards/answer_wer_reward": 0.9441157281398773, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9745170772075653, + "step": 153 + }, + { + "completion_length": 187.03125, + "epoch": 0.4928, + "grad_norm": 1.039563536643982, + "kl": 0.0535888671875, + "learning_rate": 8.087499999999999e-07, + "loss": 0.0005, + "reward": 3.940076231956482, + "reward_std": 0.014994107652455568, + "rewards/answer_entity_reward": 0.9910714626312256, + "rewards/answer_wer_reward": 0.9499542117118835, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9990506172180176, + "step": 154 + }, + { + "completion_length": 214.125, + "epoch": 0.496, + "grad_norm": 2.49003267288208, + "kl": 0.0635986328125, + "learning_rate": 8.075e-07, + "loss": 0.0006, + "reward": 3.850375175476074, + "reward_std": 0.026249381713569164, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.8511867821216583, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9991883039474487, + "step": 155 + }, + { + "completion_length": 214.8125, + "epoch": 0.4992, + "grad_norm": 2.7330820560455322, + "kl": 0.03717041015625, + "learning_rate": 8.0625e-07, + "loss": 0.0004, + "reward": 3.9070980548858643, + "reward_std": 0.04327901639044285, + "rewards/answer_entity_reward": 0.9886363744735718, + "rewards/answer_wer_reward": 0.9249836802482605, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9934781193733215, + "step": 156 + }, + { + "completion_length": 195.0625, + "epoch": 0.5024, + "grad_norm": 2.878744602203369, + "kl": 0.0828857421875, + "learning_rate": 8.05e-07, + "loss": 0.0008, + "reward": 3.9139277935028076, + "reward_std": 0.022999857552349567, + "rewards/answer_entity_reward": 0.9947916567325592, + "rewards/answer_wer_reward": 0.9313595592975616, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9877763986587524, + "step": 157 + }, + { + "completion_length": 216.625, + "epoch": 0.5056, + "grad_norm": 1.1287983655929565, + "kl": 0.049072265625, + "learning_rate": 8.037499999999999e-07, + "loss": 0.0005, + "reward": 3.9037948846817017, + "reward_std": 0.011531218886375427, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9081907570362091, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9956042170524597, + "step": 158 + }, + { + "completion_length": 200.21875, + "epoch": 0.5088, + "grad_norm": 1.5555959939956665, + "kl": 0.0369873046875, + "learning_rate": 8.024999999999999e-07, + "loss": 0.0004, + "reward": 3.9110556840896606, + "reward_std": 0.019422957440838218, + "rewards/answer_entity_reward": 0.9941239356994629, + "rewards/answer_wer_reward": 0.9354503750801086, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9814814925193787, + "step": 159 + }, + { + "completion_length": 202.875, + "epoch": 0.512, + "grad_norm": 13.22675895690918, + "kl": 0.084228515625, + "learning_rate": 8.0125e-07, + "loss": 0.0008, + "reward": 3.8508609533309937, + "reward_std": 0.037849435582756996, + "rewards/answer_entity_reward": 0.9867424070835114, + "rewards/answer_wer_reward": 0.9194300472736359, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9446885287761688, + "step": 160 + }, + { + "completion_length": 187.5625, + "epoch": 0.5152, + "grad_norm": 1.9724727869033813, + "kl": 0.05126953125, + "learning_rate": 8e-07, + "loss": 0.0005, + "reward": 3.9261248111724854, + "reward_std": 0.02531399577856064, + "rewards/answer_entity_reward": 0.9882478713989258, + "rewards/answer_wer_reward": 0.9410728812217712, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9968039691448212, + "step": 161 + }, + { + "completion_length": 254.84375, + "epoch": 0.5184, + "grad_norm": 2.3500356674194336, + "kl": 0.05340576171875, + "learning_rate": 7.9875e-07, + "loss": 0.0005, + "reward": 3.910772919654846, + "reward_std": 0.04009111411869526, + "rewards/answer_entity_reward": 0.9747862815856934, + "rewards/answer_wer_reward": 0.9362366199493408, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.999750018119812, + "step": 162 + }, + { + "completion_length": 206.625, + "epoch": 0.5216, + "grad_norm": 6.3654890060424805, + "kl": 0.069580078125, + "learning_rate": 7.975e-07, + "loss": 0.0007, + "reward": 3.805917978286743, + "reward_std": 0.052407728508114815, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9451808631420135, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8642092943191528, + "step": 163 + }, + { + "completion_length": 212.71875, + "epoch": 0.5248, + "grad_norm": 1.921622633934021, + "kl": 0.09283447265625, + "learning_rate": 7.9625e-07, + "loss": 0.0009, + "reward": 3.9235308170318604, + "reward_std": 0.022881922777742147, + "rewards/answer_entity_reward": 0.993686854839325, + "rewards/answer_wer_reward": 0.9401760995388031, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9896678924560547, + "step": 164 + }, + { + "completion_length": 234.5625, + "epoch": 0.528, + "grad_norm": 1.4160696268081665, + "kl": 0.061767578125, + "learning_rate": 7.95e-07, + "loss": 0.0006, + "reward": 3.890324354171753, + "reward_std": 0.014382836874574423, + "rewards/answer_entity_reward": 0.9653846025466919, + "rewards/answer_wer_reward": 0.9249398708343506, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 165 + }, + { + "completion_length": 223.0, + "epoch": 0.5312, + "grad_norm": 1.2775448560714722, + "kl": 0.0582275390625, + "learning_rate": 7.937499999999999e-07, + "loss": 0.0006, + "reward": 3.9478421211242676, + "reward_std": 0.011931413784623146, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9481260776519775, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997159242630005, + "step": 166 + }, + { + "completion_length": 214.65625, + "epoch": 0.5344, + "grad_norm": 1.287255883216858, + "kl": 0.052734375, + "learning_rate": 7.924999999999999e-07, + "loss": 0.0005, + "reward": 3.9042768478393555, + "reward_std": 0.02827941346913576, + "rewards/answer_entity_reward": 0.9787962138652802, + "rewards/answer_wer_reward": 0.925747811794281, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997329115867615, + "step": 167 + }, + { + "completion_length": 224.59375, + "epoch": 0.5376, + "grad_norm": 1.7952959537506104, + "kl": 0.0364990234375, + "learning_rate": 7.912499999999999e-07, + "loss": 0.0004, + "reward": 3.935611605644226, + "reward_std": 0.027386673726141453, + "rewards/answer_entity_reward": 0.9919143319129944, + "rewards/answer_wer_reward": 0.9439473152160645, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.999750018119812, + "step": 168 + }, + { + "completion_length": 183.28125, + "epoch": 0.5408, + "grad_norm": 8.36503791809082, + "kl": 0.0848388671875, + "learning_rate": 7.9e-07, + "loss": 0.0008, + "reward": 3.8025405406951904, + "reward_std": 0.04630524106323719, + "rewards/answer_entity_reward": 0.9862637221813202, + "rewards/answer_wer_reward": 0.8270655274391174, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9892113208770752, + "step": 169 + }, + { + "completion_length": 235.15625, + "epoch": 0.544, + "grad_norm": 2.2816457748413086, + "kl": 0.0296630859375, + "learning_rate": 7.8875e-07, + "loss": 0.0003, + "reward": 3.934034824371338, + "reward_std": 0.009957955218851566, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9344717264175415, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9995629191398621, + "step": 170 + }, + { + "completion_length": 247.53125, + "epoch": 0.5472, + "grad_norm": 1.6856052875518799, + "kl": 0.13134765625, + "learning_rate": 7.875e-07, + "loss": 0.0013, + "reward": 3.896223545074463, + "reward_std": 0.015339810401201248, + "rewards/answer_entity_reward": 0.9861111044883728, + "rewards/answer_wer_reward": 0.9109295010566711, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9991828203201294, + "step": 171 + }, + { + "completion_length": 245.03125, + "epoch": 0.5504, + "grad_norm": 4.956347465515137, + "kl": 0.044921875, + "learning_rate": 7.8625e-07, + "loss": 0.0005, + "reward": 3.7271645069122314, + "reward_std": 0.21888091787695885, + "rewards/answer_entity_reward": 0.9630681872367859, + "rewards/answer_wer_reward": 0.8937070369720459, + "rewards/format_reward": 0.96875, + "rewards/think_ocr_reward": 0.9016393423080444, + "step": 172 + }, + { + "completion_length": 211.3125, + "epoch": 0.5536, + "grad_norm": 1.1714370250701904, + "kl": 0.0323486328125, + "learning_rate": 7.85e-07, + "loss": 0.0003, + "reward": 3.913045883178711, + "reward_std": 0.04143238253891468, + "rewards/answer_entity_reward": 0.9870130121707916, + "rewards/answer_wer_reward": 0.9331351518630981, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9928977191448212, + "step": 173 + }, + { + "completion_length": 272.1875, + "epoch": 0.5568, + "grad_norm": 1.2012341022491455, + "kl": 0.0413818359375, + "learning_rate": 7.837499999999999e-07, + "loss": 0.0004, + "reward": 3.876948356628418, + "reward_std": 0.03149130195379257, + "rewards/answer_entity_reward": 0.9889954328536987, + "rewards/answer_wer_reward": 0.9271560311317444, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9607969224452972, + "step": 174 + }, + { + "completion_length": 200.3125, + "epoch": 0.56, + "grad_norm": 2.998842477798462, + "kl": 0.067138671875, + "learning_rate": 7.824999999999999e-07, + "loss": 0.0007, + "reward": 3.8472641706466675, + "reward_std": 0.04471721313893795, + "rewards/answer_entity_reward": 0.9902146458625793, + "rewards/answer_wer_reward": 0.9358225166797638, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.921226978302002, + "step": 175 + }, + { + "completion_length": 207.03125, + "epoch": 0.5632, + "grad_norm": 10.961363792419434, + "kl": 0.0789794921875, + "learning_rate": 7.812499999999999e-07, + "loss": 0.0008, + "reward": 3.9478721618652344, + "reward_std": 0.027662259992212057, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9600406885147095, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9902353584766388, + "step": 176 + }, + { + "completion_length": 221.59375, + "epoch": 0.5664, + "grad_norm": 1.341109275817871, + "kl": 0.065185546875, + "learning_rate": 7.799999999999999e-07, + "loss": 0.0006, + "reward": 3.8582847118377686, + "reward_std": 0.041704089380800724, + "rewards/answer_entity_reward": 0.9775640964508057, + "rewards/answer_wer_reward": 0.9368657767772675, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9438548386096954, + "step": 177 + }, + { + "completion_length": 239.90625, + "epoch": 0.5696, + "grad_norm": 1.4057974815368652, + "kl": 0.045166015625, + "learning_rate": 7.787500000000001e-07, + "loss": 0.0005, + "reward": 3.9274110794067383, + "reward_std": 0.02352920500561595, + "rewards/answer_entity_reward": 0.9946895241737366, + "rewards/answer_wer_reward": 0.9349404275417328, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9977810680866241, + "step": 178 + }, + { + "completion_length": 211.78125, + "epoch": 0.5728, + "grad_norm": 2.9184887409210205, + "kl": 0.031982421875, + "learning_rate": 7.775e-07, + "loss": 0.0003, + "reward": 3.945718765258789, + "reward_std": 0.01779081765562296, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9512039721012115, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9973557591438293, + "step": 179 + }, + { + "completion_length": 204.375, + "epoch": 0.576, + "grad_norm": 113.12403869628906, + "kl": 0.05322265625, + "learning_rate": 7.7625e-07, + "loss": 0.0005, + "reward": 3.8825124502182007, + "reward_std": 0.07031127344816923, + "rewards/answer_entity_reward": 0.9926734566688538, + "rewards/answer_wer_reward": 0.9367940425872803, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9530448913574219, + "step": 180 + }, + { + "completion_length": 214.75, + "epoch": 0.5792, + "grad_norm": 1.3515021800994873, + "kl": 0.0609130859375, + "learning_rate": 7.75e-07, + "loss": 0.0006, + "reward": 3.920071840286255, + "reward_std": 0.011316743912175298, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9225669503211975, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9975048303604126, + "step": 181 + }, + { + "completion_length": 205.0625, + "epoch": 0.5824, + "grad_norm": 1.5749711990356445, + "kl": 0.054443359375, + "learning_rate": 7.7375e-07, + "loss": 0.0005, + "reward": 3.921678900718689, + "reward_std": 0.013327162247151136, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9460242688655853, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9780585169792175, + "step": 182 + }, + { + "completion_length": 217.75, + "epoch": 0.5856, + "grad_norm": 0.7737219929695129, + "kl": 0.0469970703125, + "learning_rate": 7.724999999999999e-07, + "loss": 0.0005, + "reward": 3.9334832429885864, + "reward_std": 0.020406807772815228, + "rewards/answer_entity_reward": 0.9947552382946014, + "rewards/answer_wer_reward": 0.938728004693985, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 183 + }, + { + "completion_length": 231.59375, + "epoch": 0.5888, + "grad_norm": 1.6825175285339355, + "kl": 0.0543212890625, + "learning_rate": 7.712499999999999e-07, + "loss": 0.0005, + "reward": 3.938681125640869, + "reward_std": 0.017365658190101385, + "rewards/answer_entity_reward": 0.9981617629528046, + "rewards/answer_wer_reward": 0.9413779377937317, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9991414844989777, + "step": 184 + }, + { + "completion_length": 239.71875, + "epoch": 0.592, + "grad_norm": 1.3427449464797974, + "kl": 0.058837890625, + "learning_rate": 7.699999999999999e-07, + "loss": 0.0006, + "reward": 3.9066988229751587, + "reward_std": 0.020341036841273308, + "rewards/answer_entity_reward": 0.9776557087898254, + "rewards/answer_wer_reward": 0.929761528968811, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9992816150188446, + "step": 185 + }, + { + "completion_length": 133.90625, + "epoch": 0.5952, + "grad_norm": 4.991705417633057, + "kl": 0.0623779296875, + "learning_rate": 7.6875e-07, + "loss": 0.0006, + "reward": 3.926753878593445, + "reward_std": 0.023914007004350424, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9629489779472351, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9638049006462097, + "step": 186 + }, + { + "completion_length": 234.625, + "epoch": 0.5984, + "grad_norm": 2.8712401390075684, + "kl": 0.096435546875, + "learning_rate": 7.675e-07, + "loss": 0.001, + "reward": 3.872377395629883, + "reward_std": 0.06525835767388344, + "rewards/answer_entity_reward": 0.9841803908348083, + "rewards/answer_wer_reward": 0.9093597233295441, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9788371920585632, + "step": 187 + }, + { + "completion_length": 225.4375, + "epoch": 0.6016, + "grad_norm": 2.3115170001983643, + "kl": 0.055419921875, + "learning_rate": 7.6625e-07, + "loss": 0.0006, + "reward": 3.9362770318984985, + "reward_std": 0.019690027460455894, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9422614872455597, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.997487872838974, + "step": 188 + }, + { + "completion_length": 214.15625, + "epoch": 0.6048, + "grad_norm": 3.583329677581787, + "kl": 0.0550537109375, + "learning_rate": 7.65e-07, + "loss": 0.0005, + "reward": 3.9327969551086426, + "reward_std": 0.014218965079635382, + "rewards/answer_entity_reward": 0.9903846085071564, + "rewards/answer_wer_reward": 0.9424121379852295, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 189 + }, + { + "completion_length": 249.15625, + "epoch": 0.608, + "grad_norm": 1.4651848077774048, + "kl": 0.052001953125, + "learning_rate": 7.6375e-07, + "loss": 0.0005, + "reward": 3.941069722175598, + "reward_std": 0.009663278236985207, + "rewards/answer_entity_reward": 0.9926470518112183, + "rewards/answer_wer_reward": 0.9507163166999817, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9977063536643982, + "step": 190 + }, + { + "completion_length": 197.84375, + "epoch": 0.6112, + "grad_norm": 1.4688224792480469, + "kl": 0.0577392578125, + "learning_rate": 7.624999999999999e-07, + "loss": 0.0006, + "reward": 3.9300395250320435, + "reward_std": 0.014806594932451844, + "rewards/answer_entity_reward": 0.984722226858139, + "rewards/answer_wer_reward": 0.9455022215843201, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9998151063919067, + "step": 191 + }, + { + "completion_length": 254.6875, + "epoch": 0.6144, + "grad_norm": 1.1648938655853271, + "kl": 0.0589599609375, + "learning_rate": 7.612499999999999e-07, + "loss": 0.0006, + "reward": 3.9228453636169434, + "reward_std": 0.026355463080108166, + "rewards/answer_entity_reward": 0.9819444715976715, + "rewards/answer_wer_reward": 0.9418983161449432, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9990026652812958, + "step": 192 + }, + { + "completion_length": 264.34375, + "epoch": 0.6176, + "grad_norm": 1.2595146894454956, + "kl": 0.0635986328125, + "learning_rate": 7.599999999999999e-07, + "loss": 0.0006, + "reward": 3.9068782329559326, + "reward_std": 0.02374061942100525, + "rewards/answer_entity_reward": 0.9758522510528564, + "rewards/answer_wer_reward": 0.9392839670181274, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9917418956756592, + "step": 193 + }, + { + "completion_length": 226.875, + "epoch": 0.6208, + "grad_norm": 3.0049514770507812, + "kl": 0.065185546875, + "learning_rate": 7.5875e-07, + "loss": 0.0007, + "reward": 3.9182554483413696, + "reward_std": 0.028174775652587414, + "rewards/answer_entity_reward": 0.9943181872367859, + "rewards/answer_wer_reward": 0.9239371716976166, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 194 + }, + { + "completion_length": 233.90625, + "epoch": 0.624, + "grad_norm": 3.6226987838745117, + "kl": 0.14013671875, + "learning_rate": 7.575e-07, + "loss": 0.0014, + "reward": 3.917691946029663, + "reward_std": 0.015854593832045794, + "rewards/answer_entity_reward": 0.9861111044883728, + "rewards/answer_wer_reward": 0.9359965324401855, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.99558424949646, + "step": 195 + }, + { + "completion_length": 228.96875, + "epoch": 0.6272, + "grad_norm": 3.1564576625823975, + "kl": 0.03131103515625, + "learning_rate": 7.5625e-07, + "loss": 0.0003, + "reward": 3.8988983631134033, + "reward_std": 0.04383570794016123, + "rewards/answer_entity_reward": 0.980654776096344, + "rewards/answer_wer_reward": 0.9372455775737762, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9809979796409607, + "step": 196 + }, + { + "completion_length": 235.875, + "epoch": 0.6304, + "grad_norm": 1.3267861604690552, + "kl": 0.052978515625, + "learning_rate": 7.55e-07, + "loss": 0.0005, + "reward": 3.9319225549697876, + "reward_std": 0.02372880419716239, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9346356689929962, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.999690592288971, + "step": 197 + }, + { + "completion_length": 162.34375, + "epoch": 0.6336, + "grad_norm": 1.4438445568084717, + "kl": 0.065185546875, + "learning_rate": 7.5375e-07, + "loss": 0.0006, + "reward": 3.8535887002944946, + "reward_std": 0.041104525327682495, + "rewards/answer_entity_reward": 0.9681412279605865, + "rewards/answer_wer_reward": 0.9683326184749603, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9171150028705597, + "step": 198 + }, + { + "completion_length": 203.875, + "epoch": 0.6368, + "grad_norm": 4.674152374267578, + "kl": 0.050048828125, + "learning_rate": 7.524999999999999e-07, + "loss": 0.0005, + "reward": 3.938958764076233, + "reward_std": 0.01455747289583087, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9664872884750366, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.974875271320343, + "step": 199 + }, + { + "completion_length": 230.625, + "epoch": 0.64, + "grad_norm": 1.899129867553711, + "kl": 0.0535888671875, + "learning_rate": 7.512499999999999e-07, + "loss": 0.0005, + "reward": 3.9438642263412476, + "reward_std": 0.014077516738325357, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.952812910079956, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9938922822475433, + "step": 200 + }, + { + "completion_length": 212.4375, + "epoch": 0.6432, + "grad_norm": 1.8970869779586792, + "kl": 0.0460205078125, + "learning_rate": 7.5e-07, + "loss": 0.0005, + "reward": 3.9026511907577515, + "reward_std": 0.038714910857379436, + "rewards/answer_entity_reward": 0.9916666746139526, + "rewards/answer_wer_reward": 0.911726325750351, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9992582499980927, + "step": 201 + }, + { + "completion_length": 203.875, + "epoch": 0.6464, + "grad_norm": 2.5214030742645264, + "kl": 0.083251953125, + "learning_rate": 7.4875e-07, + "loss": 0.0008, + "reward": 3.9040462970733643, + "reward_std": 0.016587836667895317, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9761527180671692, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9278934299945831, + "step": 202 + }, + { + "completion_length": 216.375, + "epoch": 0.6496, + "grad_norm": 4.072224140167236, + "kl": 0.053955078125, + "learning_rate": 7.475e-07, + "loss": 0.0005, + "reward": 3.9431036710739136, + "reward_std": 0.020094456151127815, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.949131965637207, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.996812641620636, + "step": 203 + }, + { + "completion_length": 221.0, + "epoch": 0.6528, + "grad_norm": 3.3709828853607178, + "kl": 0.070556640625, + "learning_rate": 7.4625e-07, + "loss": 0.0007, + "reward": 3.8844679594039917, + "reward_std": 0.05386691028252244, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.934579610824585, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9498883783817291, + "step": 204 + }, + { + "completion_length": 195.9375, + "epoch": 0.656, + "grad_norm": 2.4978103637695312, + "kl": 0.0775146484375, + "learning_rate": 7.45e-07, + "loss": 0.0008, + "reward": 3.9303336143493652, + "reward_std": 0.04689153959043324, + "rewards/answer_entity_reward": 0.9804924428462982, + "rewards/answer_wer_reward": 0.9526000618934631, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9972410500049591, + "step": 205 + }, + { + "completion_length": 256.875, + "epoch": 0.6592, + "grad_norm": 2.3422584533691406, + "kl": 0.1229248046875, + "learning_rate": 7.4375e-07, + "loss": 0.0012, + "reward": 3.9243087768554688, + "reward_std": 0.019790570251643658, + "rewards/answer_entity_reward": 0.9764957129955292, + "rewards/answer_wer_reward": 0.9478131830692291, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 206 + }, + { + "completion_length": 204.15625, + "epoch": 0.6624, + "grad_norm": 2.19623064994812, + "kl": 0.0550537109375, + "learning_rate": 7.425e-07, + "loss": 0.0006, + "reward": 3.936911940574646, + "reward_std": 0.02031032182276249, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9463189840316772, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9905929565429688, + "step": 207 + }, + { + "completion_length": 225.21875, + "epoch": 0.6656, + "grad_norm": 5.279341220855713, + "kl": 0.0498046875, + "learning_rate": 7.412499999999999e-07, + "loss": 0.0005, + "reward": 3.915460228919983, + "reward_std": 0.015285669825971127, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9175935089588165, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9978667497634888, + "step": 208 + }, + { + "completion_length": 188.78125, + "epoch": 0.6688, + "grad_norm": 3.7716915607452393, + "kl": 0.0576171875, + "learning_rate": 7.4e-07, + "loss": 0.0006, + "reward": 3.8296241760253906, + "reward_std": 0.017440371215343475, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9421272277832031, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8899007737636566, + "step": 209 + }, + { + "completion_length": 203.28125, + "epoch": 0.672, + "grad_norm": 1.2790639400482178, + "kl": 0.0582275390625, + "learning_rate": 7.3875e-07, + "loss": 0.0006, + "reward": 3.952346086502075, + "reward_std": 0.007349871098995209, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.969746857881546, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9825991690158844, + "step": 210 + }, + { + "completion_length": 196.1875, + "epoch": 0.6752, + "grad_norm": 14.005128860473633, + "kl": 0.0604248046875, + "learning_rate": 7.375e-07, + "loss": 0.0006, + "reward": 3.8537105321884155, + "reward_std": 0.012695960700511932, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9704558551311493, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8832548260688782, + "step": 211 + }, + { + "completion_length": 159.3125, + "epoch": 0.6784, + "grad_norm": 4.394070625305176, + "kl": 0.068115234375, + "learning_rate": 7.362499999999999e-07, + "loss": 0.0007, + "reward": 3.9123398065567017, + "reward_std": 0.02882718201726675, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9466139674186707, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.965725839138031, + "step": 212 + }, + { + "completion_length": 238.75, + "epoch": 0.6816, + "grad_norm": 5.395397663116455, + "kl": 0.041748046875, + "learning_rate": 7.35e-07, + "loss": 0.0004, + "reward": 3.89706289768219, + "reward_std": 0.0131816565990448, + "rewards/answer_entity_reward": 0.9914772808551788, + "rewards/answer_wer_reward": 0.9062366485595703, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9993489682674408, + "step": 213 + }, + { + "completion_length": 255.65625, + "epoch": 0.6848, + "grad_norm": 1.9760891199111938, + "kl": 0.03961181640625, + "learning_rate": 7.3375e-07, + "loss": 0.0004, + "reward": 3.917116641998291, + "reward_std": 0.04898790689185262, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9182944297790527, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9988220930099487, + "step": 214 + }, + { + "completion_length": 165.75, + "epoch": 0.688, + "grad_norm": 2.763314723968506, + "kl": 0.0577392578125, + "learning_rate": 7.325e-07, + "loss": 0.0006, + "reward": 3.952502489089966, + "reward_std": 0.016542275436222553, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9569029808044434, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9990717768669128, + "step": 215 + }, + { + "completion_length": 215.625, + "epoch": 0.6912, + "grad_norm": 7.516313552856445, + "kl": 0.0439453125, + "learning_rate": 7.312499999999999e-07, + "loss": 0.0004, + "reward": 3.9650633335113525, + "reward_std": 0.015061032958328724, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9679040908813477, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 216 + }, + { + "completion_length": 227.84375, + "epoch": 0.6944, + "grad_norm": 1.8075324296951294, + "kl": 0.0511474609375, + "learning_rate": 7.3e-07, + "loss": 0.0005, + "reward": 3.9209293127059937, + "reward_std": 0.01800437457859516, + "rewards/answer_entity_reward": 0.9943181872367859, + "rewards/answer_wer_reward": 0.9266109764575958, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 217 + }, + { + "completion_length": 213.625, + "epoch": 0.6976, + "grad_norm": 5.917069911956787, + "kl": 0.0426025390625, + "learning_rate": 7.2875e-07, + "loss": 0.0004, + "reward": 3.9082109928131104, + "reward_std": 0.07417950965464115, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9089923202991486, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.999218761920929, + "step": 218 + }, + { + "completion_length": 228.6875, + "epoch": 0.7008, + "grad_norm": 1.1044409275054932, + "kl": 0.0531005859375, + "learning_rate": 7.275e-07, + "loss": 0.0005, + "reward": 3.908870220184326, + "reward_std": 0.016815255396068096, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9117993116378784, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9994747638702393, + "step": 219 + }, + { + "completion_length": 199.125, + "epoch": 0.704, + "grad_norm": 3.019407272338867, + "kl": 0.058837890625, + "learning_rate": 7.262499999999999e-07, + "loss": 0.0006, + "reward": 3.925763249397278, + "reward_std": 0.01313594076782465, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9272693395614624, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9984939694404602, + "step": 220 + }, + { + "completion_length": 210.65625, + "epoch": 0.7072, + "grad_norm": 2.7719058990478516, + "kl": 0.0377197265625, + "learning_rate": 7.249999999999999e-07, + "loss": 0.0004, + "reward": 3.8708763122558594, + "reward_std": 0.028095172019675374, + "rewards/answer_entity_reward": 0.9812500178813934, + "rewards/answer_wer_reward": 0.9290285110473633, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.960597813129425, + "step": 221 + }, + { + "completion_length": 199.6875, + "epoch": 0.7104, + "grad_norm": 2.267350435256958, + "kl": 0.0660400390625, + "learning_rate": 7.2375e-07, + "loss": 0.0006, + "reward": 3.9580957889556885, + "reward_std": 0.03087126836180687, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9705802798271179, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9899193644523621, + "step": 222 + }, + { + "completion_length": 181.8125, + "epoch": 0.7136, + "grad_norm": 8.685694694519043, + "kl": 0.081787109375, + "learning_rate": 7.225e-07, + "loss": 0.0008, + "reward": 3.8902955055236816, + "reward_std": 0.011068197898566723, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9720200002193451, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9182755053043365, + "step": 223 + }, + { + "completion_length": 185.4375, + "epoch": 0.7168, + "grad_norm": 2.514770746231079, + "kl": 0.0609130859375, + "learning_rate": 7.212499999999999e-07, + "loss": 0.0006, + "reward": 3.9320486783981323, + "reward_std": 0.033941914327442646, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9598598778247833, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.972188800573349, + "step": 224 + }, + { + "completion_length": 250.84375, + "epoch": 0.72, + "grad_norm": 1.7914812564849854, + "kl": 0.03045654296875, + "learning_rate": 7.2e-07, + "loss": 0.0003, + "reward": 3.8908780813217163, + "reward_std": 0.03203156217932701, + "rewards/answer_entity_reward": 0.9678819179534912, + "rewards/answer_wer_reward": 0.9238358736038208, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9991601407527924, + "step": 225 + }, + { + "completion_length": 249.125, + "epoch": 0.7232, + "grad_norm": 4.627202987670898, + "kl": 0.0531005859375, + "learning_rate": 7.1875e-07, + "loss": 0.0005, + "reward": 3.899629235267639, + "reward_std": 0.06726673897355795, + "rewards/answer_entity_reward": 0.9953208565711975, + "rewards/answer_wer_reward": 0.9247469902038574, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9795613586902618, + "step": 226 + }, + { + "completion_length": 214.40625, + "epoch": 0.7264, + "grad_norm": 1.942586064338684, + "kl": 0.0352783203125, + "learning_rate": 7.175e-07, + "loss": 0.0003, + "reward": 3.959649443626404, + "reward_std": 0.01394367078319192, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9649502038955688, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9975401163101196, + "step": 227 + }, + { + "completion_length": 182.59375, + "epoch": 0.7296, + "grad_norm": 3.191298246383667, + "kl": 0.055419921875, + "learning_rate": 7.1625e-07, + "loss": 0.0005, + "reward": 3.9260960817337036, + "reward_std": 0.021659906953573227, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9576999247074127, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9712370932102203, + "step": 228 + }, + { + "completion_length": 212.53125, + "epoch": 0.7328, + "grad_norm": 1.0323834419250488, + "kl": 0.0533447265625, + "learning_rate": 7.149999999999999e-07, + "loss": 0.0005, + "reward": 3.939168095588684, + "reward_std": 0.009458722081035376, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9402457773685455, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9989224076271057, + "step": 229 + }, + { + "completion_length": 187.8125, + "epoch": 0.736, + "grad_norm": 4.53863000869751, + "kl": 0.050537109375, + "learning_rate": 7.137499999999999e-07, + "loss": 0.0005, + "reward": 3.893386960029602, + "reward_std": 0.03008814249187708, + "rewards/answer_entity_reward": 0.9941239356994629, + "rewards/answer_wer_reward": 0.9532185792922974, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.946044385433197, + "step": 230 + }, + { + "completion_length": 235.5, + "epoch": 0.7392, + "grad_norm": 2.1737990379333496, + "kl": 0.0477294921875, + "learning_rate": 7.125e-07, + "loss": 0.0005, + "reward": 3.8995944261550903, + "reward_std": 0.021292359568178654, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9127146005630493, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9868797659873962, + "step": 231 + }, + { + "completion_length": 230.625, + "epoch": 0.7424, + "grad_norm": 0.8920266628265381, + "kl": 0.02874755859375, + "learning_rate": 7.1125e-07, + "loss": 0.0003, + "reward": 3.9383678436279297, + "reward_std": 0.008275180356577039, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9394271969795227, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9989406764507294, + "step": 232 + }, + { + "completion_length": 196.125, + "epoch": 0.7456, + "grad_norm": 2.1836190223693848, + "kl": 0.06640625, + "learning_rate": 7.1e-07, + "loss": 0.0007, + "reward": 3.9469913244247437, + "reward_std": 0.01094681373797357, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9498908519744873, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9971005320549011, + "step": 233 + }, + { + "completion_length": 200.6875, + "epoch": 0.7488, + "grad_norm": 1.5529507398605347, + "kl": 0.041748046875, + "learning_rate": 7.0875e-07, + "loss": 0.0004, + "reward": 3.8839221000671387, + "reward_std": 0.02069476176984608, + "rewards/answer_entity_reward": 0.9841346144676208, + "rewards/answer_wer_reward": 0.9540095031261444, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.945777952671051, + "step": 234 + }, + { + "completion_length": 222.90625, + "epoch": 0.752, + "grad_norm": 17.55677604675293, + "kl": 0.061767578125, + "learning_rate": 7.075e-07, + "loss": 0.0006, + "reward": 3.92560076713562, + "reward_std": 0.03323593852110207, + "rewards/answer_entity_reward": 0.9963235259056091, + "rewards/answer_wer_reward": 0.9402145445346832, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.989062488079071, + "step": 235 + }, + { + "completion_length": 195.0625, + "epoch": 0.7552, + "grad_norm": 1.7806612253189087, + "kl": 0.056640625, + "learning_rate": 7.0625e-07, + "loss": 0.0006, + "reward": 3.9366722106933594, + "reward_std": 0.02212852332741022, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9515082538127899, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9886362552642822, + "step": 236 + }, + { + "completion_length": 224.34375, + "epoch": 0.7584, + "grad_norm": 3.0402088165283203, + "kl": 0.0352783203125, + "learning_rate": 7.049999999999999e-07, + "loss": 0.0004, + "reward": 3.947329044342041, + "reward_std": 0.011976622510701418, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.961329847574234, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.988840252161026, + "step": 237 + }, + { + "completion_length": 223.53125, + "epoch": 0.7616, + "grad_norm": 2.889293670654297, + "kl": 0.0616455078125, + "learning_rate": 7.037499999999999e-07, + "loss": 0.0006, + "reward": 3.9246891736984253, + "reward_std": 0.05990536604076624, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9530621469020844, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9750992059707642, + "step": 238 + }, + { + "completion_length": 184.78125, + "epoch": 0.7648, + "grad_norm": 1.2427425384521484, + "kl": 0.0623779296875, + "learning_rate": 7.024999999999999e-07, + "loss": 0.0006, + "reward": 3.957573890686035, + "reward_std": 0.005278389900922775, + "rewards/answer_entity_reward": 0.9926470518112183, + "rewards/answer_wer_reward": 0.9649269282817841, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 239 + }, + { + "completion_length": 236.28125, + "epoch": 0.768, + "grad_norm": 2.361463785171509, + "kl": 0.0545654296875, + "learning_rate": 7.0125e-07, + "loss": 0.0005, + "reward": 3.9197674989700317, + "reward_std": 0.02553732506930828, + "rewards/answer_entity_reward": 0.9834134578704834, + "rewards/answer_wer_reward": 0.9363541007041931, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 240 + }, + { + "completion_length": 174.0, + "epoch": 0.7712, + "grad_norm": 2.3930962085723877, + "kl": 0.05926513671875, + "learning_rate": 7e-07, + "loss": 0.0006, + "reward": 3.9211114645004272, + "reward_std": 0.008784215082414448, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9724419414997101, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9486694633960724, + "step": 241 + }, + { + "completion_length": 254.59375, + "epoch": 0.7744, + "grad_norm": 1.6553773880004883, + "kl": 0.0389404296875, + "learning_rate": 6.9875e-07, + "loss": 0.0004, + "reward": 3.929746985435486, + "reward_std": 0.012057055719196796, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9313917756080627, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9983552694320679, + "step": 242 + }, + { + "completion_length": 235.375, + "epoch": 0.7776, + "grad_norm": 0.8029008507728577, + "kl": 0.04083251953125, + "learning_rate": 6.975e-07, + "loss": 0.0004, + "reward": 3.9153066873550415, + "reward_std": 0.005760843865573406, + "rewards/answer_entity_reward": 0.9916666746139526, + "rewards/answer_wer_reward": 0.9309280216693878, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9927119016647339, + "step": 243 + }, + { + "completion_length": 186.78125, + "epoch": 0.7808, + "grad_norm": 3.1181294918060303, + "kl": 0.0732421875, + "learning_rate": 6.9625e-07, + "loss": 0.0007, + "reward": 3.9115726947784424, + "reward_std": 0.007224578293971717, + "rewards/answer_entity_reward": 0.9707792401313782, + "rewards/answer_wer_reward": 0.940793514251709, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 244 + }, + { + "completion_length": 223.6875, + "epoch": 0.784, + "grad_norm": 1.3839703798294067, + "kl": 0.0380859375, + "learning_rate": 6.949999999999999e-07, + "loss": 0.0004, + "reward": 3.9361883401870728, + "reward_std": 0.012964933644980192, + "rewards/answer_entity_reward": 0.9818618893623352, + "rewards/answer_wer_reward": 0.9550732672214508, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.999253123998642, + "step": 245 + }, + { + "completion_length": 222.53125, + "epoch": 0.7872, + "grad_norm": 3.1735548973083496, + "kl": 0.072509765625, + "learning_rate": 6.937499999999999e-07, + "loss": 0.0007, + "reward": 3.9446396827697754, + "reward_std": 0.023095417767763138, + "rewards/answer_entity_reward": 0.9895833134651184, + "rewards/answer_wer_reward": 0.9603613913059235, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9946948885917664, + "step": 246 + }, + { + "completion_length": 217.4375, + "epoch": 0.7904, + "grad_norm": 1.185796856880188, + "kl": 0.042236328125, + "learning_rate": 6.924999999999999e-07, + "loss": 0.0004, + "reward": 3.9417611360549927, + "reward_std": 0.013147154357284307, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9470057189464569, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9947555065155029, + "step": 247 + }, + { + "completion_length": 240.59375, + "epoch": 0.7936, + "grad_norm": 2.088177442550659, + "kl": 0.0504150390625, + "learning_rate": 6.9125e-07, + "loss": 0.0005, + "reward": 3.9391993284225464, + "reward_std": 0.015122740995138884, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9413229823112488, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9978764653205872, + "step": 248 + }, + { + "completion_length": 251.8125, + "epoch": 0.7968, + "grad_norm": 1.0327165126800537, + "kl": 0.0439453125, + "learning_rate": 6.9e-07, + "loss": 0.0004, + "reward": 3.928339123725891, + "reward_std": 0.014733773190528154, + "rewards/answer_entity_reward": 0.9895104765892029, + "rewards/answer_wer_reward": 0.9401907324790955, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9986380338668823, + "step": 249 + }, + { + "completion_length": 202.125, + "epoch": 0.8, + "grad_norm": 1.0536175966262817, + "kl": 0.0443115234375, + "learning_rate": 6.8875e-07, + "loss": 0.0004, + "reward": 3.9324183464050293, + "reward_std": 0.018241871614009142, + "rewards/answer_entity_reward": 0.9873737692832947, + "rewards/answer_wer_reward": 0.9567070603370667, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9883374869823456, + "step": 250 + }, + { + "completion_length": 231.59375, + "epoch": 0.8032, + "grad_norm": 1.8605543375015259, + "kl": 0.0467529296875, + "learning_rate": 6.875e-07, + "loss": 0.0005, + "reward": 3.9515386819839478, + "reward_std": 0.014535096473991871, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9524115920066833, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9991269707679749, + "step": 251 + }, + { + "completion_length": 202.8125, + "epoch": 0.8064, + "grad_norm": 1.7101868391036987, + "kl": 0.0673828125, + "learning_rate": 6.8625e-07, + "loss": 0.0007, + "reward": 3.947361946105957, + "reward_std": 0.01079330500215292, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9485193192958832, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9988425970077515, + "step": 252 + }, + { + "completion_length": 194.4375, + "epoch": 0.8096, + "grad_norm": 1.6060519218444824, + "kl": 0.0518798828125, + "learning_rate": 6.85e-07, + "loss": 0.0005, + "reward": 3.8238483667373657, + "reward_std": 0.09831315139308572, + "rewards/answer_entity_reward": 0.9366161823272705, + "rewards/answer_wer_reward": 0.888142466545105, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9990898072719574, + "step": 253 + }, + { + "completion_length": 231.71875, + "epoch": 0.8128, + "grad_norm": 1.4323464632034302, + "kl": 0.04559326171875, + "learning_rate": 6.837499999999999e-07, + "loss": 0.0005, + "reward": 3.9585113525390625, + "reward_std": 0.009139138273894787, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9591011703014374, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9994101524353027, + "step": 254 + }, + { + "completion_length": 242.15625, + "epoch": 0.816, + "grad_norm": 1.638405442237854, + "kl": 0.0592041015625, + "learning_rate": 6.824999999999999e-07, + "loss": 0.0006, + "reward": 3.938191056251526, + "reward_std": 0.015181098598986864, + "rewards/answer_entity_reward": 0.9916666746139526, + "rewards/answer_wer_reward": 0.9465242922306061, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 255 + }, + { + "completion_length": 178.96875, + "epoch": 0.8192, + "grad_norm": 2.906489133834839, + "kl": 0.07958984375, + "learning_rate": 6.8125e-07, + "loss": 0.0008, + "reward": 3.9418115615844727, + "reward_std": 0.024727396899834275, + "rewards/answer_entity_reward": 0.9943181872367859, + "rewards/answer_wer_reward": 0.9549268186092377, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9925665557384491, + "step": 256 + }, + { + "completion_length": 191.59375, + "epoch": 0.8224, + "grad_norm": 4.772871494293213, + "kl": 0.271484375, + "learning_rate": 6.800000000000001e-07, + "loss": 0.0027, + "reward": 3.9085776805877686, + "reward_std": 0.01904244115576148, + "rewards/answer_entity_reward": 0.9866071343421936, + "rewards/answer_wer_reward": 0.9542762637138367, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9676942825317383, + "step": 257 + }, + { + "completion_length": 192.78125, + "epoch": 0.8256, + "grad_norm": 2.3399181365966797, + "kl": 0.081787109375, + "learning_rate": 6.7875e-07, + "loss": 0.0008, + "reward": 3.930221199989319, + "reward_std": 0.014671812066808343, + "rewards/answer_entity_reward": 0.9867201447486877, + "rewards/answer_wer_reward": 0.9438917338848114, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996093809604645, + "step": 258 + }, + { + "completion_length": 187.5, + "epoch": 0.8288, + "grad_norm": 9.805069923400879, + "kl": 0.072265625, + "learning_rate": 6.775e-07, + "loss": 0.0007, + "reward": 3.939017653465271, + "reward_std": 0.016680479515343904, + "rewards/answer_entity_reward": 0.9944852888584137, + "rewards/answer_wer_reward": 0.9445324242115021, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 259 + }, + { + "completion_length": 234.5625, + "epoch": 0.832, + "grad_norm": 1.5217561721801758, + "kl": 0.0516357421875, + "learning_rate": 6.7625e-07, + "loss": 0.0005, + "reward": 3.922031283378601, + "reward_std": 0.01609009224921465, + "rewards/answer_entity_reward": 0.9681277275085449, + "rewards/answer_wer_reward": 0.9539035856723785, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 260 + }, + { + "completion_length": 159.0, + "epoch": 0.8352, + "grad_norm": 2.5927042961120605, + "kl": 0.0557861328125, + "learning_rate": 6.75e-07, + "loss": 0.0006, + "reward": 3.9503369331359863, + "reward_std": 0.004757039016112685, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9792385697364807, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9710983335971832, + "step": 261 + }, + { + "completion_length": 222.15625, + "epoch": 0.8384, + "grad_norm": 1.9485008716583252, + "kl": 0.0928955078125, + "learning_rate": 6.737499999999999e-07, + "loss": 0.0009, + "reward": 3.9718098640441895, + "reward_std": 0.01134553411975503, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9718098938465118, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 262 + }, + { + "completion_length": 248.875, + "epoch": 0.8416, + "grad_norm": 5.045698165893555, + "kl": 0.0552978515625, + "learning_rate": 6.724999999999999e-07, + "loss": 0.0006, + "reward": 3.799831986427307, + "reward_std": 0.03707320708781481, + "rewards/answer_entity_reward": 0.9943181872367859, + "rewards/answer_wer_reward": 0.9218086004257202, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.883705198764801, + "step": 263 + }, + { + "completion_length": 157.6875, + "epoch": 0.8448, + "grad_norm": 1.9603397846221924, + "kl": 0.14111328125, + "learning_rate": 6.7125e-07, + "loss": 0.0014, + "reward": 3.9334217309951782, + "reward_std": 0.00959050771780312, + "rewards/answer_entity_reward": 0.9916666746139526, + "rewards/answer_wer_reward": 0.9538573622703552, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.987897664308548, + "step": 264 + }, + { + "completion_length": 249.21875, + "epoch": 0.848, + "grad_norm": 1.720057725906372, + "kl": 0.102783203125, + "learning_rate": 6.7e-07, + "loss": 0.001, + "reward": 3.9404491186141968, + "reward_std": 0.023797483183443546, + "rewards/answer_entity_reward": 0.9947552382946014, + "rewards/answer_wer_reward": 0.9459458291530609, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997479915618896, + "step": 265 + }, + { + "completion_length": 200.65625, + "epoch": 0.8512, + "grad_norm": 1.7017474174499512, + "kl": 0.06640625, + "learning_rate": 6.6875e-07, + "loss": 0.0007, + "reward": 3.897473454475403, + "reward_std": 0.017802401445806026, + "rewards/answer_entity_reward": 0.9892628192901611, + "rewards/answer_wer_reward": 0.9560422301292419, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.952168345451355, + "step": 266 + }, + { + "completion_length": 206.9375, + "epoch": 0.8544, + "grad_norm": 1.7645119428634644, + "kl": 0.107177734375, + "learning_rate": 6.675e-07, + "loss": 0.0011, + "reward": 3.919585347175598, + "reward_std": 0.017358362209051847, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9206817746162415, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9989035129547119, + "step": 267 + }, + { + "completion_length": 234.125, + "epoch": 0.8576, + "grad_norm": 2.324972629547119, + "kl": 0.07275390625, + "learning_rate": 6.6625e-07, + "loss": 0.0007, + "reward": 3.8366565704345703, + "reward_std": 0.03994511067867279, + "rewards/answer_entity_reward": 0.9375, + "rewards/answer_wer_reward": 0.9288243353366852, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9703322649002075, + "step": 268 + }, + { + "completion_length": 163.28125, + "epoch": 0.8608, + "grad_norm": 3.44211483001709, + "kl": 0.07080078125, + "learning_rate": 6.65e-07, + "loss": 0.0007, + "reward": 3.8973175287246704, + "reward_std": 0.051633019000291824, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9550660252571106, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9457237124443054, + "step": 269 + }, + { + "completion_length": 198.0625, + "epoch": 0.864, + "grad_norm": 5.092156887054443, + "kl": 0.072998046875, + "learning_rate": 6.637499999999999e-07, + "loss": 0.0007, + "reward": 3.940290689468384, + "reward_std": 0.009564612759277225, + "rewards/answer_entity_reward": 0.9821428656578064, + "rewards/answer_wer_reward": 0.958147794008255, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 270 + }, + { + "completion_length": 138.875, + "epoch": 0.8672, + "grad_norm": 3.998215913772583, + "kl": 0.05889892578125, + "learning_rate": 6.624999999999999e-07, + "loss": 0.0006, + "reward": 3.9329700469970703, + "reward_std": 0.05405183229595423, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9581792652606964, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9782631099224091, + "step": 271 + }, + { + "completion_length": 208.53125, + "epoch": 0.8704, + "grad_norm": 2.191901206970215, + "kl": 0.06884765625, + "learning_rate": 6.6125e-07, + "loss": 0.0007, + "reward": 3.956714630126953, + "reward_std": 0.01909107668325305, + "rewards/answer_entity_reward": 0.993686854839325, + "rewards/answer_wer_reward": 0.9632268249988556, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9998009502887726, + "step": 272 + }, + { + "completion_length": 196.71875, + "epoch": 0.8736, + "grad_norm": 3.2068357467651367, + "kl": 0.0513916015625, + "learning_rate": 6.6e-07, + "loss": 0.0005, + "reward": 3.9089767932891846, + "reward_std": 0.035889009945094585, + "rewards/answer_entity_reward": 0.9902777671813965, + "rewards/answer_wer_reward": 0.934887707233429, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9838112890720367, + "step": 273 + }, + { + "completion_length": 238.03125, + "epoch": 0.8768, + "grad_norm": 12.858990669250488, + "kl": 0.0513916015625, + "learning_rate": 6.587499999999999e-07, + "loss": 0.0005, + "reward": 3.9507744312286377, + "reward_std": 0.012679634615778923, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9518805146217346, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9988937973976135, + "step": 274 + }, + { + "completion_length": 215.03125, + "epoch": 0.88, + "grad_norm": 6.914164066314697, + "kl": 0.053466796875, + "learning_rate": 6.575e-07, + "loss": 0.0005, + "reward": 3.920554757118225, + "reward_std": 0.01066223531961441, + "rewards/answer_entity_reward": 0.9821428656578064, + "rewards/answer_wer_reward": 0.9384119212627411, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 275 + }, + { + "completion_length": 170.3125, + "epoch": 0.8832, + "grad_norm": 1.4424182176589966, + "kl": 0.0533447265625, + "learning_rate": 6.5625e-07, + "loss": 0.0005, + "reward": 3.8676129579544067, + "reward_std": 0.015859364066272974, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9279236793518066, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9396892189979553, + "step": 276 + }, + { + "completion_length": 203.0, + "epoch": 0.8864, + "grad_norm": 1.4304486513137817, + "kl": 0.040771484375, + "learning_rate": 6.55e-07, + "loss": 0.0004, + "reward": 3.9131808280944824, + "reward_std": 0.020121398381888866, + "rewards/answer_entity_reward": 0.9930555820465088, + "rewards/answer_wer_reward": 0.9201253056526184, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 277 + }, + { + "completion_length": 199.9375, + "epoch": 0.8896, + "grad_norm": 4.607363700866699, + "kl": 0.0810546875, + "learning_rate": 6.5375e-07, + "loss": 0.0008, + "reward": 3.9438611268997192, + "reward_std": 0.014630983117967844, + "rewards/answer_entity_reward": 0.9979166686534882, + "rewards/answer_wer_reward": 0.9560317695140839, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.989912748336792, + "step": 278 + }, + { + "completion_length": 215.75, + "epoch": 0.8928, + "grad_norm": 0.9500401020050049, + "kl": 0.0498046875, + "learning_rate": 6.524999999999999e-07, + "loss": 0.0005, + "reward": 3.9393136501312256, + "reward_std": 0.010870016179978848, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9396113157272339, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997023940086365, + "step": 279 + }, + { + "completion_length": 211.4375, + "epoch": 0.896, + "grad_norm": 2.4634454250335693, + "kl": 0.08154296875, + "learning_rate": 6.5125e-07, + "loss": 0.0008, + "reward": 3.8559117317199707, + "reward_std": 0.020915272179991007, + "rewards/answer_entity_reward": 0.9944444298744202, + "rewards/answer_wer_reward": 0.9251176416873932, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9363496899604797, + "step": 280 + }, + { + "completion_length": 172.8125, + "epoch": 0.8992, + "grad_norm": 5.569718360900879, + "kl": 0.1357421875, + "learning_rate": 6.5e-07, + "loss": 0.0014, + "reward": 3.87375545501709, + "reward_std": 0.04026831593364477, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9294662475585938, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9442892372608185, + "step": 281 + }, + { + "completion_length": 114.875, + "epoch": 0.9024, + "grad_norm": 4.26852560043335, + "kl": 0.053955078125, + "learning_rate": 6.4875e-07, + "loss": 0.0005, + "reward": 3.909887909889221, + "reward_std": 0.015241059940308332, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9791332483291626, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9335956573486328, + "step": 282 + }, + { + "completion_length": 245.0, + "epoch": 0.9056, + "grad_norm": 1.3898316621780396, + "kl": 0.0450439453125, + "learning_rate": 6.474999999999999e-07, + "loss": 0.0005, + "reward": 3.9195964336395264, + "reward_std": 0.018749097362160683, + "rewards/answer_entity_reward": 0.9911437332630157, + "rewards/answer_wer_reward": 0.9284527003765106, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 283 + }, + { + "completion_length": 218.75, + "epoch": 0.9088, + "grad_norm": 4.705906391143799, + "kl": 0.0338134765625, + "learning_rate": 6.4625e-07, + "loss": 0.0003, + "reward": 3.9526829719543457, + "reward_std": 0.012810520827770233, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9526830613613129, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 284 + }, + { + "completion_length": 175.15625, + "epoch": 0.912, + "grad_norm": 1.7440683841705322, + "kl": 0.0616455078125, + "learning_rate": 6.45e-07, + "loss": 0.0006, + "reward": 3.9307706356048584, + "reward_std": 0.014890296617522836, + "rewards/answer_entity_reward": 0.9845238327980042, + "rewards/answer_wer_reward": 0.9668512642383575, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9793955981731415, + "step": 285 + }, + { + "completion_length": 154.3125, + "epoch": 0.9152, + "grad_norm": 2.3717188835144043, + "kl": 0.0599365234375, + "learning_rate": 6.4375e-07, + "loss": 0.0006, + "reward": 3.9156084060668945, + "reward_std": 0.013419507071375847, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.951806515455246, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9638019800186157, + "step": 286 + }, + { + "completion_length": 226.40625, + "epoch": 0.9184, + "grad_norm": 2.069488525390625, + "kl": 0.058349609375, + "learning_rate": 6.424999999999999e-07, + "loss": 0.0006, + "reward": 3.8257880210876465, + "reward_std": 0.023342549800872803, + "rewards/answer_entity_reward": 0.9914772808551788, + "rewards/answer_wer_reward": 0.9156993925571442, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9186112284660339, + "step": 287 + }, + { + "completion_length": 203.21875, + "epoch": 0.9216, + "grad_norm": 1.8522766828536987, + "kl": 0.0611572265625, + "learning_rate": 6.4125e-07, + "loss": 0.0006, + "reward": 3.9413124322891235, + "reward_std": 0.014133658958598971, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9447846114635468, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 288 + }, + { + "completion_length": 182.90625, + "epoch": 0.9248, + "grad_norm": 3.1601576805114746, + "kl": 0.0626220703125, + "learning_rate": 6.4e-07, + "loss": 0.0006, + "reward": 3.934013605117798, + "reward_std": 0.020497526740655303, + "rewards/answer_entity_reward": 0.9861111044883728, + "rewards/answer_wer_reward": 0.9598910510540009, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.988011360168457, + "step": 289 + }, + { + "completion_length": 235.71875, + "epoch": 0.928, + "grad_norm": 1.5299009084701538, + "kl": 0.062744140625, + "learning_rate": 6.3875e-07, + "loss": 0.0006, + "reward": 3.900187373161316, + "reward_std": 0.027182841673493385, + "rewards/answer_entity_reward": 0.9859217405319214, + "rewards/answer_wer_reward": 0.9156533181667328, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9986123144626617, + "step": 290 + }, + { + "completion_length": 181.59375, + "epoch": 0.9312, + "grad_norm": 2.8708431720733643, + "kl": 0.09375, + "learning_rate": 6.374999999999999e-07, + "loss": 0.0009, + "reward": 3.878863215446472, + "reward_std": 0.016461022198200226, + "rewards/answer_entity_reward": 0.9607954621315002, + "rewards/answer_wer_reward": 0.9469051957130432, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9711625277996063, + "step": 291 + }, + { + "completion_length": 252.71875, + "epoch": 0.9344, + "grad_norm": 1.3821316957473755, + "kl": 0.143798828125, + "learning_rate": 6.362499999999999e-07, + "loss": 0.0014, + "reward": 3.9444687366485596, + "reward_std": 0.015690275467932224, + "rewards/answer_entity_reward": 0.9958333373069763, + "rewards/answer_wer_reward": 0.9486355781555176, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 292 + }, + { + "completion_length": 191.5, + "epoch": 0.9376, + "grad_norm": 3.0700418949127197, + "kl": 0.08984375, + "learning_rate": 6.35e-07, + "loss": 0.0009, + "reward": 3.9288469552993774, + "reward_std": 0.025998966302722692, + "rewards/answer_entity_reward": 0.9910714626312256, + "rewards/answer_wer_reward": 0.9580896496772766, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.97968590259552, + "step": 293 + }, + { + "completion_length": 236.40625, + "epoch": 0.9408, + "grad_norm": 0.9392086863517761, + "kl": 0.0728759765625, + "learning_rate": 6.3375e-07, + "loss": 0.0007, + "reward": 3.9576098918914795, + "reward_std": 0.004891619086265564, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9576099216938019, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 294 + }, + { + "completion_length": 204.125, + "epoch": 0.944, + "grad_norm": 1.4554882049560547, + "kl": 0.044677734375, + "learning_rate": 6.324999999999999e-07, + "loss": 0.0004, + "reward": 3.9175373315811157, + "reward_std": 0.008688606787472963, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9530804753303528, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9644568264484406, + "step": 295 + }, + { + "completion_length": 230.8125, + "epoch": 0.9472, + "grad_norm": 0.7801051139831543, + "kl": 0.0537109375, + "learning_rate": 6.3125e-07, + "loss": 0.0005, + "reward": 3.941986918449402, + "reward_std": 0.011714181862771511, + "rewards/answer_entity_reward": 0.9983552694320679, + "rewards/answer_wer_reward": 0.9448631405830383, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9987684786319733, + "step": 296 + }, + { + "completion_length": 201.6875, + "epoch": 0.9504, + "grad_norm": 3.2697925567626953, + "kl": 0.0723876953125, + "learning_rate": 6.3e-07, + "loss": 0.0007, + "reward": 3.9148101806640625, + "reward_std": 0.02096148394048214, + "rewards/answer_entity_reward": 0.9955357313156128, + "rewards/answer_wer_reward": 0.9371316432952881, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9821428656578064, + "step": 297 + }, + { + "completion_length": 174.71875, + "epoch": 0.9536, + "grad_norm": 1.3895010948181152, + "kl": 0.072509765625, + "learning_rate": 6.2875e-07, + "loss": 0.0007, + "reward": 3.9413623809814453, + "reward_std": 0.012068473850376904, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.96162348985672, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9821428656578064, + "step": 298 + }, + { + "completion_length": 226.53125, + "epoch": 0.9568, + "grad_norm": 0.9915501475334167, + "kl": 0.0574951171875, + "learning_rate": 6.274999999999999e-07, + "loss": 0.0006, + "reward": 3.9342339038848877, + "reward_std": 0.017138528637588024, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9342339336872101, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 299 + }, + { + "completion_length": 185.96875, + "epoch": 0.96, + "grad_norm": 2.181473970413208, + "kl": 0.0693359375, + "learning_rate": 6.262499999999999e-07, + "loss": 0.0007, + "reward": 3.8075177669525146, + "reward_std": 0.008563205134123564, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.974321037530899, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8331968486309052, + "step": 300 + }, + { + "completion_length": 259.4375, + "epoch": 0.9632, + "grad_norm": 0.8825593590736389, + "kl": 0.053955078125, + "learning_rate": 6.249999999999999e-07, + "loss": 0.0005, + "reward": 3.9282361268997192, + "reward_std": 0.01493215560913086, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9290694296360016, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9991666674613953, + "step": 301 + }, + { + "completion_length": 233.4375, + "epoch": 0.9664, + "grad_norm": 2.377093553543091, + "kl": 0.08251953125, + "learning_rate": 6.2375e-07, + "loss": 0.0008, + "reward": 3.8652896881103516, + "reward_std": 0.04854640178382397, + "rewards/answer_entity_reward": 0.9947552382946014, + "rewards/answer_wer_reward": 0.931235283613205, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9392991065979004, + "step": 302 + }, + { + "completion_length": 214.9375, + "epoch": 0.9696, + "grad_norm": 2.7887818813323975, + "kl": 0.0765380859375, + "learning_rate": 6.225000000000001e-07, + "loss": 0.0008, + "reward": 3.916442394256592, + "reward_std": 0.014312040992081165, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9577742516994476, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9586680829524994, + "step": 303 + }, + { + "completion_length": 195.53125, + "epoch": 0.9728, + "grad_norm": 1.3930556774139404, + "kl": 0.0662841796875, + "learning_rate": 6.2125e-07, + "loss": 0.0007, + "reward": 3.8324824571609497, + "reward_std": 0.013787610223516822, + "rewards/answer_entity_reward": 0.9943181872367859, + "rewards/answer_wer_reward": 0.9709192514419556, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8672450482845306, + "step": 304 + }, + { + "completion_length": 221.65625, + "epoch": 0.976, + "grad_norm": 1.6060283184051514, + "kl": 0.046875, + "learning_rate": 6.2e-07, + "loss": 0.0005, + "reward": 3.9341059923171997, + "reward_std": 0.016552825924009085, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9438435733318329, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9902624487876892, + "step": 305 + }, + { + "completion_length": 274.4375, + "epoch": 0.9792, + "grad_norm": 2.2774875164031982, + "kl": 0.0582275390625, + "learning_rate": 6.1875e-07, + "loss": 0.0006, + "reward": 3.8809224367141724, + "reward_std": 0.03468186687678099, + "rewards/answer_entity_reward": 0.9755851626396179, + "rewards/answer_wer_reward": 0.9063642621040344, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9989729523658752, + "step": 306 + }, + { + "completion_length": 243.875, + "epoch": 0.9824, + "grad_norm": 1.4776897430419922, + "kl": 0.0865478515625, + "learning_rate": 6.175e-07, + "loss": 0.0009, + "reward": 3.921198606491089, + "reward_std": 0.029711266048252583, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9262239336967468, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9984469413757324, + "step": 307 + }, + { + "completion_length": 230.6875, + "epoch": 0.9856, + "grad_norm": 0.8870422840118408, + "kl": 0.0528564453125, + "learning_rate": 6.162499999999999e-07, + "loss": 0.0005, + "reward": 3.9468624591827393, + "reward_std": 0.010126703884452581, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9468623399734497, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 308 + }, + { + "completion_length": 193.53125, + "epoch": 0.9888, + "grad_norm": 1.2648320198059082, + "kl": 0.0474853515625, + "learning_rate": 6.149999999999999e-07, + "loss": 0.0005, + "reward": 3.9692437648773193, + "reward_std": 0.010907594813033938, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9716475903987885, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 309 + }, + { + "completion_length": 226.84375, + "epoch": 0.992, + "grad_norm": 2.5334410667419434, + "kl": 0.099609375, + "learning_rate": 6.1375e-07, + "loss": 0.001, + "reward": 3.932776689529419, + "reward_std": 0.025886752177029848, + "rewards/answer_entity_reward": 0.9937500059604645, + "rewards/answer_wer_reward": 0.9474222362041473, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9916044771671295, + "step": 310 + }, + { + "completion_length": 202.40625, + "epoch": 0.9952, + "grad_norm": 1.6191986799240112, + "kl": 0.059326171875, + "learning_rate": 6.125000000000001e-07, + "loss": 0.0006, + "reward": 3.923641085624695, + "reward_std": 0.016786989755928516, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9264820218086243, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 311 + }, + { + "completion_length": 226.125, + "epoch": 0.9984, + "grad_norm": 2.3516252040863037, + "kl": 0.0587158203125, + "learning_rate": 6.1125e-07, + "loss": 0.0006, + "reward": 3.822533130645752, + "reward_std": 0.19381592608988285, + "rewards/answer_entity_reward": 0.9630681872367859, + "rewards/answer_wer_reward": 0.8977905511856079, + "rewards/format_reward": 0.96875, + "rewards/think_ocr_reward": 0.9929245114326477, + "step": 312 + }, + { + "completion_length": 164.4375, + "epoch": 1.0, + "grad_norm": 9.48376178741455, + "kl": 0.04345703125, + "learning_rate": 6.1e-07, + "loss": 0.0002, + "reward": 3.9722466468811035, + "reward_std": 0.021218769252300262, + "rewards/answer_entity_reward": 0.9861111044883728, + "rewards/answer_wer_reward": 0.9880585074424744, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9980769157409668, + "step": 313 + }, + { + "completion_length": 194.0625, + "epoch": 1.0032, + "grad_norm": 1.5969237089157104, + "kl": 0.0419921875, + "learning_rate": 6.0875e-07, + "loss": 0.0004, + "reward": 3.9741499423980713, + "reward_std": 0.00955872773192823, + "rewards/answer_entity_reward": 0.9979166686534882, + "rewards/answer_wer_reward": 0.9776757061481476, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9985576868057251, + "step": 314 + }, + { + "completion_length": 174.25, + "epoch": 1.0064, + "grad_norm": 5.0026326179504395, + "kl": 0.07470703125, + "learning_rate": 6.075e-07, + "loss": 0.0007, + "reward": 3.9532389640808105, + "reward_std": 0.01782281370833516, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9582388997077942, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9950000047683716, + "step": 315 + }, + { + "completion_length": 218.3125, + "epoch": 1.0096, + "grad_norm": 1.521260142326355, + "kl": 0.072509765625, + "learning_rate": 6.062499999999999e-07, + "loss": 0.0007, + "reward": 3.891371011734009, + "reward_std": 0.037183830980211496, + "rewards/answer_entity_reward": 0.995192289352417, + "rewards/answer_wer_reward": 0.9465020596981049, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9496767222881317, + "step": 316 + }, + { + "completion_length": 181.21875, + "epoch": 1.0128, + "grad_norm": 2.444070339202881, + "kl": 0.1011962890625, + "learning_rate": 6.049999999999999e-07, + "loss": 0.001, + "reward": 3.957024097442627, + "reward_std": 0.015732225496321917, + "rewards/answer_entity_reward": 0.9943181872367859, + "rewards/answer_wer_reward": 0.9627059102058411, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 317 + }, + { + "completion_length": 214.8125, + "epoch": 1.016, + "grad_norm": 5.038032054901123, + "kl": 0.081298828125, + "learning_rate": 6.037499999999999e-07, + "loss": 0.0008, + "reward": 3.905093193054199, + "reward_std": 0.02073481073603034, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9350383579730988, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.97005495429039, + "step": 318 + }, + { + "completion_length": 209.8125, + "epoch": 1.0192, + "grad_norm": 3.9700140953063965, + "kl": 0.07373046875, + "learning_rate": 6.025000000000001e-07, + "loss": 0.0007, + "reward": 3.8465429544448853, + "reward_std": 0.044920976273715496, + "rewards/answer_entity_reward": 0.953125, + "rewards/answer_wer_reward": 0.935539960861206, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9578781127929688, + "step": 319 + }, + { + "completion_length": 242.8125, + "epoch": 1.0224, + "grad_norm": 1.1018257141113281, + "kl": 0.0404052734375, + "learning_rate": 6.0125e-07, + "loss": 0.0004, + "reward": 3.9351298809051514, + "reward_std": 0.00889231264591217, + "rewards/answer_entity_reward": 0.9861111044883728, + "rewards/answer_wer_reward": 0.9503234028816223, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9986952543258667, + "step": 320 + }, + { + "completion_length": 178.65625, + "epoch": 1.0256, + "grad_norm": 1.2945948839187622, + "kl": 0.059326171875, + "learning_rate": 6e-07, + "loss": 0.0006, + "reward": 3.9444717168807983, + "reward_std": 0.010739851742982864, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9468754827976227, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 321 + }, + { + "completion_length": 158.75, + "epoch": 1.0288, + "grad_norm": 1.9997080564498901, + "kl": 0.10498046875, + "learning_rate": 5.9875e-07, + "loss": 0.001, + "reward": 3.8997615575790405, + "reward_std": 0.0878201499581337, + "rewards/answer_entity_reward": 0.9768981039524078, + "rewards/answer_wer_reward": 0.9317395091056824, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9911239445209503, + "step": 322 + }, + { + "completion_length": 202.78125, + "epoch": 1.032, + "grad_norm": 2.5343425273895264, + "kl": 0.047119140625, + "learning_rate": 5.975e-07, + "loss": 0.0005, + "reward": 3.9625836610794067, + "reward_std": 0.0073791013564914465, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9652430713176727, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9973404407501221, + "step": 323 + }, + { + "completion_length": 181.9375, + "epoch": 1.0352, + "grad_norm": 7.240401744842529, + "kl": 0.067138671875, + "learning_rate": 5.962499999999999e-07, + "loss": 0.0007, + "reward": 3.828685760498047, + "reward_std": 0.04627671558409929, + "rewards/answer_entity_reward": 0.995192289352417, + "rewards/answer_wer_reward": 0.951274037361145, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.882219523191452, + "step": 324 + }, + { + "completion_length": 209.75, + "epoch": 1.0384, + "grad_norm": 2.1784214973449707, + "kl": 0.0810546875, + "learning_rate": 5.949999999999999e-07, + "loss": 0.0008, + "reward": 3.9578659534454346, + "reward_std": 0.015447806101292372, + "rewards/answer_entity_reward": 0.9947552382946014, + "rewards/answer_wer_reward": 0.9634187519550323, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996921122074127, + "step": 325 + }, + { + "completion_length": 200.78125, + "epoch": 1.0416, + "grad_norm": 1.8993250131607056, + "kl": 0.086669921875, + "learning_rate": 5.937499999999999e-07, + "loss": 0.0009, + "reward": 3.9622350931167603, + "reward_std": 0.011172362137585878, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9622350335121155, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 326 + }, + { + "completion_length": 188.0625, + "epoch": 1.0448, + "grad_norm": 2.999244213104248, + "kl": 0.04931640625, + "learning_rate": 5.925e-07, + "loss": 0.0005, + "reward": 3.8658429384231567, + "reward_std": 0.027352653443813324, + "rewards/answer_entity_reward": 0.9859203398227692, + "rewards/answer_wer_reward": 0.9490468800067902, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9308757185935974, + "step": 327 + }, + { + "completion_length": 211.6875, + "epoch": 1.048, + "grad_norm": 1.4307529926300049, + "kl": 0.06982421875, + "learning_rate": 5.912500000000001e-07, + "loss": 0.0007, + "reward": 3.8813902139663696, + "reward_std": 0.015089725144207478, + "rewards/answer_entity_reward": 0.9800595343112946, + "rewards/answer_wer_reward": 0.9558005034923553, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9455301761627197, + "step": 328 + }, + { + "completion_length": 184.1875, + "epoch": 1.0512, + "grad_norm": 1.9804878234863281, + "kl": 0.03851318359375, + "learning_rate": 5.9e-07, + "loss": 0.0004, + "reward": 3.9403220415115356, + "reward_std": 0.025673750409623608, + "rewards/answer_entity_reward": 0.9941239356994629, + "rewards/answer_wer_reward": 0.94679394364357, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9994041919708252, + "step": 329 + }, + { + "completion_length": 200.71875, + "epoch": 1.0544, + "grad_norm": 1.5184144973754883, + "kl": 0.06689453125, + "learning_rate": 5.8875e-07, + "loss": 0.0007, + "reward": 3.945325493812561, + "reward_std": 0.021944692358374596, + "rewards/answer_entity_reward": 0.9943181872367859, + "rewards/answer_wer_reward": 0.951007217168808, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 330 + }, + { + "completion_length": 211.875, + "epoch": 1.0576, + "grad_norm": 1.228079915046692, + "kl": 0.052978515625, + "learning_rate": 5.875e-07, + "loss": 0.0005, + "reward": 3.9120590686798096, + "reward_std": 0.015080507844686508, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.912059098482132, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 331 + }, + { + "completion_length": 240.5625, + "epoch": 1.0608, + "grad_norm": 1.7073534727096558, + "kl": 0.1005859375, + "learning_rate": 5.8625e-07, + "loss": 0.001, + "reward": 3.943448066711426, + "reward_std": 0.010788221377879381, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9437373280525208, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997106492519379, + "step": 332 + }, + { + "completion_length": 217.78125, + "epoch": 1.064, + "grad_norm": 1.9268385171890259, + "kl": 0.0440673828125, + "learning_rate": 5.849999999999999e-07, + "loss": 0.0004, + "reward": 3.9603058099746704, + "reward_std": 0.009590512840077281, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9625644087791443, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9977414906024933, + "step": 333 + }, + { + "completion_length": 188.125, + "epoch": 1.0672, + "grad_norm": 0.780636727809906, + "kl": 0.04638671875, + "learning_rate": 5.837499999999999e-07, + "loss": 0.0005, + "reward": 3.949649691581726, + "reward_std": 0.0076717507326975465, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9496497213840485, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 334 + }, + { + "completion_length": 240.71875, + "epoch": 1.0704, + "grad_norm": 21.118270874023438, + "kl": 0.04296875, + "learning_rate": 5.825e-07, + "loss": 0.0004, + "reward": 3.968227982521057, + "reward_std": 0.01375247398391366, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9715853631496429, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.996642529964447, + "step": 335 + }, + { + "completion_length": 251.21875, + "epoch": 1.0735999999999999, + "grad_norm": 1.0980618000030518, + "kl": 0.0467529296875, + "learning_rate": 5.8125e-07, + "loss": 0.0005, + "reward": 3.9321502447128296, + "reward_std": 0.02487938292324543, + "rewards/answer_entity_reward": 0.987500011920929, + "rewards/answer_wer_reward": 0.945962131023407, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9986882209777832, + "step": 336 + }, + { + "completion_length": 191.0, + "epoch": 1.0768, + "grad_norm": 1.9901342391967773, + "kl": 0.1015625, + "learning_rate": 5.8e-07, + "loss": 0.001, + "reward": 3.860186219215393, + "reward_std": 0.008080802159383893, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9668596386909485, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8933265209197998, + "step": 337 + }, + { + "completion_length": 222.40625, + "epoch": 1.08, + "grad_norm": 1.9760770797729492, + "kl": 0.0791015625, + "learning_rate": 5.7875e-07, + "loss": 0.0008, + "reward": 3.943527340888977, + "reward_std": 0.013376505114138126, + "rewards/answer_entity_reward": 0.9927884340286255, + "rewards/answer_wer_reward": 0.950738936662674, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 338 + }, + { + "completion_length": 242.75, + "epoch": 1.0832, + "grad_norm": 1.4690314531326294, + "kl": 0.0699462890625, + "learning_rate": 5.775e-07, + "loss": 0.0007, + "reward": 3.946296215057373, + "reward_std": 0.010936432983726263, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.946296215057373, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 339 + }, + { + "completion_length": 213.75, + "epoch": 1.0864, + "grad_norm": 1.3006911277770996, + "kl": 0.068603515625, + "learning_rate": 5.7625e-07, + "loss": 0.0007, + "reward": 3.929935932159424, + "reward_std": 0.012226814404129982, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9303079545497894, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996279776096344, + "step": 340 + }, + { + "completion_length": 203.875, + "epoch": 1.0896, + "grad_norm": 20.699094772338867, + "kl": 0.0606689453125, + "learning_rate": 5.749999999999999e-07, + "loss": 0.0006, + "reward": 3.839663863182068, + "reward_std": 0.2153539047576487, + "rewards/answer_entity_reward": 0.9632352888584137, + "rewards/answer_wer_reward": 0.9303349256515503, + "rewards/format_reward": 0.96875, + "rewards/think_ocr_reward": 0.977343738079071, + "step": 341 + }, + { + "completion_length": 229.9375, + "epoch": 1.0928, + "grad_norm": 10.713321685791016, + "kl": 0.062255859375, + "learning_rate": 5.737499999999999e-07, + "loss": 0.0006, + "reward": 3.952810525894165, + "reward_std": 0.013096342328935862, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9535458087921143, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9992647171020508, + "step": 342 + }, + { + "completion_length": 226.0625, + "epoch": 1.096, + "grad_norm": 5.412719249725342, + "kl": 0.068115234375, + "learning_rate": 5.725e-07, + "loss": 0.0007, + "reward": 3.9290108680725098, + "reward_std": 0.014630899764597416, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9352608323097229, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9937500059604645, + "step": 343 + }, + { + "completion_length": 180.875, + "epoch": 1.0992, + "grad_norm": 1.5433329343795776, + "kl": 0.046875, + "learning_rate": 5.7125e-07, + "loss": 0.0005, + "reward": 3.9217172861099243, + "reward_std": 0.007004068233072758, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9350151419639587, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9867021441459656, + "step": 344 + }, + { + "completion_length": 228.5625, + "epoch": 1.1024, + "grad_norm": 1.6970151662826538, + "kl": 0.058837890625, + "learning_rate": 5.699999999999999e-07, + "loss": 0.0006, + "reward": 3.9185184240341187, + "reward_std": 0.013168168719857931, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9197319746017456, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9987863898277283, + "step": 345 + }, + { + "completion_length": 155.34375, + "epoch": 1.1056, + "grad_norm": 1.7489057779312134, + "kl": 0.0869140625, + "learning_rate": 5.6875e-07, + "loss": 0.0009, + "reward": 3.9059561491012573, + "reward_std": 0.00622332957573235, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9627758860588074, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9431802928447723, + "step": 346 + }, + { + "completion_length": 173.40625, + "epoch": 1.1088, + "grad_norm": 1.3873649835586548, + "kl": 0.09033203125, + "learning_rate": 5.675e-07, + "loss": 0.0009, + "reward": 3.9297943115234375, + "reward_std": 0.039116960018873215, + "rewards/answer_entity_reward": 0.9826389253139496, + "rewards/answer_wer_reward": 0.9575237333774567, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9896316528320312, + "step": 347 + }, + { + "completion_length": 210.3125, + "epoch": 1.112, + "grad_norm": 3.549527645111084, + "kl": 0.0986328125, + "learning_rate": 5.6625e-07, + "loss": 0.001, + "reward": 3.9249199628829956, + "reward_std": 0.019829558208584785, + "rewards/answer_entity_reward": 0.9842728972434998, + "rewards/answer_wer_reward": 0.9483617842197418, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9922854006290436, + "step": 348 + }, + { + "completion_length": 210.21875, + "epoch": 1.1152, + "grad_norm": 1.7917331457138062, + "kl": 0.0712890625, + "learning_rate": 5.649999999999999e-07, + "loss": 0.0007, + "reward": 3.9333280324935913, + "reward_std": 0.011767172254621983, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9333280622959137, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 349 + }, + { + "completion_length": 220.1875, + "epoch": 1.1184, + "grad_norm": 0.8690351247787476, + "kl": 0.069580078125, + "learning_rate": 5.637499999999999e-07, + "loss": 0.0007, + "reward": 3.9331865310668945, + "reward_std": 0.008595036342740059, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9419363439083099, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9912500977516174, + "step": 350 + }, + { + "completion_length": 192.65625, + "epoch": 1.1216, + "grad_norm": 1.7662582397460938, + "kl": 0.076171875, + "learning_rate": 5.625e-07, + "loss": 0.0008, + "reward": 3.950869083404541, + "reward_std": 0.020245986990630627, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.951172411441803, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996966123580933, + "step": 351 + }, + { + "completion_length": 264.25, + "epoch": 1.1248, + "grad_norm": 6.877583026885986, + "kl": 0.0867919921875, + "learning_rate": 5.6125e-07, + "loss": 0.0009, + "reward": 3.9451229572296143, + "reward_std": 0.017284557223320007, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.946128636598587, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9989943504333496, + "step": 352 + }, + { + "completion_length": 218.4375, + "epoch": 1.1280000000000001, + "grad_norm": 1.853745460510254, + "kl": 0.058837890625, + "learning_rate": 5.6e-07, + "loss": 0.0006, + "reward": 3.9474722146987915, + "reward_std": 0.01703261397778988, + "rewards/answer_entity_reward": 0.9955357313156128, + "rewards/answer_wer_reward": 0.9519364535808563, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 353 + }, + { + "completion_length": 229.9375, + "epoch": 1.1312, + "grad_norm": 7.013837814331055, + "kl": 0.079345703125, + "learning_rate": 5.587499999999999e-07, + "loss": 0.0008, + "reward": 3.928715705871582, + "reward_std": 0.024107711389660835, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9372670352458954, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9938524663448334, + "step": 354 + }, + { + "completion_length": 238.09375, + "epoch": 1.1344, + "grad_norm": 1.8181698322296143, + "kl": 0.0587158203125, + "learning_rate": 5.575e-07, + "loss": 0.0006, + "reward": 3.9445427656173706, + "reward_std": 0.028678019531071186, + "rewards/answer_entity_reward": 0.9851190447807312, + "rewards/answer_wer_reward": 0.9630020260810852, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.996421754360199, + "step": 355 + }, + { + "completion_length": 199.46875, + "epoch": 1.1376, + "grad_norm": 17.45456314086914, + "kl": 0.44140625, + "learning_rate": 5.5625e-07, + "loss": 0.0044, + "reward": 3.793405294418335, + "reward_std": 0.09584336914122105, + "rewards/answer_entity_reward": 0.9953208565711975, + "rewards/answer_wer_reward": 0.9546021223068237, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.843482255935669, + "step": 356 + }, + { + "completion_length": 234.9375, + "epoch": 1.1408, + "grad_norm": 1.5193853378295898, + "kl": 0.056396484375, + "learning_rate": 5.55e-07, + "loss": 0.0006, + "reward": 3.9331583976745605, + "reward_std": 0.01793505996465683, + "rewards/answer_entity_reward": 0.9901185929775238, + "rewards/answer_wer_reward": 0.9450170993804932, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9980226159095764, + "step": 357 + }, + { + "completion_length": 225.21875, + "epoch": 1.144, + "grad_norm": 0.7461761236190796, + "kl": 0.050048828125, + "learning_rate": 5.5375e-07, + "loss": 0.0005, + "reward": 3.9532158374786377, + "reward_std": 0.013632898684591055, + "rewards/answer_entity_reward": 0.9930555522441864, + "rewards/answer_wer_reward": 0.9601602554321289, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 358 + }, + { + "completion_length": 196.21875, + "epoch": 1.1472, + "grad_norm": 1.688063621520996, + "kl": 0.0589599609375, + "learning_rate": 5.525e-07, + "loss": 0.0006, + "reward": 3.957648277282715, + "reward_std": 0.009953869972378016, + "rewards/answer_entity_reward": 0.9892857074737549, + "rewards/answer_wer_reward": 0.9689917266368866, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9993708431720734, + "step": 359 + }, + { + "completion_length": 230.875, + "epoch": 1.1504, + "grad_norm": 1.0592241287231445, + "kl": 0.057861328125, + "learning_rate": 5.5125e-07, + "loss": 0.0006, + "reward": 3.9605822563171387, + "reward_std": 0.00902467966079712, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.961335301399231, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9992469847202301, + "step": 360 + }, + { + "completion_length": 177.25, + "epoch": 1.1536, + "grad_norm": 0.887911856174469, + "kl": 0.0631103515625, + "learning_rate": 5.5e-07, + "loss": 0.0006, + "reward": 3.9682934284210205, + "reward_std": 0.004935940261930227, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9682934284210205, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 361 + }, + { + "completion_length": 204.09375, + "epoch": 1.1568, + "grad_norm": 1.4796991348266602, + "kl": 0.0721435546875, + "learning_rate": 5.487499999999999e-07, + "loss": 0.0007, + "reward": 3.967429041862488, + "reward_std": 0.004718436859548092, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.967721164226532, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997079372406006, + "step": 362 + }, + { + "completion_length": 201.90625, + "epoch": 1.16, + "grad_norm": 1.349228858947754, + "kl": 0.0635986328125, + "learning_rate": 5.474999999999999e-07, + "loss": 0.0006, + "reward": 3.968218684196472, + "reward_std": 0.004579245578497648, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9686298072338104, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.999588817358017, + "step": 363 + }, + { + "completion_length": 222.25, + "epoch": 1.1632, + "grad_norm": 8.183592796325684, + "kl": 0.7177734375, + "learning_rate": 5.4625e-07, + "loss": 0.0072, + "reward": 3.8565011024475098, + "reward_std": 0.14647854026407003, + "rewards/answer_entity_reward": 0.9628739356994629, + "rewards/answer_wer_reward": 0.897028774023056, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9965982735157013, + "step": 364 + }, + { + "completion_length": 203.875, + "epoch": 1.1663999999999999, + "grad_norm": 2.1804592609405518, + "kl": 0.07666015625, + "learning_rate": 5.45e-07, + "loss": 0.0008, + "reward": 3.9330880641937256, + "reward_std": 0.023633791133761406, + "rewards/answer_entity_reward": 0.9927884340286255, + "rewards/answer_wer_reward": 0.9594465494155884, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9808530211448669, + "step": 365 + }, + { + "completion_length": 187.53125, + "epoch": 1.1696, + "grad_norm": 0.952870786190033, + "kl": 0.068603515625, + "learning_rate": 5.4375e-07, + "loss": 0.0007, + "reward": 3.906123399734497, + "reward_std": 0.02216299483552575, + "rewards/answer_entity_reward": 0.9882478415966034, + "rewards/answer_wer_reward": 0.9373133480548859, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9805622696876526, + "step": 366 + }, + { + "completion_length": 180.28125, + "epoch": 1.1728, + "grad_norm": 1.6601589918136597, + "kl": 0.069091796875, + "learning_rate": 5.425e-07, + "loss": 0.0007, + "reward": 3.9451587200164795, + "reward_std": 0.01368240499868989, + "rewards/answer_entity_reward": 0.9923513829708099, + "rewards/answer_wer_reward": 0.9530614018440247, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997459352016449, + "step": 367 + }, + { + "completion_length": 207.5625, + "epoch": 1.176, + "grad_norm": 2.0661466121673584, + "kl": 0.142578125, + "learning_rate": 5.4125e-07, + "loss": 0.0014, + "reward": 3.9405598640441895, + "reward_std": 0.009340570773929358, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9443033933639526, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9962564706802368, + "step": 368 + }, + { + "completion_length": 193.4375, + "epoch": 1.1792, + "grad_norm": 2.3376078605651855, + "kl": 0.0548095703125, + "learning_rate": 5.4e-07, + "loss": 0.0005, + "reward": 3.9724533557891846, + "reward_std": 0.007678399793803692, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9739435911178589, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9985099136829376, + "step": 369 + }, + { + "completion_length": 244.9375, + "epoch": 1.1824, + "grad_norm": 8.994063377380371, + "kl": 0.067138671875, + "learning_rate": 5.387499999999999e-07, + "loss": 0.0007, + "reward": 3.8642784357070923, + "reward_std": 0.015206838492304087, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9453278481960297, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9217914342880249, + "step": 370 + }, + { + "completion_length": 223.5, + "epoch": 1.1856, + "grad_norm": 0.7140876054763794, + "kl": 0.0628662109375, + "learning_rate": 5.374999999999999e-07, + "loss": 0.0006, + "reward": 3.9566755294799805, + "reward_std": 0.008438330609351397, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9571858644485474, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9994895756244659, + "step": 371 + }, + { + "completion_length": 236.09375, + "epoch": 1.1888, + "grad_norm": 5.422008514404297, + "kl": 0.072021484375, + "learning_rate": 5.3625e-07, + "loss": 0.0007, + "reward": 3.9092832803726196, + "reward_std": 0.02735153865069151, + "rewards/answer_entity_reward": 0.9869465231895447, + "rewards/answer_wer_reward": 0.9258767068386078, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9964599907398224, + "step": 372 + }, + { + "completion_length": 215.90625, + "epoch": 1.192, + "grad_norm": 2.5449435710906982, + "kl": 0.0655517578125, + "learning_rate": 5.35e-07, + "loss": 0.0007, + "reward": 3.8726375102996826, + "reward_std": 0.15768051333725452, + "rewards/answer_entity_reward": 0.991346150636673, + "rewards/answer_wer_reward": 0.9473030865192413, + "rewards/format_reward": 0.96875, + "rewards/think_ocr_reward": 0.9652382135391235, + "step": 373 + }, + { + "completion_length": 221.09375, + "epoch": 1.1952, + "grad_norm": 1.3450181484222412, + "kl": 0.0499267578125, + "learning_rate": 5.3375e-07, + "loss": 0.0005, + "reward": 3.945889711380005, + "reward_std": 0.021359253441914916, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9733871817588806, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9725023210048676, + "step": 374 + }, + { + "completion_length": 208.03125, + "epoch": 1.1984, + "grad_norm": 1.1699227094650269, + "kl": 0.067626953125, + "learning_rate": 5.325e-07, + "loss": 0.0007, + "reward": 3.951171040534973, + "reward_std": 0.008666176348924637, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9543131291866302, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9992617964744568, + "step": 375 + }, + { + "completion_length": 253.28125, + "epoch": 1.2016, + "grad_norm": 2.287163496017456, + "kl": 0.0572509765625, + "learning_rate": 5.3125e-07, + "loss": 0.0006, + "reward": 3.9154282808303833, + "reward_std": 0.04354940680786967, + "rewards/answer_entity_reward": 0.9888257682323456, + "rewards/answer_wer_reward": 0.9271413683891296, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9994612038135529, + "step": 376 + }, + { + "completion_length": 187.21875, + "epoch": 1.2048, + "grad_norm": 1.3305357694625854, + "kl": 0.046142578125, + "learning_rate": 5.3e-07, + "loss": 0.0005, + "reward": 3.9359636306762695, + "reward_std": 0.00542741478420794, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9541498124599457, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.981813907623291, + "step": 377 + }, + { + "completion_length": 224.125, + "epoch": 1.208, + "grad_norm": 10.12941837310791, + "kl": 0.06201171875, + "learning_rate": 5.2875e-07, + "loss": 0.0006, + "reward": 3.9541337490081787, + "reward_std": 0.013694523833692074, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9624313712120056, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9917024075984955, + "step": 378 + }, + { + "completion_length": 158.96875, + "epoch": 1.2112, + "grad_norm": 1.3805967569351196, + "kl": 0.05859375, + "learning_rate": 5.274999999999999e-07, + "loss": 0.0006, + "reward": 3.947017788887024, + "reward_std": 0.02097574481740594, + "rewards/answer_entity_reward": 0.9902146458625793, + "rewards/answer_wer_reward": 0.961486428976059, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9953167736530304, + "step": 379 + }, + { + "completion_length": 250.40625, + "epoch": 1.2144, + "grad_norm": 1.2120996713638306, + "kl": 0.044921875, + "learning_rate": 5.262499999999999e-07, + "loss": 0.0004, + "reward": 3.918868899345398, + "reward_std": 0.021801823284476995, + "rewards/answer_entity_reward": 0.9937500059604645, + "rewards/answer_wer_reward": 0.9251189529895782, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 380 + }, + { + "completion_length": 211.34375, + "epoch": 1.2176, + "grad_norm": 2.19063138961792, + "kl": 0.078369140625, + "learning_rate": 5.25e-07, + "loss": 0.0008, + "reward": 3.8982889652252197, + "reward_std": 0.02524574287235737, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9512019455432892, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.947086900472641, + "step": 381 + }, + { + "completion_length": 241.28125, + "epoch": 1.2208, + "grad_norm": 1.619989275932312, + "kl": 0.05615234375, + "learning_rate": 5.237500000000001e-07, + "loss": 0.0006, + "reward": 3.9471057653427124, + "reward_std": 0.013869246933609247, + "rewards/answer_entity_reward": 0.9944852888584137, + "rewards/answer_wer_reward": 0.9526203572750092, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 382 + }, + { + "completion_length": 244.875, + "epoch": 1.224, + "grad_norm": 0.8697032928466797, + "kl": 0.061279296875, + "learning_rate": 5.225e-07, + "loss": 0.0006, + "reward": 3.9235615730285645, + "reward_std": 0.015196615364402533, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9275480508804321, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.998417317867279, + "step": 383 + }, + { + "completion_length": 191.875, + "epoch": 1.2272, + "grad_norm": 5.2052154541015625, + "kl": 0.06884765625, + "learning_rate": 5.2125e-07, + "loss": 0.0007, + "reward": 3.934178948402405, + "reward_std": 0.024661258328706026, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9814408719539642, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9527381658554077, + "step": 384 + }, + { + "completion_length": 218.15625, + "epoch": 1.2304, + "grad_norm": 1.1718415021896362, + "kl": 0.105224609375, + "learning_rate": 5.2e-07, + "loss": 0.0011, + "reward": 3.8538546562194824, + "reward_std": 0.013242242857813835, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9431050419807434, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9135904610157013, + "step": 385 + }, + { + "completion_length": 167.59375, + "epoch": 1.2336, + "grad_norm": 1.8933672904968262, + "kl": 0.0555419921875, + "learning_rate": 5.1875e-07, + "loss": 0.0006, + "reward": 3.942023754119873, + "reward_std": 0.04039308475330472, + "rewards/answer_entity_reward": 0.9895833432674408, + "rewards/answer_wer_reward": 0.9561411142349243, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9962993562221527, + "step": 386 + }, + { + "completion_length": 181.1875, + "epoch": 1.2368000000000001, + "grad_norm": 1.132387399673462, + "kl": 0.134033203125, + "learning_rate": 5.174999999999999e-07, + "loss": 0.0013, + "reward": 3.883729100227356, + "reward_std": 0.006107622524723411, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9661928117275238, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9175363183021545, + "step": 387 + }, + { + "completion_length": 245.78125, + "epoch": 1.24, + "grad_norm": 1.5286246538162231, + "kl": 0.0439453125, + "learning_rate": 5.162499999999999e-07, + "loss": 0.0004, + "reward": 3.9444308280944824, + "reward_std": 0.017588268965482712, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.951177716255188, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.993253082036972, + "step": 388 + }, + { + "completion_length": 214.5, + "epoch": 1.2432, + "grad_norm": 4.535660266876221, + "kl": 0.4443359375, + "learning_rate": 5.149999999999999e-07, + "loss": 0.0045, + "reward": 3.9712672233581543, + "reward_std": 0.017703328281641006, + "rewards/answer_entity_reward": 0.9923513829708099, + "rewards/answer_wer_reward": 0.9789157509803772, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 389 + }, + { + "completion_length": 237.71875, + "epoch": 1.2464, + "grad_norm": 1.100642204284668, + "kl": 0.0443115234375, + "learning_rate": 5.137500000000001e-07, + "loss": 0.0004, + "reward": 3.9504618644714355, + "reward_std": 0.01717091863974929, + "rewards/answer_entity_reward": 0.9955357313156128, + "rewards/answer_wer_reward": 0.9553267061710358, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9995993673801422, + "step": 390 + }, + { + "completion_length": 220.8125, + "epoch": 1.2496, + "grad_norm": 1.8153222799301147, + "kl": 0.050537109375, + "learning_rate": 5.125e-07, + "loss": 0.0005, + "reward": 3.954966902732849, + "reward_std": 0.023467861115932465, + "rewards/answer_entity_reward": 0.9909090995788574, + "rewards/answer_wer_reward": 0.9640579223632812, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 391 + }, + { + "completion_length": 215.75, + "epoch": 1.2528000000000001, + "grad_norm": 1.3607189655303955, + "kl": 0.0562744140625, + "learning_rate": 5.1125e-07, + "loss": 0.0006, + "reward": 3.947434425354004, + "reward_std": 0.01746128685772419, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9514667093753815, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9959677457809448, + "step": 392 + }, + { + "completion_length": 140.75, + "epoch": 1.256, + "grad_norm": 3.343885898590088, + "kl": 0.064208984375, + "learning_rate": 5.1e-07, + "loss": 0.0006, + "reward": 3.9535528421401978, + "reward_std": 0.016743881278671324, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9615642726421356, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9948294758796692, + "step": 393 + }, + { + "completion_length": 225.09375, + "epoch": 1.2591999999999999, + "grad_norm": 7.593709468841553, + "kl": 0.0628662109375, + "learning_rate": 5.0875e-07, + "loss": 0.0006, + "reward": 3.9337310791015625, + "reward_std": 0.01689326297491789, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9342745840549469, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.999456524848938, + "step": 394 + }, + { + "completion_length": 195.15625, + "epoch": 1.2624, + "grad_norm": 1.6891230344772339, + "kl": 0.085693359375, + "learning_rate": 5.074999999999999e-07, + "loss": 0.0009, + "reward": 3.836549401283264, + "reward_std": 0.005918985931202769, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.8378467857837677, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9987025856971741, + "step": 395 + }, + { + "completion_length": 218.71875, + "epoch": 1.2656, + "grad_norm": 2.0911483764648438, + "kl": 0.057373046875, + "learning_rate": 5.062499999999999e-07, + "loss": 0.0006, + "reward": 3.930617570877075, + "reward_std": 0.014833949506282806, + "rewards/answer_entity_reward": 0.9881944358348846, + "rewards/answer_wer_reward": 0.9436545968055725, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9987684786319733, + "step": 396 + }, + { + "completion_length": 244.4375, + "epoch": 1.2688, + "grad_norm": 0.6879564523696899, + "kl": 0.05810546875, + "learning_rate": 5.049999999999999e-07, + "loss": 0.0006, + "reward": 3.9541516304016113, + "reward_std": 0.014136601239442825, + "rewards/answer_entity_reward": 0.9979166686534882, + "rewards/answer_wer_reward": 0.9578942954540253, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9983407258987427, + "step": 397 + }, + { + "completion_length": 171.875, + "epoch": 1.272, + "grad_norm": 1.0838266611099243, + "kl": 0.063232421875, + "learning_rate": 5.0375e-07, + "loss": 0.0006, + "reward": 3.961939811706543, + "reward_std": 0.007458951906301081, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9619399607181549, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 398 + }, + { + "completion_length": 224.53125, + "epoch": 1.2752, + "grad_norm": 2.0163495540618896, + "kl": 0.072265625, + "learning_rate": 5.025e-07, + "loss": 0.0007, + "reward": 3.964465856552124, + "reward_std": 0.014243231620639563, + "rewards/answer_entity_reward": 0.9957579076290131, + "rewards/answer_wer_reward": 0.9695450067520142, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9991629421710968, + "step": 399 + }, + { + "completion_length": 181.15625, + "epoch": 1.2784, + "grad_norm": 0.38955262303352356, + "kl": 0.0517578125, + "learning_rate": 5.0125e-07, + "loss": 0.0005, + "reward": 3.9557042121887207, + "reward_std": 0.005372793646529317, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9557042419910431, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 400 + }, + { + "completion_length": 208.3125, + "epoch": 1.2816, + "grad_norm": 3.9781861305236816, + "kl": 0.0716552734375, + "learning_rate": 5e-07, + "loss": 0.0007, + "reward": 3.8667571544647217, + "reward_std": 0.015388892497867346, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9693593382835388, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8998015820980072, + "step": 401 + }, + { + "completion_length": 204.375, + "epoch": 1.2848, + "grad_norm": 1.1456544399261475, + "kl": 0.103515625, + "learning_rate": 4.9875e-07, + "loss": 0.001, + "reward": 3.956982374191284, + "reward_std": 0.007417811662890017, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9575175940990448, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9994648098945618, + "step": 402 + }, + { + "completion_length": 216.1875, + "epoch": 1.288, + "grad_norm": 1.1664754152297974, + "kl": 0.06396484375, + "learning_rate": 4.975e-07, + "loss": 0.0006, + "reward": 3.8699432611465454, + "reward_std": 0.02020346373319626, + "rewards/answer_entity_reward": 0.9979166686534882, + "rewards/answer_wer_reward": 0.9359997510910034, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.936026930809021, + "step": 403 + }, + { + "completion_length": 253.09375, + "epoch": 1.2912, + "grad_norm": 0.8103052377700806, + "kl": 0.0635986328125, + "learning_rate": 4.9625e-07, + "loss": 0.0006, + "reward": 3.937591075897217, + "reward_std": 0.018769525457173586, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9415221214294434, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9989098608493805, + "step": 404 + }, + { + "completion_length": 215.0625, + "epoch": 1.2944, + "grad_norm": 1.4777588844299316, + "kl": 0.068603515625, + "learning_rate": 4.95e-07, + "loss": 0.0007, + "reward": 3.949555516242981, + "reward_std": 0.009917980059981346, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.949555516242981, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 405 + }, + { + "completion_length": 202.65625, + "epoch": 1.2976, + "grad_norm": 0.7443984150886536, + "kl": 0.106689453125, + "learning_rate": 4.9375e-07, + "loss": 0.0011, + "reward": 3.7686209678649902, + "reward_std": 0.011178261134773493, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9451543390750885, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8234666287899017, + "step": 406 + }, + { + "completion_length": 189.21875, + "epoch": 1.3008, + "grad_norm": 0.9547207951545715, + "kl": 0.077392578125, + "learning_rate": 4.924999999999999e-07, + "loss": 0.0008, + "reward": 3.9593130350112915, + "reward_std": 0.006907296134158969, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9597530961036682, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9995598495006561, + "step": 407 + }, + { + "completion_length": 208.09375, + "epoch": 1.304, + "grad_norm": 0.8897162079811096, + "kl": 0.0604248046875, + "learning_rate": 4.9125e-07, + "loss": 0.0006, + "reward": 3.9529693126678467, + "reward_std": 0.0038969104643911123, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9714880287647247, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9814814925193787, + "step": 408 + }, + { + "completion_length": 199.78125, + "epoch": 1.3072, + "grad_norm": 1.1945850849151611, + "kl": 0.056640625, + "learning_rate": 4.9e-07, + "loss": 0.0006, + "reward": 3.951330065727234, + "reward_std": 0.0060545760206878185, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9513299763202667, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 409 + }, + { + "completion_length": 176.5, + "epoch": 1.3104, + "grad_norm": 1.5717577934265137, + "kl": 0.085205078125, + "learning_rate": 4.8875e-07, + "loss": 0.0009, + "reward": 3.9731186628341675, + "reward_std": 0.009643410099670291, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9749214053153992, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9981971085071564, + "step": 410 + }, + { + "completion_length": 209.25, + "epoch": 1.3136, + "grad_norm": 1.7357205152511597, + "kl": 0.05517578125, + "learning_rate": 4.875e-07, + "loss": 0.0006, + "reward": 3.9563956260681152, + "reward_std": 0.013218061067163944, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9563955068588257, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 411 + }, + { + "completion_length": 233.28125, + "epoch": 1.3168, + "grad_norm": 3.6717629432678223, + "kl": 0.070068359375, + "learning_rate": 4.8625e-07, + "loss": 0.0007, + "reward": 3.955284357070923, + "reward_std": 0.02536593284457922, + "rewards/answer_entity_reward": 0.9871794581413269, + "rewards/answer_wer_reward": 0.968104898929596, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 412 + }, + { + "completion_length": 205.125, + "epoch": 1.32, + "grad_norm": 1.0453362464904785, + "kl": 0.04473876953125, + "learning_rate": 4.85e-07, + "loss": 0.0005, + "reward": 3.9507482051849365, + "reward_std": 0.005348393111489713, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9646830558776855, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9860649704933167, + "step": 413 + }, + { + "completion_length": 197.71875, + "epoch": 1.3232, + "grad_norm": 10.967116355895996, + "kl": 0.4443359375, + "learning_rate": 4.8375e-07, + "loss": 0.0044, + "reward": 3.958775758743286, + "reward_std": 0.01469768793322146, + "rewards/answer_entity_reward": 0.9979166686534882, + "rewards/answer_wer_reward": 0.9608590006828308, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 414 + }, + { + "completion_length": 240.75, + "epoch": 1.3264, + "grad_norm": 1.771857738494873, + "kl": 0.056884765625, + "learning_rate": 4.824999999999999e-07, + "loss": 0.0006, + "reward": 3.9307100772857666, + "reward_std": 0.01262786379083991, + "rewards/answer_entity_reward": 0.9861111044883728, + "rewards/answer_wer_reward": 0.9445989429950714, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 415 + }, + { + "completion_length": 184.9375, + "epoch": 1.3296000000000001, + "grad_norm": 0.5742409825325012, + "kl": 0.081787109375, + "learning_rate": 4.812499999999999e-07, + "loss": 0.0008, + "reward": 3.965754270553589, + "reward_std": 0.003614649409428239, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9657542705535889, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 416 + }, + { + "completion_length": 173.90625, + "epoch": 1.3328, + "grad_norm": 1.4033151865005493, + "kl": 0.074462890625, + "learning_rate": 4.8e-07, + "loss": 0.0007, + "reward": 3.9543731212615967, + "reward_std": 0.006403392762877047, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9728915691375732, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9814814925193787, + "step": 417 + }, + { + "completion_length": 224.0625, + "epoch": 1.336, + "grad_norm": 1.0427494049072266, + "kl": 0.0576171875, + "learning_rate": 4.7875e-07, + "loss": 0.0006, + "reward": 3.965309262275696, + "reward_std": 0.011804148089140654, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9667502641677856, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9985590577125549, + "step": 418 + }, + { + "completion_length": 228.53125, + "epoch": 1.3392, + "grad_norm": 1.1613246202468872, + "kl": 0.06591796875, + "learning_rate": 4.775e-07, + "loss": 0.0007, + "reward": 3.948023200035095, + "reward_std": 0.012544674333184958, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9482711553573608, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997519850730896, + "step": 419 + }, + { + "completion_length": 197.34375, + "epoch": 1.3424, + "grad_norm": 0.8760451674461365, + "kl": 0.072265625, + "learning_rate": 4.7625e-07, + "loss": 0.0007, + "reward": 3.938261866569519, + "reward_std": 0.004269103752449155, + "rewards/answer_entity_reward": 0.9886363744735718, + "rewards/answer_wer_reward": 0.9496253132820129, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 420 + }, + { + "completion_length": 225.5, + "epoch": 1.3456000000000001, + "grad_norm": 2.4799275398254395, + "kl": 0.1290283203125, + "learning_rate": 4.7499999999999995e-07, + "loss": 0.0013, + "reward": 3.9379055500030518, + "reward_std": 0.008256121072918177, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9677021205425262, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9702034592628479, + "step": 421 + }, + { + "completion_length": 209.3125, + "epoch": 1.3488, + "grad_norm": 0.6864319443702698, + "kl": 0.0604248046875, + "learning_rate": 4.7374999999999996e-07, + "loss": 0.0006, + "reward": 3.9712308645248413, + "reward_std": 0.0032088530133478343, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9722216725349426, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9990091919898987, + "step": 422 + }, + { + "completion_length": 187.5625, + "epoch": 1.3519999999999999, + "grad_norm": 1.9412598609924316, + "kl": 0.06787109375, + "learning_rate": 4.725e-07, + "loss": 0.0007, + "reward": 3.947052240371704, + "reward_std": 0.014190569054335356, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9569187760353088, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9925373196601868, + "step": 423 + }, + { + "completion_length": 225.59375, + "epoch": 1.3552, + "grad_norm": 1.4452259540557861, + "kl": 0.09619140625, + "learning_rate": 4.7125e-07, + "loss": 0.001, + "reward": 3.939266562461853, + "reward_std": 0.012853712774813175, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9556125402450562, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9860578179359436, + "step": 424 + }, + { + "completion_length": 261.0, + "epoch": 1.3584, + "grad_norm": 0.9420474171638489, + "kl": 0.054931640625, + "learning_rate": 4.6999999999999995e-07, + "loss": 0.0006, + "reward": 3.939144253730774, + "reward_std": 0.00785708031617105, + "rewards/answer_entity_reward": 0.9916666746139526, + "rewards/answer_wer_reward": 0.9474774897098541, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 425 + }, + { + "completion_length": 243.1875, + "epoch": 1.3616, + "grad_norm": 1.1776657104492188, + "kl": 0.078369140625, + "learning_rate": 4.6874999999999996e-07, + "loss": 0.0008, + "reward": 3.928247570991516, + "reward_std": 0.02044426929205656, + "rewards/answer_entity_reward": 0.995192289352417, + "rewards/answer_wer_reward": 0.9401307106018066, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9929245114326477, + "step": 426 + }, + { + "completion_length": 204.4375, + "epoch": 1.3648, + "grad_norm": 1.6268881559371948, + "kl": 0.073974609375, + "learning_rate": 4.675e-07, + "loss": 0.0007, + "reward": 3.9266600608825684, + "reward_std": 0.006853222264908254, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9440751671791077, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9825847446918488, + "step": 427 + }, + { + "completion_length": 232.0625, + "epoch": 1.3679999999999999, + "grad_norm": 34.5067138671875, + "kl": 0.755859375, + "learning_rate": 4.6625e-07, + "loss": 0.0076, + "reward": 3.844196319580078, + "reward_std": 0.04641831433400512, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9399954378604889, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9042008221149445, + "step": 428 + }, + { + "completion_length": 253.21875, + "epoch": 1.3712, + "grad_norm": 1.4444057941436768, + "kl": 0.0673828125, + "learning_rate": 4.65e-07, + "loss": 0.0007, + "reward": 3.963658928871155, + "reward_std": 0.009957378264516592, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9636587798595428, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 429 + }, + { + "completion_length": 241.875, + "epoch": 1.3744, + "grad_norm": 0.9258720278739929, + "kl": 0.0687255859375, + "learning_rate": 4.6374999999999995e-07, + "loss": 0.0007, + "reward": 3.9617748260498047, + "reward_std": 0.013449362479150295, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9652469456195831, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 430 + }, + { + "completion_length": 204.96875, + "epoch": 1.3776, + "grad_norm": 1.6328847408294678, + "kl": 0.0863037109375, + "learning_rate": 4.625e-07, + "loss": 0.0009, + "reward": 3.8922348022460938, + "reward_std": 0.007920752046629786, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9477903544902802, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9444444477558136, + "step": 431 + }, + { + "completion_length": 222.375, + "epoch": 1.3808, + "grad_norm": 2.479295492172241, + "kl": 0.0732421875, + "learning_rate": 4.6125e-07, + "loss": 0.0007, + "reward": 3.9312403202056885, + "reward_std": 0.02260798867791891, + "rewards/answer_entity_reward": 0.9941239356994629, + "rewards/answer_wer_reward": 0.937116414308548, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 432 + }, + { + "completion_length": 203.28125, + "epoch": 1.384, + "grad_norm": 2.6669020652770996, + "kl": 0.0631103515625, + "learning_rate": 4.6e-07, + "loss": 0.0006, + "reward": 3.938199043273926, + "reward_std": 0.014480275101959705, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9408722817897797, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997305870056152, + "step": 433 + }, + { + "completion_length": 255.1875, + "epoch": 1.3872, + "grad_norm": 1.4742846488952637, + "kl": 0.057373046875, + "learning_rate": 4.5874999999999995e-07, + "loss": 0.0006, + "reward": 3.9382212162017822, + "reward_std": 0.01696724910289049, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.94236820936203, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9982567131519318, + "step": 434 + }, + { + "completion_length": 211.15625, + "epoch": 1.3904, + "grad_norm": 1.795336365699768, + "kl": 0.0667724609375, + "learning_rate": 4.575e-07, + "loss": 0.0007, + "reward": 3.919999361038208, + "reward_std": 0.028288409113883972, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9725300371646881, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9474693238735199, + "step": 435 + }, + { + "completion_length": 208.65625, + "epoch": 1.3936, + "grad_norm": 2.1704065799713135, + "kl": 0.095947265625, + "learning_rate": 4.5624999999999997e-07, + "loss": 0.001, + "reward": 3.857280731201172, + "reward_std": 0.2144411588087678, + "rewards/answer_entity_reward": 0.9618055820465088, + "rewards/answer_wer_reward": 0.949828714132309, + "rewards/format_reward": 0.96875, + "rewards/think_ocr_reward": 0.9768964946269989, + "step": 436 + }, + { + "completion_length": 194.9375, + "epoch": 1.3968, + "grad_norm": 3.8814220428466797, + "kl": 0.082275390625, + "learning_rate": 4.55e-07, + "loss": 0.0008, + "reward": 3.941987633705139, + "reward_std": 0.015088737476617098, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.94545978307724, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 437 + }, + { + "completion_length": 217.5, + "epoch": 1.4, + "grad_norm": 1.3024876117706299, + "kl": 0.0389404296875, + "learning_rate": 4.5374999999999994e-07, + "loss": 0.0004, + "reward": 3.950901508331299, + "reward_std": 0.008365771966055036, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9589883685112, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9919130802154541, + "step": 438 + }, + { + "completion_length": 159.03125, + "epoch": 1.4032, + "grad_norm": 0.272270530462265, + "kl": 0.0396728515625, + "learning_rate": 4.525e-07, + "loss": 0.0004, + "reward": 3.9221452474594116, + "reward_std": 0.0014547138416673988, + "rewards/answer_entity_reward": 0.9916666746139526, + "rewards/answer_wer_reward": 0.9875754117965698, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9429032206535339, + "step": 439 + }, + { + "completion_length": 200.28125, + "epoch": 1.4064, + "grad_norm": 5.4578399658203125, + "kl": 0.0828857421875, + "learning_rate": 4.5124999999999997e-07, + "loss": 0.0008, + "reward": 3.9259976148605347, + "reward_std": 0.014895747415721416, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9536634683609009, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9758064448833466, + "step": 440 + }, + { + "completion_length": 229.1875, + "epoch": 1.4096, + "grad_norm": 0.6568198800086975, + "kl": 0.067138671875, + "learning_rate": 4.5e-07, + "loss": 0.0007, + "reward": 3.9455034732818604, + "reward_std": 0.011267438880167902, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9479073286056519, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 441 + }, + { + "completion_length": 199.3125, + "epoch": 1.4128, + "grad_norm": 1.0056089162826538, + "kl": 0.0567626953125, + "learning_rate": 4.4874999999999994e-07, + "loss": 0.0006, + "reward": 3.9622955322265625, + "reward_std": 0.008431105175986886, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9622955024242401, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 442 + }, + { + "completion_length": 212.375, + "epoch": 1.416, + "grad_norm": 0.7950085997581482, + "kl": 0.051025390625, + "learning_rate": 4.475e-07, + "loss": 0.0005, + "reward": 3.9517738819122314, + "reward_std": 0.03710572328418493, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9708344638347626, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9809393286705017, + "step": 443 + }, + { + "completion_length": 227.71875, + "epoch": 1.4192, + "grad_norm": 0.8971355557441711, + "kl": 0.0460205078125, + "learning_rate": 4.4624999999999996e-07, + "loss": 0.0005, + "reward": 3.980188012123108, + "reward_std": 0.00624943315051496, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9801879525184631, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 444 + }, + { + "completion_length": 226.78125, + "epoch": 1.4224, + "grad_norm": 2.114032745361328, + "kl": 0.0791015625, + "learning_rate": 4.45e-07, + "loss": 0.0008, + "reward": 3.879195213317871, + "reward_std": 0.03936337144114077, + "rewards/answer_entity_reward": 0.9981617629528046, + "rewards/answer_wer_reward": 0.9502902626991272, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9307432472705841, + "step": 445 + }, + { + "completion_length": 227.875, + "epoch": 1.4256, + "grad_norm": 1.0065126419067383, + "kl": 0.083984375, + "learning_rate": 4.4374999999999993e-07, + "loss": 0.0009, + "reward": 3.939829707145691, + "reward_std": 0.013783617876470089, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9398296475410461, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 446 + }, + { + "completion_length": 202.6875, + "epoch": 1.4288, + "grad_norm": 1.7568168640136719, + "kl": 0.0418701171875, + "learning_rate": 4.425e-07, + "loss": 0.0004, + "reward": 3.943518042564392, + "reward_std": 0.016201740596443415, + "rewards/answer_entity_reward": 0.9914772808551788, + "rewards/answer_wer_reward": 0.9520406126976013, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 447 + }, + { + "completion_length": 172.8125, + "epoch": 1.432, + "grad_norm": 1.0688170194625854, + "kl": 0.0494384765625, + "learning_rate": 4.4124999999999996e-07, + "loss": 0.0005, + "reward": 3.7196162939071655, + "reward_std": 0.006592530757188797, + "rewards/answer_entity_reward": 0.8677884340286255, + "rewards/answer_wer_reward": 0.8742637634277344, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9775640964508057, + "step": 448 + }, + { + "completion_length": 168.59375, + "epoch": 1.4352, + "grad_norm": 1.7712996006011963, + "kl": 0.0435791015625, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.0004, + "reward": 3.8386131525039673, + "reward_std": 0.011066187638789415, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.8386130630970001, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 449 + }, + { + "completion_length": 197.40625, + "epoch": 1.4384000000000001, + "grad_norm": 0.8872710466384888, + "kl": 0.058349609375, + "learning_rate": 4.3874999999999993e-07, + "loss": 0.0006, + "reward": 3.7988067865371704, + "reward_std": 0.03104257071390748, + "rewards/answer_entity_reward": 0.9734432399272919, + "rewards/answer_wer_reward": 0.8270545899868011, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9983089566230774, + "step": 450 + }, + { + "completion_length": 178.40625, + "epoch": 1.4416, + "grad_norm": 6.044506072998047, + "kl": 0.0657958984375, + "learning_rate": 4.375e-07, + "loss": 0.0007, + "reward": 3.9419833421707153, + "reward_std": 0.021156481467187405, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9676234424114227, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9778319895267487, + "step": 451 + }, + { + "completion_length": 201.8125, + "epoch": 1.4447999999999999, + "grad_norm": 0.7943681478500366, + "kl": 0.0511474609375, + "learning_rate": 4.3625e-07, + "loss": 0.0005, + "reward": 3.956661581993103, + "reward_std": 0.007463611662387848, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9675310552120209, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.989130437374115, + "step": 452 + }, + { + "completion_length": 219.03125, + "epoch": 1.448, + "grad_norm": 1.069403052330017, + "kl": 0.0570068359375, + "learning_rate": 4.3499999999999996e-07, + "loss": 0.0006, + "reward": 3.9562065601348877, + "reward_std": 0.011006501503288746, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9564736187458038, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997329115867615, + "step": 453 + }, + { + "completion_length": 206.8125, + "epoch": 1.4512, + "grad_norm": 1.0987451076507568, + "kl": 0.0611572265625, + "learning_rate": 4.3375000000000003e-07, + "loss": 0.0006, + "reward": 3.9423000812530518, + "reward_std": 0.01284673297777772, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9693345129489899, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9758064448833466, + "step": 454 + }, + { + "completion_length": 211.375, + "epoch": 1.4544000000000001, + "grad_norm": 3.5896220207214355, + "kl": 0.065673828125, + "learning_rate": 4.325e-07, + "loss": 0.0007, + "reward": 3.961179494857788, + "reward_std": 0.012267218437045813, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9640858769416809, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.99709352850914, + "step": 455 + }, + { + "completion_length": 238.8125, + "epoch": 1.4576, + "grad_norm": 0.625076174736023, + "kl": 0.0399169921875, + "learning_rate": 4.3125e-07, + "loss": 0.0004, + "reward": 3.9661307334899902, + "reward_std": 0.013454007916152477, + "rewards/answer_entity_reward": 0.9958333373069763, + "rewards/answer_wer_reward": 0.9702973961830139, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 456 + }, + { + "completion_length": 206.9375, + "epoch": 1.4607999999999999, + "grad_norm": 0.6369054317474365, + "kl": 0.059814453125, + "learning_rate": 4.2999999999999996e-07, + "loss": 0.0006, + "reward": 3.9704521894454956, + "reward_std": 0.006653362594079226, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9733729660511017, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9970792829990387, + "step": 457 + }, + { + "completion_length": 199.625, + "epoch": 1.464, + "grad_norm": 1.2201271057128906, + "kl": 0.083251953125, + "learning_rate": 4.2875e-07, + "loss": 0.0008, + "reward": 3.967539429664612, + "reward_std": 0.012669337913393974, + "rewards/answer_entity_reward": 0.9927884340286255, + "rewards/answer_wer_reward": 0.9747509360313416, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 458 + }, + { + "completion_length": 220.0, + "epoch": 1.4672, + "grad_norm": 11.574130058288574, + "kl": 0.2125244140625, + "learning_rate": 4.275e-07, + "loss": 0.0021, + "reward": 3.9735381603240967, + "reward_std": 0.0033322512172162533, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9737901091575623, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997479915618896, + "step": 459 + }, + { + "completion_length": 181.0625, + "epoch": 1.4704, + "grad_norm": 1.050900936126709, + "kl": 0.0736083984375, + "learning_rate": 4.2625e-07, + "loss": 0.0007, + "reward": 3.9467893838882446, + "reward_std": 0.00827464903704822, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9717220067977905, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9750673770904541, + "step": 460 + }, + { + "completion_length": 207.4375, + "epoch": 1.4736, + "grad_norm": 1.25560462474823, + "kl": 0.07861328125, + "learning_rate": 4.2499999999999995e-07, + "loss": 0.0008, + "reward": 3.885838508605957, + "reward_std": 0.012714273296296597, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9541967213153839, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9316417276859283, + "step": 461 + }, + { + "completion_length": 205.125, + "epoch": 1.4768, + "grad_norm": 2.1235697269439697, + "kl": 0.064208984375, + "learning_rate": 4.2375e-07, + "loss": 0.0006, + "reward": 3.952380895614624, + "reward_std": 0.013835938647389412, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9538231492042542, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9985576868057251, + "step": 462 + }, + { + "completion_length": 229.25, + "epoch": 1.48, + "grad_norm": 3.838672399520874, + "kl": 0.09619140625, + "learning_rate": 4.225e-07, + "loss": 0.001, + "reward": 3.9537363052368164, + "reward_std": 0.014287983998656273, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9542993903160095, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9994369447231293, + "step": 463 + }, + { + "completion_length": 224.71875, + "epoch": 1.4832, + "grad_norm": 0.7103460431098938, + "kl": 0.058837890625, + "learning_rate": 4.2125e-07, + "loss": 0.0006, + "reward": 3.9675354957580566, + "reward_std": 0.013558031525462866, + "rewards/answer_entity_reward": 0.9958333373069763, + "rewards/answer_wer_reward": 0.9719286262989044, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997735619544983, + "step": 464 + }, + { + "completion_length": 147.625, + "epoch": 1.4864, + "grad_norm": 2.865051031112671, + "kl": 0.099853515625, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.001, + "reward": 3.958040475845337, + "reward_std": 0.00422883324790746, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9780724942684174, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9799679517745972, + "step": 465 + }, + { + "completion_length": 250.625, + "epoch": 1.4896, + "grad_norm": 1.115330696105957, + "kl": 0.062744140625, + "learning_rate": 4.1875e-07, + "loss": 0.0006, + "reward": 3.925747871398926, + "reward_std": 0.01510471198707819, + "rewards/answer_entity_reward": 0.9895833134651184, + "rewards/answer_wer_reward": 0.9361644089221954, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 466 + }, + { + "completion_length": 181.28125, + "epoch": 1.4928, + "grad_norm": 0.8615334033966064, + "kl": 0.095703125, + "learning_rate": 4.1749999999999997e-07, + "loss": 0.001, + "reward": 3.9389272928237915, + "reward_std": 0.009215079713612795, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.947648286819458, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9912790656089783, + "step": 467 + }, + { + "completion_length": 201.1875, + "epoch": 1.496, + "grad_norm": 0.8399393558502197, + "kl": 0.067138671875, + "learning_rate": 4.1625e-07, + "loss": 0.0007, + "reward": 3.9645369052886963, + "reward_std": 0.005296911578625441, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9660760462284088, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9984607994556427, + "step": 468 + }, + { + "completion_length": 181.53125, + "epoch": 1.4992, + "grad_norm": 1.692581057548523, + "kl": 0.116455078125, + "learning_rate": 4.1499999999999994e-07, + "loss": 0.0012, + "reward": 3.91774320602417, + "reward_std": 0.007862454745918512, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9589084982872009, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.958834707736969, + "step": 469 + }, + { + "completion_length": 208.375, + "epoch": 1.5024, + "grad_norm": 1.0280638933181763, + "kl": 0.0733642578125, + "learning_rate": 4.1375e-07, + "loss": 0.0007, + "reward": 3.963421940803528, + "reward_std": 0.010574808926321566, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9634219110012054, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 470 + }, + { + "completion_length": 194.375, + "epoch": 1.5056, + "grad_norm": 0.9556618332862854, + "kl": 0.04541015625, + "learning_rate": 4.1249999999999997e-07, + "loss": 0.0005, + "reward": 3.9483964443206787, + "reward_std": 0.0071337176486849785, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9483965635299683, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 471 + }, + { + "completion_length": 219.90625, + "epoch": 1.5088, + "grad_norm": 8.583925247192383, + "kl": 0.057373046875, + "learning_rate": 4.1125e-07, + "loss": 0.0006, + "reward": 3.9298593997955322, + "reward_std": 0.010127428220584989, + "rewards/answer_entity_reward": 0.9764957129955292, + "rewards/answer_wer_reward": 0.9549680352210999, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9983957409858704, + "step": 472 + }, + { + "completion_length": 169.71875, + "epoch": 1.512, + "grad_norm": 1.0506740808486938, + "kl": 0.0703125, + "learning_rate": 4.0999999999999994e-07, + "loss": 0.0007, + "reward": 3.9712518453598022, + "reward_std": 0.004299861378967762, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9712517857551575, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 473 + }, + { + "completion_length": 254.0, + "epoch": 1.5152, + "grad_norm": 1.2391588687896729, + "kl": 0.055419921875, + "learning_rate": 4.0875e-07, + "loss": 0.0006, + "reward": 3.9443717002868652, + "reward_std": 0.007719833869487047, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9459867179393768, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9983848929405212, + "step": 474 + }, + { + "completion_length": 173.15625, + "epoch": 1.5184, + "grad_norm": 21.967166900634766, + "kl": 0.0810546875, + "learning_rate": 4.0749999999999996e-07, + "loss": 0.0008, + "reward": 3.892626404762268, + "reward_std": 0.03193977475166321, + "rewards/answer_entity_reward": 0.9926470518112183, + "rewards/answer_wer_reward": 0.9627694487571716, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9372097849845886, + "step": 475 + }, + { + "completion_length": 177.96875, + "epoch": 1.5215999999999998, + "grad_norm": 2.125126838684082, + "kl": 0.0814208984375, + "learning_rate": 4.0625e-07, + "loss": 0.0008, + "reward": 3.957445502281189, + "reward_std": 0.016827338375151157, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9618943929672241, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9990234375, + "step": 476 + }, + { + "completion_length": 259.34375, + "epoch": 1.5248, + "grad_norm": 1.144234538078308, + "kl": 0.0545654296875, + "learning_rate": 4.05e-07, + "loss": 0.0005, + "reward": 3.9333302974700928, + "reward_std": 0.015490441583096981, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9336776435375214, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.999652773141861, + "step": 477 + }, + { + "completion_length": 223.84375, + "epoch": 1.528, + "grad_norm": 0.8379483222961426, + "kl": 0.0653076171875, + "learning_rate": 4.0375e-07, + "loss": 0.0007, + "reward": 3.9397594928741455, + "reward_std": 0.006189712788909674, + "rewards/answer_entity_reward": 0.9926470518112183, + "rewards/answer_wer_reward": 0.9652985334396362, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.981813907623291, + "step": 478 + }, + { + "completion_length": 195.15625, + "epoch": 1.5312000000000001, + "grad_norm": 1.9627622365951538, + "kl": 0.0709228515625, + "learning_rate": 4.025e-07, + "loss": 0.0007, + "reward": 3.90268337726593, + "reward_std": 0.022933244705200195, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9422430694103241, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9632812440395355, + "step": 479 + }, + { + "completion_length": 212.03125, + "epoch": 1.5344, + "grad_norm": 1.4353668689727783, + "kl": 0.0572509765625, + "learning_rate": 4.0124999999999997e-07, + "loss": 0.0006, + "reward": 3.955712080001831, + "reward_std": 0.004905138397589326, + "rewards/answer_entity_reward": 0.9903846085071564, + "rewards/answer_wer_reward": 0.9653275012969971, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 480 + }, + { + "completion_length": 238.125, + "epoch": 1.5375999999999999, + "grad_norm": 0.9400500059127808, + "kl": 0.0516357421875, + "learning_rate": 4e-07, + "loss": 0.0005, + "reward": 3.9561740159988403, + "reward_std": 0.004761199816130102, + "rewards/answer_entity_reward": 0.9903846085071564, + "rewards/answer_wer_reward": 0.9657893478870392, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 481 + }, + { + "completion_length": 197.125, + "epoch": 1.5408, + "grad_norm": 1.7909142971038818, + "kl": 0.044677734375, + "learning_rate": 3.9875e-07, + "loss": 0.0004, + "reward": 3.9649877548217773, + "reward_std": 0.008824507240206003, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9712709188461304, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.993716835975647, + "step": 482 + }, + { + "completion_length": 247.28125, + "epoch": 1.544, + "grad_norm": 1.305432915687561, + "kl": 0.0885009765625, + "learning_rate": 3.975e-07, + "loss": 0.0009, + "reward": 3.9271016120910645, + "reward_std": 0.010741112288087606, + "rewards/answer_entity_reward": 0.9867424368858337, + "rewards/answer_wer_reward": 0.9422920942306519, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9980670213699341, + "step": 483 + }, + { + "completion_length": 183.71875, + "epoch": 1.5472000000000001, + "grad_norm": 1.2143511772155762, + "kl": 0.083251953125, + "learning_rate": 3.9624999999999996e-07, + "loss": 0.0008, + "reward": 3.961517810821533, + "reward_std": 0.015109732514247298, + "rewards/answer_entity_reward": 0.9955357313156128, + "rewards/answer_wer_reward": 0.9659819006919861, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 484 + }, + { + "completion_length": 190.96875, + "epoch": 1.5504, + "grad_norm": 1.3901034593582153, + "kl": 0.0478515625, + "learning_rate": 3.95e-07, + "loss": 0.0005, + "reward": 3.9620405435562134, + "reward_std": 0.007438812637701631, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.962040513753891, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 485 + }, + { + "completion_length": 236.71875, + "epoch": 1.5535999999999999, + "grad_norm": 1.005139946937561, + "kl": 0.064697265625, + "learning_rate": 3.9375e-07, + "loss": 0.0007, + "reward": 3.9681735038757324, + "reward_std": 0.007598390802741051, + "rewards/answer_entity_reward": 0.9981617629528046, + "rewards/answer_wer_reward": 0.9703975021839142, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996141791343689, + "step": 486 + }, + { + "completion_length": 167.71875, + "epoch": 1.5568, + "grad_norm": 14.769695281982422, + "kl": 0.088623046875, + "learning_rate": 3.925e-07, + "loss": 0.0009, + "reward": 3.9402579069137573, + "reward_std": 0.01711948262527585, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9504852592945099, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9897727370262146, + "step": 487 + }, + { + "completion_length": 245.59375, + "epoch": 1.56, + "grad_norm": 2.1311302185058594, + "kl": 0.0643310546875, + "learning_rate": 3.9124999999999996e-07, + "loss": 0.0006, + "reward": 3.965644121170044, + "reward_std": 0.006802293471992016, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9664610624313354, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9991829991340637, + "step": 488 + }, + { + "completion_length": 228.90625, + "epoch": 1.5632000000000001, + "grad_norm": 2.194638967514038, + "kl": 0.07861328125, + "learning_rate": 3.8999999999999997e-07, + "loss": 0.0008, + "reward": 3.940732479095459, + "reward_std": 0.00845141801983118, + "rewards/answer_entity_reward": 0.9916666746139526, + "rewards/answer_wer_reward": 0.9496362805366516, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9994294047355652, + "step": 489 + }, + { + "completion_length": 229.09375, + "epoch": 1.5664, + "grad_norm": 1.4338947534561157, + "kl": 0.067138671875, + "learning_rate": 3.8875e-07, + "loss": 0.0007, + "reward": 3.974826216697693, + "reward_std": 0.008368036011233926, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9759277105331421, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9988985061645508, + "step": 490 + }, + { + "completion_length": 147.1875, + "epoch": 1.5695999999999999, + "grad_norm": 0.9500789046287537, + "kl": 0.055908203125, + "learning_rate": 3.875e-07, + "loss": 0.0006, + "reward": 3.900749683380127, + "reward_std": 0.004976645112037659, + "rewards/answer_entity_reward": 0.9886363744735718, + "rewards/answer_wer_reward": 0.981389045715332, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9307242631912231, + "step": 491 + }, + { + "completion_length": 207.1875, + "epoch": 1.5728, + "grad_norm": 18.29888916015625, + "kl": 0.0787353515625, + "learning_rate": 3.8624999999999995e-07, + "loss": 0.0008, + "reward": 3.9231996536254883, + "reward_std": 0.01712162047624588, + "rewards/answer_entity_reward": 0.9963235259056091, + "rewards/answer_wer_reward": 0.9278469979763031, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9990289807319641, + "step": 492 + }, + { + "completion_length": 215.3125, + "epoch": 1.576, + "grad_norm": 2.524644613265991, + "kl": 0.0682373046875, + "learning_rate": 3.8499999999999997e-07, + "loss": 0.0007, + "reward": 3.9182220697402954, + "reward_std": 0.028343133628368378, + "rewards/answer_entity_reward": 0.9899839758872986, + "rewards/answer_wer_reward": 0.9533904790878296, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9748475551605225, + "step": 493 + }, + { + "completion_length": 205.21875, + "epoch": 1.5792000000000002, + "grad_norm": 0.8041574954986572, + "kl": 0.0572509765625, + "learning_rate": 3.8375e-07, + "loss": 0.0006, + "reward": 3.9712276458740234, + "reward_std": 0.006993145681917667, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9721719622612, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9990555644035339, + "step": 494 + }, + { + "completion_length": 245.84375, + "epoch": 1.5824, + "grad_norm": 1.4723294973373413, + "kl": 0.0518798828125, + "learning_rate": 3.825e-07, + "loss": 0.0005, + "reward": 3.9171528816223145, + "reward_std": 0.007540189428254962, + "rewards/answer_entity_reward": 0.9707792401313782, + "rewards/answer_wer_reward": 0.9463737607002258, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 495 + }, + { + "completion_length": 191.1875, + "epoch": 1.5856, + "grad_norm": 5.778710842132568, + "kl": 0.095703125, + "learning_rate": 3.8124999999999995e-07, + "loss": 0.001, + "reward": 3.7989085912704468, + "reward_std": 0.02309321239590645, + "rewards/answer_entity_reward": 0.9837072491645813, + "rewards/answer_wer_reward": 0.9482426345348358, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.866958737373352, + "step": 496 + }, + { + "completion_length": 164.375, + "epoch": 1.5888, + "grad_norm": 3.773331880569458, + "kl": 0.0452880859375, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.0005, + "reward": 3.957179307937622, + "reward_std": 0.03012340608984232, + "rewards/answer_entity_reward": 0.995192289352417, + "rewards/answer_wer_reward": 0.9724558889865875, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9895310997962952, + "step": 497 + }, + { + "completion_length": 190.34375, + "epoch": 1.592, + "grad_norm": 1.7698373794555664, + "kl": 0.0579833984375, + "learning_rate": 3.7875e-07, + "loss": 0.0006, + "reward": 3.9473685026168823, + "reward_std": 0.009419793263077736, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9480363428592682, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9993322789669037, + "step": 498 + }, + { + "completion_length": 223.03125, + "epoch": 1.5952, + "grad_norm": 1.197536587715149, + "kl": 0.074462890625, + "learning_rate": 3.775e-07, + "loss": 0.0007, + "reward": 3.9201695919036865, + "reward_std": 0.012398123741149902, + "rewards/answer_entity_reward": 0.9861111044883728, + "rewards/answer_wer_reward": 0.9409077167510986, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9931506812572479, + "step": 499 + }, + { + "completion_length": 204.46875, + "epoch": 1.5984, + "grad_norm": 1.5246530771255493, + "kl": 0.0849609375, + "learning_rate": 3.7624999999999994e-07, + "loss": 0.0008, + "reward": 3.9556870460510254, + "reward_std": 0.010473677422851324, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9580392241477966, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9976478517055511, + "step": 500 + }, + { + "completion_length": 230.0625, + "epoch": 1.6016, + "grad_norm": 1.1340093612670898, + "kl": 0.10595703125, + "learning_rate": 3.75e-07, + "loss": 0.0011, + "reward": 3.9659206867218018, + "reward_std": 0.008191006258130074, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9659207165241241, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 501 + }, + { + "completion_length": 185.15625, + "epoch": 1.6048, + "grad_norm": 1.2874914407730103, + "kl": 0.045654296875, + "learning_rate": 3.7375e-07, + "loss": 0.0005, + "reward": 3.9568817615509033, + "reward_std": 0.011238863109610975, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9603540003299713, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 502 + }, + { + "completion_length": 241.78125, + "epoch": 1.608, + "grad_norm": 0.9499295353889465, + "kl": 0.0531005859375, + "learning_rate": 3.725e-07, + "loss": 0.0005, + "reward": 3.9388747215270996, + "reward_std": 0.008348907809704542, + "rewards/answer_entity_reward": 0.9886363744735718, + "rewards/answer_wer_reward": 0.9510295391082764, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9992088675498962, + "step": 503 + }, + { + "completion_length": 233.25, + "epoch": 1.6112, + "grad_norm": 1.0857101678848267, + "kl": 0.062744140625, + "learning_rate": 3.7125e-07, + "loss": 0.0006, + "reward": 3.958517551422119, + "reward_std": 0.0058578201569616795, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.958990752696991, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9995267689228058, + "step": 504 + }, + { + "completion_length": 251.78125, + "epoch": 1.6143999999999998, + "grad_norm": 28.171039581298828, + "kl": 0.114013671875, + "learning_rate": 3.7e-07, + "loss": 0.0011, + "reward": 3.866329312324524, + "reward_std": 0.01942992489784956, + "rewards/answer_entity_reward": 0.9720904231071472, + "rewards/answer_wer_reward": 0.8955735862255096, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9986652433872223, + "step": 505 + }, + { + "completion_length": 186.46875, + "epoch": 1.6176, + "grad_norm": 6.638906955718994, + "kl": 0.06884765625, + "learning_rate": 3.6875e-07, + "loss": 0.0007, + "reward": 3.7806142568588257, + "reward_std": 0.013823950197547674, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.945627748966217, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8378273248672485, + "step": 506 + }, + { + "completion_length": 225.375, + "epoch": 1.6208, + "grad_norm": 2.12021803855896, + "kl": 0.07177734375, + "learning_rate": 3.675e-07, + "loss": 0.0007, + "reward": 3.9451769590377808, + "reward_std": 0.013169697020202875, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9672558605670929, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.977921187877655, + "step": 507 + }, + { + "completion_length": 219.125, + "epoch": 1.624, + "grad_norm": 1.5153933763504028, + "kl": 0.053955078125, + "learning_rate": 3.6625e-07, + "loss": 0.0005, + "reward": 3.959490180015564, + "reward_std": 0.010949777672067285, + "rewards/answer_entity_reward": 0.9958333373069763, + "rewards/answer_wer_reward": 0.9636567533016205, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 508 + }, + { + "completion_length": 228.4375, + "epoch": 1.6272, + "grad_norm": 3.832310676574707, + "kl": 0.0521240234375, + "learning_rate": 3.65e-07, + "loss": 0.0005, + "reward": 3.953840732574463, + "reward_std": 0.017153040505945683, + "rewards/answer_entity_reward": 0.9936868846416473, + "rewards/answer_wer_reward": 0.9603707194328308, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997829794883728, + "step": 509 + }, + { + "completion_length": 243.46875, + "epoch": 1.6303999999999998, + "grad_norm": 1.285962462425232, + "kl": 0.0673828125, + "learning_rate": 3.6375e-07, + "loss": 0.0007, + "reward": 3.960462808609009, + "reward_std": 0.0062334975227713585, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9608500599861145, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996127486228943, + "step": 510 + }, + { + "completion_length": 262.65625, + "epoch": 1.6336, + "grad_norm": 1.124130368232727, + "kl": 0.0596923828125, + "learning_rate": 3.6249999999999997e-07, + "loss": 0.0006, + "reward": 3.941042900085449, + "reward_std": 0.01204587472602725, + "rewards/answer_entity_reward": 0.9970238208770752, + "rewards/answer_wer_reward": 0.9446144104003906, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9994047582149506, + "step": 511 + }, + { + "completion_length": 182.28125, + "epoch": 1.6368, + "grad_norm": 1.9966425895690918, + "kl": 0.061279296875, + "learning_rate": 3.6125e-07, + "loss": 0.0006, + "reward": 3.9531023502349854, + "reward_std": 0.02773769712075591, + "rewards/answer_entity_reward": 0.9917200803756714, + "rewards/answer_wer_reward": 0.9697157144546509, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9916666746139526, + "step": 512 + }, + { + "completion_length": 218.125, + "epoch": 1.6400000000000001, + "grad_norm": 3.2862062454223633, + "kl": 0.04736328125, + "learning_rate": 3.6e-07, + "loss": 0.0005, + "reward": 3.858319878578186, + "reward_std": 0.07778534758836031, + "rewards/answer_entity_reward": 0.9955357313156128, + "rewards/answer_wer_reward": 0.9565341770648956, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.90625, + "step": 513 + }, + { + "completion_length": 235.03125, + "epoch": 1.6432, + "grad_norm": 1.14111328125, + "kl": 0.054443359375, + "learning_rate": 3.5875e-07, + "loss": 0.0005, + "reward": 3.967674970626831, + "reward_std": 0.0044005257077515125, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9691169261932373, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9985580444335938, + "step": 514 + }, + { + "completion_length": 233.90625, + "epoch": 1.6463999999999999, + "grad_norm": 1.2006644010543823, + "kl": 0.06103515625, + "learning_rate": 3.5749999999999997e-07, + "loss": 0.0006, + "reward": 3.959411859512329, + "reward_std": 0.005820953520014882, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9596619009971619, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.999750018119812, + "step": 515 + }, + { + "completion_length": 252.5625, + "epoch": 1.6496, + "grad_norm": 0.7272346615791321, + "kl": 0.0428466796875, + "learning_rate": 3.5625e-07, + "loss": 0.0004, + "reward": 3.963356375694275, + "reward_std": 0.0036240214249119163, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.964261919260025, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9990943968296051, + "step": 516 + }, + { + "completion_length": 240.6875, + "epoch": 1.6528, + "grad_norm": 1.0241456031799316, + "kl": 0.0665283203125, + "learning_rate": 3.55e-07, + "loss": 0.0007, + "reward": 3.953768730163574, + "reward_std": 0.012724505737423897, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9555812776088715, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9981874525547028, + "step": 517 + }, + { + "completion_length": 221.5625, + "epoch": 1.6560000000000001, + "grad_norm": 0.9653159379959106, + "kl": 0.0732421875, + "learning_rate": 3.5375e-07, + "loss": 0.0007, + "reward": 3.928879141807556, + "reward_std": 0.03069964610040188, + "rewards/answer_entity_reward": 0.9769324958324432, + "rewards/answer_wer_reward": 0.9525844156742096, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9993622303009033, + "step": 518 + }, + { + "completion_length": 186.53125, + "epoch": 1.6592, + "grad_norm": 1.616326928138733, + "kl": 0.0673828125, + "learning_rate": 3.5249999999999996e-07, + "loss": 0.0007, + "reward": 3.963484525680542, + "reward_std": 0.0024420777335762978, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9634844958782196, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 519 + }, + { + "completion_length": 197.53125, + "epoch": 1.6623999999999999, + "grad_norm": 1.1605949401855469, + "kl": 0.066162109375, + "learning_rate": 3.5124999999999997e-07, + "loss": 0.0007, + "reward": 3.871947407722473, + "reward_std": 0.008121895836666226, + "rewards/answer_entity_reward": 0.9832701981067657, + "rewards/answer_wer_reward": 0.9628296792507172, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9258474707603455, + "step": 520 + }, + { + "completion_length": 199.9375, + "epoch": 1.6656, + "grad_norm": 2.1799464225769043, + "kl": 0.098876953125, + "learning_rate": 3.5e-07, + "loss": 0.001, + "reward": 3.914597272872925, + "reward_std": 0.046278308145701885, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9440673291683197, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9705299139022827, + "step": 521 + }, + { + "completion_length": 213.78125, + "epoch": 1.6688, + "grad_norm": 1.8315109014511108, + "kl": 0.0609130859375, + "learning_rate": 3.4875e-07, + "loss": 0.0006, + "reward": 3.934143304824829, + "reward_std": 0.005300799617543817, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9627971351146698, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9713463485240936, + "step": 522 + }, + { + "completion_length": 233.21875, + "epoch": 1.6720000000000002, + "grad_norm": 2.7353854179382324, + "kl": 0.0634765625, + "learning_rate": 3.4749999999999996e-07, + "loss": 0.0006, + "reward": 3.940351963043213, + "reward_std": 0.012048345990478992, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9584531188011169, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9818988144397736, + "step": 523 + }, + { + "completion_length": 226.8125, + "epoch": 1.6752, + "grad_norm": 1.2798601388931274, + "kl": 0.0517578125, + "learning_rate": 3.4624999999999997e-07, + "loss": 0.0005, + "reward": 3.94057559967041, + "reward_std": 0.016422050073742867, + "rewards/answer_entity_reward": 0.9859203100204468, + "rewards/answer_wer_reward": 0.9546553492546082, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 524 + }, + { + "completion_length": 225.375, + "epoch": 1.6784, + "grad_norm": 2.434398651123047, + "kl": 0.0570068359375, + "learning_rate": 3.45e-07, + "loss": 0.0006, + "reward": 3.9358779191970825, + "reward_std": 0.02181497309356928, + "rewards/answer_entity_reward": 0.9961080551147461, + "rewards/answer_wer_reward": 0.9410910904407501, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9986788034439087, + "step": 525 + }, + { + "completion_length": 181.21875, + "epoch": 1.6816, + "grad_norm": 1.322139859199524, + "kl": 0.116943359375, + "learning_rate": 3.4375e-07, + "loss": 0.0012, + "reward": 3.946447730064392, + "reward_std": 0.007033249130472541, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9464477598667145, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 526 + }, + { + "completion_length": 195.8125, + "epoch": 1.6848, + "grad_norm": 1.412061333656311, + "kl": 0.06640625, + "learning_rate": 3.425e-07, + "loss": 0.0007, + "reward": 3.936468005180359, + "reward_std": 0.00922114565037191, + "rewards/answer_entity_reward": 0.9841346144676208, + "rewards/answer_wer_reward": 0.952333390712738, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 527 + }, + { + "completion_length": 210.84375, + "epoch": 1.688, + "grad_norm": 3.695819139480591, + "kl": 0.056640625, + "learning_rate": 3.4124999999999996e-07, + "loss": 0.0006, + "reward": 3.894517421722412, + "reward_std": 0.015210594050586224, + "rewards/answer_entity_reward": 0.9943181872367859, + "rewards/answer_wer_reward": 0.9634661674499512, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9367331266403198, + "step": 528 + }, + { + "completion_length": 220.09375, + "epoch": 1.6912, + "grad_norm": 1.6299357414245605, + "kl": 0.0711669921875, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.0007, + "reward": 3.9391125440597534, + "reward_std": 0.014290765568148345, + "rewards/answer_entity_reward": 0.9847222566604614, + "rewards/answer_wer_reward": 0.954390287399292, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 529 + }, + { + "completion_length": 195.65625, + "epoch": 1.6944, + "grad_norm": 4.491413116455078, + "kl": 0.064453125, + "learning_rate": 3.3875e-07, + "loss": 0.0007, + "reward": 3.971281409263611, + "reward_std": 0.017785906326025724, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9796920418739319, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9944303929805756, + "step": 530 + }, + { + "completion_length": 208.34375, + "epoch": 1.6976, + "grad_norm": 4.832588195800781, + "kl": 0.0972900390625, + "learning_rate": 3.375e-07, + "loss": 0.001, + "reward": 3.9011433124542236, + "reward_std": 0.010198547039180994, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9640267491340637, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9371165633201599, + "step": 531 + }, + { + "completion_length": 203.40625, + "epoch": 1.7008, + "grad_norm": 3.4038021564483643, + "kl": 0.071044921875, + "learning_rate": 3.3624999999999996e-07, + "loss": 0.0007, + "reward": 3.9605783224105835, + "reward_std": 0.0076046837493777275, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9607688188552856, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9998094439506531, + "step": 532 + }, + { + "completion_length": 241.8125, + "epoch": 1.704, + "grad_norm": 1.0362496376037598, + "kl": 0.063232421875, + "learning_rate": 3.35e-07, + "loss": 0.0006, + "reward": 3.9339258670806885, + "reward_std": 0.018858356634154916, + "rewards/answer_entity_reward": 0.9955357313156128, + "rewards/answer_wer_reward": 0.9387494027614594, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996408224105835, + "step": 533 + }, + { + "completion_length": 235.90625, + "epoch": 1.7072, + "grad_norm": 3.604599714279175, + "kl": 0.0853271484375, + "learning_rate": 3.3375e-07, + "loss": 0.0009, + "reward": 3.861118197441101, + "reward_std": 0.011326078558340669, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9576848149299622, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9034333229064941, + "step": 534 + }, + { + "completion_length": 229.0, + "epoch": 1.7104, + "grad_norm": 2.319185256958008, + "kl": 0.052001953125, + "learning_rate": 3.325e-07, + "loss": 0.0005, + "reward": 3.9228227138519287, + "reward_std": 0.03856424614787102, + "rewards/answer_entity_reward": 0.9914772808551788, + "rewards/answer_wer_reward": 0.9560109972953796, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9753345847129822, + "step": 535 + }, + { + "completion_length": 224.71875, + "epoch": 1.7136, + "grad_norm": 2.444124460220337, + "kl": 0.080810546875, + "learning_rate": 3.3124999999999995e-07, + "loss": 0.0008, + "reward": 3.9688942432403564, + "reward_std": 0.003912239335477352, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9688942730426788, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 536 + }, + { + "completion_length": 224.3125, + "epoch": 1.7168, + "grad_norm": 6.20790958404541, + "kl": 0.064697265625, + "learning_rate": 3.3e-07, + "loss": 0.0006, + "reward": 3.8677161931991577, + "reward_std": 0.02981195878237486, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9471929371356964, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9233641624450684, + "step": 537 + }, + { + "completion_length": 150.34375, + "epoch": 1.72, + "grad_norm": 1.6208490133285522, + "kl": 0.03924560546875, + "learning_rate": 3.2875e-07, + "loss": 0.0004, + "reward": 3.9733328819274902, + "reward_std": 0.002679725643247366, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9733329117298126, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 538 + }, + { + "completion_length": 183.0, + "epoch": 1.7231999999999998, + "grad_norm": 1.2286797761917114, + "kl": 0.057861328125, + "learning_rate": 3.275e-07, + "loss": 0.0006, + "reward": 3.935777187347412, + "reward_std": 0.003249647794291377, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9795266687870026, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9562505781650543, + "step": 539 + }, + { + "completion_length": 234.625, + "epoch": 1.7264, + "grad_norm": 1.304764747619629, + "kl": 0.054931640625, + "learning_rate": 3.2624999999999995e-07, + "loss": 0.0005, + "reward": 3.950987696647644, + "reward_std": 0.00898568145930767, + "rewards/answer_entity_reward": 0.9958333373069763, + "rewards/answer_wer_reward": 0.9557509124279022, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9994034171104431, + "step": 540 + }, + { + "completion_length": 183.96875, + "epoch": 1.7296, + "grad_norm": 1.3975461721420288, + "kl": 0.07421875, + "learning_rate": 3.25e-07, + "loss": 0.0007, + "reward": 3.918307065963745, + "reward_std": 0.01607332704588771, + "rewards/answer_entity_reward": 0.9720314145088196, + "rewards/answer_wer_reward": 0.9547825455665588, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9914930462837219, + "step": 541 + }, + { + "completion_length": 204.0625, + "epoch": 1.7328000000000001, + "grad_norm": 2.0030770301818848, + "kl": 0.070068359375, + "learning_rate": 3.2374999999999997e-07, + "loss": 0.0007, + "reward": 3.9624624252319336, + "reward_std": 0.011391833890229464, + "rewards/answer_entity_reward": 0.9979166686534882, + "rewards/answer_wer_reward": 0.9645456969738007, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 542 + }, + { + "completion_length": 236.8125, + "epoch": 1.736, + "grad_norm": 1.0529872179031372, + "kl": 0.06396484375, + "learning_rate": 3.225e-07, + "loss": 0.0006, + "reward": 3.9355998039245605, + "reward_std": 0.011712775565683842, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.946576714515686, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9890230894088745, + "step": 543 + }, + { + "completion_length": 171.03125, + "epoch": 1.7391999999999999, + "grad_norm": 1.4777579307556152, + "kl": 0.07861328125, + "learning_rate": 3.2124999999999994e-07, + "loss": 0.0008, + "reward": 3.959132194519043, + "reward_std": 0.007866068510338664, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9591321349143982, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 544 + }, + { + "completion_length": 199.03125, + "epoch": 1.7424, + "grad_norm": 1.5819900035858154, + "kl": 0.07666015625, + "learning_rate": 3.2e-07, + "loss": 0.0008, + "reward": 3.9456801414489746, + "reward_std": 0.01446144049987197, + "rewards/answer_entity_reward": 0.9979166686534882, + "rewards/answer_wer_reward": 0.9492515921592712, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9985119104385376, + "step": 545 + }, + { + "completion_length": 243.53125, + "epoch": 1.7456, + "grad_norm": 6.461181640625, + "kl": 0.1029052734375, + "learning_rate": 3.1874999999999997e-07, + "loss": 0.001, + "reward": 3.9253257513046265, + "reward_std": 0.013943355064839125, + "rewards/answer_entity_reward": 0.9861111044883728, + "rewards/answer_wer_reward": 0.9411455988883972, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9980691075325012, + "step": 546 + }, + { + "completion_length": 190.21875, + "epoch": 1.7488000000000001, + "grad_norm": 1.5046278238296509, + "kl": 0.0430908203125, + "learning_rate": 3.175e-07, + "loss": 0.0004, + "reward": 3.946847081184387, + "reward_std": 0.006090850802138448, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9579125344753265, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9889345765113831, + "step": 547 + }, + { + "completion_length": 199.5625, + "epoch": 1.752, + "grad_norm": 2.7514781951904297, + "kl": 0.054931640625, + "learning_rate": 3.1624999999999994e-07, + "loss": 0.0006, + "reward": 3.9198288917541504, + "reward_std": 0.008053636411204934, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9198288321495056, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 548 + }, + { + "completion_length": 244.625, + "epoch": 1.7551999999999999, + "grad_norm": 1.0448155403137207, + "kl": 0.0426025390625, + "learning_rate": 3.15e-07, + "loss": 0.0004, + "reward": 3.958520531654358, + "reward_std": 0.008235724177211523, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9585205316543579, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 549 + }, + { + "completion_length": 249.0, + "epoch": 1.7584, + "grad_norm": 128.38499450683594, + "kl": 17.28076171875, + "learning_rate": 3.1374999999999996e-07, + "loss": 0.172, + "reward": 3.932722330093384, + "reward_std": 0.012139817699790001, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9340447783470154, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.998677521944046, + "step": 550 + }, + { + "completion_length": 202.25, + "epoch": 1.7616, + "grad_norm": 1.6289058923721313, + "kl": 0.0709228515625, + "learning_rate": 3.1249999999999997e-07, + "loss": 0.0007, + "reward": 3.931633234024048, + "reward_std": 0.015017563942819834, + "rewards/answer_entity_reward": 0.9943181872367859, + "rewards/answer_wer_reward": 0.9620243012905121, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9752906560897827, + "step": 551 + }, + { + "completion_length": 223.65625, + "epoch": 1.7648000000000001, + "grad_norm": 0.650069534778595, + "kl": 0.0467529296875, + "learning_rate": 3.1125000000000004e-07, + "loss": 0.0005, + "reward": 3.9622879028320312, + "reward_std": 0.004962240578606725, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9622879028320312, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 552 + }, + { + "completion_length": 238.65625, + "epoch": 1.768, + "grad_norm": 9.516084671020508, + "kl": 0.0474853515625, + "learning_rate": 3.1e-07, + "loss": 0.0005, + "reward": 3.9525749683380127, + "reward_std": 0.012759724631905556, + "rewards/answer_entity_reward": 0.9937500059604645, + "rewards/answer_wer_reward": 0.9610438644886017, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9977810680866241, + "step": 553 + }, + { + "completion_length": 224.65625, + "epoch": 1.7711999999999999, + "grad_norm": 1.8886899948120117, + "kl": 0.044189453125, + "learning_rate": 3.0875e-07, + "loss": 0.0004, + "reward": 3.9586617946624756, + "reward_std": 0.01200480293482542, + "rewards/answer_entity_reward": 0.9914772808551788, + "rewards/answer_wer_reward": 0.9675752222537994, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996093809604645, + "step": 554 + }, + { + "completion_length": 220.9375, + "epoch": 1.7744, + "grad_norm": 5.122376918792725, + "kl": 0.048828125, + "learning_rate": 3.0749999999999997e-07, + "loss": 0.0005, + "reward": 3.9466060400009155, + "reward_std": 0.016119306907057762, + "rewards/answer_entity_reward": 0.9965170323848724, + "rewards/answer_wer_reward": 0.9567474722862244, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.993341475725174, + "step": 555 + }, + { + "completion_length": 198.8125, + "epoch": 1.7776, + "grad_norm": 4.916889667510986, + "kl": 0.068115234375, + "learning_rate": 3.0625000000000003e-07, + "loss": 0.0007, + "reward": 3.949711561203003, + "reward_std": 0.0163404387421906, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9576182961463928, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9920931458473206, + "step": 556 + }, + { + "completion_length": 180.875, + "epoch": 1.7808000000000002, + "grad_norm": 10.021855354309082, + "kl": 0.072021484375, + "learning_rate": 3.05e-07, + "loss": 0.0007, + "reward": 3.867478370666504, + "reward_std": 0.047242360189557076, + "rewards/answer_entity_reward": 0.9821428656578064, + "rewards/answer_wer_reward": 0.9576010704040527, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.927734375, + "step": 557 + }, + { + "completion_length": 227.8125, + "epoch": 1.784, + "grad_norm": 1.7502044439315796, + "kl": 0.04443359375, + "learning_rate": 3.0375e-07, + "loss": 0.0004, + "reward": 3.9525381326675415, + "reward_std": 0.013325697276741266, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9532942175865173, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.999243974685669, + "step": 558 + }, + { + "completion_length": 204.15625, + "epoch": 1.7872, + "grad_norm": 5.304961681365967, + "kl": 0.0496826171875, + "learning_rate": 3.0249999999999996e-07, + "loss": 0.0005, + "reward": 3.957284450531006, + "reward_std": 0.005683758878149092, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9572845101356506, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 559 + }, + { + "completion_length": 228.34375, + "epoch": 1.7904, + "grad_norm": 1.2513984441757202, + "kl": 0.0577392578125, + "learning_rate": 3.0125000000000003e-07, + "loss": 0.0006, + "reward": 3.94599187374115, + "reward_std": 0.00800859834998846, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.957431435585022, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9885604083538055, + "step": 560 + }, + { + "completion_length": 211.03125, + "epoch": 1.7936, + "grad_norm": 5.97805118560791, + "kl": 0.1036376953125, + "learning_rate": 3e-07, + "loss": 0.001, + "reward": 3.9404828548431396, + "reward_std": 0.01265423372387886, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9433237612247467, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 561 + }, + { + "completion_length": 205.4375, + "epoch": 1.7968, + "grad_norm": 3.833575487136841, + "kl": 0.22998046875, + "learning_rate": 2.9875e-07, + "loss": 0.0023, + "reward": 3.909332752227783, + "reward_std": 0.007294894196093082, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9648370146751404, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9444957971572876, + "step": 562 + }, + { + "completion_length": 207.4375, + "epoch": 1.8, + "grad_norm": 0.8627040982246399, + "kl": 0.0611572265625, + "learning_rate": 2.9749999999999996e-07, + "loss": 0.0006, + "reward": 3.9548414945602417, + "reward_std": 0.006908831186592579, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9550975561141968, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997438490390778, + "step": 563 + }, + { + "completion_length": 198.15625, + "epoch": 1.8032, + "grad_norm": 0.9193502068519592, + "kl": 0.0518798828125, + "learning_rate": 2.9625e-07, + "loss": 0.0005, + "reward": 3.9462149143218994, + "reward_std": 0.007913234177976847, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9465437531471252, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996710419654846, + "step": 564 + }, + { + "completion_length": 198.15625, + "epoch": 1.8064, + "grad_norm": 1.9635776281356812, + "kl": 0.059814453125, + "learning_rate": 2.95e-07, + "loss": 0.0006, + "reward": 3.896806240081787, + "reward_std": 0.012922112364321947, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9503778219223022, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9464285671710968, + "step": 565 + }, + { + "completion_length": 164.90625, + "epoch": 1.8096, + "grad_norm": 1.2068322896957397, + "kl": 0.09375, + "learning_rate": 2.9375e-07, + "loss": 0.0009, + "reward": 3.8490008115768433, + "reward_std": 0.1467541428282857, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9502907395362854, + "rewards/format_reward": 0.96875, + "rewards/think_ocr_reward": 0.9328009486198425, + "step": 566 + }, + { + "completion_length": 206.34375, + "epoch": 1.8128, + "grad_norm": 2.1644375324249268, + "kl": 0.08251953125, + "learning_rate": 2.9249999999999995e-07, + "loss": 0.0008, + "reward": 3.970282793045044, + "reward_std": 0.0077400594018399715, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9728601574897766, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9974226951599121, + "step": 567 + }, + { + "completion_length": 233.09375, + "epoch": 1.8159999999999998, + "grad_norm": 1.106130599975586, + "kl": 0.0552978515625, + "learning_rate": 2.9125e-07, + "loss": 0.0005, + "reward": 3.9414994716644287, + "reward_std": 0.011295767035335302, + "rewards/answer_entity_reward": 0.9848698973655701, + "rewards/answer_wer_reward": 0.9577165246009827, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.998913049697876, + "step": 568 + }, + { + "completion_length": 206.46875, + "epoch": 1.8192, + "grad_norm": 1.2371478080749512, + "kl": 0.0599365234375, + "learning_rate": 2.9e-07, + "loss": 0.0006, + "reward": 3.9829952716827393, + "reward_std": 0.007155058206990361, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9829952716827393, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 569 + }, + { + "completion_length": 227.875, + "epoch": 1.8224, + "grad_norm": 0.9648468494415283, + "kl": 0.0587158203125, + "learning_rate": 2.8875e-07, + "loss": 0.0006, + "reward": 3.875002384185791, + "reward_std": 0.007613388821482658, + "rewards/answer_entity_reward": 0.9604166746139526, + "rewards/answer_wer_reward": 0.9299702048301697, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9846153855323792, + "step": 570 + }, + { + "completion_length": 242.1875, + "epoch": 1.8256000000000001, + "grad_norm": 3.7682442665100098, + "kl": 0.0732421875, + "learning_rate": 2.8749999999999995e-07, + "loss": 0.0007, + "reward": 3.790624737739563, + "reward_std": 0.14343099505640566, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9464230239391327, + "rewards/format_reward": 0.96875, + "rewards/think_ocr_reward": 0.8754517436027527, + "step": 571 + }, + { + "completion_length": 248.28125, + "epoch": 1.8288, + "grad_norm": 0.7550325393676758, + "kl": 0.039794921875, + "learning_rate": 2.8625e-07, + "loss": 0.0004, + "reward": 3.9295032024383545, + "reward_std": 0.004920503590255976, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9295033514499664, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 572 + }, + { + "completion_length": 222.4375, + "epoch": 1.8319999999999999, + "grad_norm": 1.055333137512207, + "kl": 0.0567626953125, + "learning_rate": 2.8499999999999997e-07, + "loss": 0.0006, + "reward": 3.929059386253357, + "reward_std": 0.014613255392760038, + "rewards/answer_entity_reward": 0.9819711446762085, + "rewards/answer_wer_reward": 0.9496394395828247, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.997448742389679, + "step": 573 + }, + { + "completion_length": 217.40625, + "epoch": 1.8352, + "grad_norm": 1.640468716621399, + "kl": 0.0443115234375, + "learning_rate": 2.8375e-07, + "loss": 0.0004, + "reward": 3.9705777168273926, + "reward_std": 0.013166352873668075, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9736025929450989, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9998161792755127, + "step": 574 + }, + { + "completion_length": 229.1875, + "epoch": 1.8384, + "grad_norm": 3.271684169769287, + "kl": 0.0567626953125, + "learning_rate": 2.8249999999999994e-07, + "loss": 0.0006, + "reward": 3.9389246702194214, + "reward_std": 0.007664299104362726, + "rewards/answer_entity_reward": 0.9833333492279053, + "rewards/answer_wer_reward": 0.9555914402008057, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 575 + }, + { + "completion_length": 203.28125, + "epoch": 1.8416000000000001, + "grad_norm": 1.6847234964370728, + "kl": 0.063232421875, + "learning_rate": 2.8125e-07, + "loss": 0.0006, + "reward": 3.9692747592926025, + "reward_std": 0.006263851770199835, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9701676964759827, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.999107152223587, + "step": 576 + }, + { + "completion_length": 251.0625, + "epoch": 1.8448, + "grad_norm": 4.737148761749268, + "kl": 0.128173828125, + "learning_rate": 2.8e-07, + "loss": 0.0013, + "reward": 3.935584545135498, + "reward_std": 0.016471964307129383, + "rewards/answer_entity_reward": 0.9937500059604645, + "rewards/answer_wer_reward": 0.9418345093727112, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 577 + }, + { + "completion_length": 199.5, + "epoch": 1.8479999999999999, + "grad_norm": 1.7424699068069458, + "kl": 0.0618896484375, + "learning_rate": 2.7875e-07, + "loss": 0.0006, + "reward": 3.966155171394348, + "reward_std": 0.012047166470438242, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9755966663360596, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9905584752559662, + "step": 578 + }, + { + "completion_length": 192.96875, + "epoch": 1.8512, + "grad_norm": 0.8571773171424866, + "kl": 0.0526123046875, + "learning_rate": 2.775e-07, + "loss": 0.0005, + "reward": 3.977761387825012, + "reward_std": 0.0047087406273931265, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9777614176273346, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 579 + }, + { + "completion_length": 223.6875, + "epoch": 1.8544, + "grad_norm": 1.3312608003616333, + "kl": 0.050537109375, + "learning_rate": 2.7625e-07, + "loss": 0.0005, + "reward": 3.9508321285247803, + "reward_std": 0.00891483761370182, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9508320689201355, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 580 + }, + { + "completion_length": 241.96875, + "epoch": 1.8576000000000001, + "grad_norm": 4.553063869476318, + "kl": 0.19140625, + "learning_rate": 2.75e-07, + "loss": 0.0019, + "reward": 3.925418257713318, + "reward_std": 0.016543671488761902, + "rewards/answer_entity_reward": 0.9963235259056091, + "rewards/answer_wer_reward": 0.9290946125984192, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 581 + }, + { + "completion_length": 241.90625, + "epoch": 1.8608, + "grad_norm": 0.8970361948013306, + "kl": 0.065185546875, + "learning_rate": 2.7374999999999997e-07, + "loss": 0.0007, + "reward": 3.9467151165008545, + "reward_std": 0.007796656806021929, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9470826387405396, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996323585510254, + "step": 582 + }, + { + "completion_length": 246.96875, + "epoch": 1.8639999999999999, + "grad_norm": 1.9463343620300293, + "kl": 0.04547119140625, + "learning_rate": 2.725e-07, + "loss": 0.0005, + "reward": 3.940864324569702, + "reward_std": 0.011073273373767734, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9416800141334534, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9991843402385712, + "step": 583 + }, + { + "completion_length": 206.625, + "epoch": 1.8672, + "grad_norm": 4.5208892822265625, + "kl": 0.092529296875, + "learning_rate": 2.7125e-07, + "loss": 0.0009, + "reward": 3.8930487632751465, + "reward_std": 0.032747200690209866, + "rewards/answer_entity_reward": 0.9943181872367859, + "rewards/answer_wer_reward": 0.9660382270812988, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9326923191547394, + "step": 584 + }, + { + "completion_length": 255.25, + "epoch": 1.8704, + "grad_norm": 2.1606805324554443, + "kl": 0.04736328125, + "learning_rate": 2.7e-07, + "loss": 0.0005, + "reward": 3.936957836151123, + "reward_std": 0.013339729979634285, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9393823444843292, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.997575432062149, + "step": 585 + }, + { + "completion_length": 226.3125, + "epoch": 1.8736000000000002, + "grad_norm": 0.7422674298286438, + "kl": 0.048095703125, + "learning_rate": 2.6874999999999997e-07, + "loss": 0.0005, + "reward": 3.9866139888763428, + "reward_std": 0.0038484669639728963, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.987176924943924, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9994369745254517, + "step": 586 + }, + { + "completion_length": 214.59375, + "epoch": 1.8768, + "grad_norm": 1.313864827156067, + "kl": 0.0684814453125, + "learning_rate": 2.675e-07, + "loss": 0.0007, + "reward": 3.9567151069641113, + "reward_std": 0.012406408437527716, + "rewards/answer_entity_reward": 0.9832702279090881, + "rewards/answer_wer_reward": 0.9734448790550232, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 587 + }, + { + "completion_length": 256.46875, + "epoch": 1.88, + "grad_norm": 1.4952497482299805, + "kl": 0.1278076171875, + "learning_rate": 2.6625e-07, + "loss": 0.0013, + "reward": 3.8717525005340576, + "reward_std": 0.13869436737149954, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9397719204425812, + "rewards/format_reward": 0.96875, + "rewards/think_ocr_reward": 0.9660714268684387, + "step": 588 + }, + { + "completion_length": 217.09375, + "epoch": 1.8832, + "grad_norm": 1.3716284036636353, + "kl": 0.054931640625, + "learning_rate": 2.65e-07, + "loss": 0.0006, + "reward": 3.962627410888672, + "reward_std": 0.006240109680220485, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9626273214817047, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 589 + }, + { + "completion_length": 253.0, + "epoch": 1.8864, + "grad_norm": 1.4284135103225708, + "kl": 0.07080078125, + "learning_rate": 2.6374999999999996e-07, + "loss": 0.0007, + "reward": 3.9501919746398926, + "reward_std": 0.012296234723180532, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9531300067901611, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9970619678497314, + "step": 590 + }, + { + "completion_length": 204.5, + "epoch": 1.8896, + "grad_norm": 3.8569161891937256, + "kl": 0.07421875, + "learning_rate": 2.625e-07, + "loss": 0.0007, + "reward": 3.9426995515823364, + "reward_std": 0.027584614232182503, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9779268503189087, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9647727012634277, + "step": 591 + }, + { + "completion_length": 229.0625, + "epoch": 1.8928, + "grad_norm": 2.589956760406494, + "kl": 0.08203125, + "learning_rate": 2.6125e-07, + "loss": 0.0008, + "reward": 3.9178069829940796, + "reward_std": 0.007971604820340872, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.95549076795578, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9623160660266876, + "step": 592 + }, + { + "completion_length": 170.65625, + "epoch": 1.896, + "grad_norm": 3.586792469024658, + "kl": 0.0423583984375, + "learning_rate": 2.6e-07, + "loss": 0.0004, + "reward": 3.9206513166427612, + "reward_std": 0.023992381058633327, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9824000000953674, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9382513165473938, + "step": 593 + }, + { + "completion_length": 229.34375, + "epoch": 1.8992, + "grad_norm": 4.520889759063721, + "kl": 0.07421875, + "learning_rate": 2.5874999999999996e-07, + "loss": 0.0007, + "reward": 3.942514419555664, + "reward_std": 0.038696477888152, + "rewards/answer_entity_reward": 0.984275609254837, + "rewards/answer_wer_reward": 0.9582389295101166, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 594 + }, + { + "completion_length": 223.4375, + "epoch": 1.9024, + "grad_norm": 1.3104579448699951, + "kl": 0.0565185546875, + "learning_rate": 2.5749999999999997e-07, + "loss": 0.0006, + "reward": 3.976773500442505, + "reward_std": 0.0044562743860296905, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9767734706401825, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 595 + }, + { + "completion_length": 254.09375, + "epoch": 1.9056, + "grad_norm": 1.03975510597229, + "kl": 0.05322265625, + "learning_rate": 2.5625e-07, + "loss": 0.0005, + "reward": 3.943529725074768, + "reward_std": 0.009816794656217098, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9451378583908081, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9983919560909271, + "step": 596 + }, + { + "completion_length": 243.03125, + "epoch": 1.9088, + "grad_norm": 1.0213077068328857, + "kl": 0.0506591796875, + "learning_rate": 2.55e-07, + "loss": 0.0005, + "reward": 3.9278059005737305, + "reward_std": 0.00602961634285748, + "rewards/answer_entity_reward": 0.9861111044883728, + "rewards/answer_wer_reward": 0.9420903027057648, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996044337749481, + "step": 597 + }, + { + "completion_length": 182.46875, + "epoch": 1.912, + "grad_norm": 1.8683794736862183, + "kl": 0.065185546875, + "learning_rate": 2.5374999999999995e-07, + "loss": 0.0007, + "reward": 3.9624691009521484, + "reward_std": 0.012565109878778458, + "rewards/answer_entity_reward": 0.9955357313156128, + "rewards/answer_wer_reward": 0.9729967415332794, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9939365684986115, + "step": 598 + }, + { + "completion_length": 166.25, + "epoch": 1.9152, + "grad_norm": 1.716305136680603, + "kl": 0.0968017578125, + "learning_rate": 2.5249999999999996e-07, + "loss": 0.001, + "reward": 3.896498918533325, + "reward_std": 0.11676233587786555, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9749563038349152, + "rewards/format_reward": 0.96875, + "rewards/think_ocr_reward": 0.9527925550937653, + "step": 599 + }, + { + "completion_length": 199.59375, + "epoch": 1.9184, + "grad_norm": 1.2319942712783813, + "kl": 0.0775146484375, + "learning_rate": 2.5125e-07, + "loss": 0.0008, + "reward": 3.9489831924438477, + "reward_std": 0.010235858615487814, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9580873548984528, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9908958971500397, + "step": 600 + }, + { + "completion_length": 210.53125, + "epoch": 1.9216, + "grad_norm": 1.0385370254516602, + "kl": 0.0650634765625, + "learning_rate": 2.5e-07, + "loss": 0.0007, + "reward": 3.966851830482483, + "reward_std": 0.005628936691209674, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9668518006801605, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 601 + }, + { + "completion_length": 186.9375, + "epoch": 1.9247999999999998, + "grad_norm": 2.1772327423095703, + "kl": 0.11279296875, + "learning_rate": 2.4875e-07, + "loss": 0.0011, + "reward": 3.9322038888931274, + "reward_std": 0.01743672974407673, + "rewards/answer_entity_reward": 0.9880681931972504, + "rewards/answer_wer_reward": 0.9574334919452667, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9867021441459656, + "step": 602 + }, + { + "completion_length": 209.65625, + "epoch": 1.928, + "grad_norm": 0.9661850929260254, + "kl": 0.072998046875, + "learning_rate": 2.475e-07, + "loss": 0.0007, + "reward": 3.9598844051361084, + "reward_std": 0.009228286100551486, + "rewards/answer_entity_reward": 0.9937500059604645, + "rewards/answer_wer_reward": 0.966718465089798, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9994158744812012, + "step": 603 + }, + { + "completion_length": 193.65625, + "epoch": 1.9312, + "grad_norm": 2.6254851818084717, + "kl": 0.102294921875, + "learning_rate": 2.4624999999999997e-07, + "loss": 0.001, + "reward": 3.957027792930603, + "reward_std": 0.008546661585569382, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9570277333259583, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 604 + }, + { + "completion_length": 219.34375, + "epoch": 1.9344000000000001, + "grad_norm": 1.0413298606872559, + "kl": 0.104736328125, + "learning_rate": 2.45e-07, + "loss": 0.0011, + "reward": 3.9702824354171753, + "reward_std": 0.007483657216653228, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9702823162078857, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 605 + }, + { + "completion_length": 157.46875, + "epoch": 1.9376, + "grad_norm": 2.432849645614624, + "kl": 0.14453125, + "learning_rate": 2.4375e-07, + "loss": 0.0014, + "reward": 3.957343101501465, + "reward_std": 0.005332180997356772, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.957624614238739, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997184872627258, + "step": 606 + }, + { + "completion_length": 248.40625, + "epoch": 1.9407999999999999, + "grad_norm": 0.8216654062271118, + "kl": 0.071044921875, + "learning_rate": 2.425e-07, + "loss": 0.0007, + "reward": 3.9644582271575928, + "reward_std": 0.01216787239536643, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9688305556774139, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.998031497001648, + "step": 607 + }, + { + "completion_length": 218.625, + "epoch": 1.944, + "grad_norm": 0.9195014834403992, + "kl": 0.0545654296875, + "learning_rate": 2.4124999999999997e-07, + "loss": 0.0005, + "reward": 3.972040057182312, + "reward_std": 0.004315207479521632, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9726911783218384, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9993489682674408, + "step": 608 + }, + { + "completion_length": 231.84375, + "epoch": 1.9472, + "grad_norm": 1.3564932346343994, + "kl": 0.06103515625, + "learning_rate": 2.4e-07, + "loss": 0.0006, + "reward": 3.951057553291321, + "reward_std": 0.013061597011983395, + "rewards/answer_entity_reward": 0.9963235259056091, + "rewards/answer_wer_reward": 0.9553851187229156, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9993489682674408, + "step": 609 + }, + { + "completion_length": 241.4375, + "epoch": 1.9504000000000001, + "grad_norm": 0.9419238567352295, + "kl": 0.051513671875, + "learning_rate": 2.3875e-07, + "loss": 0.0005, + "reward": 3.971252202987671, + "reward_std": 0.006067809648811817, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9715149104595184, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997373819351196, + "step": 610 + }, + { + "completion_length": 222.09375, + "epoch": 1.9536, + "grad_norm": 1.4854899644851685, + "kl": 0.166748046875, + "learning_rate": 2.3749999999999998e-07, + "loss": 0.0017, + "reward": 3.9489357471466064, + "reward_std": 0.012118924409151077, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.948935866355896, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 611 + }, + { + "completion_length": 259.8125, + "epoch": 1.9567999999999999, + "grad_norm": 2.2286458015441895, + "kl": 0.0426025390625, + "learning_rate": 2.3625e-07, + "loss": 0.0004, + "reward": 3.96254563331604, + "reward_std": 0.005056597990915179, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9625457525253296, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 612 + }, + { + "completion_length": 209.4375, + "epoch": 1.96, + "grad_norm": 4.077661514282227, + "kl": 0.05615234375, + "learning_rate": 2.3499999999999997e-07, + "loss": 0.0006, + "reward": 3.941632628440857, + "reward_std": 0.01233140891417861, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9416325688362122, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 613 + }, + { + "completion_length": 221.71875, + "epoch": 1.9632, + "grad_norm": 0.7665371298789978, + "kl": 0.0555419921875, + "learning_rate": 2.3375e-07, + "loss": 0.0005, + "reward": 3.9698644876480103, + "reward_std": 0.009979546128306538, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.973064661026001, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996408224105835, + "step": 614 + }, + { + "completion_length": 219.875, + "epoch": 1.9664000000000001, + "grad_norm": 2.4666738510131836, + "kl": 0.0546875, + "learning_rate": 2.325e-07, + "loss": 0.0005, + "reward": 3.9548712968826294, + "reward_std": 0.011192699894309044, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9553521871566772, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9995192289352417, + "step": 615 + }, + { + "completion_length": 235.0, + "epoch": 1.9696, + "grad_norm": 1.5382620096206665, + "kl": 0.044921875, + "learning_rate": 2.3125e-07, + "loss": 0.0005, + "reward": 3.9565550088882446, + "reward_std": 0.008881408954039216, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9740456640720367, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9825093150138855, + "step": 616 + }, + { + "completion_length": 141.09375, + "epoch": 1.9727999999999999, + "grad_norm": 2.0756258964538574, + "kl": 0.0631103515625, + "learning_rate": 2.3e-07, + "loss": 0.0006, + "reward": 3.9571491479873657, + "reward_std": 0.005044124089181423, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.980070561170578, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9770786762237549, + "step": 617 + }, + { + "completion_length": 222.46875, + "epoch": 1.976, + "grad_norm": 5.071360111236572, + "kl": 0.075927734375, + "learning_rate": 2.2875e-07, + "loss": 0.0008, + "reward": 3.8557703495025635, + "reward_std": 0.06493359804153442, + "rewards/answer_entity_reward": 0.9847027957439423, + "rewards/answer_wer_reward": 0.9706770181655884, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.900390625, + "step": 618 + }, + { + "completion_length": 231.125, + "epoch": 1.9792, + "grad_norm": 1.0749843120574951, + "kl": 0.050537109375, + "learning_rate": 2.275e-07, + "loss": 0.0005, + "reward": 3.9660208225250244, + "reward_std": 0.0037171735893934965, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9660208523273468, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 619 + }, + { + "completion_length": 252.625, + "epoch": 1.9824000000000002, + "grad_norm": 1.5367364883422852, + "kl": 0.070068359375, + "learning_rate": 2.2625e-07, + "loss": 0.0007, + "reward": 3.946213126182556, + "reward_std": 0.01816728012636304, + "rewards/answer_entity_reward": 0.9867424070835114, + "rewards/answer_wer_reward": 0.9616928696632385, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9977777898311615, + "step": 620 + }, + { + "completion_length": 239.34375, + "epoch": 1.9856, + "grad_norm": 2.541694164276123, + "kl": 0.142578125, + "learning_rate": 2.25e-07, + "loss": 0.0014, + "reward": 3.947938561439514, + "reward_std": 0.009988004341721535, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9479385614395142, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 621 + }, + { + "completion_length": 224.65625, + "epoch": 1.9888, + "grad_norm": 1.3821133375167847, + "kl": 0.075927734375, + "learning_rate": 2.2375e-07, + "loss": 0.0007, + "reward": 3.953581690788269, + "reward_std": 0.006479294504970312, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.953581839799881, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 622 + }, + { + "completion_length": 206.3125, + "epoch": 1.992, + "grad_norm": 1.0023412704467773, + "kl": 0.13232421875, + "learning_rate": 2.225e-07, + "loss": 0.0013, + "reward": 3.8949310779571533, + "reward_std": 0.006026371265761554, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9634793996810913, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9314516186714172, + "step": 623 + }, + { + "completion_length": 179.96875, + "epoch": 1.9952, + "grad_norm": 1.534476637840271, + "kl": 0.078125, + "learning_rate": 2.2125e-07, + "loss": 0.0008, + "reward": 3.966533660888672, + "reward_std": 0.008991609327495098, + "rewards/answer_entity_reward": 0.9950658082962036, + "rewards/answer_wer_reward": 0.9756669104099274, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9958009123802185, + "step": 624 + }, + { + "completion_length": 232.75, + "epoch": 1.9984, + "grad_norm": 0.7324752807617188, + "kl": 0.0499267578125, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.0005, + "reward": 3.946596384048462, + "reward_std": 0.011123172473162413, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9492979049682617, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997023940086365, + "step": 625 + }, + { + "completion_length": 176.0625, + "epoch": 2.0, + "grad_norm": 0.33141908049583435, + "kl": 0.06005859375, + "learning_rate": 2.1875e-07, + "loss": 0.0003, + "reward": 3.9717535972595215, + "reward_std": 0.012056672014296055, + "rewards/answer_entity_reward": 0.9963235259056091, + "rewards/answer_wer_reward": 0.975429892539978, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 626 + }, + { + "completion_length": 232.21875, + "epoch": 2.0032, + "grad_norm": 0.8334391117095947, + "kl": 0.0457763671875, + "learning_rate": 2.1749999999999998e-07, + "loss": 0.0004, + "reward": 3.970544457435608, + "reward_std": 0.003736199578270316, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9705445766448975, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 627 + }, + { + "completion_length": 173.375, + "epoch": 2.0064, + "grad_norm": 0.965114951133728, + "kl": 0.067626953125, + "learning_rate": 2.1625e-07, + "loss": 0.0007, + "reward": 3.974756956100464, + "reward_std": 0.004756669281050563, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9788074791431427, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9959495067596436, + "step": 628 + }, + { + "completion_length": 222.15625, + "epoch": 2.0096, + "grad_norm": 2.102520227432251, + "kl": 0.0474853515625, + "learning_rate": 2.1499999999999998e-07, + "loss": 0.0005, + "reward": 3.938779830932617, + "reward_std": 0.01813220279291272, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9791045486927032, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9596752524375916, + "step": 629 + }, + { + "completion_length": 206.40625, + "epoch": 2.0128, + "grad_norm": 1.3867822885513306, + "kl": 0.095458984375, + "learning_rate": 2.1375e-07, + "loss": 0.001, + "reward": 3.977003812789917, + "reward_std": 0.003467106493189931, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9772301912307739, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997735619544983, + "step": 630 + }, + { + "completion_length": 237.625, + "epoch": 2.016, + "grad_norm": 1.2721437215805054, + "kl": 0.0576171875, + "learning_rate": 2.1249999999999998e-07, + "loss": 0.0006, + "reward": 3.96044921875, + "reward_std": 0.007887857500463724, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9609974026679993, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.999451756477356, + "step": 631 + }, + { + "completion_length": 190.65625, + "epoch": 2.0192, + "grad_norm": 1.6940927505493164, + "kl": 0.170166015625, + "learning_rate": 2.1125e-07, + "loss": 0.0017, + "reward": 3.92085862159729, + "reward_std": 0.012093114666640759, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9635953307151794, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9572633504867554, + "step": 632 + }, + { + "completion_length": 213.75, + "epoch": 2.0224, + "grad_norm": 1.3798060417175293, + "kl": 0.0552978515625, + "learning_rate": 2.0999999999999997e-07, + "loss": 0.0006, + "reward": 3.9467806816101074, + "reward_std": 0.00452708825469017, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9470699727535248, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997106492519379, + "step": 633 + }, + { + "completion_length": 193.5625, + "epoch": 2.0256, + "grad_norm": 1.5375889539718628, + "kl": 0.046875, + "learning_rate": 2.0874999999999999e-07, + "loss": 0.0005, + "reward": 3.9730241298675537, + "reward_std": 0.006102013634517789, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9743154048919678, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9987086653709412, + "step": 634 + }, + { + "completion_length": 204.09375, + "epoch": 2.0288, + "grad_norm": 1.0933163166046143, + "kl": 0.09228515625, + "learning_rate": 2.0749999999999997e-07, + "loss": 0.0009, + "reward": 3.9593019485473633, + "reward_std": 0.008372287498787045, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9602685272693634, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.999033510684967, + "step": 635 + }, + { + "completion_length": 186.875, + "epoch": 2.032, + "grad_norm": 3.5551085472106934, + "kl": 0.085205078125, + "learning_rate": 2.0624999999999998e-07, + "loss": 0.0008, + "reward": 3.937085270881653, + "reward_std": 0.028064538724720478, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9683353006839752, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9687500298023224, + "step": 636 + }, + { + "completion_length": 228.875, + "epoch": 2.0352, + "grad_norm": 0.9865986108779907, + "kl": 0.0728759765625, + "learning_rate": 2.0499999999999997e-07, + "loss": 0.0007, + "reward": 3.9492111206054688, + "reward_std": 0.007756081875413656, + "rewards/answer_entity_reward": 0.9916666746139526, + "rewards/answer_wer_reward": 0.9575444757938385, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 637 + }, + { + "completion_length": 212.28125, + "epoch": 2.0384, + "grad_norm": 3.542672872543335, + "kl": 0.110107421875, + "learning_rate": 2.0374999999999998e-07, + "loss": 0.0011, + "reward": 3.9374581575393677, + "reward_std": 0.009235690347850323, + "rewards/answer_entity_reward": 0.9979166686534882, + "rewards/answer_wer_reward": 0.9742424190044403, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9652990996837616, + "step": 638 + }, + { + "completion_length": 232.0, + "epoch": 2.0416, + "grad_norm": 1.4940472841262817, + "kl": 0.0565185546875, + "learning_rate": 2.025e-07, + "loss": 0.0006, + "reward": 3.947740077972412, + "reward_std": 0.006069941911846399, + "rewards/answer_entity_reward": 0.9861111044883728, + "rewards/answer_wer_reward": 0.9616289734840393, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 639 + }, + { + "completion_length": 214.46875, + "epoch": 2.0448, + "grad_norm": 1.0322229862213135, + "kl": 0.0865478515625, + "learning_rate": 2.0125e-07, + "loss": 0.0009, + "reward": 3.973870038986206, + "reward_std": 0.005974382860586047, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9738699197769165, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 640 + }, + { + "completion_length": 175.71875, + "epoch": 2.048, + "grad_norm": 2.1991164684295654, + "kl": 0.0986328125, + "learning_rate": 2e-07, + "loss": 0.001, + "reward": 3.9478849172592163, + "reward_std": 0.012253349646925926, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9485794901847839, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9993055462837219, + "step": 641 + }, + { + "completion_length": 202.3125, + "epoch": 2.0512, + "grad_norm": 2.254936456680298, + "kl": 0.0758056640625, + "learning_rate": 1.9875e-07, + "loss": 0.0008, + "reward": 3.9462071657180786, + "reward_std": 0.007457165978848934, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9462071061134338, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 642 + }, + { + "completion_length": 205.03125, + "epoch": 2.0544, + "grad_norm": 2.473928928375244, + "kl": 0.079345703125, + "learning_rate": 1.975e-07, + "loss": 0.0008, + "reward": 3.92992103099823, + "reward_std": 0.014722079504281282, + "rewards/answer_entity_reward": 0.9943181872367859, + "rewards/answer_wer_reward": 0.9436539113521576, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9919487535953522, + "step": 643 + }, + { + "completion_length": 202.3125, + "epoch": 2.0576, + "grad_norm": 1.5329126119613647, + "kl": 0.03643798828125, + "learning_rate": 1.9625e-07, + "loss": 0.0004, + "reward": 3.944863796234131, + "reward_std": 0.006489667110145092, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9667904078960419, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9780733287334442, + "step": 644 + }, + { + "completion_length": 202.53125, + "epoch": 2.0608, + "grad_norm": 0.6484522223472595, + "kl": 0.04443359375, + "learning_rate": 1.9499999999999999e-07, + "loss": 0.0004, + "reward": 3.975989580154419, + "reward_std": 0.0032934267073869705, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9759896695613861, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 645 + }, + { + "completion_length": 248.65625, + "epoch": 2.064, + "grad_norm": 3.43375301361084, + "kl": 0.0609130859375, + "learning_rate": 1.9375e-07, + "loss": 0.0006, + "reward": 3.952019691467285, + "reward_std": 0.010596145410090685, + "rewards/answer_entity_reward": 0.9983552694320679, + "rewards/answer_wer_reward": 0.9558849632740021, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9977796375751495, + "step": 646 + }, + { + "completion_length": 209.40625, + "epoch": 2.0672, + "grad_norm": 1.1015528440475464, + "kl": 0.057373046875, + "learning_rate": 1.9249999999999998e-07, + "loss": 0.0006, + "reward": 3.9535114765167236, + "reward_std": 0.0073295624461025, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9535112977027893, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 647 + }, + { + "completion_length": 247.15625, + "epoch": 2.0704, + "grad_norm": 5.493063449859619, + "kl": 0.052490234375, + "learning_rate": 1.9125e-07, + "loss": 0.0005, + "reward": 3.959768056869507, + "reward_std": 0.009880491998046637, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9597680270671844, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 648 + }, + { + "completion_length": 190.3125, + "epoch": 2.0736, + "grad_norm": 3.042928457260132, + "kl": 0.070556640625, + "learning_rate": 1.8999999999999998e-07, + "loss": 0.0007, + "reward": 3.935302972793579, + "reward_std": 0.008418679004535079, + "rewards/answer_entity_reward": 0.9861111044883728, + "rewards/answer_wer_reward": 0.9707636535167694, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9784283638000488, + "step": 649 + }, + { + "completion_length": 240.1875, + "epoch": 2.0768, + "grad_norm": 1.1801666021347046, + "kl": 0.068359375, + "learning_rate": 1.8875e-07, + "loss": 0.0007, + "reward": 3.944392442703247, + "reward_std": 0.008859490510076284, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9443924725055695, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 650 + }, + { + "completion_length": 212.0625, + "epoch": 2.08, + "grad_norm": 1.1967086791992188, + "kl": 0.072021484375, + "learning_rate": 1.875e-07, + "loss": 0.0007, + "reward": 3.96494197845459, + "reward_std": 0.011900570709258318, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9674758613109589, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9974662065505981, + "step": 651 + }, + { + "completion_length": 179.90625, + "epoch": 2.0832, + "grad_norm": 2.0556278228759766, + "kl": 0.056640625, + "learning_rate": 1.8625e-07, + "loss": 0.0006, + "reward": 3.925339102745056, + "reward_std": 0.005963671952486038, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9453259110450745, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9800131618976593, + "step": 652 + }, + { + "completion_length": 232.1875, + "epoch": 2.0864, + "grad_norm": 1.1875349283218384, + "kl": 0.076171875, + "learning_rate": 1.85e-07, + "loss": 0.0008, + "reward": 3.9718481302261353, + "reward_std": 0.01158686971757561, + "rewards/answer_entity_reward": 0.9955128133296967, + "rewards/answer_wer_reward": 0.9763352572917938, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 653 + }, + { + "completion_length": 222.65625, + "epoch": 2.0896, + "grad_norm": 2.1682872772216797, + "kl": 0.09423828125, + "learning_rate": 1.8375e-07, + "loss": 0.0009, + "reward": 3.94124174118042, + "reward_std": 0.008590340381488204, + "rewards/answer_entity_reward": 0.9903846085071564, + "rewards/answer_wer_reward": 0.9508572518825531, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 654 + }, + { + "completion_length": 173.03125, + "epoch": 2.0928, + "grad_norm": 2.1240601539611816, + "kl": 0.066162109375, + "learning_rate": 1.825e-07, + "loss": 0.0007, + "reward": 3.9930202960968018, + "reward_std": 0.0026576630771160126, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9934512376785278, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9995689690113068, + "step": 655 + }, + { + "completion_length": 177.09375, + "epoch": 2.096, + "grad_norm": 4.589439868927002, + "kl": 0.083984375, + "learning_rate": 1.8124999999999999e-07, + "loss": 0.0008, + "reward": 3.7905973196029663, + "reward_std": 0.05029802396893501, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9605589509010315, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8300382792949677, + "step": 656 + }, + { + "completion_length": 182.5, + "epoch": 2.0992, + "grad_norm": 2.9955060482025146, + "kl": 0.0601806640625, + "learning_rate": 1.8e-07, + "loss": 0.0006, + "reward": 3.959343194961548, + "reward_std": 0.010165283223614097, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9634606242179871, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.995882511138916, + "step": 657 + }, + { + "completion_length": 247.53125, + "epoch": 2.1024, + "grad_norm": 6.366602897644043, + "kl": 0.2166748046875, + "learning_rate": 1.7874999999999998e-07, + "loss": 0.0022, + "reward": 3.95376193523407, + "reward_std": 0.007726241368800402, + "rewards/answer_entity_reward": 0.9903846085071564, + "rewards/answer_wer_reward": 0.9633772671222687, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 658 + }, + { + "completion_length": 212.8125, + "epoch": 2.1056, + "grad_norm": 1.1973211765289307, + "kl": 0.0445556640625, + "learning_rate": 1.775e-07, + "loss": 0.0004, + "reward": 3.979708194732666, + "reward_std": 0.007615833543241024, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9800336956977844, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996744692325592, + "step": 659 + }, + { + "completion_length": 244.65625, + "epoch": 2.1088, + "grad_norm": 1.237342357635498, + "kl": 0.063232421875, + "learning_rate": 1.7624999999999998e-07, + "loss": 0.0006, + "reward": 3.9267531633377075, + "reward_std": 0.01262162160128355, + "rewards/answer_entity_reward": 0.9903846085071564, + "rewards/answer_wer_reward": 0.937911719083786, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9984567761421204, + "step": 660 + }, + { + "completion_length": 211.46875, + "epoch": 2.112, + "grad_norm": 1.6842882633209229, + "kl": 0.0623779296875, + "learning_rate": 1.75e-07, + "loss": 0.0006, + "reward": 3.9610049724578857, + "reward_std": 0.008832846768200397, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9619665145874023, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9990384578704834, + "step": 661 + }, + { + "completion_length": 208.6875, + "epoch": 2.1152, + "grad_norm": 1.8498320579528809, + "kl": 0.0687255859375, + "learning_rate": 1.7374999999999998e-07, + "loss": 0.0007, + "reward": 3.908181667327881, + "reward_std": 0.05270358338020742, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9462520182132721, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9654017686843872, + "step": 662 + }, + { + "completion_length": 220.15625, + "epoch": 2.1184, + "grad_norm": 1.3248109817504883, + "kl": 0.0576171875, + "learning_rate": 1.725e-07, + "loss": 0.0006, + "reward": 3.977890729904175, + "reward_std": 0.0048680840991437435, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9778908789157867, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 663 + }, + { + "completion_length": 203.125, + "epoch": 2.1216, + "grad_norm": 1.2837951183319092, + "kl": 0.0660400390625, + "learning_rate": 1.7125e-07, + "loss": 0.0007, + "reward": 3.951757311820984, + "reward_std": 0.01306973909959197, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9517573118209839, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 664 + }, + { + "completion_length": 234.71875, + "epoch": 2.1248, + "grad_norm": 1.2517513036727905, + "kl": 0.072265625, + "learning_rate": 1.7000000000000001e-07, + "loss": 0.0007, + "reward": 3.932037830352783, + "reward_std": 0.018653371836990118, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9320378601551056, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 665 + }, + { + "completion_length": 154.4375, + "epoch": 2.128, + "grad_norm": 1.6812143325805664, + "kl": 0.057373046875, + "learning_rate": 1.6875e-07, + "loss": 0.0006, + "reward": 3.933722972869873, + "reward_std": 0.004374760144855827, + "rewards/answer_entity_reward": 0.9916666746139526, + "rewards/answer_wer_reward": 0.9603091180324554, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9817472994327545, + "step": 666 + }, + { + "completion_length": 194.375, + "epoch": 2.1312, + "grad_norm": 1.1369833946228027, + "kl": 0.10205078125, + "learning_rate": 1.675e-07, + "loss": 0.001, + "reward": 3.948467254638672, + "reward_std": 0.013669541105628014, + "rewards/answer_entity_reward": 0.9895833134651184, + "rewards/answer_wer_reward": 0.9588838517665863, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 667 + }, + { + "completion_length": 222.40625, + "epoch": 2.1344, + "grad_norm": 1.289441466331482, + "kl": 0.09716796875, + "learning_rate": 1.6625e-07, + "loss": 0.001, + "reward": 3.938557267189026, + "reward_std": 0.005478785838931799, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9577881693840027, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9807692170143127, + "step": 668 + }, + { + "completion_length": 185.71875, + "epoch": 2.1376, + "grad_norm": 1.9890272617340088, + "kl": 0.084716796875, + "learning_rate": 1.65e-07, + "loss": 0.0008, + "reward": 3.967849016189575, + "reward_std": 0.008760316297411919, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.967848926782608, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 669 + }, + { + "completion_length": 248.46875, + "epoch": 2.1408, + "grad_norm": 1.1813039779663086, + "kl": 0.074462890625, + "learning_rate": 1.6375e-07, + "loss": 0.0007, + "reward": 3.8907772302627563, + "reward_std": 0.07307082694023848, + "rewards/answer_entity_reward": 0.9749999940395355, + "rewards/answer_wer_reward": 0.915777176618576, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 670 + }, + { + "completion_length": 204.09375, + "epoch": 2.144, + "grad_norm": 1.4091624021530151, + "kl": 0.079833984375, + "learning_rate": 1.625e-07, + "loss": 0.0008, + "reward": 3.9357553720474243, + "reward_std": 0.018585966899991035, + "rewards/answer_entity_reward": 0.9924799501895905, + "rewards/answer_wer_reward": 0.9553823173046112, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9878930449485779, + "step": 671 + }, + { + "completion_length": 204.15625, + "epoch": 2.1471999999999998, + "grad_norm": 1.9349714517593384, + "kl": 0.0614013671875, + "learning_rate": 1.6125e-07, + "loss": 0.0006, + "reward": 3.963050127029419, + "reward_std": 0.011341096367686987, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9657188355922699, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997351765632629, + "step": 672 + }, + { + "completion_length": 183.1875, + "epoch": 2.1504, + "grad_norm": 3.866070508956909, + "kl": 0.1171875, + "learning_rate": 1.6e-07, + "loss": 0.0012, + "reward": 3.778456449508667, + "reward_std": 0.1051805429160595, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9563734233379364, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8244869709014893, + "step": 673 + }, + { + "completion_length": 237.875, + "epoch": 2.1536, + "grad_norm": 1.3984158039093018, + "kl": 0.0478515625, + "learning_rate": 1.5875e-07, + "loss": 0.0005, + "reward": 3.9681609869003296, + "reward_std": 0.007229159120470285, + "rewards/answer_entity_reward": 0.9981617629528046, + "rewards/answer_wer_reward": 0.9706325232982635, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9993667006492615, + "step": 674 + }, + { + "completion_length": 201.9375, + "epoch": 2.1568, + "grad_norm": 4.475615501403809, + "kl": 0.06640625, + "learning_rate": 1.575e-07, + "loss": 0.0007, + "reward": 3.8558905124664307, + "reward_std": 0.0662167351692915, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9515935778617859, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.904296875, + "step": 675 + }, + { + "completion_length": 199.59375, + "epoch": 2.16, + "grad_norm": 1.3850592374801636, + "kl": 0.042236328125, + "learning_rate": 1.5624999999999999e-07, + "loss": 0.0004, + "reward": 3.9729303121566772, + "reward_std": 0.01144796540029347, + "rewards/answer_entity_reward": 0.9979166686534882, + "rewards/answer_wer_reward": 0.9750137031078339, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 676 + }, + { + "completion_length": 198.8125, + "epoch": 2.1632, + "grad_norm": 0.8988875150680542, + "kl": 0.0848388671875, + "learning_rate": 1.55e-07, + "loss": 0.0008, + "reward": 3.9634130001068115, + "reward_std": 0.016308533609844744, + "rewards/answer_entity_reward": 0.9937500059604645, + "rewards/answer_wer_reward": 0.969995379447937, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996675550937653, + "step": 677 + }, + { + "completion_length": 242.0, + "epoch": 2.1664, + "grad_norm": 0.886544406414032, + "kl": 0.057861328125, + "learning_rate": 1.5374999999999998e-07, + "loss": 0.0006, + "reward": 3.9666435718536377, + "reward_std": 0.009206962306052446, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9666436016559601, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 678 + }, + { + "completion_length": 208.09375, + "epoch": 2.1696, + "grad_norm": 1.2104874849319458, + "kl": 0.0665283203125, + "learning_rate": 1.525e-07, + "loss": 0.0007, + "reward": 3.956413745880127, + "reward_std": 0.008385751629248261, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9564136564731598, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 679 + }, + { + "completion_length": 205.65625, + "epoch": 2.1728, + "grad_norm": 1.4340012073516846, + "kl": 0.0653076171875, + "learning_rate": 1.5124999999999998e-07, + "loss": 0.0007, + "reward": 3.9660589694976807, + "reward_std": 0.007518206490203738, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9666839838027954, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9993749856948853, + "step": 680 + }, + { + "completion_length": 243.875, + "epoch": 2.176, + "grad_norm": 2.6693804264068604, + "kl": 0.0611572265625, + "learning_rate": 1.5e-07, + "loss": 0.0006, + "reward": 3.9342352151870728, + "reward_std": 0.0278960638679564, + "rewards/answer_entity_reward": 0.9851190745830536, + "rewards/answer_wer_reward": 0.9509375989437103, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9981784820556641, + "step": 681 + }, + { + "completion_length": 247.875, + "epoch": 2.1792, + "grad_norm": 0.978139340877533, + "kl": 0.050537109375, + "learning_rate": 1.4874999999999998e-07, + "loss": 0.0005, + "reward": 3.9769967794418335, + "reward_std": 0.006702936254441738, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9769968390464783, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 682 + }, + { + "completion_length": 222.5625, + "epoch": 2.1824, + "grad_norm": 1.382318139076233, + "kl": 0.065185546875, + "learning_rate": 1.475e-07, + "loss": 0.0007, + "reward": 3.9492597579956055, + "reward_std": 0.008544785436242819, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9507622122764587, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9984976053237915, + "step": 683 + }, + { + "completion_length": 219.71875, + "epoch": 2.1856, + "grad_norm": 2.196531057357788, + "kl": 0.0595703125, + "learning_rate": 1.4624999999999998e-07, + "loss": 0.0006, + "reward": 3.9446985721588135, + "reward_std": 0.014558171853423119, + "rewards/answer_entity_reward": 0.9813033938407898, + "rewards/answer_wer_reward": 0.9633950591087341, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 684 + }, + { + "completion_length": 219.5, + "epoch": 2.1888, + "grad_norm": 1.4868621826171875, + "kl": 0.07177734375, + "learning_rate": 1.45e-07, + "loss": 0.0007, + "reward": 3.9446860551834106, + "reward_std": 0.010166772175580263, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9451901018619537, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9994959831237793, + "step": 685 + }, + { + "completion_length": 261.59375, + "epoch": 2.192, + "grad_norm": 0.8591821789741516, + "kl": 0.0595703125, + "learning_rate": 1.4374999999999997e-07, + "loss": 0.0006, + "reward": 3.9277877807617188, + "reward_std": 0.010211648885160685, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.930150032043457, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9976378083229065, + "step": 686 + }, + { + "completion_length": 205.0625, + "epoch": 2.1952, + "grad_norm": 0.924826443195343, + "kl": 0.0703125, + "learning_rate": 1.4249999999999999e-07, + "loss": 0.0007, + "reward": 3.9727468490600586, + "reward_std": 0.006501165917143226, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.972746878862381, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 687 + }, + { + "completion_length": 197.625, + "epoch": 2.1984, + "grad_norm": 1.508520483970642, + "kl": 0.092041015625, + "learning_rate": 1.4124999999999997e-07, + "loss": 0.0009, + "reward": 3.9627835750579834, + "reward_std": 0.010947544127702713, + "rewards/answer_entity_reward": 0.9930555522441864, + "rewards/answer_wer_reward": 0.9707047045230865, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9990234375, + "step": 688 + }, + { + "completion_length": 205.09375, + "epoch": 2.2016, + "grad_norm": 2.3478713035583496, + "kl": 0.0712890625, + "learning_rate": 1.4e-07, + "loss": 0.0007, + "reward": 3.933359384536743, + "reward_std": 0.008363787084817886, + "rewards/answer_entity_reward": 0.9861111044883728, + "rewards/answer_wer_reward": 0.9626152515411377, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9846329689025879, + "step": 689 + }, + { + "completion_length": 225.03125, + "epoch": 2.2048, + "grad_norm": 1.3916107416152954, + "kl": 0.058837890625, + "learning_rate": 1.3875e-07, + "loss": 0.0006, + "reward": 3.9732636213302612, + "reward_std": 0.009609260130673647, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9732636511325836, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 690 + }, + { + "completion_length": 152.59375, + "epoch": 2.208, + "grad_norm": 1.322786808013916, + "kl": 0.0557861328125, + "learning_rate": 1.375e-07, + "loss": 0.0006, + "reward": 3.8575568199157715, + "reward_std": 0.011282142717391253, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9596264958381653, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9003343284130096, + "step": 691 + }, + { + "completion_length": 162.71875, + "epoch": 2.2112, + "grad_norm": 0.7846171855926514, + "kl": 0.0657958984375, + "learning_rate": 1.3625e-07, + "loss": 0.0007, + "reward": 3.9684951305389404, + "reward_std": 0.013251218944787979, + "rewards/answer_entity_reward": 0.9910714328289032, + "rewards/answer_wer_reward": 0.9774238169193268, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 692 + }, + { + "completion_length": 207.65625, + "epoch": 2.2144, + "grad_norm": 1.7230638265609741, + "kl": 0.1243896484375, + "learning_rate": 1.35e-07, + "loss": 0.0012, + "reward": 3.9475139379501343, + "reward_std": 0.00949817756190896, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9486435055732727, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.998870462179184, + "step": 693 + }, + { + "completion_length": 245.96875, + "epoch": 2.2176, + "grad_norm": 1.5247471332550049, + "kl": 0.061767578125, + "learning_rate": 1.3375e-07, + "loss": 0.0006, + "reward": 3.947926878929138, + "reward_std": 0.014066703617572784, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9513991177082062, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 694 + }, + { + "completion_length": 222.5625, + "epoch": 2.2208, + "grad_norm": 1.5721601247787476, + "kl": 0.0782470703125, + "learning_rate": 1.325e-07, + "loss": 0.0008, + "reward": 3.903387188911438, + "reward_std": 0.005873196758329868, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9640650153160095, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9393221139907837, + "step": 695 + }, + { + "completion_length": 187.03125, + "epoch": 2.224, + "grad_norm": 1.1470870971679688, + "kl": 0.0457763671875, + "learning_rate": 1.3125e-07, + "loss": 0.0005, + "reward": 3.9857735633850098, + "reward_std": 0.003898413386195898, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9857736229896545, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 696 + }, + { + "completion_length": 202.84375, + "epoch": 2.2272, + "grad_norm": 2.00569486618042, + "kl": 0.077392578125, + "learning_rate": 1.3e-07, + "loss": 0.0008, + "reward": 3.9439765214920044, + "reward_std": 0.00677294097840786, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9666953980922699, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9772810935974121, + "step": 697 + }, + { + "completion_length": 200.875, + "epoch": 2.2304, + "grad_norm": 0.5203324556350708, + "kl": 0.0533447265625, + "learning_rate": 1.2874999999999998e-07, + "loss": 0.0005, + "reward": 3.981989622116089, + "reward_std": 0.003249130444601178, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9819895327091217, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 698 + }, + { + "completion_length": 229.5, + "epoch": 2.2336, + "grad_norm": 1.028457760810852, + "kl": 0.0615234375, + "learning_rate": 1.275e-07, + "loss": 0.0006, + "reward": 3.9699747562408447, + "reward_std": 0.007223621942102909, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9699748456478119, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 699 + }, + { + "completion_length": 183.375, + "epoch": 2.2368, + "grad_norm": 1.1010169982910156, + "kl": 0.09619140625, + "learning_rate": 1.2624999999999998e-07, + "loss": 0.001, + "reward": 3.9709969758987427, + "reward_std": 0.013876417418941855, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9763848185539246, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.994612067937851, + "step": 700 + }, + { + "completion_length": 192.6875, + "epoch": 2.24, + "grad_norm": 1.9254510402679443, + "kl": 0.126708984375, + "learning_rate": 1.25e-07, + "loss": 0.0013, + "reward": 3.9508676528930664, + "reward_std": 0.007698251400142908, + "rewards/answer_entity_reward": 0.9903846085071564, + "rewards/answer_wer_reward": 0.9661648571491241, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9943181872367859, + "step": 701 + }, + { + "completion_length": 206.6875, + "epoch": 2.2432, + "grad_norm": 4.035684108734131, + "kl": 0.04833984375, + "learning_rate": 1.2375e-07, + "loss": 0.0005, + "reward": 3.9621732234954834, + "reward_std": 0.007325239945203066, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.977934330701828, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9842387735843658, + "step": 702 + }, + { + "completion_length": 240.78125, + "epoch": 2.2464, + "grad_norm": 1.4605140686035156, + "kl": 0.0582275390625, + "learning_rate": 1.225e-07, + "loss": 0.0006, + "reward": 3.951379179954529, + "reward_std": 0.005893495166674256, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9543100893497467, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9970689713954926, + "step": 703 + }, + { + "completion_length": 190.84375, + "epoch": 2.2496, + "grad_norm": 0.8877372741699219, + "kl": 0.064453125, + "learning_rate": 1.2125e-07, + "loss": 0.0007, + "reward": 3.9827821254730225, + "reward_std": 0.003501511411741376, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.983114629983902, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996675550937653, + "step": 704 + }, + { + "completion_length": 169.875, + "epoch": 2.2528, + "grad_norm": 4.669096946716309, + "kl": 0.0634765625, + "learning_rate": 1.2e-07, + "loss": 0.0006, + "reward": 3.9501044750213623, + "reward_std": 0.00536915916018188, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9683522582054138, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9817522466182709, + "step": 705 + }, + { + "completion_length": 208.34375, + "epoch": 2.2560000000000002, + "grad_norm": 2.4436697959899902, + "kl": 0.072998046875, + "learning_rate": 1.1874999999999999e-07, + "loss": 0.0007, + "reward": 3.95159912109375, + "reward_std": 0.012246299302205443, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9697677791118622, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9818313717842102, + "step": 706 + }, + { + "completion_length": 253.34375, + "epoch": 2.2592, + "grad_norm": 0.6258556842803955, + "kl": 0.0625, + "learning_rate": 1.1749999999999999e-07, + "loss": 0.0006, + "reward": 3.943672776222229, + "reward_std": 0.004726027720607817, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9436727464199066, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 707 + }, + { + "completion_length": 187.96875, + "epoch": 2.2624, + "grad_norm": 2.1608188152313232, + "kl": 0.09521484375, + "learning_rate": 1.1625e-07, + "loss": 0.0009, + "reward": 3.9321788549423218, + "reward_std": 0.021823766641318798, + "rewards/answer_entity_reward": 0.9955357313156128, + "rewards/answer_wer_reward": 0.9405494034290314, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.99609375, + "step": 708 + }, + { + "completion_length": 201.0625, + "epoch": 2.2656, + "grad_norm": 5.012310028076172, + "kl": 0.04071044921875, + "learning_rate": 1.15e-07, + "loss": 0.0004, + "reward": 3.9624879360198975, + "reward_std": 0.01549163879826665, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9760953187942505, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.989864856004715, + "step": 709 + }, + { + "completion_length": 238.71875, + "epoch": 2.2688, + "grad_norm": 1.1021510362625122, + "kl": 0.08154296875, + "learning_rate": 1.1375e-07, + "loss": 0.0008, + "reward": 3.9332664012908936, + "reward_std": 0.015113649424165487, + "rewards/answer_entity_reward": 0.9832701981067657, + "rewards/answer_wer_reward": 0.9499962031841278, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 710 + }, + { + "completion_length": 220.90625, + "epoch": 2.2720000000000002, + "grad_norm": 1.1716574430465698, + "kl": 0.053466796875, + "learning_rate": 1.125e-07, + "loss": 0.0005, + "reward": 3.9751139879226685, + "reward_std": 0.007001735270023346, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9751139879226685, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 711 + }, + { + "completion_length": 241.40625, + "epoch": 2.2752, + "grad_norm": 1.469359278678894, + "kl": 0.07275390625, + "learning_rate": 1.1125e-07, + "loss": 0.0007, + "reward": 3.898247718811035, + "reward_std": 0.039173625875264406, + "rewards/answer_entity_reward": 0.984375, + "rewards/answer_wer_reward": 0.9162905812263489, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9975819885730743, + "step": 712 + }, + { + "completion_length": 205.25, + "epoch": 2.2784, + "grad_norm": 0.7749589085578918, + "kl": 0.0621337890625, + "learning_rate": 1.0999999999999999e-07, + "loss": 0.0006, + "reward": 3.9739962816238403, + "reward_std": 0.0056007420644164085, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9743727445602417, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996234774589539, + "step": 713 + }, + { + "completion_length": 206.125, + "epoch": 2.2816, + "grad_norm": 0.5464848875999451, + "kl": 0.04901123046875, + "learning_rate": 1.0874999999999999e-07, + "loss": 0.0005, + "reward": 3.95177161693573, + "reward_std": 0.004434725036844611, + "rewards/answer_entity_reward": 0.9903846085071564, + "rewards/answer_wer_reward": 0.9615707993507385, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9998161792755127, + "step": 714 + }, + { + "completion_length": 169.09375, + "epoch": 2.2848, + "grad_norm": 3.133605480194092, + "kl": 0.06689453125, + "learning_rate": 1.0749999999999999e-07, + "loss": 0.0007, + "reward": 3.929832339286804, + "reward_std": 0.01732827629894018, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9689165651798248, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.960915744304657, + "step": 715 + }, + { + "completion_length": 204.59375, + "epoch": 2.288, + "grad_norm": 0.7156680822372437, + "kl": 0.06884765625, + "learning_rate": 1.0624999999999999e-07, + "loss": 0.0007, + "reward": 3.976062059402466, + "reward_std": 0.0025083101354539394, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9836839437484741, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9923780560493469, + "step": 716 + }, + { + "completion_length": 210.59375, + "epoch": 2.2912, + "grad_norm": 284.2210998535156, + "kl": 0.1416015625, + "learning_rate": 1.0499999999999999e-07, + "loss": 0.0014, + "reward": 3.9028064012527466, + "reward_std": 0.016830324195325375, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9614686369895935, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9413377344608307, + "step": 717 + }, + { + "completion_length": 232.53125, + "epoch": 2.2944, + "grad_norm": 1.077739953994751, + "kl": 0.08544921875, + "learning_rate": 1.0374999999999999e-07, + "loss": 0.0009, + "reward": 3.9475821256637573, + "reward_std": 0.011592368595302105, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9478915929794312, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.999690592288971, + "step": 718 + }, + { + "completion_length": 217.875, + "epoch": 2.2976, + "grad_norm": 2.2114531993865967, + "kl": 0.195068359375, + "learning_rate": 1.0249999999999998e-07, + "loss": 0.002, + "reward": 3.941352367401123, + "reward_std": 0.00652403780259192, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9605833292007446, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9807692170143127, + "step": 719 + }, + { + "completion_length": 241.875, + "epoch": 2.3008, + "grad_norm": 2.330026865005493, + "kl": 0.10693359375, + "learning_rate": 1.0125e-07, + "loss": 0.0011, + "reward": 3.8385108709335327, + "reward_std": 0.0217201872728765, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9317739605903625, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9067369103431702, + "step": 720 + }, + { + "completion_length": 148.15625, + "epoch": 2.304, + "grad_norm": 6.020991802215576, + "kl": 0.0804443359375, + "learning_rate": 1e-07, + "loss": 0.0008, + "reward": 3.9653271436691284, + "reward_std": 0.010471278452314436, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9677309989929199, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 721 + }, + { + "completion_length": 242.34375, + "epoch": 2.3072, + "grad_norm": 1.3827441930770874, + "kl": 0.0606689453125, + "learning_rate": 9.875e-08, + "loss": 0.0006, + "reward": 3.9477760791778564, + "reward_std": 0.017027822323143482, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9524165093898773, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9988317787647247, + "step": 722 + }, + { + "completion_length": 182.875, + "epoch": 2.3104, + "grad_norm": 0.6132823824882507, + "kl": 0.0732421875, + "learning_rate": 9.749999999999999e-08, + "loss": 0.0007, + "reward": 3.9824774265289307, + "reward_std": 0.0017756590968929231, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9837089478969574, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9987684786319733, + "step": 723 + }, + { + "completion_length": 259.21875, + "epoch": 2.3136, + "grad_norm": 1.0919182300567627, + "kl": 0.052001953125, + "learning_rate": 9.624999999999999e-08, + "loss": 0.0005, + "reward": 3.9247913360595703, + "reward_std": 0.0157609935849905, + "rewards/answer_entity_reward": 0.9692307412624359, + "rewards/answer_wer_reward": 0.9555604159832001, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 724 + }, + { + "completion_length": 243.21875, + "epoch": 2.3168, + "grad_norm": 1.7886172533035278, + "kl": 0.04718017578125, + "learning_rate": 9.499999999999999e-08, + "loss": 0.0005, + "reward": 3.9662917852401733, + "reward_std": 0.005910404259338975, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9665379524230957, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997539520263672, + "step": 725 + }, + { + "completion_length": 201.65625, + "epoch": 2.32, + "grad_norm": 1.3444185256958008, + "kl": 0.0606689453125, + "learning_rate": 9.375e-08, + "loss": 0.0006, + "reward": 3.9709818363189697, + "reward_std": 0.00892023229971528, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9738226532936096, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 726 + }, + { + "completion_length": 224.1875, + "epoch": 2.3232, + "grad_norm": 4.107091426849365, + "kl": 0.229736328125, + "learning_rate": 9.25e-08, + "loss": 0.0023, + "reward": 3.9483840465545654, + "reward_std": 0.013201091904193163, + "rewards/answer_entity_reward": 0.9927884340286255, + "rewards/answer_wer_reward": 0.955822080373764, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997735619544983, + "step": 727 + }, + { + "completion_length": 189.75, + "epoch": 2.3264, + "grad_norm": 1.512626051902771, + "kl": 0.0589599609375, + "learning_rate": 9.125e-08, + "loss": 0.0006, + "reward": 3.9542768001556396, + "reward_std": 0.008582692593336105, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9702657759189606, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.984011173248291, + "step": 728 + }, + { + "completion_length": 172.8125, + "epoch": 2.3296, + "grad_norm": 4.1475830078125, + "kl": 0.110107421875, + "learning_rate": 9e-08, + "loss": 0.0011, + "reward": 3.9462348222732544, + "reward_std": 0.009323009755462408, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9772224724292755, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9690123498439789, + "step": 729 + }, + { + "completion_length": 198.84375, + "epoch": 2.3327999999999998, + "grad_norm": 1.3541475534439087, + "kl": 0.045166015625, + "learning_rate": 8.875e-08, + "loss": 0.0005, + "reward": 3.9697635173797607, + "reward_std": 0.00771446293219924, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9707715511322021, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9989919364452362, + "step": 730 + }, + { + "completion_length": 217.71875, + "epoch": 2.336, + "grad_norm": 1.2064177989959717, + "kl": 0.05908203125, + "learning_rate": 8.75e-08, + "loss": 0.0006, + "reward": 3.9431110620498657, + "reward_std": 0.01243708049878478, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9462102055549622, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9969007968902588, + "step": 731 + }, + { + "completion_length": 208.0, + "epoch": 2.3392, + "grad_norm": 1.1856428384780884, + "kl": 0.048095703125, + "learning_rate": 8.625e-08, + "loss": 0.0005, + "reward": 3.955425500869751, + "reward_std": 0.013023892883211374, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9704216420650482, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9850037693977356, + "step": 732 + }, + { + "completion_length": 230.46875, + "epoch": 2.3424, + "grad_norm": 7.96836519241333, + "kl": 0.0765380859375, + "learning_rate": 8.500000000000001e-08, + "loss": 0.0008, + "reward": 3.8350234031677246, + "reward_std": 0.0071187918074429035, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9640994668006897, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.8709239065647125, + "step": 733 + }, + { + "completion_length": 240.46875, + "epoch": 2.3456, + "grad_norm": 1.9817602634429932, + "kl": 0.067138671875, + "learning_rate": 8.375e-08, + "loss": 0.0007, + "reward": 3.8598886728286743, + "reward_std": 0.009870891459286213, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9313421249389648, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9285465180873871, + "step": 734 + }, + { + "completion_length": 232.625, + "epoch": 2.3487999999999998, + "grad_norm": 1.4039250612258911, + "kl": 0.05126953125, + "learning_rate": 8.25e-08, + "loss": 0.0005, + "reward": 3.9484113454818726, + "reward_std": 0.011133690131828189, + "rewards/answer_entity_reward": 0.9971590936183929, + "rewards/answer_wer_reward": 0.9577626585960388, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9934895932674408, + "step": 735 + }, + { + "completion_length": 168.15625, + "epoch": 2.352, + "grad_norm": 0.8416581153869629, + "kl": 0.068359375, + "learning_rate": 8.125e-08, + "loss": 0.0007, + "reward": 3.9322515726089478, + "reward_std": 0.002792949788272381, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9946084916591644, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9376430511474609, + "step": 736 + }, + { + "completion_length": 232.375, + "epoch": 2.3552, + "grad_norm": 1.3709439039230347, + "kl": 0.068359375, + "learning_rate": 8e-08, + "loss": 0.0007, + "reward": 3.9093856811523438, + "reward_std": 0.0034298759419471025, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.971885621547699, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9375, + "step": 737 + }, + { + "completion_length": 201.15625, + "epoch": 2.3584, + "grad_norm": 0.9587724804878235, + "kl": 0.0657958984375, + "learning_rate": 7.875e-08, + "loss": 0.0007, + "reward": 3.960189461708069, + "reward_std": 0.017379604279994965, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9656778275966644, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9979838728904724, + "step": 738 + }, + { + "completion_length": 204.3125, + "epoch": 2.3616, + "grad_norm": 1.5729237794876099, + "kl": 0.075439453125, + "learning_rate": 7.75e-08, + "loss": 0.0007, + "reward": 3.9626389741897583, + "reward_std": 0.01823890022933483, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9661112725734711, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 739 + }, + { + "completion_length": 239.875, + "epoch": 2.3648, + "grad_norm": 0.9296643733978271, + "kl": 0.064208984375, + "learning_rate": 7.625e-08, + "loss": 0.0006, + "reward": 3.968054413795471, + "reward_std": 0.0051011774921789765, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9680543541908264, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 740 + }, + { + "completion_length": 242.71875, + "epoch": 2.368, + "grad_norm": 0.9536841511726379, + "kl": 0.0606689453125, + "learning_rate": 7.5e-08, + "loss": 0.0006, + "reward": 3.9280422925949097, + "reward_std": 0.005676981760188937, + "rewards/answer_entity_reward": 0.9861111044883728, + "rewards/answer_wer_reward": 0.9430340826511383, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9988970458507538, + "step": 741 + }, + { + "completion_length": 239.59375, + "epoch": 2.3712, + "grad_norm": 1.1191787719726562, + "kl": 0.0565185546875, + "learning_rate": 7.375e-08, + "loss": 0.0006, + "reward": 3.9627801179885864, + "reward_std": 0.004723543883301318, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9627801775932312, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 742 + }, + { + "completion_length": 198.125, + "epoch": 2.3744, + "grad_norm": 19.45572280883789, + "kl": 0.0677490234375, + "learning_rate": 7.25e-08, + "loss": 0.0007, + "reward": 3.8835959434509277, + "reward_std": 0.0259452061727643, + "rewards/answer_entity_reward": 0.9975961446762085, + "rewards/answer_wer_reward": 0.9579322040081024, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9280676245689392, + "step": 743 + }, + { + "completion_length": 176.875, + "epoch": 2.3776, + "grad_norm": 2.2377281188964844, + "kl": 0.090087890625, + "learning_rate": 7.124999999999999e-08, + "loss": 0.0009, + "reward": 3.9422539472579956, + "reward_std": 0.039653101935982704, + "rewards/answer_entity_reward": 0.9895833134651184, + "rewards/answer_wer_reward": 0.9664814472198486, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.986189216375351, + "step": 744 + }, + { + "completion_length": 229.1875, + "epoch": 2.3808, + "grad_norm": 1.561314344406128, + "kl": 0.0491943359375, + "learning_rate": 7e-08, + "loss": 0.0005, + "reward": 3.8669506311416626, + "reward_std": 0.19146580225788057, + "rewards/answer_entity_reward": 0.96875, + "rewards/answer_wer_reward": 0.9294506311416626, + "rewards/format_reward": 0.96875, + "rewards/think_ocr_reward": 1.0, + "step": 745 + }, + { + "completion_length": 192.875, + "epoch": 2.384, + "grad_norm": 1.9305033683776855, + "kl": 0.078857421875, + "learning_rate": 6.875e-08, + "loss": 0.0008, + "reward": 3.944983959197998, + "reward_std": 0.012190061155706644, + "rewards/answer_entity_reward": 0.9979166686534882, + "rewards/answer_wer_reward": 0.9473004341125488, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997667968273163, + "step": 746 + }, + { + "completion_length": 214.75, + "epoch": 2.3872, + "grad_norm": 13.16278076171875, + "kl": 0.0552978515625, + "learning_rate": 6.75e-08, + "loss": 0.0006, + "reward": 3.981534004211426, + "reward_std": 0.016841471777297556, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.989596426486969, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.991937518119812, + "step": 747 + }, + { + "completion_length": 202.65625, + "epoch": 2.3904, + "grad_norm": 1.269473671913147, + "kl": 0.0595703125, + "learning_rate": 6.625e-08, + "loss": 0.0006, + "reward": 3.9539172649383545, + "reward_std": 0.006352424388751388, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9545792937278748, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9993380010128021, + "step": 748 + }, + { + "completion_length": 241.9375, + "epoch": 2.3936, + "grad_norm": 0.799062192440033, + "kl": 0.08447265625, + "learning_rate": 6.5e-08, + "loss": 0.0008, + "reward": 3.968814492225647, + "reward_std": 0.0058513006661087275, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9696769118309021, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9991375803947449, + "step": 749 + }, + { + "completion_length": 175.90625, + "epoch": 2.3968, + "grad_norm": 1.7988041639328003, + "kl": 0.06201171875, + "learning_rate": 6.375e-08, + "loss": 0.0006, + "reward": 3.9838857650756836, + "reward_std": 0.0046576057793572545, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9841121137142181, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997735619544983, + "step": 750 + }, + { + "completion_length": 215.59375, + "epoch": 2.4, + "grad_norm": 2.852858781814575, + "kl": 0.0533447265625, + "learning_rate": 6.25e-08, + "loss": 0.0005, + "reward": 3.943244457244873, + "reward_std": 0.03492546791676432, + "rewards/answer_entity_reward": 0.9943181872367859, + "rewards/answer_wer_reward": 0.9835853576660156, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9653409123420715, + "step": 751 + }, + { + "completion_length": 238.5, + "epoch": 2.4032, + "grad_norm": 12.164900779724121, + "kl": 0.0615234375, + "learning_rate": 6.125e-08, + "loss": 0.0006, + "reward": 3.9755419492721558, + "reward_std": 0.010625506052747369, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9782145917415619, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9973272979259491, + "step": 752 + }, + { + "completion_length": 179.15625, + "epoch": 2.4064, + "grad_norm": 0.9550566077232361, + "kl": 0.0693359375, + "learning_rate": 6e-08, + "loss": 0.0007, + "reward": 3.954240560531616, + "reward_std": 0.011055386741645634, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9720976054668427, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9821428656578064, + "step": 753 + }, + { + "completion_length": 216.0625, + "epoch": 2.4096, + "grad_norm": 1.3647923469543457, + "kl": 0.0582275390625, + "learning_rate": 5.8749999999999993e-08, + "loss": 0.0006, + "reward": 3.962032198905945, + "reward_std": 0.008129856083542109, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9623997509479523, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996323585510254, + "step": 754 + }, + { + "completion_length": 221.8125, + "epoch": 2.4128, + "grad_norm": 1.9497917890548706, + "kl": 0.0604248046875, + "learning_rate": 5.75e-08, + "loss": 0.0006, + "reward": 3.9653851985931396, + "reward_std": 0.02012356440536678, + "rewards/answer_entity_reward": 0.9937500059604645, + "rewards/answer_wer_reward": 0.9722139835357666, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9994212985038757, + "step": 755 + }, + { + "completion_length": 198.4375, + "epoch": 2.416, + "grad_norm": 0.6684221029281616, + "kl": 0.07568359375, + "learning_rate": 5.625e-08, + "loss": 0.0008, + "reward": 3.942944049835205, + "reward_std": 0.008921493077650666, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9674927294254303, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9754513502120972, + "step": 756 + }, + { + "completion_length": 234.875, + "epoch": 2.4192, + "grad_norm": 1.097367525100708, + "kl": 0.1142578125, + "learning_rate": 5.4999999999999996e-08, + "loss": 0.0011, + "reward": 3.9485758543014526, + "reward_std": 0.01669642748311162, + "rewards/answer_entity_reward": 0.9955357313156128, + "rewards/answer_wer_reward": 0.9544399976730347, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9986002445220947, + "step": 757 + }, + { + "completion_length": 150.46875, + "epoch": 2.4224, + "grad_norm": 0.21660760045051575, + "kl": 0.0321044921875, + "learning_rate": 5.3749999999999995e-08, + "loss": 0.0003, + "reward": 3.978167176246643, + "reward_std": 0.0010678768157958984, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9781671762466431, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 758 + }, + { + "completion_length": 231.25, + "epoch": 2.4256, + "grad_norm": 3.330300807952881, + "kl": 0.078857421875, + "learning_rate": 5.2499999999999994e-08, + "loss": 0.0008, + "reward": 3.9418994188308716, + "reward_std": 0.007436740444973111, + "rewards/answer_entity_reward": 0.9861111044883728, + "rewards/answer_wer_reward": 0.9557883143424988, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 759 + }, + { + "completion_length": 212.5, + "epoch": 2.4288, + "grad_norm": 3.427900791168213, + "kl": 0.13525390625, + "learning_rate": 5.124999999999999e-08, + "loss": 0.0014, + "reward": 3.9013478755950928, + "reward_std": 0.030906156171113253, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9625242948532104, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9388234913349152, + "step": 760 + }, + { + "completion_length": 218.90625, + "epoch": 2.432, + "grad_norm": 1.3307231664657593, + "kl": 0.0567626953125, + "learning_rate": 5e-08, + "loss": 0.0006, + "reward": 3.9774084091186523, + "reward_std": 0.0034683155827224255, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9774083495140076, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 761 + }, + { + "completion_length": 207.875, + "epoch": 2.4352, + "grad_norm": 0.7475162148475647, + "kl": 0.057373046875, + "learning_rate": 4.8749999999999996e-08, + "loss": 0.0006, + "reward": 3.9419760704040527, + "reward_std": 0.004616708727553487, + "rewards/answer_entity_reward": 0.9788995385169983, + "rewards/answer_wer_reward": 0.9634398818016052, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996366202831268, + "step": 762 + }, + { + "completion_length": 195.28125, + "epoch": 2.4384, + "grad_norm": 2.0728979110717773, + "kl": 0.0966796875, + "learning_rate": 4.7499999999999995e-08, + "loss": 0.001, + "reward": 3.944322109222412, + "reward_std": 0.017246471252292395, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9587452709674835, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9855769276618958, + "step": 763 + }, + { + "completion_length": 224.9375, + "epoch": 2.4416, + "grad_norm": 1.2122951745986938, + "kl": 0.1226806640625, + "learning_rate": 4.625e-08, + "loss": 0.0012, + "reward": 3.9620940685272217, + "reward_std": 0.007986569311469793, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9652903079986572, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.996803879737854, + "step": 764 + }, + { + "completion_length": 249.03125, + "epoch": 2.4448, + "grad_norm": 1.21713125705719, + "kl": 0.065673828125, + "learning_rate": 4.5e-08, + "loss": 0.0007, + "reward": 3.9346585273742676, + "reward_std": 0.006481441203504801, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9346585869789124, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 765 + }, + { + "completion_length": 239.875, + "epoch": 2.448, + "grad_norm": 5.105895519256592, + "kl": 0.0665283203125, + "learning_rate": 4.375e-08, + "loss": 0.0007, + "reward": 3.916127324104309, + "reward_std": 0.02047336893156171, + "rewards/answer_entity_reward": 0.9910714626312256, + "rewards/answer_wer_reward": 0.9319192171096802, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9931367635726929, + "step": 766 + }, + { + "completion_length": 216.84375, + "epoch": 2.4512, + "grad_norm": 3.230001449584961, + "kl": 0.0445556640625, + "learning_rate": 4.2500000000000003e-08, + "loss": 0.0004, + "reward": 3.9800050258636475, + "reward_std": 0.004955247277393937, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9800049960613251, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 767 + }, + { + "completion_length": 229.8125, + "epoch": 2.4544, + "grad_norm": 1.2354313135147095, + "kl": 0.0478515625, + "learning_rate": 4.125e-08, + "loss": 0.0005, + "reward": 3.9553003311157227, + "reward_std": 0.013880819431506097, + "rewards/answer_entity_reward": 0.9826388955116272, + "rewards/answer_wer_reward": 0.9739912152290344, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.998670220375061, + "step": 768 + }, + { + "completion_length": 248.34375, + "epoch": 2.4576000000000002, + "grad_norm": 0.8089145421981812, + "kl": 0.06005859375, + "learning_rate": 4e-08, + "loss": 0.0006, + "reward": 3.9643748998641968, + "reward_std": 0.007618119474500418, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9654783606529236, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9988966286182404, + "step": 769 + }, + { + "completion_length": 233.53125, + "epoch": 2.4608, + "grad_norm": 1.2253531217575073, + "kl": 0.0540771484375, + "learning_rate": 3.875e-08, + "loss": 0.0005, + "reward": 3.955801010131836, + "reward_std": 0.007193901808932424, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9558009505271912, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 770 + }, + { + "completion_length": 246.25, + "epoch": 2.464, + "grad_norm": 0.8907082080841064, + "kl": 0.0740966796875, + "learning_rate": 3.75e-08, + "loss": 0.0007, + "reward": 3.9567649364471436, + "reward_std": 0.007558103417977691, + "rewards/answer_entity_reward": 0.9926470518112183, + "rewards/answer_wer_reward": 0.9644212424755096, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996966123580933, + "step": 771 + }, + { + "completion_length": 157.84375, + "epoch": 2.4672, + "grad_norm": 0.6787045001983643, + "kl": 0.080078125, + "learning_rate": 3.625e-08, + "loss": 0.0008, + "reward": 3.989119529724121, + "reward_std": 0.0026377947069704533, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9893985092639923, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997209906578064, + "step": 772 + }, + { + "completion_length": 228.75, + "epoch": 2.4704, + "grad_norm": 0.6448482275009155, + "kl": 0.0562744140625, + "learning_rate": 3.5e-08, + "loss": 0.0006, + "reward": 3.960241913795471, + "reward_std": 0.005235916236415505, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.960241824388504, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 773 + }, + { + "completion_length": 226.5, + "epoch": 2.4736000000000002, + "grad_norm": 0.9646191596984863, + "kl": 0.05224609375, + "learning_rate": 3.375e-08, + "loss": 0.0005, + "reward": 3.9351943731307983, + "reward_std": 0.015792422462254763, + "rewards/answer_entity_reward": 0.9866071343421936, + "rewards/answer_wer_reward": 0.9691915214061737, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9793955981731415, + "step": 774 + }, + { + "completion_length": 249.53125, + "epoch": 2.4768, + "grad_norm": 2.9048826694488525, + "kl": 0.0540771484375, + "learning_rate": 3.25e-08, + "loss": 0.0005, + "reward": 3.952099561691284, + "reward_std": 0.00629690324421972, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9520994424819946, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 775 + }, + { + "completion_length": 222.21875, + "epoch": 2.48, + "grad_norm": 1.1555320024490356, + "kl": 0.0548095703125, + "learning_rate": 3.125e-08, + "loss": 0.0005, + "reward": 3.9609912633895874, + "reward_std": 0.017560790292918682, + "rewards/answer_entity_reward": 0.9927884340286255, + "rewards/answer_wer_reward": 0.9682029485702515, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 776 + }, + { + "completion_length": 245.46875, + "epoch": 2.4832, + "grad_norm": 2.5107345581054688, + "kl": 0.098388671875, + "learning_rate": 3e-08, + "loss": 0.001, + "reward": 3.9294867515563965, + "reward_std": 0.009384696371853352, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.932422935962677, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9970638751983643, + "step": 777 + }, + { + "completion_length": 175.1875, + "epoch": 2.4864, + "grad_norm": 3.319678783416748, + "kl": 0.06640625, + "learning_rate": 2.875e-08, + "loss": 0.0007, + "reward": 3.9766006469726562, + "reward_std": 0.005283091915771365, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9770888686180115, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.99951171875, + "step": 778 + }, + { + "completion_length": 215.8125, + "epoch": 2.4896, + "grad_norm": 1.7188315391540527, + "kl": 0.058837890625, + "learning_rate": 2.7499999999999998e-08, + "loss": 0.0006, + "reward": 3.945501208305359, + "reward_std": 0.006351021584123373, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9457343518733978, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9997667968273163, + "step": 779 + }, + { + "completion_length": 204.28125, + "epoch": 2.4928, + "grad_norm": 1.284071683883667, + "kl": 0.0640869140625, + "learning_rate": 2.6249999999999997e-08, + "loss": 0.0006, + "reward": 3.9768584966659546, + "reward_std": 0.003316762624308467, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9779550433158875, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9989035129547119, + "step": 780 + }, + { + "completion_length": 216.71875, + "epoch": 2.496, + "grad_norm": 1.442418098449707, + "kl": 0.067138671875, + "learning_rate": 2.5e-08, + "loss": 0.0007, + "reward": 3.945361614227295, + "reward_std": 0.03020885493606329, + "rewards/answer_entity_reward": 0.9867424070835114, + "rewards/answer_wer_reward": 0.9586191177368164, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 781 + }, + { + "completion_length": 199.28125, + "epoch": 2.4992, + "grad_norm": 2.220127582550049, + "kl": 0.071533203125, + "learning_rate": 2.3749999999999998e-08, + "loss": 0.0007, + "reward": 3.945390462875366, + "reward_std": 0.012143698055297136, + "rewards/answer_entity_reward": 0.9965277910232544, + "rewards/answer_wer_reward": 0.9488627314567566, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 782 + }, + { + "completion_length": 220.6875, + "epoch": 2.5023999999999997, + "grad_norm": 2.2362775802612305, + "kl": 0.0634765625, + "learning_rate": 2.25e-08, + "loss": 0.0006, + "reward": 3.960192322731018, + "reward_std": 0.006831311853602529, + "rewards/answer_entity_reward": 0.9916666746139526, + "rewards/answer_wer_reward": 0.9691977500915527, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9993279576301575, + "step": 783 + }, + { + "completion_length": 235.5, + "epoch": 2.5056000000000003, + "grad_norm": 0.9817630052566528, + "kl": 0.05224609375, + "learning_rate": 2.1250000000000002e-08, + "loss": 0.0005, + "reward": 3.9702308177948, + "reward_std": 0.006825624033808708, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9809376895427704, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9892931282520294, + "step": 784 + }, + { + "completion_length": 203.78125, + "epoch": 2.5088, + "grad_norm": 2.859792947769165, + "kl": 0.053955078125, + "learning_rate": 2e-08, + "loss": 0.0005, + "reward": 3.9142426252365112, + "reward_std": 0.01792304962873459, + "rewards/answer_entity_reward": 0.9944852888584137, + "rewards/answer_wer_reward": 0.9792338609695435, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9405233263969421, + "step": 785 + }, + { + "completion_length": 224.28125, + "epoch": 2.512, + "grad_norm": 3.7338051795959473, + "kl": 0.060791015625, + "learning_rate": 1.875e-08, + "loss": 0.0006, + "reward": 3.948864221572876, + "reward_std": 0.01559874601662159, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9604960083961487, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9883682429790497, + "step": 786 + }, + { + "completion_length": 175.1875, + "epoch": 2.5152, + "grad_norm": 4.41845703125, + "kl": 0.083740234375, + "learning_rate": 1.75e-08, + "loss": 0.0008, + "reward": 3.949966311454773, + "reward_std": 0.01157908933237195, + "rewards/answer_entity_reward": 0.9914772808551788, + "rewards/answer_wer_reward": 0.9717868566513062, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9867021441459656, + "step": 787 + }, + { + "completion_length": 259.25, + "epoch": 2.5183999999999997, + "grad_norm": 0.9571487903594971, + "kl": 0.0584716796875, + "learning_rate": 1.625e-08, + "loss": 0.0006, + "reward": 3.853899836540222, + "reward_std": 0.1917457883246243, + "rewards/answer_entity_reward": 0.9654605388641357, + "rewards/answer_wer_reward": 0.9225141406059265, + "rewards/format_reward": 0.96875, + "rewards/think_ocr_reward": 0.9971751868724823, + "step": 788 + }, + { + "completion_length": 249.375, + "epoch": 2.5216, + "grad_norm": 2.86120867729187, + "kl": 0.1368408203125, + "learning_rate": 1.5e-08, + "loss": 0.0014, + "reward": 3.9423060417175293, + "reward_std": 0.01874951831996441, + "rewards/answer_entity_reward": 0.9955357313156128, + "rewards/answer_wer_reward": 0.9478386044502258, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9989316165447235, + "step": 789 + }, + { + "completion_length": 198.78125, + "epoch": 2.5248, + "grad_norm": 4.95521879196167, + "kl": 0.0611572265625, + "learning_rate": 1.3749999999999999e-08, + "loss": 0.0006, + "reward": 3.915849447250366, + "reward_std": 0.016107629984617233, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9824348092079163, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9334145784378052, + "step": 790 + }, + { + "completion_length": 184.1875, + "epoch": 2.528, + "grad_norm": 0.8447386622428894, + "kl": 0.0634765625, + "learning_rate": 1.25e-08, + "loss": 0.0006, + "reward": 3.929018259048462, + "reward_std": 0.009709671430755407, + "rewards/answer_entity_reward": 0.9886363744735718, + "rewards/answer_wer_reward": 0.940733015537262, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9996488690376282, + "step": 791 + }, + { + "completion_length": 185.59375, + "epoch": 2.5312, + "grad_norm": 2.6198718547821045, + "kl": 0.0439453125, + "learning_rate": 1.125e-08, + "loss": 0.0004, + "reward": 3.9582111835479736, + "reward_std": 0.007002702914178371, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9582110941410065, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 792 + }, + { + "completion_length": 197.8125, + "epoch": 2.5343999999999998, + "grad_norm": 1.3550831079483032, + "kl": 0.065185546875, + "learning_rate": 1e-08, + "loss": 0.0007, + "reward": 3.8907723426818848, + "reward_std": 0.005525397136807442, + "rewards/answer_entity_reward": 0.9903846085071564, + "rewards/answer_wer_reward": 0.9687470197677612, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.931640625, + "step": 793 + }, + { + "completion_length": 187.375, + "epoch": 2.5376, + "grad_norm": 1.0252914428710938, + "kl": 0.086181640625, + "learning_rate": 8.75e-09, + "loss": 0.0009, + "reward": 3.86617374420166, + "reward_std": 0.011230799835175276, + "rewards/answer_entity_reward": 0.9981617629528046, + "rewards/answer_wer_reward": 0.9432033002376556, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9248086512088776, + "step": 794 + }, + { + "completion_length": 220.84375, + "epoch": 2.5408, + "grad_norm": 3.189028739929199, + "kl": 0.05078125, + "learning_rate": 7.5e-09, + "loss": 0.0005, + "reward": 3.9672648906707764, + "reward_std": 0.006707400782033801, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.968046098947525, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.999218761920929, + "step": 795 + }, + { + "completion_length": 148.875, + "epoch": 2.544, + "grad_norm": 0.518578052520752, + "kl": 0.085693359375, + "learning_rate": 6.25e-09, + "loss": 0.0009, + "reward": 3.8482353687286377, + "reward_std": 0.0038536423817276955, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.8497678339481354, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9984675645828247, + "step": 796 + }, + { + "completion_length": 193.40625, + "epoch": 2.5472, + "grad_norm": 0.928065299987793, + "kl": 0.081298828125, + "learning_rate": 5e-09, + "loss": 0.0008, + "reward": 3.9669394493103027, + "reward_std": 0.013519858941435814, + "rewards/answer_entity_reward": 0.9937500059604645, + "rewards/answer_wer_reward": 0.9760889112949371, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9971005320549011, + "step": 797 + }, + { + "completion_length": 220.0625, + "epoch": 2.5504, + "grad_norm": 2.7394306659698486, + "kl": 0.050537109375, + "learning_rate": 3.75e-09, + "loss": 0.0005, + "reward": 3.972287654876709, + "reward_std": 0.0053059973288327456, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9722877740859985, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 798 + }, + { + "completion_length": 221.4375, + "epoch": 2.5536, + "grad_norm": 3.9942383766174316, + "kl": 0.0673828125, + "learning_rate": 2.5e-09, + "loss": 0.0007, + "reward": 3.9344537258148193, + "reward_std": 0.01906409254297614, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9436750113964081, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 0.9907786846160889, + "step": 799 + }, + { + "completion_length": 231.1875, + "epoch": 2.5568, + "grad_norm": 2.3216702938079834, + "kl": 0.0462646484375, + "learning_rate": 1.25e-09, + "loss": 0.0005, + "reward": 3.959131956100464, + "reward_std": 0.005453485995531082, + "rewards/answer_entity_reward": 1.0, + "rewards/answer_wer_reward": 0.9591320157051086, + "rewards/format_reward": 1.0, + "rewards/think_ocr_reward": 1.0, + "step": 800 + } + ], + "logging_steps": 1, + "max_steps": 800, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}