| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.5568, |
| "eval_steps": 500, |
| "global_step": 800, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "completion_length": 175.78125, |
| "epoch": 0.0032, |
| "grad_norm": 5.3713698387146, |
| "kl": 0.0, |
| "learning_rate": 1e-06, |
| "loss": 0.0, |
| "reward": 2.691648483276367, |
| "reward_std": 0.9842272102832794, |
| "rewards/answer_entity_reward": 0.8998827934265137, |
| "rewards/answer_wer_reward": 0.6144023239612579, |
| "rewards/format_reward": 0.65625, |
| "rewards/think_ocr_reward": 0.5211134254932404, |
| "step": 1 |
| }, |
| { |
| "completion_length": 205.1875, |
| "epoch": 0.0064, |
| "grad_norm": 12.984394073486328, |
| "kl": 0.000339508056640625, |
| "learning_rate": 9.9875e-07, |
| "loss": 0.0, |
| "reward": 2.8287014961242676, |
| "reward_std": 1.0050830841064453, |
| "rewards/answer_entity_reward": 0.7303222715854645, |
| "rewards/answer_wer_reward": 0.47497838735580444, |
| "rewards/format_reward": 0.875, |
| "rewards/think_ocr_reward": 0.7484009563922882, |
| "step": 2 |
| }, |
| { |
| "completion_length": 203.09375, |
| "epoch": 0.0096, |
| "grad_norm": 5.166553497314453, |
| "kl": 0.00044536590576171875, |
| "learning_rate": 9.975e-07, |
| "loss": 0.0, |
| "reward": 3.498788595199585, |
| "reward_std": 0.2545953020453453, |
| "rewards/answer_entity_reward": 0.9527146220207214, |
| "rewards/answer_wer_reward": 0.7393675744533539, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8067062795162201, |
| "step": 3 |
| }, |
| { |
| "completion_length": 206.1875, |
| "epoch": 0.0128, |
| "grad_norm": 2.356685161590576, |
| "kl": 0.0009002685546875, |
| "learning_rate": 9.9625e-07, |
| "loss": 0.0, |
| "reward": 3.299022078514099, |
| "reward_std": 0.5456227362155914, |
| "rewards/answer_entity_reward": 0.8519714176654816, |
| "rewards/answer_wer_reward": 0.6592651903629303, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.8190353512763977, |
| "step": 4 |
| }, |
| { |
| "completion_length": 223.28125, |
| "epoch": 0.016, |
| "grad_norm": 3.5642409324645996, |
| "kl": 0.001827239990234375, |
| "learning_rate": 9.95e-07, |
| "loss": 0.0, |
| "reward": 2.8498330116271973, |
| "reward_std": 0.6001743674278259, |
| "rewards/answer_entity_reward": 0.8803278803825378, |
| "rewards/answer_wer_reward": 0.45287495851516724, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.5478802025318146, |
| "step": 5 |
| }, |
| { |
| "completion_length": 210.28125, |
| "epoch": 0.0192, |
| "grad_norm": 2.062991142272949, |
| "kl": 0.004608154296875, |
| "learning_rate": 9.9375e-07, |
| "loss": 0.0, |
| "reward": 3.345002055168152, |
| "reward_std": 0.5891430526971817, |
| "rewards/answer_entity_reward": 0.8334160447120667, |
| "rewards/answer_wer_reward": 0.7313504219055176, |
| "rewards/format_reward": 0.875, |
| "rewards/think_ocr_reward": 0.9052354693412781, |
| "step": 6 |
| }, |
| { |
| "completion_length": 204.9375, |
| "epoch": 0.0224, |
| "grad_norm": 2.77138090133667, |
| "kl": 0.01922607421875, |
| "learning_rate": 9.925e-07, |
| "loss": 0.0002, |
| "reward": 3.3531779050827026, |
| "reward_std": 0.7286678552627563, |
| "rewards/answer_entity_reward": 0.8474657833576202, |
| "rewards/answer_wer_reward": 0.7306987345218658, |
| "rewards/format_reward": 0.90625, |
| "rewards/think_ocr_reward": 0.8687634468078613, |
| "step": 7 |
| }, |
| { |
| "completion_length": 242.0, |
| "epoch": 0.0256, |
| "grad_norm": 1.9377678632736206, |
| "kl": 0.00897216796875, |
| "learning_rate": 9.912499999999998e-07, |
| "loss": 0.0001, |
| "reward": 3.538244366645813, |
| "reward_std": 0.26357416808605194, |
| "rewards/answer_entity_reward": 0.8956374526023865, |
| "rewards/answer_wer_reward": 0.795194149017334, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.8786628246307373, |
| "step": 8 |
| }, |
| { |
| "completion_length": 181.28125, |
| "epoch": 0.0288, |
| "grad_norm": 2.9018149375915527, |
| "kl": 0.0250244140625, |
| "learning_rate": 9.9e-07, |
| "loss": 0.0002, |
| "reward": 3.6827263832092285, |
| "reward_std": 0.21120695769786835, |
| "rewards/answer_entity_reward": 0.9178647994995117, |
| "rewards/answer_wer_reward": 0.8329994082450867, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9318622648715973, |
| "step": 9 |
| }, |
| { |
| "completion_length": 211.1875, |
| "epoch": 0.032, |
| "grad_norm": 3.4354376792907715, |
| "kl": 0.02166748046875, |
| "learning_rate": 9.8875e-07, |
| "loss": 0.0002, |
| "reward": 3.6928374767303467, |
| "reward_std": 0.21010804921388626, |
| "rewards/answer_entity_reward": 0.8995116055011749, |
| "rewards/answer_wer_reward": 0.8549435138702393, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9383824467658997, |
| "step": 10 |
| }, |
| { |
| "completion_length": 165.40625, |
| "epoch": 0.0352, |
| "grad_norm": 5.1537652015686035, |
| "kl": 0.0521240234375, |
| "learning_rate": 9.875e-07, |
| "loss": 0.0005, |
| "reward": 3.500484824180603, |
| "reward_std": 0.5196337550878525, |
| "rewards/answer_entity_reward": 0.9380581974983215, |
| "rewards/answer_wer_reward": 0.7917109727859497, |
| "rewards/format_reward": 0.9375, |
| "rewards/think_ocr_reward": 0.833215594291687, |
| "step": 11 |
| }, |
| { |
| "completion_length": 223.8125, |
| "epoch": 0.0384, |
| "grad_norm": 3.7026002407073975, |
| "kl": 0.02813720703125, |
| "learning_rate": 9.862499999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.7366983890533447, |
| "reward_std": 0.19402557611465454, |
| "rewards/answer_entity_reward": 0.9315968751907349, |
| "rewards/answer_wer_reward": 0.836162269115448, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9689393639564514, |
| "step": 12 |
| }, |
| { |
| "completion_length": 201.34375, |
| "epoch": 0.0416, |
| "grad_norm": 4.624758243560791, |
| "kl": 0.0487060546875, |
| "learning_rate": 9.849999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.6485583782196045, |
| "reward_std": 0.19490989297628403, |
| "rewards/answer_entity_reward": 0.9538419842720032, |
| "rewards/answer_wer_reward": 0.8439803719520569, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.8819859325885773, |
| "step": 13 |
| }, |
| { |
| "completion_length": 197.53125, |
| "epoch": 0.0448, |
| "grad_norm": 5.349609375, |
| "kl": 0.03363037109375, |
| "learning_rate": 9.8375e-07, |
| "loss": 0.0003, |
| "reward": 3.579698920249939, |
| "reward_std": 0.12941206991672516, |
| "rewards/answer_entity_reward": 0.9086007177829742, |
| "rewards/answer_wer_reward": 0.8474478721618652, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8236501812934875, |
| "step": 14 |
| }, |
| { |
| "completion_length": 180.5625, |
| "epoch": 0.048, |
| "grad_norm": 5.51423454284668, |
| "kl": 0.0633544921875, |
| "learning_rate": 9.825e-07, |
| "loss": 0.0006, |
| "reward": 3.6973917484283447, |
| "reward_std": 0.15208109095692635, |
| "rewards/answer_entity_reward": 0.9153402149677277, |
| "rewards/answer_wer_reward": 0.8323444426059723, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9497069418430328, |
| "step": 15 |
| }, |
| { |
| "completion_length": 205.03125, |
| "epoch": 0.0512, |
| "grad_norm": 3.2830357551574707, |
| "kl": 0.059326171875, |
| "learning_rate": 9.8125e-07, |
| "loss": 0.0006, |
| "reward": 3.477460026741028, |
| "reward_std": 0.43340209126472473, |
| "rewards/answer_entity_reward": 0.8780590891838074, |
| "rewards/answer_wer_reward": 0.7556597292423248, |
| "rewards/format_reward": 0.9375, |
| "rewards/think_ocr_reward": 0.9062411189079285, |
| "step": 16 |
| }, |
| { |
| "completion_length": 243.84375, |
| "epoch": 0.0544, |
| "grad_norm": 2.257538080215454, |
| "kl": 0.03240966796875, |
| "learning_rate": 9.8e-07, |
| "loss": 0.0003, |
| "reward": 3.6340386867523193, |
| "reward_std": 0.15337160229682922, |
| "rewards/answer_entity_reward": 0.8995862305164337, |
| "rewards/answer_wer_reward": 0.7731227576732635, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9613295793533325, |
| "step": 17 |
| }, |
| { |
| "completion_length": 236.125, |
| "epoch": 0.0576, |
| "grad_norm": 2.133462429046631, |
| "kl": 0.0579833984375, |
| "learning_rate": 9.7875e-07, |
| "loss": 0.0006, |
| "reward": 3.730382204055786, |
| "reward_std": 0.1639438048005104, |
| "rewards/answer_entity_reward": 0.9158936738967896, |
| "rewards/answer_wer_reward": 0.8535431623458862, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9609452486038208, |
| "step": 18 |
| }, |
| { |
| "completion_length": 253.84375, |
| "epoch": 0.0608, |
| "grad_norm": 2.6911232471466064, |
| "kl": 0.042236328125, |
| "learning_rate": 9.775e-07, |
| "loss": 0.0004, |
| "reward": 3.6918214559555054, |
| "reward_std": 0.24240515753626823, |
| "rewards/answer_entity_reward": 0.908495306968689, |
| "rewards/answer_wer_reward": 0.8162411749362946, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9983349442481995, |
| "step": 19 |
| }, |
| { |
| "completion_length": 195.3125, |
| "epoch": 0.064, |
| "grad_norm": 2.856860876083374, |
| "kl": 0.0548095703125, |
| "learning_rate": 9.7625e-07, |
| "loss": 0.0005, |
| "reward": 3.570927858352661, |
| "reward_std": 0.38515634275972843, |
| "rewards/answer_entity_reward": 0.885971337556839, |
| "rewards/answer_wer_reward": 0.7937527894973755, |
| "rewards/format_reward": 0.9375, |
| "rewards/think_ocr_reward": 0.9537037014961243, |
| "step": 20 |
| }, |
| { |
| "completion_length": 200.21875, |
| "epoch": 0.0672, |
| "grad_norm": 2.869398355484009, |
| "kl": 0.059814453125, |
| "learning_rate": 9.75e-07, |
| "loss": 0.0006, |
| "reward": 3.7599644660949707, |
| "reward_std": 0.13445724919438362, |
| "rewards/answer_entity_reward": 0.9744762480258942, |
| "rewards/answer_wer_reward": 0.8406906425952911, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9447975754737854, |
| "step": 21 |
| }, |
| { |
| "completion_length": 228.9375, |
| "epoch": 0.0704, |
| "grad_norm": 2.2584221363067627, |
| "kl": 0.03387451171875, |
| "learning_rate": 9.7375e-07, |
| "loss": 0.0003, |
| "reward": 3.5859320163726807, |
| "reward_std": 0.14986564964056015, |
| "rewards/answer_entity_reward": 0.9357894659042358, |
| "rewards/answer_wer_reward": 0.8099571466445923, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8401854038238525, |
| "step": 22 |
| }, |
| { |
| "completion_length": 219.78125, |
| "epoch": 0.0736, |
| "grad_norm": 2.140197277069092, |
| "kl": 0.0499267578125, |
| "learning_rate": 9.725e-07, |
| "loss": 0.0005, |
| "reward": 3.755205750465393, |
| "reward_std": 0.09474575892090797, |
| "rewards/answer_entity_reward": 0.9487689137458801, |
| "rewards/answer_wer_reward": 0.871625155210495, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9348115921020508, |
| "step": 23 |
| }, |
| { |
| "completion_length": 206.28125, |
| "epoch": 0.0768, |
| "grad_norm": 3.823035478591919, |
| "kl": 0.13916015625, |
| "learning_rate": 9.712499999999998e-07, |
| "loss": 0.0014, |
| "reward": 3.7580984830856323, |
| "reward_std": 0.07033384963870049, |
| "rewards/answer_entity_reward": 0.9635280966758728, |
| "rewards/answer_wer_reward": 0.8670244812965393, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9275458753108978, |
| "step": 24 |
| }, |
| { |
| "completion_length": 141.875, |
| "epoch": 0.08, |
| "grad_norm": 3.9088714122772217, |
| "kl": 0.10791015625, |
| "learning_rate": 9.7e-07, |
| "loss": 0.0011, |
| "reward": 3.7762891054153442, |
| "reward_std": 0.04259665124118328, |
| "rewards/answer_entity_reward": 0.9848519563674927, |
| "rewards/answer_wer_reward": 0.8006402850151062, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9907970130443573, |
| "step": 25 |
| }, |
| { |
| "completion_length": 205.21875, |
| "epoch": 0.0832, |
| "grad_norm": 2.103792905807495, |
| "kl": 0.065185546875, |
| "learning_rate": 9.6875e-07, |
| "loss": 0.0007, |
| "reward": 3.811550498008728, |
| "reward_std": 0.11633584462106228, |
| "rewards/answer_entity_reward": 0.9553370177745819, |
| "rewards/answer_wer_reward": 0.9040265679359436, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9834368824958801, |
| "step": 26 |
| }, |
| { |
| "completion_length": 228.78125, |
| "epoch": 0.0864, |
| "grad_norm": 2.7897403240203857, |
| "kl": 0.0435791015625, |
| "learning_rate": 9.675e-07, |
| "loss": 0.0004, |
| "reward": 3.788088798522949, |
| "reward_std": 0.10910476744174957, |
| "rewards/answer_entity_reward": 0.9546680450439453, |
| "rewards/answer_wer_reward": 0.872740238904953, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9606806039810181, |
| "step": 27 |
| }, |
| { |
| "completion_length": 210.5, |
| "epoch": 0.0896, |
| "grad_norm": 1.2101320028305054, |
| "kl": 0.0552978515625, |
| "learning_rate": 9.6625e-07, |
| "loss": 0.0006, |
| "reward": 3.8938169479370117, |
| "reward_std": 0.04485907219350338, |
| "rewards/answer_entity_reward": 0.974581778049469, |
| "rewards/answer_wer_reward": 0.9207929372787476, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9984423518180847, |
| "step": 28 |
| }, |
| { |
| "completion_length": 233.78125, |
| "epoch": 0.0928, |
| "grad_norm": 2.7460684776306152, |
| "kl": 0.035400390625, |
| "learning_rate": 9.649999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.662728428840637, |
| "reward_std": 0.20339616388082504, |
| "rewards/answer_entity_reward": 0.8774791359901428, |
| "rewards/answer_wer_reward": 0.8000176846981049, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9852316677570343, |
| "step": 29 |
| }, |
| { |
| "completion_length": 199.59375, |
| "epoch": 0.096, |
| "grad_norm": 1.8316643238067627, |
| "kl": 0.0596923828125, |
| "learning_rate": 9.637499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.890167713165283, |
| "reward_std": 0.037449197843670845, |
| "rewards/answer_entity_reward": 0.96912881731987, |
| "rewards/answer_wer_reward": 0.9220606982707977, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9989781975746155, |
| "step": 30 |
| }, |
| { |
| "completion_length": 226.125, |
| "epoch": 0.0992, |
| "grad_norm": 2.0417702198028564, |
| "kl": 0.0440673828125, |
| "learning_rate": 9.624999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.8260613679885864, |
| "reward_std": 0.07994803786277771, |
| "rewards/answer_entity_reward": 0.9577426314353943, |
| "rewards/answer_wer_reward": 0.902205765247345, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9661130309104919, |
| "step": 31 |
| }, |
| { |
| "completion_length": 214.5, |
| "epoch": 0.1024, |
| "grad_norm": 4.027645111083984, |
| "kl": 0.1015625, |
| "learning_rate": 9.6125e-07, |
| "loss": 0.001, |
| "reward": 3.7394936084747314, |
| "reward_std": 0.10389792174100876, |
| "rewards/answer_entity_reward": 0.9218434691429138, |
| "rewards/answer_wer_reward": 0.8621510863304138, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9554989635944366, |
| "step": 32 |
| }, |
| { |
| "completion_length": 255.28125, |
| "epoch": 0.1056, |
| "grad_norm": 1.527213454246521, |
| "kl": 0.046875, |
| "learning_rate": 9.6e-07, |
| "loss": 0.0005, |
| "reward": 3.8307132720947266, |
| "reward_std": 0.0552691500633955, |
| "rewards/answer_entity_reward": 0.9554121494293213, |
| "rewards/answer_wer_reward": 0.8765550553798676, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987460970878601, |
| "step": 33 |
| }, |
| { |
| "completion_length": 226.0, |
| "epoch": 0.1088, |
| "grad_norm": 1.822529673576355, |
| "kl": 0.0372314453125, |
| "learning_rate": 9.5875e-07, |
| "loss": 0.0004, |
| "reward": 3.8188695907592773, |
| "reward_std": 0.07392234448343515, |
| "rewards/answer_entity_reward": 0.9491736888885498, |
| "rewards/answer_wer_reward": 0.8781739175319672, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.991521954536438, |
| "step": 34 |
| }, |
| { |
| "completion_length": 230.71875, |
| "epoch": 0.112, |
| "grad_norm": 1.96689772605896, |
| "kl": 0.05322265625, |
| "learning_rate": 9.575e-07, |
| "loss": 0.0005, |
| "reward": 3.839812397956848, |
| "reward_std": 0.04108080454170704, |
| "rewards/answer_entity_reward": 0.9491481184959412, |
| "rewards/answer_wer_reward": 0.8918017745018005, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9988624751567841, |
| "step": 35 |
| }, |
| { |
| "completion_length": 181.75, |
| "epoch": 0.1152, |
| "grad_norm": 25.535808563232422, |
| "kl": 0.100830078125, |
| "learning_rate": 9.5625e-07, |
| "loss": 0.001, |
| "reward": 3.8188287019729614, |
| "reward_std": 0.1601814702153206, |
| "rewards/answer_entity_reward": 0.9457894563674927, |
| "rewards/answer_wer_reward": 0.9093815982341766, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9636577069759369, |
| "step": 36 |
| }, |
| { |
| "completion_length": 165.375, |
| "epoch": 0.1184, |
| "grad_norm": 2.886183738708496, |
| "kl": 0.0692138671875, |
| "learning_rate": 9.55e-07, |
| "loss": 0.0007, |
| "reward": 3.8752315044403076, |
| "reward_std": 0.04815678671002388, |
| "rewards/answer_entity_reward": 0.994689553976059, |
| "rewards/answer_wer_reward": 0.9401271045207977, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9404149055480957, |
| "step": 37 |
| }, |
| { |
| "completion_length": 250.40625, |
| "epoch": 0.1216, |
| "grad_norm": 2.9052975177764893, |
| "kl": 0.0467529296875, |
| "learning_rate": 9.5375e-07, |
| "loss": 0.0005, |
| "reward": 3.8545405864715576, |
| "reward_std": 0.04892056295648217, |
| "rewards/answer_entity_reward": 0.9534467458724976, |
| "rewards/answer_wer_reward": 0.9035276472568512, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9975661933422089, |
| "step": 38 |
| }, |
| { |
| "completion_length": 234.125, |
| "epoch": 0.1248, |
| "grad_norm": 1.5214505195617676, |
| "kl": 0.04010009765625, |
| "learning_rate": 9.525e-07, |
| "loss": 0.0004, |
| "reward": 3.7642624378204346, |
| "reward_std": 0.06860890984535217, |
| "rewards/answer_entity_reward": 0.9330369234085083, |
| "rewards/answer_wer_reward": 0.8348780572414398, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9963473677635193, |
| "step": 39 |
| }, |
| { |
| "completion_length": 222.5, |
| "epoch": 0.128, |
| "grad_norm": 1.4751359224319458, |
| "kl": 0.0521240234375, |
| "learning_rate": 9.5125e-07, |
| "loss": 0.0005, |
| "reward": 3.8170441389083862, |
| "reward_std": 0.06563596054911613, |
| "rewards/answer_entity_reward": 0.9340721964836121, |
| "rewards/answer_wer_reward": 0.8999682068824768, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9830037355422974, |
| "step": 40 |
| }, |
| { |
| "completion_length": 201.84375, |
| "epoch": 0.1312, |
| "grad_norm": 20.2832088470459, |
| "kl": 0.038818359375, |
| "learning_rate": 9.499999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.7008172273635864, |
| "reward_std": 0.039744822308421135, |
| "rewards/answer_entity_reward": 0.9294143319129944, |
| "rewards/answer_wer_reward": 0.890234112739563, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8811687231063843, |
| "step": 41 |
| }, |
| { |
| "completion_length": 192.09375, |
| "epoch": 0.1344, |
| "grad_norm": 3.430189609527588, |
| "kl": 0.0523681640625, |
| "learning_rate": 9.487499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.8015908002853394, |
| "reward_std": 0.057819752022624016, |
| "rewards/answer_entity_reward": 0.9672390222549438, |
| "rewards/answer_wer_reward": 0.8474858105182648, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9868659377098083, |
| "step": 42 |
| }, |
| { |
| "completion_length": 215.53125, |
| "epoch": 0.1376, |
| "grad_norm": 16.041494369506836, |
| "kl": 0.0418701171875, |
| "learning_rate": 9.474999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.730579137802124, |
| "reward_std": 0.11731705069541931, |
| "rewards/answer_entity_reward": 0.9560448527336121, |
| "rewards/answer_wer_reward": 0.8699329495429993, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9046012759208679, |
| "step": 43 |
| }, |
| { |
| "completion_length": 236.78125, |
| "epoch": 0.1408, |
| "grad_norm": 1.6949574947357178, |
| "kl": 0.0352783203125, |
| "learning_rate": 9.462499999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.899806261062622, |
| "reward_std": 0.018219145480543375, |
| "rewards/answer_entity_reward": 0.9738267660140991, |
| "rewards/answer_wer_reward": 0.9316939115524292, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9942856431007385, |
| "step": 44 |
| }, |
| { |
| "completion_length": 246.4375, |
| "epoch": 0.144, |
| "grad_norm": 1.3507007360458374, |
| "kl": 0.0330810546875, |
| "learning_rate": 9.45e-07, |
| "loss": 0.0003, |
| "reward": 3.8328453302383423, |
| "reward_std": 0.06314087565988302, |
| "rewards/answer_entity_reward": 0.9711392819881439, |
| "rewards/answer_wer_reward": 0.8670938909053802, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.994612067937851, |
| "step": 45 |
| }, |
| { |
| "completion_length": 170.28125, |
| "epoch": 0.1472, |
| "grad_norm": 2.2585864067077637, |
| "kl": 0.077392578125, |
| "learning_rate": 9.4375e-07, |
| "loss": 0.0008, |
| "reward": 3.902386784553528, |
| "reward_std": 0.035709235817193985, |
| "rewards/answer_entity_reward": 0.9873873591423035, |
| "rewards/answer_wer_reward": 0.9353838264942169, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9796155691146851, |
| "step": 46 |
| }, |
| { |
| "completion_length": 149.8125, |
| "epoch": 0.1504, |
| "grad_norm": 4.581851005554199, |
| "kl": 0.0452880859375, |
| "learning_rate": 9.425e-07, |
| "loss": 0.0005, |
| "reward": 3.6548960208892822, |
| "reward_std": 0.06261088512837887, |
| "rewards/answer_entity_reward": 0.9477430880069733, |
| "rewards/answer_wer_reward": 0.8129006922245026, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8942522406578064, |
| "step": 47 |
| }, |
| { |
| "completion_length": 216.75, |
| "epoch": 0.1536, |
| "grad_norm": 47.897464752197266, |
| "kl": 0.3621826171875, |
| "learning_rate": 9.4125e-07, |
| "loss": 0.0036, |
| "reward": 3.906231164932251, |
| "reward_std": 0.034966002218425274, |
| "rewards/answer_entity_reward": 0.9823353588581085, |
| "rewards/answer_wer_reward": 0.9293725490570068, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9945231974124908, |
| "step": 48 |
| }, |
| { |
| "completion_length": 196.9375, |
| "epoch": 0.1568, |
| "grad_norm": 2.257028579711914, |
| "kl": 0.0465087890625, |
| "learning_rate": 9.399999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.8652477264404297, |
| "reward_std": 0.03087481390684843, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9092975854873657, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.958791047334671, |
| "step": 49 |
| }, |
| { |
| "completion_length": 196.9375, |
| "epoch": 0.16, |
| "grad_norm": 4.950622081756592, |
| "kl": 0.0345458984375, |
| "learning_rate": 9.387499999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.824746251106262, |
| "reward_std": 0.058218397200107574, |
| "rewards/answer_entity_reward": 0.9825757443904877, |
| "rewards/answer_wer_reward": 0.9601459503173828, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8820245563983917, |
| "step": 50 |
| }, |
| { |
| "completion_length": 174.8125, |
| "epoch": 0.1632, |
| "grad_norm": 7.211401462554932, |
| "kl": 0.0582275390625, |
| "learning_rate": 9.374999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.8160147666931152, |
| "reward_std": 0.04299969598650932, |
| "rewards/answer_entity_reward": 0.9790209829807281, |
| "rewards/answer_wer_reward": 0.9350173771381378, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.901976466178894, |
| "step": 51 |
| }, |
| { |
| "completion_length": 248.125, |
| "epoch": 0.1664, |
| "grad_norm": 0.9922041893005371, |
| "kl": 0.0201416015625, |
| "learning_rate": 9.3625e-07, |
| "loss": 0.0002, |
| "reward": 3.8918874263763428, |
| "reward_std": 0.029974642675369978, |
| "rewards/answer_entity_reward": 0.9869123697280884, |
| "rewards/answer_wer_reward": 0.9067046940326691, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9982704818248749, |
| "step": 52 |
| }, |
| { |
| "completion_length": 251.59375, |
| "epoch": 0.1696, |
| "grad_norm": 0.9144994020462036, |
| "kl": 0.02001953125, |
| "learning_rate": 9.35e-07, |
| "loss": 0.0002, |
| "reward": 3.782878875732422, |
| "reward_std": 0.04338405467569828, |
| "rewards/answer_entity_reward": 0.9685876965522766, |
| "rewards/answer_wer_reward": 0.8232664167881012, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9910247623920441, |
| "step": 53 |
| }, |
| { |
| "completion_length": 224.5625, |
| "epoch": 0.1728, |
| "grad_norm": 0.8014624118804932, |
| "kl": 0.01904296875, |
| "learning_rate": 9.3375e-07, |
| "loss": 0.0002, |
| "reward": 3.804163098335266, |
| "reward_std": 0.02029208466410637, |
| "rewards/answer_entity_reward": 0.9539299309253693, |
| "rewards/answer_wer_reward": 0.8539278209209442, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9963052868843079, |
| "step": 54 |
| }, |
| { |
| "completion_length": 174.59375, |
| "epoch": 0.176, |
| "grad_norm": 2.5315935611724854, |
| "kl": 0.02862548828125, |
| "learning_rate": 9.325e-07, |
| "loss": 0.0003, |
| "reward": 3.8737215995788574, |
| "reward_std": 0.06625958904623985, |
| "rewards/answer_entity_reward": 0.9887503385543823, |
| "rewards/answer_wer_reward": 0.9215180277824402, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9634531438350677, |
| "step": 55 |
| }, |
| { |
| "completion_length": 239.4375, |
| "epoch": 0.1792, |
| "grad_norm": 1.3654975891113281, |
| "kl": 0.0283203125, |
| "learning_rate": 9.3125e-07, |
| "loss": 0.0003, |
| "reward": 3.8753963708877563, |
| "reward_std": 0.04764867387712002, |
| "rewards/answer_entity_reward": 0.9810132682323456, |
| "rewards/answer_wer_reward": 0.8943831324577332, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 56 |
| }, |
| { |
| "completion_length": 214.75, |
| "epoch": 0.1824, |
| "grad_norm": 1.4159584045410156, |
| "kl": 0.02081298828125, |
| "learning_rate": 9.3e-07, |
| "loss": 0.0002, |
| "reward": 3.8986427783966064, |
| "reward_std": 0.031265249475836754, |
| "rewards/answer_entity_reward": 0.9880681931972504, |
| "rewards/answer_wer_reward": 0.9130412340164185, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.99753338098526, |
| "step": 57 |
| }, |
| { |
| "completion_length": 240.46875, |
| "epoch": 0.1856, |
| "grad_norm": 1.1824144124984741, |
| "kl": 0.015960693359375, |
| "learning_rate": 9.287499999999999e-07, |
| "loss": 0.0002, |
| "reward": 3.90795361995697, |
| "reward_std": 0.02096135076135397, |
| "rewards/answer_entity_reward": 0.9983552694320679, |
| "rewards/answer_wer_reward": 0.9095984101295471, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 58 |
| }, |
| { |
| "completion_length": 238.09375, |
| "epoch": 0.1888, |
| "grad_norm": 1.165099024772644, |
| "kl": 0.026123046875, |
| "learning_rate": 9.274999999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.9033310413360596, |
| "reward_std": 0.03423699922859669, |
| "rewards/answer_entity_reward": 0.9810605943202972, |
| "rewards/answer_wer_reward": 0.9234386384487152, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9988317787647247, |
| "step": 59 |
| }, |
| { |
| "completion_length": 221.84375, |
| "epoch": 0.192, |
| "grad_norm": 2.964642286300659, |
| "kl": 0.02587890625, |
| "learning_rate": 9.2625e-07, |
| "loss": 0.0003, |
| "reward": 3.9065024852752686, |
| "reward_std": 0.022342820651829243, |
| "rewards/answer_entity_reward": 0.978426069021225, |
| "rewards/answer_wer_reward": 0.9289742708206177, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9991020858287811, |
| "step": 60 |
| }, |
| { |
| "completion_length": 211.1875, |
| "epoch": 0.1952, |
| "grad_norm": 2.225137233734131, |
| "kl": 0.0374755859375, |
| "learning_rate": 9.25e-07, |
| "loss": 0.0004, |
| "reward": 3.6701877117156982, |
| "reward_std": 0.03641202859580517, |
| "rewards/answer_entity_reward": 0.9796620309352875, |
| "rewards/answer_wer_reward": 0.7723922729492188, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9181334376335144, |
| "step": 61 |
| }, |
| { |
| "completion_length": 150.0, |
| "epoch": 0.1984, |
| "grad_norm": 4.289616584777832, |
| "kl": 0.062744140625, |
| "learning_rate": 9.237499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.769058585166931, |
| "reward_std": 0.060237159952521324, |
| "rewards/answer_entity_reward": 0.842234879732132, |
| "rewards/answer_wer_reward": 0.9324747323989868, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9943490326404572, |
| "step": 62 |
| }, |
| { |
| "completion_length": 172.59375, |
| "epoch": 0.2016, |
| "grad_norm": 0.9226670861244202, |
| "kl": 0.04541015625, |
| "learning_rate": 9.225e-07, |
| "loss": 0.0005, |
| "reward": 3.9475854635238647, |
| "reward_std": 0.009972278494387865, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9488748908042908, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987106323242188, |
| "step": 63 |
| }, |
| { |
| "completion_length": 186.28125, |
| "epoch": 0.2048, |
| "grad_norm": 2.8787524700164795, |
| "kl": 0.02923583984375, |
| "learning_rate": 9.2125e-07, |
| "loss": 0.0003, |
| "reward": 3.8407578468322754, |
| "reward_std": 0.04633911233395338, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9414158165454865, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8993421196937561, |
| "step": 64 |
| }, |
| { |
| "completion_length": 235.21875, |
| "epoch": 0.208, |
| "grad_norm": 3.289802074432373, |
| "kl": 0.02203369140625, |
| "learning_rate": 9.2e-07, |
| "loss": 0.0002, |
| "reward": 3.8516111373901367, |
| "reward_std": 0.05013709142804146, |
| "rewards/answer_entity_reward": 0.9782106876373291, |
| "rewards/answer_wer_reward": 0.8967941999435425, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9766062498092651, |
| "step": 65 |
| }, |
| { |
| "completion_length": 182.9375, |
| "epoch": 0.2112, |
| "grad_norm": 15.17410659790039, |
| "kl": 0.079833984375, |
| "learning_rate": 9.187499999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.7952799797058105, |
| "reward_std": 0.08191402442753315, |
| "rewards/answer_entity_reward": 0.9947552382946014, |
| "rewards/answer_wer_reward": 0.9461319446563721, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8543927371501923, |
| "step": 66 |
| }, |
| { |
| "completion_length": 195.3125, |
| "epoch": 0.2144, |
| "grad_norm": 1.6663379669189453, |
| "kl": 0.0638427734375, |
| "learning_rate": 9.174999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.916337490081787, |
| "reward_std": 0.018936872947961092, |
| "rewards/answer_entity_reward": 0.9955128133296967, |
| "rewards/answer_wer_reward": 0.9398471117019653, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.980977475643158, |
| "step": 67 |
| }, |
| { |
| "completion_length": 211.84375, |
| "epoch": 0.2176, |
| "grad_norm": 2.6255111694335938, |
| "kl": 0.05126953125, |
| "learning_rate": 9.1625e-07, |
| "loss": 0.0005, |
| "reward": 3.9224915504455566, |
| "reward_std": 0.01644316827878356, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9280897378921509, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.997242659330368, |
| "step": 68 |
| }, |
| { |
| "completion_length": 170.65625, |
| "epoch": 0.2208, |
| "grad_norm": 3.3114447593688965, |
| "kl": 0.0849609375, |
| "learning_rate": 9.15e-07, |
| "loss": 0.0009, |
| "reward": 3.801788806915283, |
| "reward_std": 0.07587217539548874, |
| "rewards/answer_entity_reward": 0.9663097262382507, |
| "rewards/answer_wer_reward": 0.9007239937782288, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9347550868988037, |
| "step": 69 |
| }, |
| { |
| "completion_length": 194.0, |
| "epoch": 0.224, |
| "grad_norm": 0.908227264881134, |
| "kl": 0.0428466796875, |
| "learning_rate": 9.137499999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.908014178276062, |
| "reward_std": 0.015611772891134024, |
| "rewards/answer_entity_reward": 0.9866071343421936, |
| "rewards/answer_wer_reward": 0.9214071035385132, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 70 |
| }, |
| { |
| "completion_length": 235.15625, |
| "epoch": 0.2272, |
| "grad_norm": 6.288023471832275, |
| "kl": 0.0377197265625, |
| "learning_rate": 9.124999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.8232322931289673, |
| "reward_std": 0.019494441337883472, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9413564205169678, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8853481709957123, |
| "step": 71 |
| }, |
| { |
| "completion_length": 202.84375, |
| "epoch": 0.2304, |
| "grad_norm": 3.666252374649048, |
| "kl": 0.02703857421875, |
| "learning_rate": 9.1125e-07, |
| "loss": 0.0003, |
| "reward": 3.8724911212921143, |
| "reward_std": 0.036418632604181767, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9379763305187225, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.934514731168747, |
| "step": 72 |
| }, |
| { |
| "completion_length": 192.59375, |
| "epoch": 0.2336, |
| "grad_norm": 2.5703845024108887, |
| "kl": 0.04815673828125, |
| "learning_rate": 9.1e-07, |
| "loss": 0.0005, |
| "reward": 3.819400668144226, |
| "reward_std": 0.09702013805508614, |
| "rewards/answer_entity_reward": 0.9749708473682404, |
| "rewards/answer_wer_reward": 0.8958881497383118, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9485417604446411, |
| "step": 73 |
| }, |
| { |
| "completion_length": 233.96875, |
| "epoch": 0.2368, |
| "grad_norm": 5.079833030700684, |
| "kl": 0.03594970703125, |
| "learning_rate": 9.087499999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.87298047542572, |
| "reward_std": 0.04117333237081766, |
| "rewards/answer_entity_reward": 0.979208379983902, |
| "rewards/answer_wer_reward": 0.8985798060894012, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.995192289352417, |
| "step": 74 |
| }, |
| { |
| "completion_length": 232.09375, |
| "epoch": 0.24, |
| "grad_norm": 1.3709529638290405, |
| "kl": 0.0469970703125, |
| "learning_rate": 9.074999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.8842471837997437, |
| "reward_std": 0.02406489010900259, |
| "rewards/answer_entity_reward": 0.976262629032135, |
| "rewards/answer_wer_reward": 0.9083134233951569, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996710419654846, |
| "step": 75 |
| }, |
| { |
| "completion_length": 134.46875, |
| "epoch": 0.2432, |
| "grad_norm": 1.7917073965072632, |
| "kl": 0.04345703125, |
| "learning_rate": 9.0625e-07, |
| "loss": 0.0004, |
| "reward": 3.9434739351272583, |
| "reward_std": 0.03165043890476227, |
| "rewards/answer_entity_reward": 0.9853896200656891, |
| "rewards/answer_wer_reward": 0.960752934217453, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9973313212394714, |
| "step": 76 |
| }, |
| { |
| "completion_length": 260.75, |
| "epoch": 0.2464, |
| "grad_norm": 2.487206220626831, |
| "kl": 0.02789306640625, |
| "learning_rate": 9.05e-07, |
| "loss": 0.0003, |
| "reward": 3.8149930238723755, |
| "reward_std": 0.04638839513063431, |
| "rewards/answer_entity_reward": 0.9494674503803253, |
| "rewards/answer_wer_reward": 0.8663396835327148, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9991858303546906, |
| "step": 77 |
| }, |
| { |
| "completion_length": 221.3125, |
| "epoch": 0.2496, |
| "grad_norm": 1.8767852783203125, |
| "kl": 0.017425537109375, |
| "learning_rate": 9.0375e-07, |
| "loss": 0.0002, |
| "reward": 3.8600170612335205, |
| "reward_std": 0.04895954905077815, |
| "rewards/answer_entity_reward": 0.9861111044883728, |
| "rewards/answer_wer_reward": 0.8933806419372559, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9805253744125366, |
| "step": 78 |
| }, |
| { |
| "completion_length": 230.71875, |
| "epoch": 0.2528, |
| "grad_norm": 3.712688684463501, |
| "kl": 0.054931640625, |
| "learning_rate": 9.024999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.8847248554229736, |
| "reward_std": 0.012873267754912376, |
| "rewards/answer_entity_reward": 0.9855768978595734, |
| "rewards/answer_wer_reward": 0.9019420742988586, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9972057938575745, |
| "step": 79 |
| }, |
| { |
| "completion_length": 199.3125, |
| "epoch": 0.256, |
| "grad_norm": 1.9246958494186401, |
| "kl": 0.054931640625, |
| "learning_rate": 9.0125e-07, |
| "loss": 0.0005, |
| "reward": 3.8006842136383057, |
| "reward_std": 0.052133604884147644, |
| "rewards/answer_entity_reward": 0.9955128133296967, |
| "rewards/answer_wer_reward": 0.9017785787582397, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9033928513526917, |
| "step": 80 |
| }, |
| { |
| "completion_length": 250.21875, |
| "epoch": 0.2592, |
| "grad_norm": 1.160876989364624, |
| "kl": 0.0220947265625, |
| "learning_rate": 9e-07, |
| "loss": 0.0002, |
| "reward": 3.8708144426345825, |
| "reward_std": 0.030466954689472914, |
| "rewards/answer_entity_reward": 0.9937500059604645, |
| "rewards/answer_wer_reward": 0.8790038824081421, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9980604648590088, |
| "step": 81 |
| }, |
| { |
| "completion_length": 237.125, |
| "epoch": 0.2624, |
| "grad_norm": 5.024093151092529, |
| "kl": 0.0382080078125, |
| "learning_rate": 8.9875e-07, |
| "loss": 0.0004, |
| "reward": 3.9048351049423218, |
| "reward_std": 0.03107828088104725, |
| "rewards/answer_entity_reward": 0.9851398468017578, |
| "rewards/answer_wer_reward": 0.9344828426837921, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9852123558521271, |
| "step": 82 |
| }, |
| { |
| "completion_length": 222.5, |
| "epoch": 0.2656, |
| "grad_norm": 1.6519030332565308, |
| "kl": 0.0380859375, |
| "learning_rate": 8.974999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.863801956176758, |
| "reward_std": 0.030243747401982546, |
| "rewards/answer_entity_reward": 0.9727078676223755, |
| "rewards/answer_wer_reward": 0.9002127051353455, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9908813536167145, |
| "step": 83 |
| }, |
| { |
| "completion_length": 225.53125, |
| "epoch": 0.2688, |
| "grad_norm": 1.4793689250946045, |
| "kl": 0.0517578125, |
| "learning_rate": 8.9625e-07, |
| "loss": 0.0005, |
| "reward": 3.8814769983291626, |
| "reward_std": 0.029270809143781662, |
| "rewards/answer_entity_reward": 0.9880681931972504, |
| "rewards/answer_wer_reward": 0.8934087753295898, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 84 |
| }, |
| { |
| "completion_length": 235.9375, |
| "epoch": 0.272, |
| "grad_norm": 1.597517728805542, |
| "kl": 0.1016845703125, |
| "learning_rate": 8.95e-07, |
| "loss": 0.001, |
| "reward": 3.8768863677978516, |
| "reward_std": 0.03502520266920328, |
| "rewards/answer_entity_reward": 0.9798878133296967, |
| "rewards/answer_wer_reward": 0.8985857367515564, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9984128177165985, |
| "step": 85 |
| }, |
| { |
| "completion_length": 214.4375, |
| "epoch": 0.2752, |
| "grad_norm": 4.483051300048828, |
| "kl": 0.04150390625, |
| "learning_rate": 8.9375e-07, |
| "loss": 0.0004, |
| "reward": 3.903320074081421, |
| "reward_std": 0.019831405603326857, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9384645223617554, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9648554623126984, |
| "step": 86 |
| }, |
| { |
| "completion_length": 217.3125, |
| "epoch": 0.2784, |
| "grad_norm": 2.5979843139648438, |
| "kl": 0.0279541015625, |
| "learning_rate": 8.924999999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.8643628358840942, |
| "reward_std": 0.07706086616963148, |
| "rewards/answer_entity_reward": 0.9751845002174377, |
| "rewards/answer_wer_reward": 0.9189748764038086, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9702034890651703, |
| "step": 87 |
| }, |
| { |
| "completion_length": 209.0625, |
| "epoch": 0.2816, |
| "grad_norm": 2.134483575820923, |
| "kl": 0.0654296875, |
| "learning_rate": 8.912499999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.829586148262024, |
| "reward_std": 0.11678730137646198, |
| "rewards/answer_entity_reward": 0.9327990114688873, |
| "rewards/answer_wer_reward": 0.9185277223587036, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9782594740390778, |
| "step": 88 |
| }, |
| { |
| "completion_length": 202.5625, |
| "epoch": 0.2848, |
| "grad_norm": 2.750098943710327, |
| "kl": 0.0386962890625, |
| "learning_rate": 8.9e-07, |
| "loss": 0.0004, |
| "reward": 3.813106060028076, |
| "reward_std": 0.013170521473512053, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.8155100047588348, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9975961446762085, |
| "step": 89 |
| }, |
| { |
| "completion_length": 208.21875, |
| "epoch": 0.288, |
| "grad_norm": 1.0419001579284668, |
| "kl": 0.02874755859375, |
| "learning_rate": 8.8875e-07, |
| "loss": 0.0003, |
| "reward": 3.7984471321105957, |
| "reward_std": 0.046625567600131035, |
| "rewards/answer_entity_reward": 0.9813492298126221, |
| "rewards/answer_wer_reward": 0.908283531665802, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9088144898414612, |
| "step": 90 |
| }, |
| { |
| "completion_length": 240.875, |
| "epoch": 0.2912, |
| "grad_norm": 1.406315565109253, |
| "kl": 0.0322265625, |
| "learning_rate": 8.874999999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.917527914047241, |
| "reward_std": 0.018682857509702444, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.919611245393753, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 91 |
| }, |
| { |
| "completion_length": 248.28125, |
| "epoch": 0.2944, |
| "grad_norm": 0.9986963868141174, |
| "kl": 0.034912109375, |
| "learning_rate": 8.8625e-07, |
| "loss": 0.0003, |
| "reward": 3.8824074268341064, |
| "reward_std": 0.027639332227408886, |
| "rewards/answer_entity_reward": 0.9829497039318085, |
| "rewards/answer_wer_reward": 0.8998689651489258, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999588817358017, |
| "step": 92 |
| }, |
| { |
| "completion_length": 166.84375, |
| "epoch": 0.2976, |
| "grad_norm": 1.9086061716079712, |
| "kl": 0.03448486328125, |
| "learning_rate": 8.85e-07, |
| "loss": 0.0003, |
| "reward": 3.9501060247421265, |
| "reward_std": 0.012802016455680132, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9628694355487823, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9872365295886993, |
| "step": 93 |
| }, |
| { |
| "completion_length": 256.1875, |
| "epoch": 0.3008, |
| "grad_norm": 3.4043421745300293, |
| "kl": 0.049072265625, |
| "learning_rate": 8.8375e-07, |
| "loss": 0.0005, |
| "reward": 3.814915657043457, |
| "reward_std": 0.03222915716469288, |
| "rewards/answer_entity_reward": 0.9890734255313873, |
| "rewards/answer_wer_reward": 0.8261894881725311, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999652773141861, |
| "step": 94 |
| }, |
| { |
| "completion_length": 253.4375, |
| "epoch": 0.304, |
| "grad_norm": 0.9184324741363525, |
| "kl": 0.03564453125, |
| "learning_rate": 8.824999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.8896020650863647, |
| "reward_std": 0.02269437536597252, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.8971993029117584, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9952436983585358, |
| "step": 95 |
| }, |
| { |
| "completion_length": 202.15625, |
| "epoch": 0.3072, |
| "grad_norm": 12.922323226928711, |
| "kl": 0.05908203125, |
| "learning_rate": 8.812499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.9009629487991333, |
| "reward_std": 0.0202713580802083, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9189554452896118, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9820075929164886, |
| "step": 96 |
| }, |
| { |
| "completion_length": 224.53125, |
| "epoch": 0.3104, |
| "grad_norm": 4.217601299285889, |
| "kl": 0.0465087890625, |
| "learning_rate": 8.799999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.8913207054138184, |
| "reward_std": 0.014381649438291788, |
| "rewards/answer_entity_reward": 0.9821428656578064, |
| "rewards/answer_wer_reward": 0.9095685184001923, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996093809604645, |
| "step": 97 |
| }, |
| { |
| "completion_length": 206.40625, |
| "epoch": 0.3136, |
| "grad_norm": 2.168041706085205, |
| "kl": 0.0323486328125, |
| "learning_rate": 8.7875e-07, |
| "loss": 0.0003, |
| "reward": 3.8137295246124268, |
| "reward_std": 0.06389336660504341, |
| "rewards/answer_entity_reward": 0.9776169061660767, |
| "rewards/answer_wer_reward": 0.8989610075950623, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9371516108512878, |
| "step": 98 |
| }, |
| { |
| "completion_length": 209.0625, |
| "epoch": 0.3168, |
| "grad_norm": 1.6052436828613281, |
| "kl": 0.0345458984375, |
| "learning_rate": 8.774999999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.828700304031372, |
| "reward_std": 0.019330056384205818, |
| "rewards/answer_entity_reward": 0.9850388169288635, |
| "rewards/answer_wer_reward": 0.846589595079422, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9970719814300537, |
| "step": 99 |
| }, |
| { |
| "completion_length": 210.6875, |
| "epoch": 0.32, |
| "grad_norm": 0.9548845887184143, |
| "kl": 0.0341796875, |
| "learning_rate": 8.7625e-07, |
| "loss": 0.0003, |
| "reward": 3.9469358921051025, |
| "reward_std": 0.021241382230073214, |
| "rewards/answer_entity_reward": 0.9851641654968262, |
| "rewards/answer_wer_reward": 0.961771547794342, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 100 |
| }, |
| { |
| "completion_length": 214.28125, |
| "epoch": 0.3232, |
| "grad_norm": 2.8610620498657227, |
| "kl": 0.052734375, |
| "learning_rate": 8.75e-07, |
| "loss": 0.0005, |
| "reward": 3.806527853012085, |
| "reward_std": 0.04471902176737785, |
| "rewards/answer_entity_reward": 0.9853896200656891, |
| "rewards/answer_wer_reward": 0.8547504544258118, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.966387927532196, |
| "step": 101 |
| }, |
| { |
| "completion_length": 223.4375, |
| "epoch": 0.3264, |
| "grad_norm": 0.7780336141586304, |
| "kl": 0.034912109375, |
| "learning_rate": 8.7375e-07, |
| "loss": 0.0003, |
| "reward": 3.880792260169983, |
| "reward_std": 0.022754055447876453, |
| "rewards/answer_entity_reward": 0.989393949508667, |
| "rewards/answer_wer_reward": 0.8913983702659607, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 102 |
| }, |
| { |
| "completion_length": 231.0625, |
| "epoch": 0.3296, |
| "grad_norm": 1.3763070106506348, |
| "kl": 0.024444580078125, |
| "learning_rate": 8.725e-07, |
| "loss": 0.0003, |
| "reward": 3.929618239402771, |
| "reward_std": 0.012849014718085527, |
| "rewards/answer_entity_reward": 0.9983552694320679, |
| "rewards/answer_wer_reward": 0.9325020015239716, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987609684467316, |
| "step": 103 |
| }, |
| { |
| "completion_length": 268.96875, |
| "epoch": 0.3328, |
| "grad_norm": 1.7985624074935913, |
| "kl": 0.0289306640625, |
| "learning_rate": 8.712499999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.888875961303711, |
| "reward_std": 0.027541114017367363, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.8925231993198395, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999193549156189, |
| "step": 104 |
| }, |
| { |
| "completion_length": 254.1875, |
| "epoch": 0.336, |
| "grad_norm": 18.920978546142578, |
| "kl": 0.027099609375, |
| "learning_rate": 8.699999999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.860435366630554, |
| "reward_std": 0.030950906220823526, |
| "rewards/answer_entity_reward": 0.9734883308410645, |
| "rewards/answer_wer_reward": 0.8872724771499634, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996744692325592, |
| "step": 105 |
| }, |
| { |
| "completion_length": 163.53125, |
| "epoch": 0.3392, |
| "grad_norm": 2.867141008377075, |
| "kl": 0.03399658203125, |
| "learning_rate": 8.687499999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.9226391315460205, |
| "reward_std": 0.023416020907461643, |
| "rewards/answer_entity_reward": 0.9886363744735718, |
| "rewards/answer_wer_reward": 0.9473121762275696, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9866906106472015, |
| "step": 106 |
| }, |
| { |
| "completion_length": 230.5625, |
| "epoch": 0.3424, |
| "grad_norm": 1.7444649934768677, |
| "kl": 0.03302001953125, |
| "learning_rate": 8.675000000000001e-07, |
| "loss": 0.0003, |
| "reward": 3.9037901163101196, |
| "reward_std": 0.013123108074069023, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9062368869781494, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996366202831268, |
| "step": 107 |
| }, |
| { |
| "completion_length": 196.96875, |
| "epoch": 0.3456, |
| "grad_norm": 1.4596710205078125, |
| "kl": 0.0565185546875, |
| "learning_rate": 8.6625e-07, |
| "loss": 0.0006, |
| "reward": 3.927306890487671, |
| "reward_std": 0.017726238816976547, |
| "rewards/answer_entity_reward": 0.9847221970558167, |
| "rewards/answer_wer_reward": 0.9435714483261108, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990131855010986, |
| "step": 108 |
| }, |
| { |
| "completion_length": 204.03125, |
| "epoch": 0.3488, |
| "grad_norm": 21.111600875854492, |
| "kl": 0.259765625, |
| "learning_rate": 8.65e-07, |
| "loss": 0.0026, |
| "reward": 3.878751039505005, |
| "reward_std": 0.09589649271219969, |
| "rewards/answer_entity_reward": 0.9957579076290131, |
| "rewards/answer_wer_reward": 0.9333003461360931, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9496928453445435, |
| "step": 109 |
| }, |
| { |
| "completion_length": 213.25, |
| "epoch": 0.352, |
| "grad_norm": 5.349282264709473, |
| "kl": 0.0455322265625, |
| "learning_rate": 8.6375e-07, |
| "loss": 0.0005, |
| "reward": 3.862163782119751, |
| "reward_std": 0.031207362189888954, |
| "rewards/answer_entity_reward": 0.9892857372760773, |
| "rewards/answer_wer_reward": 0.9074709117412567, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9654072225093842, |
| "step": 110 |
| }, |
| { |
| "completion_length": 220.3125, |
| "epoch": 0.3552, |
| "grad_norm": 3.316596746444702, |
| "kl": 0.03369140625, |
| "learning_rate": 8.625e-07, |
| "loss": 0.0003, |
| "reward": 3.8875255584716797, |
| "reward_std": 0.03998068626970053, |
| "rewards/answer_entity_reward": 0.9902909696102142, |
| "rewards/answer_wer_reward": 0.9038136303424835, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9934210479259491, |
| "step": 111 |
| }, |
| { |
| "completion_length": 250.78125, |
| "epoch": 0.3584, |
| "grad_norm": 2.525360107421875, |
| "kl": 0.03515625, |
| "learning_rate": 8.612499999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.8880720138549805, |
| "reward_std": 0.025330569595098495, |
| "rewards/answer_entity_reward": 0.9918486475944519, |
| "rewards/answer_wer_reward": 0.8981437385082245, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9980796277523041, |
| "step": 112 |
| }, |
| { |
| "completion_length": 220.09375, |
| "epoch": 0.3616, |
| "grad_norm": 5.7261433601379395, |
| "kl": 0.038330078125, |
| "learning_rate": 8.599999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.873054027557373, |
| "reward_std": 0.018459735438227654, |
| "rewards/answer_entity_reward": 0.9903846085071564, |
| "rewards/answer_wer_reward": 0.8848404586315155, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9978289306163788, |
| "step": 113 |
| }, |
| { |
| "completion_length": 233.625, |
| "epoch": 0.3648, |
| "grad_norm": 2.1468665599823, |
| "kl": 0.0286865234375, |
| "learning_rate": 8.587499999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.9267923831939697, |
| "reward_std": 0.026638164184987545, |
| "rewards/answer_entity_reward": 0.993686854839325, |
| "rewards/answer_wer_reward": 0.9341540634632111, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9989514350891113, |
| "step": 114 |
| }, |
| { |
| "completion_length": 237.375, |
| "epoch": 0.368, |
| "grad_norm": 14.322599411010742, |
| "kl": 0.04052734375, |
| "learning_rate": 8.575e-07, |
| "loss": 0.0004, |
| "reward": 3.9121710062026978, |
| "reward_std": 0.02902364358305931, |
| "rewards/answer_entity_reward": 0.9908459782600403, |
| "rewards/answer_wer_reward": 0.922933429479599, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9983916878700256, |
| "step": 115 |
| }, |
| { |
| "completion_length": 240.15625, |
| "epoch": 0.3712, |
| "grad_norm": 2.0209200382232666, |
| "kl": 0.06103515625, |
| "learning_rate": 8.5625e-07, |
| "loss": 0.0006, |
| "reward": 3.888006567955017, |
| "reward_std": 0.023146681487560272, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.895849883556366, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.994560569524765, |
| "step": 116 |
| }, |
| { |
| "completion_length": 222.4375, |
| "epoch": 0.3744, |
| "grad_norm": 2.933910608291626, |
| "kl": 0.0419921875, |
| "learning_rate": 8.55e-07, |
| "loss": 0.0004, |
| "reward": 3.8359127044677734, |
| "reward_std": 0.058022117242217064, |
| "rewards/answer_entity_reward": 0.9440500438213348, |
| "rewards/answer_wer_reward": 0.894202709197998, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9976600110530853, |
| "step": 117 |
| }, |
| { |
| "completion_length": 214.5625, |
| "epoch": 0.3776, |
| "grad_norm": 7.493628025054932, |
| "kl": 0.064453125, |
| "learning_rate": 8.5375e-07, |
| "loss": 0.0006, |
| "reward": 3.799570918083191, |
| "reward_std": 0.06657508388161659, |
| "rewards/answer_entity_reward": 0.9727430641651154, |
| "rewards/answer_wer_reward": 0.871229887008667, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9555979073047638, |
| "step": 118 |
| }, |
| { |
| "completion_length": 212.625, |
| "epoch": 0.3808, |
| "grad_norm": 2.1899421215057373, |
| "kl": 0.0570068359375, |
| "learning_rate": 8.525e-07, |
| "loss": 0.0006, |
| "reward": 3.9054840803146362, |
| "reward_std": 0.027329989708960056, |
| "rewards/answer_entity_reward": 0.9914772808551788, |
| "rewards/answer_wer_reward": 0.9344967901706696, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9795099198818207, |
| "step": 119 |
| }, |
| { |
| "completion_length": 249.8125, |
| "epoch": 0.384, |
| "grad_norm": 2.4804491996765137, |
| "kl": 0.035888671875, |
| "learning_rate": 8.512499999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.8948739767074585, |
| "reward_std": 0.028746116440743208, |
| "rewards/answer_entity_reward": 0.9953208565711975, |
| "rewards/answer_wer_reward": 0.9002179205417633, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993351101875305, |
| "step": 120 |
| }, |
| { |
| "completion_length": 185.34375, |
| "epoch": 0.3872, |
| "grad_norm": 2.305140256881714, |
| "kl": 0.102783203125, |
| "learning_rate": 8.499999999999999e-07, |
| "loss": 0.001, |
| "reward": 3.9010980129241943, |
| "reward_std": 0.021339962724596262, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9222235083580017, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9788744449615479, |
| "step": 121 |
| }, |
| { |
| "completion_length": 204.65625, |
| "epoch": 0.3904, |
| "grad_norm": 1.5420470237731934, |
| "kl": 0.0313720703125, |
| "learning_rate": 8.487499999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.927214741706848, |
| "reward_std": 0.019817203283309937, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.92842698097229, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987878203392029, |
| "step": 122 |
| }, |
| { |
| "completion_length": 216.90625, |
| "epoch": 0.3936, |
| "grad_norm": 8.852909088134766, |
| "kl": 0.0716552734375, |
| "learning_rate": 8.475e-07, |
| "loss": 0.0007, |
| "reward": 3.811018466949463, |
| "reward_std": 0.010543343145400286, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.938366711139679, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8726518452167511, |
| "step": 123 |
| }, |
| { |
| "completion_length": 257.75, |
| "epoch": 0.3968, |
| "grad_norm": 1.4971685409545898, |
| "kl": 0.0330810546875, |
| "learning_rate": 8.462499999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.9272462129592896, |
| "reward_std": 0.01983210165053606, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9303403496742249, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9989891648292542, |
| "step": 124 |
| }, |
| { |
| "completion_length": 207.4375, |
| "epoch": 0.4, |
| "grad_norm": 1.9963277578353882, |
| "kl": 0.056396484375, |
| "learning_rate": 8.45e-07, |
| "loss": 0.0006, |
| "reward": 3.9006247520446777, |
| "reward_std": 0.030232679098844528, |
| "rewards/answer_entity_reward": 0.9941239356994629, |
| "rewards/answer_wer_reward": 0.9261119067668915, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9803889393806458, |
| "step": 125 |
| }, |
| { |
| "completion_length": 246.875, |
| "epoch": 0.4032, |
| "grad_norm": 1.1950430870056152, |
| "kl": 0.03369140625, |
| "learning_rate": 8.4375e-07, |
| "loss": 0.0003, |
| "reward": 3.881152391433716, |
| "reward_std": 0.03120280895382166, |
| "rewards/answer_entity_reward": 0.9683753550052643, |
| "rewards/answer_wer_reward": 0.9131445586681366, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996323585510254, |
| "step": 126 |
| }, |
| { |
| "completion_length": 212.15625, |
| "epoch": 0.4064, |
| "grad_norm": 4.167364120483398, |
| "kl": 0.257568359375, |
| "learning_rate": 8.425e-07, |
| "loss": 0.0026, |
| "reward": 3.891525626182556, |
| "reward_std": 0.03758985735476017, |
| "rewards/answer_entity_reward": 0.9853896200656891, |
| "rewards/answer_wer_reward": 0.9100889563560486, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9960470795631409, |
| "step": 127 |
| }, |
| { |
| "completion_length": 215.5625, |
| "epoch": 0.4096, |
| "grad_norm": 1.2758169174194336, |
| "kl": 0.059326171875, |
| "learning_rate": 8.4125e-07, |
| "loss": 0.0006, |
| "reward": 3.8984569311141968, |
| "reward_std": 0.02103353524580598, |
| "rewards/answer_entity_reward": 0.987500011920929, |
| "rewards/answer_wer_reward": 0.9310561716556549, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9799006283283234, |
| "step": 128 |
| }, |
| { |
| "completion_length": 221.0, |
| "epoch": 0.4128, |
| "grad_norm": 1.6011369228363037, |
| "kl": 0.02734375, |
| "learning_rate": 8.399999999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.907585859298706, |
| "reward_std": 0.024174046237021685, |
| "rewards/answer_entity_reward": 0.9887152910232544, |
| "rewards/answer_wer_reward": 0.9191110134124756, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997596144676208, |
| "step": 129 |
| }, |
| { |
| "completion_length": 189.375, |
| "epoch": 0.416, |
| "grad_norm": 2.7846839427948, |
| "kl": 0.0413818359375, |
| "learning_rate": 8.387499999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.8641178607940674, |
| "reward_std": 0.03212345764040947, |
| "rewards/answer_entity_reward": 0.9947552382946014, |
| "rewards/answer_wer_reward": 0.9255104064941406, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9438523054122925, |
| "step": 130 |
| }, |
| { |
| "completion_length": 209.5, |
| "epoch": 0.4192, |
| "grad_norm": 4.144553184509277, |
| "kl": 0.0548095703125, |
| "learning_rate": 8.375e-07, |
| "loss": 0.0006, |
| "reward": 3.8618308305740356, |
| "reward_std": 0.07612445950508118, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9306082725524902, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9312225580215454, |
| "step": 131 |
| }, |
| { |
| "completion_length": 198.5, |
| "epoch": 0.4224, |
| "grad_norm": 2.663985013961792, |
| "kl": 0.04052734375, |
| "learning_rate": 8.3625e-07, |
| "loss": 0.0004, |
| "reward": 3.897012948989868, |
| "reward_std": 0.030758653301745653, |
| "rewards/answer_entity_reward": 0.9861111044883728, |
| "rewards/answer_wer_reward": 0.9326047897338867, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9782971143722534, |
| "step": 132 |
| }, |
| { |
| "completion_length": 180.78125, |
| "epoch": 0.4256, |
| "grad_norm": 2.2100954055786133, |
| "kl": 0.0439453125, |
| "learning_rate": 8.349999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.923304557800293, |
| "reward_std": 0.025213422253727913, |
| "rewards/answer_entity_reward": 0.9882478713989258, |
| "rewards/answer_wer_reward": 0.9360361397266388, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999020516872406, |
| "step": 133 |
| }, |
| { |
| "completion_length": 219.59375, |
| "epoch": 0.4288, |
| "grad_norm": 15.98015022277832, |
| "kl": 0.0645751953125, |
| "learning_rate": 8.3375e-07, |
| "loss": 0.0006, |
| "reward": 3.8721258640289307, |
| "reward_std": 0.02985560242086649, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9070867002010345, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9650392830371857, |
| "step": 134 |
| }, |
| { |
| "completion_length": 239.90625, |
| "epoch": 0.432, |
| "grad_norm": 3.754002332687378, |
| "kl": 0.0419921875, |
| "learning_rate": 8.325e-07, |
| "loss": 0.0004, |
| "reward": 3.8614091873168945, |
| "reward_std": 0.0724228248000145, |
| "rewards/answer_entity_reward": 0.9794008135795593, |
| "rewards/answer_wer_reward": 0.9043296277523041, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9776785373687744, |
| "step": 135 |
| }, |
| { |
| "completion_length": 228.09375, |
| "epoch": 0.4352, |
| "grad_norm": 2.609844207763672, |
| "kl": 0.037841796875, |
| "learning_rate": 8.3125e-07, |
| "loss": 0.0004, |
| "reward": 3.8617947101593018, |
| "reward_std": 0.021692313253879547, |
| "rewards/answer_entity_reward": 0.9861111044883728, |
| "rewards/answer_wer_reward": 0.8795575797557831, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.996126115322113, |
| "step": 136 |
| }, |
| { |
| "completion_length": 158.8125, |
| "epoch": 0.4384, |
| "grad_norm": 1.6180543899536133, |
| "kl": 0.055419921875, |
| "learning_rate": 8.299999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.9137951135635376, |
| "reward_std": 0.020158007740974426, |
| "rewards/answer_entity_reward": 0.970695972442627, |
| "rewards/answer_wer_reward": 0.9480262100696564, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9950730204582214, |
| "step": 137 |
| }, |
| { |
| "completion_length": 231.25, |
| "epoch": 0.4416, |
| "grad_norm": 0.9336134195327759, |
| "kl": 0.03399658203125, |
| "learning_rate": 8.287499999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.9351539611816406, |
| "reward_std": 0.014509289292618632, |
| "rewards/answer_entity_reward": 0.9934294819831848, |
| "rewards/answer_wer_reward": 0.9442258775234222, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9974986016750336, |
| "step": 138 |
| }, |
| { |
| "completion_length": 220.34375, |
| "epoch": 0.4448, |
| "grad_norm": 21.355905532836914, |
| "kl": 0.059814453125, |
| "learning_rate": 8.275e-07, |
| "loss": 0.0006, |
| "reward": 3.863122820854187, |
| "reward_std": 0.060401469469070435, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9233364760875702, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9397862255573273, |
| "step": 139 |
| }, |
| { |
| "completion_length": 214.90625, |
| "epoch": 0.448, |
| "grad_norm": 1.280321478843689, |
| "kl": 0.052490234375, |
| "learning_rate": 8.2625e-07, |
| "loss": 0.0005, |
| "reward": 3.9231661558151245, |
| "reward_std": 0.009715312160551548, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9245247840881348, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.998641312122345, |
| "step": 140 |
| }, |
| { |
| "completion_length": 211.375, |
| "epoch": 0.4512, |
| "grad_norm": 1.7492412328720093, |
| "kl": 0.062744140625, |
| "learning_rate": 8.249999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.88791024684906, |
| "reward_std": 0.011862037936225533, |
| "rewards/answer_entity_reward": 0.9832702279090881, |
| "rewards/answer_wer_reward": 0.957579493522644, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9470604658126831, |
| "step": 141 |
| }, |
| { |
| "completion_length": 246.3125, |
| "epoch": 0.4544, |
| "grad_norm": 2.37640118598938, |
| "kl": 0.0369873046875, |
| "learning_rate": 8.2375e-07, |
| "loss": 0.0004, |
| "reward": 3.944279909133911, |
| "reward_std": 0.011443465016782284, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9472803771495819, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9969995319843292, |
| "step": 142 |
| }, |
| { |
| "completion_length": 199.90625, |
| "epoch": 0.4576, |
| "grad_norm": 2.8359158039093018, |
| "kl": 0.0540771484375, |
| "learning_rate": 8.225e-07, |
| "loss": 0.0005, |
| "reward": 3.93644380569458, |
| "reward_std": 0.023367811925709248, |
| "rewards/answer_entity_reward": 0.9916666746139526, |
| "rewards/answer_wer_reward": 0.9554752707481384, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9893018305301666, |
| "step": 143 |
| }, |
| { |
| "completion_length": 195.28125, |
| "epoch": 0.4608, |
| "grad_norm": 1.723976731300354, |
| "kl": 0.031982421875, |
| "learning_rate": 8.2125e-07, |
| "loss": 0.0003, |
| "reward": 3.9411680698394775, |
| "reward_std": 0.007689078338444233, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.941936582326889, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9992315769195557, |
| "step": 144 |
| }, |
| { |
| "completion_length": 223.375, |
| "epoch": 0.464, |
| "grad_norm": 1.08156418800354, |
| "kl": 0.02874755859375, |
| "learning_rate": 8.199999999999999e-07, |
| "loss": 0.0003, |
| "reward": 3.9059054851531982, |
| "reward_std": 0.007867377484217286, |
| "rewards/answer_entity_reward": 0.9903846085071564, |
| "rewards/answer_wer_reward": 0.9531411230564117, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9623797535896301, |
| "step": 145 |
| }, |
| { |
| "completion_length": 184.75, |
| "epoch": 0.4672, |
| "grad_norm": 1.7059741020202637, |
| "kl": 0.0400390625, |
| "learning_rate": 8.187499999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.939697027206421, |
| "reward_std": 0.0070332614704966545, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9535529613494873, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9861441254615784, |
| "step": 146 |
| }, |
| { |
| "completion_length": 222.84375, |
| "epoch": 0.4704, |
| "grad_norm": 1.5283204317092896, |
| "kl": 0.072998046875, |
| "learning_rate": 8.175e-07, |
| "loss": 0.0007, |
| "reward": 3.843386173248291, |
| "reward_std": 0.02895416272804141, |
| "rewards/answer_entity_reward": 0.9304008483886719, |
| "rewards/answer_wer_reward": 0.9129853844642639, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 147 |
| }, |
| { |
| "completion_length": 165.25, |
| "epoch": 0.4736, |
| "grad_norm": 2.885890245437622, |
| "kl": 0.04193115234375, |
| "learning_rate": 8.1625e-07, |
| "loss": 0.0004, |
| "reward": 3.8639066219329834, |
| "reward_std": 0.01842296402901411, |
| "rewards/answer_entity_reward": 0.9947552382946014, |
| "rewards/answer_wer_reward": 0.9352113604545593, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9339398741722107, |
| "step": 148 |
| }, |
| { |
| "completion_length": 225.8125, |
| "epoch": 0.4768, |
| "grad_norm": 1.5893429517745972, |
| "kl": 0.0615234375, |
| "learning_rate": 8.149999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.9009220600128174, |
| "reward_std": 0.022383708506822586, |
| "rewards/answer_entity_reward": 0.9967105388641357, |
| "rewards/answer_wer_reward": 0.9052460193634033, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9989655911922455, |
| "step": 149 |
| }, |
| { |
| "completion_length": 236.21875, |
| "epoch": 0.48, |
| "grad_norm": 2.1324307918548584, |
| "kl": 0.0377197265625, |
| "learning_rate": 8.137499999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.8904128074645996, |
| "reward_std": 0.02841739635914564, |
| "rewards/answer_entity_reward": 0.9930555820465088, |
| "rewards/answer_wer_reward": 0.8976494371891022, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997079372406006, |
| "step": 150 |
| }, |
| { |
| "completion_length": 213.15625, |
| "epoch": 0.4832, |
| "grad_norm": 0.9698525667190552, |
| "kl": 0.034423828125, |
| "learning_rate": 8.125e-07, |
| "loss": 0.0003, |
| "reward": 3.890373468399048, |
| "reward_std": 0.009532647207379341, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9459290206432343, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9444444477558136, |
| "step": 151 |
| }, |
| { |
| "completion_length": 250.21875, |
| "epoch": 0.4864, |
| "grad_norm": 4.16625452041626, |
| "kl": 0.198486328125, |
| "learning_rate": 8.1125e-07, |
| "loss": 0.002, |
| "reward": 3.8978230953216553, |
| "reward_std": 0.024048997554928064, |
| "rewards/answer_entity_reward": 0.987500011920929, |
| "rewards/answer_wer_reward": 0.9117782711982727, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9985446929931641, |
| "step": 152 |
| }, |
| { |
| "completion_length": 174.15625, |
| "epoch": 0.4896, |
| "grad_norm": 2.9183833599090576, |
| "kl": 0.0716552734375, |
| "learning_rate": 8.1e-07, |
| "loss": 0.0007, |
| "reward": 3.908216118812561, |
| "reward_std": 0.032137976959347725, |
| "rewards/answer_entity_reward": 0.9895833432674408, |
| "rewards/answer_wer_reward": 0.9441157281398773, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9745170772075653, |
| "step": 153 |
| }, |
| { |
| "completion_length": 187.03125, |
| "epoch": 0.4928, |
| "grad_norm": 1.039563536643982, |
| "kl": 0.0535888671875, |
| "learning_rate": 8.087499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.940076231956482, |
| "reward_std": 0.014994107652455568, |
| "rewards/answer_entity_reward": 0.9910714626312256, |
| "rewards/answer_wer_reward": 0.9499542117118835, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990506172180176, |
| "step": 154 |
| }, |
| { |
| "completion_length": 214.125, |
| "epoch": 0.496, |
| "grad_norm": 2.49003267288208, |
| "kl": 0.0635986328125, |
| "learning_rate": 8.075e-07, |
| "loss": 0.0006, |
| "reward": 3.850375175476074, |
| "reward_std": 0.026249381713569164, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.8511867821216583, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9991883039474487, |
| "step": 155 |
| }, |
| { |
| "completion_length": 214.8125, |
| "epoch": 0.4992, |
| "grad_norm": 2.7330820560455322, |
| "kl": 0.03717041015625, |
| "learning_rate": 8.0625e-07, |
| "loss": 0.0004, |
| "reward": 3.9070980548858643, |
| "reward_std": 0.04327901639044285, |
| "rewards/answer_entity_reward": 0.9886363744735718, |
| "rewards/answer_wer_reward": 0.9249836802482605, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9934781193733215, |
| "step": 156 |
| }, |
| { |
| "completion_length": 195.0625, |
| "epoch": 0.5024, |
| "grad_norm": 2.878744602203369, |
| "kl": 0.0828857421875, |
| "learning_rate": 8.05e-07, |
| "loss": 0.0008, |
| "reward": 3.9139277935028076, |
| "reward_std": 0.022999857552349567, |
| "rewards/answer_entity_reward": 0.9947916567325592, |
| "rewards/answer_wer_reward": 0.9313595592975616, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9877763986587524, |
| "step": 157 |
| }, |
| { |
| "completion_length": 216.625, |
| "epoch": 0.5056, |
| "grad_norm": 1.1287983655929565, |
| "kl": 0.049072265625, |
| "learning_rate": 8.037499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.9037948846817017, |
| "reward_std": 0.011531218886375427, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9081907570362091, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9956042170524597, |
| "step": 158 |
| }, |
| { |
| "completion_length": 200.21875, |
| "epoch": 0.5088, |
| "grad_norm": 1.5555959939956665, |
| "kl": 0.0369873046875, |
| "learning_rate": 8.024999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.9110556840896606, |
| "reward_std": 0.019422957440838218, |
| "rewards/answer_entity_reward": 0.9941239356994629, |
| "rewards/answer_wer_reward": 0.9354503750801086, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9814814925193787, |
| "step": 159 |
| }, |
| { |
| "completion_length": 202.875, |
| "epoch": 0.512, |
| "grad_norm": 13.22675895690918, |
| "kl": 0.084228515625, |
| "learning_rate": 8.0125e-07, |
| "loss": 0.0008, |
| "reward": 3.8508609533309937, |
| "reward_std": 0.037849435582756996, |
| "rewards/answer_entity_reward": 0.9867424070835114, |
| "rewards/answer_wer_reward": 0.9194300472736359, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9446885287761688, |
| "step": 160 |
| }, |
| { |
| "completion_length": 187.5625, |
| "epoch": 0.5152, |
| "grad_norm": 1.9724727869033813, |
| "kl": 0.05126953125, |
| "learning_rate": 8e-07, |
| "loss": 0.0005, |
| "reward": 3.9261248111724854, |
| "reward_std": 0.02531399577856064, |
| "rewards/answer_entity_reward": 0.9882478713989258, |
| "rewards/answer_wer_reward": 0.9410728812217712, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9968039691448212, |
| "step": 161 |
| }, |
| { |
| "completion_length": 254.84375, |
| "epoch": 0.5184, |
| "grad_norm": 2.3500356674194336, |
| "kl": 0.05340576171875, |
| "learning_rate": 7.9875e-07, |
| "loss": 0.0005, |
| "reward": 3.910772919654846, |
| "reward_std": 0.04009111411869526, |
| "rewards/answer_entity_reward": 0.9747862815856934, |
| "rewards/answer_wer_reward": 0.9362366199493408, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999750018119812, |
| "step": 162 |
| }, |
| { |
| "completion_length": 206.625, |
| "epoch": 0.5216, |
| "grad_norm": 6.3654890060424805, |
| "kl": 0.069580078125, |
| "learning_rate": 7.975e-07, |
| "loss": 0.0007, |
| "reward": 3.805917978286743, |
| "reward_std": 0.052407728508114815, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9451808631420135, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8642092943191528, |
| "step": 163 |
| }, |
| { |
| "completion_length": 212.71875, |
| "epoch": 0.5248, |
| "grad_norm": 1.921622633934021, |
| "kl": 0.09283447265625, |
| "learning_rate": 7.9625e-07, |
| "loss": 0.0009, |
| "reward": 3.9235308170318604, |
| "reward_std": 0.022881922777742147, |
| "rewards/answer_entity_reward": 0.993686854839325, |
| "rewards/answer_wer_reward": 0.9401760995388031, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9896678924560547, |
| "step": 164 |
| }, |
| { |
| "completion_length": 234.5625, |
| "epoch": 0.528, |
| "grad_norm": 1.4160696268081665, |
| "kl": 0.061767578125, |
| "learning_rate": 7.95e-07, |
| "loss": 0.0006, |
| "reward": 3.890324354171753, |
| "reward_std": 0.014382836874574423, |
| "rewards/answer_entity_reward": 0.9653846025466919, |
| "rewards/answer_wer_reward": 0.9249398708343506, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 165 |
| }, |
| { |
| "completion_length": 223.0, |
| "epoch": 0.5312, |
| "grad_norm": 1.2775448560714722, |
| "kl": 0.0582275390625, |
| "learning_rate": 7.937499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.9478421211242676, |
| "reward_std": 0.011931413784623146, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9481260776519775, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997159242630005, |
| "step": 166 |
| }, |
| { |
| "completion_length": 214.65625, |
| "epoch": 0.5344, |
| "grad_norm": 1.287255883216858, |
| "kl": 0.052734375, |
| "learning_rate": 7.924999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.9042768478393555, |
| "reward_std": 0.02827941346913576, |
| "rewards/answer_entity_reward": 0.9787962138652802, |
| "rewards/answer_wer_reward": 0.925747811794281, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997329115867615, |
| "step": 167 |
| }, |
| { |
| "completion_length": 224.59375, |
| "epoch": 0.5376, |
| "grad_norm": 1.7952959537506104, |
| "kl": 0.0364990234375, |
| "learning_rate": 7.912499999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.935611605644226, |
| "reward_std": 0.027386673726141453, |
| "rewards/answer_entity_reward": 0.9919143319129944, |
| "rewards/answer_wer_reward": 0.9439473152160645, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999750018119812, |
| "step": 168 |
| }, |
| { |
| "completion_length": 183.28125, |
| "epoch": 0.5408, |
| "grad_norm": 8.36503791809082, |
| "kl": 0.0848388671875, |
| "learning_rate": 7.9e-07, |
| "loss": 0.0008, |
| "reward": 3.8025405406951904, |
| "reward_std": 0.04630524106323719, |
| "rewards/answer_entity_reward": 0.9862637221813202, |
| "rewards/answer_wer_reward": 0.8270655274391174, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9892113208770752, |
| "step": 169 |
| }, |
| { |
| "completion_length": 235.15625, |
| "epoch": 0.544, |
| "grad_norm": 2.2816457748413086, |
| "kl": 0.0296630859375, |
| "learning_rate": 7.8875e-07, |
| "loss": 0.0003, |
| "reward": 3.934034824371338, |
| "reward_std": 0.009957955218851566, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9344717264175415, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9995629191398621, |
| "step": 170 |
| }, |
| { |
| "completion_length": 247.53125, |
| "epoch": 0.5472, |
| "grad_norm": 1.6856052875518799, |
| "kl": 0.13134765625, |
| "learning_rate": 7.875e-07, |
| "loss": 0.0013, |
| "reward": 3.896223545074463, |
| "reward_std": 0.015339810401201248, |
| "rewards/answer_entity_reward": 0.9861111044883728, |
| "rewards/answer_wer_reward": 0.9109295010566711, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9991828203201294, |
| "step": 171 |
| }, |
| { |
| "completion_length": 245.03125, |
| "epoch": 0.5504, |
| "grad_norm": 4.956347465515137, |
| "kl": 0.044921875, |
| "learning_rate": 7.8625e-07, |
| "loss": 0.0005, |
| "reward": 3.7271645069122314, |
| "reward_std": 0.21888091787695885, |
| "rewards/answer_entity_reward": 0.9630681872367859, |
| "rewards/answer_wer_reward": 0.8937070369720459, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9016393423080444, |
| "step": 172 |
| }, |
| { |
| "completion_length": 211.3125, |
| "epoch": 0.5536, |
| "grad_norm": 1.1714370250701904, |
| "kl": 0.0323486328125, |
| "learning_rate": 7.85e-07, |
| "loss": 0.0003, |
| "reward": 3.913045883178711, |
| "reward_std": 0.04143238253891468, |
| "rewards/answer_entity_reward": 0.9870130121707916, |
| "rewards/answer_wer_reward": 0.9331351518630981, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9928977191448212, |
| "step": 173 |
| }, |
| { |
| "completion_length": 272.1875, |
| "epoch": 0.5568, |
| "grad_norm": 1.2012341022491455, |
| "kl": 0.0413818359375, |
| "learning_rate": 7.837499999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.876948356628418, |
| "reward_std": 0.03149130195379257, |
| "rewards/answer_entity_reward": 0.9889954328536987, |
| "rewards/answer_wer_reward": 0.9271560311317444, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9607969224452972, |
| "step": 174 |
| }, |
| { |
| "completion_length": 200.3125, |
| "epoch": 0.56, |
| "grad_norm": 2.998842477798462, |
| "kl": 0.067138671875, |
| "learning_rate": 7.824999999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.8472641706466675, |
| "reward_std": 0.04471721313893795, |
| "rewards/answer_entity_reward": 0.9902146458625793, |
| "rewards/answer_wer_reward": 0.9358225166797638, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.921226978302002, |
| "step": 175 |
| }, |
| { |
| "completion_length": 207.03125, |
| "epoch": 0.5632, |
| "grad_norm": 10.961363792419434, |
| "kl": 0.0789794921875, |
| "learning_rate": 7.812499999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.9478721618652344, |
| "reward_std": 0.027662259992212057, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9600406885147095, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9902353584766388, |
| "step": 176 |
| }, |
| { |
| "completion_length": 221.59375, |
| "epoch": 0.5664, |
| "grad_norm": 1.341109275817871, |
| "kl": 0.065185546875, |
| "learning_rate": 7.799999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.8582847118377686, |
| "reward_std": 0.041704089380800724, |
| "rewards/answer_entity_reward": 0.9775640964508057, |
| "rewards/answer_wer_reward": 0.9368657767772675, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9438548386096954, |
| "step": 177 |
| }, |
| { |
| "completion_length": 239.90625, |
| "epoch": 0.5696, |
| "grad_norm": 1.4057974815368652, |
| "kl": 0.045166015625, |
| "learning_rate": 7.787500000000001e-07, |
| "loss": 0.0005, |
| "reward": 3.9274110794067383, |
| "reward_std": 0.02352920500561595, |
| "rewards/answer_entity_reward": 0.9946895241737366, |
| "rewards/answer_wer_reward": 0.9349404275417328, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9977810680866241, |
| "step": 178 |
| }, |
| { |
| "completion_length": 211.78125, |
| "epoch": 0.5728, |
| "grad_norm": 2.9184887409210205, |
| "kl": 0.031982421875, |
| "learning_rate": 7.775e-07, |
| "loss": 0.0003, |
| "reward": 3.945718765258789, |
| "reward_std": 0.01779081765562296, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9512039721012115, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9973557591438293, |
| "step": 179 |
| }, |
| { |
| "completion_length": 204.375, |
| "epoch": 0.576, |
| "grad_norm": 113.12403869628906, |
| "kl": 0.05322265625, |
| "learning_rate": 7.7625e-07, |
| "loss": 0.0005, |
| "reward": 3.8825124502182007, |
| "reward_std": 0.07031127344816923, |
| "rewards/answer_entity_reward": 0.9926734566688538, |
| "rewards/answer_wer_reward": 0.9367940425872803, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9530448913574219, |
| "step": 180 |
| }, |
| { |
| "completion_length": 214.75, |
| "epoch": 0.5792, |
| "grad_norm": 1.3515021800994873, |
| "kl": 0.0609130859375, |
| "learning_rate": 7.75e-07, |
| "loss": 0.0006, |
| "reward": 3.920071840286255, |
| "reward_std": 0.011316743912175298, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9225669503211975, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9975048303604126, |
| "step": 181 |
| }, |
| { |
| "completion_length": 205.0625, |
| "epoch": 0.5824, |
| "grad_norm": 1.5749711990356445, |
| "kl": 0.054443359375, |
| "learning_rate": 7.7375e-07, |
| "loss": 0.0005, |
| "reward": 3.921678900718689, |
| "reward_std": 0.013327162247151136, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9460242688655853, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9780585169792175, |
| "step": 182 |
| }, |
| { |
| "completion_length": 217.75, |
| "epoch": 0.5856, |
| "grad_norm": 0.7737219929695129, |
| "kl": 0.0469970703125, |
| "learning_rate": 7.724999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.9334832429885864, |
| "reward_std": 0.020406807772815228, |
| "rewards/answer_entity_reward": 0.9947552382946014, |
| "rewards/answer_wer_reward": 0.938728004693985, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 183 |
| }, |
| { |
| "completion_length": 231.59375, |
| "epoch": 0.5888, |
| "grad_norm": 1.6825175285339355, |
| "kl": 0.0543212890625, |
| "learning_rate": 7.712499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.938681125640869, |
| "reward_std": 0.017365658190101385, |
| "rewards/answer_entity_reward": 0.9981617629528046, |
| "rewards/answer_wer_reward": 0.9413779377937317, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9991414844989777, |
| "step": 184 |
| }, |
| { |
| "completion_length": 239.71875, |
| "epoch": 0.592, |
| "grad_norm": 1.3427449464797974, |
| "kl": 0.058837890625, |
| "learning_rate": 7.699999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.9066988229751587, |
| "reward_std": 0.020341036841273308, |
| "rewards/answer_entity_reward": 0.9776557087898254, |
| "rewards/answer_wer_reward": 0.929761528968811, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9992816150188446, |
| "step": 185 |
| }, |
| { |
| "completion_length": 133.90625, |
| "epoch": 0.5952, |
| "grad_norm": 4.991705417633057, |
| "kl": 0.0623779296875, |
| "learning_rate": 7.6875e-07, |
| "loss": 0.0006, |
| "reward": 3.926753878593445, |
| "reward_std": 0.023914007004350424, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9629489779472351, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9638049006462097, |
| "step": 186 |
| }, |
| { |
| "completion_length": 234.625, |
| "epoch": 0.5984, |
| "grad_norm": 2.8712401390075684, |
| "kl": 0.096435546875, |
| "learning_rate": 7.675e-07, |
| "loss": 0.001, |
| "reward": 3.872377395629883, |
| "reward_std": 0.06525835767388344, |
| "rewards/answer_entity_reward": 0.9841803908348083, |
| "rewards/answer_wer_reward": 0.9093597233295441, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9788371920585632, |
| "step": 187 |
| }, |
| { |
| "completion_length": 225.4375, |
| "epoch": 0.6016, |
| "grad_norm": 2.3115170001983643, |
| "kl": 0.055419921875, |
| "learning_rate": 7.6625e-07, |
| "loss": 0.0006, |
| "reward": 3.9362770318984985, |
| "reward_std": 0.019690027460455894, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9422614872455597, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.997487872838974, |
| "step": 188 |
| }, |
| { |
| "completion_length": 214.15625, |
| "epoch": 0.6048, |
| "grad_norm": 3.583329677581787, |
| "kl": 0.0550537109375, |
| "learning_rate": 7.65e-07, |
| "loss": 0.0005, |
| "reward": 3.9327969551086426, |
| "reward_std": 0.014218965079635382, |
| "rewards/answer_entity_reward": 0.9903846085071564, |
| "rewards/answer_wer_reward": 0.9424121379852295, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 189 |
| }, |
| { |
| "completion_length": 249.15625, |
| "epoch": 0.608, |
| "grad_norm": 1.4651848077774048, |
| "kl": 0.052001953125, |
| "learning_rate": 7.6375e-07, |
| "loss": 0.0005, |
| "reward": 3.941069722175598, |
| "reward_std": 0.009663278236985207, |
| "rewards/answer_entity_reward": 0.9926470518112183, |
| "rewards/answer_wer_reward": 0.9507163166999817, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9977063536643982, |
| "step": 190 |
| }, |
| { |
| "completion_length": 197.84375, |
| "epoch": 0.6112, |
| "grad_norm": 1.4688224792480469, |
| "kl": 0.0577392578125, |
| "learning_rate": 7.624999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.9300395250320435, |
| "reward_std": 0.014806594932451844, |
| "rewards/answer_entity_reward": 0.984722226858139, |
| "rewards/answer_wer_reward": 0.9455022215843201, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9998151063919067, |
| "step": 191 |
| }, |
| { |
| "completion_length": 254.6875, |
| "epoch": 0.6144, |
| "grad_norm": 1.1648938655853271, |
| "kl": 0.0589599609375, |
| "learning_rate": 7.612499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.9228453636169434, |
| "reward_std": 0.026355463080108166, |
| "rewards/answer_entity_reward": 0.9819444715976715, |
| "rewards/answer_wer_reward": 0.9418983161449432, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990026652812958, |
| "step": 192 |
| }, |
| { |
| "completion_length": 264.34375, |
| "epoch": 0.6176, |
| "grad_norm": 1.2595146894454956, |
| "kl": 0.0635986328125, |
| "learning_rate": 7.599999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.9068782329559326, |
| "reward_std": 0.02374061942100525, |
| "rewards/answer_entity_reward": 0.9758522510528564, |
| "rewards/answer_wer_reward": 0.9392839670181274, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9917418956756592, |
| "step": 193 |
| }, |
| { |
| "completion_length": 226.875, |
| "epoch": 0.6208, |
| "grad_norm": 3.0049514770507812, |
| "kl": 0.065185546875, |
| "learning_rate": 7.5875e-07, |
| "loss": 0.0007, |
| "reward": 3.9182554483413696, |
| "reward_std": 0.028174775652587414, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.9239371716976166, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 194 |
| }, |
| { |
| "completion_length": 233.90625, |
| "epoch": 0.624, |
| "grad_norm": 3.6226987838745117, |
| "kl": 0.14013671875, |
| "learning_rate": 7.575e-07, |
| "loss": 0.0014, |
| "reward": 3.917691946029663, |
| "reward_std": 0.015854593832045794, |
| "rewards/answer_entity_reward": 0.9861111044883728, |
| "rewards/answer_wer_reward": 0.9359965324401855, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.99558424949646, |
| "step": 195 |
| }, |
| { |
| "completion_length": 228.96875, |
| "epoch": 0.6272, |
| "grad_norm": 3.1564576625823975, |
| "kl": 0.03131103515625, |
| "learning_rate": 7.5625e-07, |
| "loss": 0.0003, |
| "reward": 3.8988983631134033, |
| "reward_std": 0.04383570794016123, |
| "rewards/answer_entity_reward": 0.980654776096344, |
| "rewards/answer_wer_reward": 0.9372455775737762, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9809979796409607, |
| "step": 196 |
| }, |
| { |
| "completion_length": 235.875, |
| "epoch": 0.6304, |
| "grad_norm": 1.3267861604690552, |
| "kl": 0.052978515625, |
| "learning_rate": 7.55e-07, |
| "loss": 0.0005, |
| "reward": 3.9319225549697876, |
| "reward_std": 0.02372880419716239, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9346356689929962, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999690592288971, |
| "step": 197 |
| }, |
| { |
| "completion_length": 162.34375, |
| "epoch": 0.6336, |
| "grad_norm": 1.4438445568084717, |
| "kl": 0.065185546875, |
| "learning_rate": 7.5375e-07, |
| "loss": 0.0006, |
| "reward": 3.8535887002944946, |
| "reward_std": 0.041104525327682495, |
| "rewards/answer_entity_reward": 0.9681412279605865, |
| "rewards/answer_wer_reward": 0.9683326184749603, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9171150028705597, |
| "step": 198 |
| }, |
| { |
| "completion_length": 203.875, |
| "epoch": 0.6368, |
| "grad_norm": 4.674152374267578, |
| "kl": 0.050048828125, |
| "learning_rate": 7.524999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.938958764076233, |
| "reward_std": 0.01455747289583087, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9664872884750366, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.974875271320343, |
| "step": 199 |
| }, |
| { |
| "completion_length": 230.625, |
| "epoch": 0.64, |
| "grad_norm": 1.899129867553711, |
| "kl": 0.0535888671875, |
| "learning_rate": 7.512499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.9438642263412476, |
| "reward_std": 0.014077516738325357, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.952812910079956, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9938922822475433, |
| "step": 200 |
| }, |
| { |
| "completion_length": 212.4375, |
| "epoch": 0.6432, |
| "grad_norm": 1.8970869779586792, |
| "kl": 0.0460205078125, |
| "learning_rate": 7.5e-07, |
| "loss": 0.0005, |
| "reward": 3.9026511907577515, |
| "reward_std": 0.038714910857379436, |
| "rewards/answer_entity_reward": 0.9916666746139526, |
| "rewards/answer_wer_reward": 0.911726325750351, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9992582499980927, |
| "step": 201 |
| }, |
| { |
| "completion_length": 203.875, |
| "epoch": 0.6464, |
| "grad_norm": 2.5214030742645264, |
| "kl": 0.083251953125, |
| "learning_rate": 7.4875e-07, |
| "loss": 0.0008, |
| "reward": 3.9040462970733643, |
| "reward_std": 0.016587836667895317, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9761527180671692, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9278934299945831, |
| "step": 202 |
| }, |
| { |
| "completion_length": 216.375, |
| "epoch": 0.6496, |
| "grad_norm": 4.072224140167236, |
| "kl": 0.053955078125, |
| "learning_rate": 7.475e-07, |
| "loss": 0.0005, |
| "reward": 3.9431036710739136, |
| "reward_std": 0.020094456151127815, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.949131965637207, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.996812641620636, |
| "step": 203 |
| }, |
| { |
| "completion_length": 221.0, |
| "epoch": 0.6528, |
| "grad_norm": 3.3709828853607178, |
| "kl": 0.070556640625, |
| "learning_rate": 7.4625e-07, |
| "loss": 0.0007, |
| "reward": 3.8844679594039917, |
| "reward_std": 0.05386691028252244, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.934579610824585, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9498883783817291, |
| "step": 204 |
| }, |
| { |
| "completion_length": 195.9375, |
| "epoch": 0.656, |
| "grad_norm": 2.4978103637695312, |
| "kl": 0.0775146484375, |
| "learning_rate": 7.45e-07, |
| "loss": 0.0008, |
| "reward": 3.9303336143493652, |
| "reward_std": 0.04689153959043324, |
| "rewards/answer_entity_reward": 0.9804924428462982, |
| "rewards/answer_wer_reward": 0.9526000618934631, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9972410500049591, |
| "step": 205 |
| }, |
| { |
| "completion_length": 256.875, |
| "epoch": 0.6592, |
| "grad_norm": 2.3422584533691406, |
| "kl": 0.1229248046875, |
| "learning_rate": 7.4375e-07, |
| "loss": 0.0012, |
| "reward": 3.9243087768554688, |
| "reward_std": 0.019790570251643658, |
| "rewards/answer_entity_reward": 0.9764957129955292, |
| "rewards/answer_wer_reward": 0.9478131830692291, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 206 |
| }, |
| { |
| "completion_length": 204.15625, |
| "epoch": 0.6624, |
| "grad_norm": 2.19623064994812, |
| "kl": 0.0550537109375, |
| "learning_rate": 7.425e-07, |
| "loss": 0.0006, |
| "reward": 3.936911940574646, |
| "reward_std": 0.02031032182276249, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9463189840316772, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9905929565429688, |
| "step": 207 |
| }, |
| { |
| "completion_length": 225.21875, |
| "epoch": 0.6656, |
| "grad_norm": 5.279341220855713, |
| "kl": 0.0498046875, |
| "learning_rate": 7.412499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.915460228919983, |
| "reward_std": 0.015285669825971127, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9175935089588165, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9978667497634888, |
| "step": 208 |
| }, |
| { |
| "completion_length": 188.78125, |
| "epoch": 0.6688, |
| "grad_norm": 3.7716915607452393, |
| "kl": 0.0576171875, |
| "learning_rate": 7.4e-07, |
| "loss": 0.0006, |
| "reward": 3.8296241760253906, |
| "reward_std": 0.017440371215343475, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9421272277832031, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8899007737636566, |
| "step": 209 |
| }, |
| { |
| "completion_length": 203.28125, |
| "epoch": 0.672, |
| "grad_norm": 1.2790639400482178, |
| "kl": 0.0582275390625, |
| "learning_rate": 7.3875e-07, |
| "loss": 0.0006, |
| "reward": 3.952346086502075, |
| "reward_std": 0.007349871098995209, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.969746857881546, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9825991690158844, |
| "step": 210 |
| }, |
| { |
| "completion_length": 196.1875, |
| "epoch": 0.6752, |
| "grad_norm": 14.005128860473633, |
| "kl": 0.0604248046875, |
| "learning_rate": 7.375e-07, |
| "loss": 0.0006, |
| "reward": 3.8537105321884155, |
| "reward_std": 0.012695960700511932, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9704558551311493, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8832548260688782, |
| "step": 211 |
| }, |
| { |
| "completion_length": 159.3125, |
| "epoch": 0.6784, |
| "grad_norm": 4.394070625305176, |
| "kl": 0.068115234375, |
| "learning_rate": 7.362499999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.9123398065567017, |
| "reward_std": 0.02882718201726675, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9466139674186707, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.965725839138031, |
| "step": 212 |
| }, |
| { |
| "completion_length": 238.75, |
| "epoch": 0.6816, |
| "grad_norm": 5.395397663116455, |
| "kl": 0.041748046875, |
| "learning_rate": 7.35e-07, |
| "loss": 0.0004, |
| "reward": 3.89706289768219, |
| "reward_std": 0.0131816565990448, |
| "rewards/answer_entity_reward": 0.9914772808551788, |
| "rewards/answer_wer_reward": 0.9062366485595703, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993489682674408, |
| "step": 213 |
| }, |
| { |
| "completion_length": 255.65625, |
| "epoch": 0.6848, |
| "grad_norm": 1.9760891199111938, |
| "kl": 0.03961181640625, |
| "learning_rate": 7.3375e-07, |
| "loss": 0.0004, |
| "reward": 3.917116641998291, |
| "reward_std": 0.04898790689185262, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9182944297790527, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9988220930099487, |
| "step": 214 |
| }, |
| { |
| "completion_length": 165.75, |
| "epoch": 0.688, |
| "grad_norm": 2.763314723968506, |
| "kl": 0.0577392578125, |
| "learning_rate": 7.325e-07, |
| "loss": 0.0006, |
| "reward": 3.952502489089966, |
| "reward_std": 0.016542275436222553, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9569029808044434, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990717768669128, |
| "step": 215 |
| }, |
| { |
| "completion_length": 215.625, |
| "epoch": 0.6912, |
| "grad_norm": 7.516313552856445, |
| "kl": 0.0439453125, |
| "learning_rate": 7.312499999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.9650633335113525, |
| "reward_std": 0.015061032958328724, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9679040908813477, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 216 |
| }, |
| { |
| "completion_length": 227.84375, |
| "epoch": 0.6944, |
| "grad_norm": 1.8075324296951294, |
| "kl": 0.0511474609375, |
| "learning_rate": 7.3e-07, |
| "loss": 0.0005, |
| "reward": 3.9209293127059937, |
| "reward_std": 0.01800437457859516, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.9266109764575958, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 217 |
| }, |
| { |
| "completion_length": 213.625, |
| "epoch": 0.6976, |
| "grad_norm": 5.917069911956787, |
| "kl": 0.0426025390625, |
| "learning_rate": 7.2875e-07, |
| "loss": 0.0004, |
| "reward": 3.9082109928131104, |
| "reward_std": 0.07417950965464115, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9089923202991486, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999218761920929, |
| "step": 218 |
| }, |
| { |
| "completion_length": 228.6875, |
| "epoch": 0.7008, |
| "grad_norm": 1.1044409275054932, |
| "kl": 0.0531005859375, |
| "learning_rate": 7.275e-07, |
| "loss": 0.0005, |
| "reward": 3.908870220184326, |
| "reward_std": 0.016815255396068096, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9117993116378784, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994747638702393, |
| "step": 219 |
| }, |
| { |
| "completion_length": 199.125, |
| "epoch": 0.704, |
| "grad_norm": 3.019407272338867, |
| "kl": 0.058837890625, |
| "learning_rate": 7.262499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.925763249397278, |
| "reward_std": 0.01313594076782465, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9272693395614624, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9984939694404602, |
| "step": 220 |
| }, |
| { |
| "completion_length": 210.65625, |
| "epoch": 0.7072, |
| "grad_norm": 2.7719058990478516, |
| "kl": 0.0377197265625, |
| "learning_rate": 7.249999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.8708763122558594, |
| "reward_std": 0.028095172019675374, |
| "rewards/answer_entity_reward": 0.9812500178813934, |
| "rewards/answer_wer_reward": 0.9290285110473633, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.960597813129425, |
| "step": 221 |
| }, |
| { |
| "completion_length": 199.6875, |
| "epoch": 0.7104, |
| "grad_norm": 2.267350435256958, |
| "kl": 0.0660400390625, |
| "learning_rate": 7.2375e-07, |
| "loss": 0.0006, |
| "reward": 3.9580957889556885, |
| "reward_std": 0.03087126836180687, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9705802798271179, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9899193644523621, |
| "step": 222 |
| }, |
| { |
| "completion_length": 181.8125, |
| "epoch": 0.7136, |
| "grad_norm": 8.685694694519043, |
| "kl": 0.081787109375, |
| "learning_rate": 7.225e-07, |
| "loss": 0.0008, |
| "reward": 3.8902955055236816, |
| "reward_std": 0.011068197898566723, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9720200002193451, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9182755053043365, |
| "step": 223 |
| }, |
| { |
| "completion_length": 185.4375, |
| "epoch": 0.7168, |
| "grad_norm": 2.514770746231079, |
| "kl": 0.0609130859375, |
| "learning_rate": 7.212499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.9320486783981323, |
| "reward_std": 0.033941914327442646, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9598598778247833, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.972188800573349, |
| "step": 224 |
| }, |
| { |
| "completion_length": 250.84375, |
| "epoch": 0.72, |
| "grad_norm": 1.7914812564849854, |
| "kl": 0.03045654296875, |
| "learning_rate": 7.2e-07, |
| "loss": 0.0003, |
| "reward": 3.8908780813217163, |
| "reward_std": 0.03203156217932701, |
| "rewards/answer_entity_reward": 0.9678819179534912, |
| "rewards/answer_wer_reward": 0.9238358736038208, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9991601407527924, |
| "step": 225 |
| }, |
| { |
| "completion_length": 249.125, |
| "epoch": 0.7232, |
| "grad_norm": 4.627202987670898, |
| "kl": 0.0531005859375, |
| "learning_rate": 7.1875e-07, |
| "loss": 0.0005, |
| "reward": 3.899629235267639, |
| "reward_std": 0.06726673897355795, |
| "rewards/answer_entity_reward": 0.9953208565711975, |
| "rewards/answer_wer_reward": 0.9247469902038574, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9795613586902618, |
| "step": 226 |
| }, |
| { |
| "completion_length": 214.40625, |
| "epoch": 0.7264, |
| "grad_norm": 1.942586064338684, |
| "kl": 0.0352783203125, |
| "learning_rate": 7.175e-07, |
| "loss": 0.0003, |
| "reward": 3.959649443626404, |
| "reward_std": 0.01394367078319192, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9649502038955688, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9975401163101196, |
| "step": 227 |
| }, |
| { |
| "completion_length": 182.59375, |
| "epoch": 0.7296, |
| "grad_norm": 3.191298246383667, |
| "kl": 0.055419921875, |
| "learning_rate": 7.1625e-07, |
| "loss": 0.0005, |
| "reward": 3.9260960817337036, |
| "reward_std": 0.021659906953573227, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9576999247074127, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9712370932102203, |
| "step": 228 |
| }, |
| { |
| "completion_length": 212.53125, |
| "epoch": 0.7328, |
| "grad_norm": 1.0323834419250488, |
| "kl": 0.0533447265625, |
| "learning_rate": 7.149999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.939168095588684, |
| "reward_std": 0.009458722081035376, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9402457773685455, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9989224076271057, |
| "step": 229 |
| }, |
| { |
| "completion_length": 187.8125, |
| "epoch": 0.736, |
| "grad_norm": 4.53863000869751, |
| "kl": 0.050537109375, |
| "learning_rate": 7.137499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.893386960029602, |
| "reward_std": 0.03008814249187708, |
| "rewards/answer_entity_reward": 0.9941239356994629, |
| "rewards/answer_wer_reward": 0.9532185792922974, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.946044385433197, |
| "step": 230 |
| }, |
| { |
| "completion_length": 235.5, |
| "epoch": 0.7392, |
| "grad_norm": 2.1737990379333496, |
| "kl": 0.0477294921875, |
| "learning_rate": 7.125e-07, |
| "loss": 0.0005, |
| "reward": 3.8995944261550903, |
| "reward_std": 0.021292359568178654, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9127146005630493, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9868797659873962, |
| "step": 231 |
| }, |
| { |
| "completion_length": 230.625, |
| "epoch": 0.7424, |
| "grad_norm": 0.8920266628265381, |
| "kl": 0.02874755859375, |
| "learning_rate": 7.1125e-07, |
| "loss": 0.0003, |
| "reward": 3.9383678436279297, |
| "reward_std": 0.008275180356577039, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9394271969795227, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9989406764507294, |
| "step": 232 |
| }, |
| { |
| "completion_length": 196.125, |
| "epoch": 0.7456, |
| "grad_norm": 2.1836190223693848, |
| "kl": 0.06640625, |
| "learning_rate": 7.1e-07, |
| "loss": 0.0007, |
| "reward": 3.9469913244247437, |
| "reward_std": 0.01094681373797357, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9498908519744873, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9971005320549011, |
| "step": 233 |
| }, |
| { |
| "completion_length": 200.6875, |
| "epoch": 0.7488, |
| "grad_norm": 1.5529507398605347, |
| "kl": 0.041748046875, |
| "learning_rate": 7.0875e-07, |
| "loss": 0.0004, |
| "reward": 3.8839221000671387, |
| "reward_std": 0.02069476176984608, |
| "rewards/answer_entity_reward": 0.9841346144676208, |
| "rewards/answer_wer_reward": 0.9540095031261444, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.945777952671051, |
| "step": 234 |
| }, |
| { |
| "completion_length": 222.90625, |
| "epoch": 0.752, |
| "grad_norm": 17.55677604675293, |
| "kl": 0.061767578125, |
| "learning_rate": 7.075e-07, |
| "loss": 0.0006, |
| "reward": 3.92560076713562, |
| "reward_std": 0.03323593852110207, |
| "rewards/answer_entity_reward": 0.9963235259056091, |
| "rewards/answer_wer_reward": 0.9402145445346832, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.989062488079071, |
| "step": 235 |
| }, |
| { |
| "completion_length": 195.0625, |
| "epoch": 0.7552, |
| "grad_norm": 1.7806612253189087, |
| "kl": 0.056640625, |
| "learning_rate": 7.0625e-07, |
| "loss": 0.0006, |
| "reward": 3.9366722106933594, |
| "reward_std": 0.02212852332741022, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9515082538127899, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9886362552642822, |
| "step": 236 |
| }, |
| { |
| "completion_length": 224.34375, |
| "epoch": 0.7584, |
| "grad_norm": 3.0402088165283203, |
| "kl": 0.0352783203125, |
| "learning_rate": 7.049999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.947329044342041, |
| "reward_std": 0.011976622510701418, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.961329847574234, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.988840252161026, |
| "step": 237 |
| }, |
| { |
| "completion_length": 223.53125, |
| "epoch": 0.7616, |
| "grad_norm": 2.889293670654297, |
| "kl": 0.0616455078125, |
| "learning_rate": 7.037499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.9246891736984253, |
| "reward_std": 0.05990536604076624, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9530621469020844, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9750992059707642, |
| "step": 238 |
| }, |
| { |
| "completion_length": 184.78125, |
| "epoch": 0.7648, |
| "grad_norm": 1.2427425384521484, |
| "kl": 0.0623779296875, |
| "learning_rate": 7.024999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.957573890686035, |
| "reward_std": 0.005278389900922775, |
| "rewards/answer_entity_reward": 0.9926470518112183, |
| "rewards/answer_wer_reward": 0.9649269282817841, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 239 |
| }, |
| { |
| "completion_length": 236.28125, |
| "epoch": 0.768, |
| "grad_norm": 2.361463785171509, |
| "kl": 0.0545654296875, |
| "learning_rate": 7.0125e-07, |
| "loss": 0.0005, |
| "reward": 3.9197674989700317, |
| "reward_std": 0.02553732506930828, |
| "rewards/answer_entity_reward": 0.9834134578704834, |
| "rewards/answer_wer_reward": 0.9363541007041931, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 240 |
| }, |
| { |
| "completion_length": 174.0, |
| "epoch": 0.7712, |
| "grad_norm": 2.3930962085723877, |
| "kl": 0.05926513671875, |
| "learning_rate": 7e-07, |
| "loss": 0.0006, |
| "reward": 3.9211114645004272, |
| "reward_std": 0.008784215082414448, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9724419414997101, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9486694633960724, |
| "step": 241 |
| }, |
| { |
| "completion_length": 254.59375, |
| "epoch": 0.7744, |
| "grad_norm": 1.6553773880004883, |
| "kl": 0.0389404296875, |
| "learning_rate": 6.9875e-07, |
| "loss": 0.0004, |
| "reward": 3.929746985435486, |
| "reward_std": 0.012057055719196796, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9313917756080627, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9983552694320679, |
| "step": 242 |
| }, |
| { |
| "completion_length": 235.375, |
| "epoch": 0.7776, |
| "grad_norm": 0.8029008507728577, |
| "kl": 0.04083251953125, |
| "learning_rate": 6.975e-07, |
| "loss": 0.0004, |
| "reward": 3.9153066873550415, |
| "reward_std": 0.005760843865573406, |
| "rewards/answer_entity_reward": 0.9916666746139526, |
| "rewards/answer_wer_reward": 0.9309280216693878, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9927119016647339, |
| "step": 243 |
| }, |
| { |
| "completion_length": 186.78125, |
| "epoch": 0.7808, |
| "grad_norm": 3.1181294918060303, |
| "kl": 0.0732421875, |
| "learning_rate": 6.9625e-07, |
| "loss": 0.0007, |
| "reward": 3.9115726947784424, |
| "reward_std": 0.007224578293971717, |
| "rewards/answer_entity_reward": 0.9707792401313782, |
| "rewards/answer_wer_reward": 0.940793514251709, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 244 |
| }, |
| { |
| "completion_length": 223.6875, |
| "epoch": 0.784, |
| "grad_norm": 1.3839703798294067, |
| "kl": 0.0380859375, |
| "learning_rate": 6.949999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.9361883401870728, |
| "reward_std": 0.012964933644980192, |
| "rewards/answer_entity_reward": 0.9818618893623352, |
| "rewards/answer_wer_reward": 0.9550732672214508, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999253123998642, |
| "step": 245 |
| }, |
| { |
| "completion_length": 222.53125, |
| "epoch": 0.7872, |
| "grad_norm": 3.1735548973083496, |
| "kl": 0.072509765625, |
| "learning_rate": 6.937499999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.9446396827697754, |
| "reward_std": 0.023095417767763138, |
| "rewards/answer_entity_reward": 0.9895833134651184, |
| "rewards/answer_wer_reward": 0.9603613913059235, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9946948885917664, |
| "step": 246 |
| }, |
| { |
| "completion_length": 217.4375, |
| "epoch": 0.7904, |
| "grad_norm": 1.185796856880188, |
| "kl": 0.042236328125, |
| "learning_rate": 6.924999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.9417611360549927, |
| "reward_std": 0.013147154357284307, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9470057189464569, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9947555065155029, |
| "step": 247 |
| }, |
| { |
| "completion_length": 240.59375, |
| "epoch": 0.7936, |
| "grad_norm": 2.088177442550659, |
| "kl": 0.0504150390625, |
| "learning_rate": 6.9125e-07, |
| "loss": 0.0005, |
| "reward": 3.9391993284225464, |
| "reward_std": 0.015122740995138884, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9413229823112488, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9978764653205872, |
| "step": 248 |
| }, |
| { |
| "completion_length": 251.8125, |
| "epoch": 0.7968, |
| "grad_norm": 1.0327165126800537, |
| "kl": 0.0439453125, |
| "learning_rate": 6.9e-07, |
| "loss": 0.0004, |
| "reward": 3.928339123725891, |
| "reward_std": 0.014733773190528154, |
| "rewards/answer_entity_reward": 0.9895104765892029, |
| "rewards/answer_wer_reward": 0.9401907324790955, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9986380338668823, |
| "step": 249 |
| }, |
| { |
| "completion_length": 202.125, |
| "epoch": 0.8, |
| "grad_norm": 1.0536175966262817, |
| "kl": 0.0443115234375, |
| "learning_rate": 6.8875e-07, |
| "loss": 0.0004, |
| "reward": 3.9324183464050293, |
| "reward_std": 0.018241871614009142, |
| "rewards/answer_entity_reward": 0.9873737692832947, |
| "rewards/answer_wer_reward": 0.9567070603370667, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9883374869823456, |
| "step": 250 |
| }, |
| { |
| "completion_length": 231.59375, |
| "epoch": 0.8032, |
| "grad_norm": 1.8605543375015259, |
| "kl": 0.0467529296875, |
| "learning_rate": 6.875e-07, |
| "loss": 0.0005, |
| "reward": 3.9515386819839478, |
| "reward_std": 0.014535096473991871, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9524115920066833, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9991269707679749, |
| "step": 251 |
| }, |
| { |
| "completion_length": 202.8125, |
| "epoch": 0.8064, |
| "grad_norm": 1.7101868391036987, |
| "kl": 0.0673828125, |
| "learning_rate": 6.8625e-07, |
| "loss": 0.0007, |
| "reward": 3.947361946105957, |
| "reward_std": 0.01079330500215292, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9485193192958832, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9988425970077515, |
| "step": 252 |
| }, |
| { |
| "completion_length": 194.4375, |
| "epoch": 0.8096, |
| "grad_norm": 1.6060519218444824, |
| "kl": 0.0518798828125, |
| "learning_rate": 6.85e-07, |
| "loss": 0.0005, |
| "reward": 3.8238483667373657, |
| "reward_std": 0.09831315139308572, |
| "rewards/answer_entity_reward": 0.9366161823272705, |
| "rewards/answer_wer_reward": 0.888142466545105, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990898072719574, |
| "step": 253 |
| }, |
| { |
| "completion_length": 231.71875, |
| "epoch": 0.8128, |
| "grad_norm": 1.4323464632034302, |
| "kl": 0.04559326171875, |
| "learning_rate": 6.837499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.9585113525390625, |
| "reward_std": 0.009139138273894787, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9591011703014374, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994101524353027, |
| "step": 254 |
| }, |
| { |
| "completion_length": 242.15625, |
| "epoch": 0.816, |
| "grad_norm": 1.638405442237854, |
| "kl": 0.0592041015625, |
| "learning_rate": 6.824999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.938191056251526, |
| "reward_std": 0.015181098598986864, |
| "rewards/answer_entity_reward": 0.9916666746139526, |
| "rewards/answer_wer_reward": 0.9465242922306061, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 255 |
| }, |
| { |
| "completion_length": 178.96875, |
| "epoch": 0.8192, |
| "grad_norm": 2.906489133834839, |
| "kl": 0.07958984375, |
| "learning_rate": 6.8125e-07, |
| "loss": 0.0008, |
| "reward": 3.9418115615844727, |
| "reward_std": 0.024727396899834275, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.9549268186092377, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9925665557384491, |
| "step": 256 |
| }, |
| { |
| "completion_length": 191.59375, |
| "epoch": 0.8224, |
| "grad_norm": 4.772871494293213, |
| "kl": 0.271484375, |
| "learning_rate": 6.800000000000001e-07, |
| "loss": 0.0027, |
| "reward": 3.9085776805877686, |
| "reward_std": 0.01904244115576148, |
| "rewards/answer_entity_reward": 0.9866071343421936, |
| "rewards/answer_wer_reward": 0.9542762637138367, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9676942825317383, |
| "step": 257 |
| }, |
| { |
| "completion_length": 192.78125, |
| "epoch": 0.8256, |
| "grad_norm": 2.3399181365966797, |
| "kl": 0.081787109375, |
| "learning_rate": 6.7875e-07, |
| "loss": 0.0008, |
| "reward": 3.930221199989319, |
| "reward_std": 0.014671812066808343, |
| "rewards/answer_entity_reward": 0.9867201447486877, |
| "rewards/answer_wer_reward": 0.9438917338848114, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996093809604645, |
| "step": 258 |
| }, |
| { |
| "completion_length": 187.5, |
| "epoch": 0.8288, |
| "grad_norm": 9.805069923400879, |
| "kl": 0.072265625, |
| "learning_rate": 6.775e-07, |
| "loss": 0.0007, |
| "reward": 3.939017653465271, |
| "reward_std": 0.016680479515343904, |
| "rewards/answer_entity_reward": 0.9944852888584137, |
| "rewards/answer_wer_reward": 0.9445324242115021, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 259 |
| }, |
| { |
| "completion_length": 234.5625, |
| "epoch": 0.832, |
| "grad_norm": 1.5217561721801758, |
| "kl": 0.0516357421875, |
| "learning_rate": 6.7625e-07, |
| "loss": 0.0005, |
| "reward": 3.922031283378601, |
| "reward_std": 0.01609009224921465, |
| "rewards/answer_entity_reward": 0.9681277275085449, |
| "rewards/answer_wer_reward": 0.9539035856723785, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 260 |
| }, |
| { |
| "completion_length": 159.0, |
| "epoch": 0.8352, |
| "grad_norm": 2.5927042961120605, |
| "kl": 0.0557861328125, |
| "learning_rate": 6.75e-07, |
| "loss": 0.0006, |
| "reward": 3.9503369331359863, |
| "reward_std": 0.004757039016112685, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9792385697364807, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9710983335971832, |
| "step": 261 |
| }, |
| { |
| "completion_length": 222.15625, |
| "epoch": 0.8384, |
| "grad_norm": 1.9485008716583252, |
| "kl": 0.0928955078125, |
| "learning_rate": 6.737499999999999e-07, |
| "loss": 0.0009, |
| "reward": 3.9718098640441895, |
| "reward_std": 0.01134553411975503, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9718098938465118, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 262 |
| }, |
| { |
| "completion_length": 248.875, |
| "epoch": 0.8416, |
| "grad_norm": 5.045698165893555, |
| "kl": 0.0552978515625, |
| "learning_rate": 6.724999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.799831986427307, |
| "reward_std": 0.03707320708781481, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.9218086004257202, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.883705198764801, |
| "step": 263 |
| }, |
| { |
| "completion_length": 157.6875, |
| "epoch": 0.8448, |
| "grad_norm": 1.9603397846221924, |
| "kl": 0.14111328125, |
| "learning_rate": 6.7125e-07, |
| "loss": 0.0014, |
| "reward": 3.9334217309951782, |
| "reward_std": 0.00959050771780312, |
| "rewards/answer_entity_reward": 0.9916666746139526, |
| "rewards/answer_wer_reward": 0.9538573622703552, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.987897664308548, |
| "step": 264 |
| }, |
| { |
| "completion_length": 249.21875, |
| "epoch": 0.848, |
| "grad_norm": 1.720057725906372, |
| "kl": 0.102783203125, |
| "learning_rate": 6.7e-07, |
| "loss": 0.001, |
| "reward": 3.9404491186141968, |
| "reward_std": 0.023797483183443546, |
| "rewards/answer_entity_reward": 0.9947552382946014, |
| "rewards/answer_wer_reward": 0.9459458291530609, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997479915618896, |
| "step": 265 |
| }, |
| { |
| "completion_length": 200.65625, |
| "epoch": 0.8512, |
| "grad_norm": 1.7017474174499512, |
| "kl": 0.06640625, |
| "learning_rate": 6.6875e-07, |
| "loss": 0.0007, |
| "reward": 3.897473454475403, |
| "reward_std": 0.017802401445806026, |
| "rewards/answer_entity_reward": 0.9892628192901611, |
| "rewards/answer_wer_reward": 0.9560422301292419, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.952168345451355, |
| "step": 266 |
| }, |
| { |
| "completion_length": 206.9375, |
| "epoch": 0.8544, |
| "grad_norm": 1.7645119428634644, |
| "kl": 0.107177734375, |
| "learning_rate": 6.675e-07, |
| "loss": 0.0011, |
| "reward": 3.919585347175598, |
| "reward_std": 0.017358362209051847, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9206817746162415, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9989035129547119, |
| "step": 267 |
| }, |
| { |
| "completion_length": 234.125, |
| "epoch": 0.8576, |
| "grad_norm": 2.324972629547119, |
| "kl": 0.07275390625, |
| "learning_rate": 6.6625e-07, |
| "loss": 0.0007, |
| "reward": 3.8366565704345703, |
| "reward_std": 0.03994511067867279, |
| "rewards/answer_entity_reward": 0.9375, |
| "rewards/answer_wer_reward": 0.9288243353366852, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9703322649002075, |
| "step": 268 |
| }, |
| { |
| "completion_length": 163.28125, |
| "epoch": 0.8608, |
| "grad_norm": 3.44211483001709, |
| "kl": 0.07080078125, |
| "learning_rate": 6.65e-07, |
| "loss": 0.0007, |
| "reward": 3.8973175287246704, |
| "reward_std": 0.051633019000291824, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9550660252571106, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9457237124443054, |
| "step": 269 |
| }, |
| { |
| "completion_length": 198.0625, |
| "epoch": 0.864, |
| "grad_norm": 5.092156887054443, |
| "kl": 0.072998046875, |
| "learning_rate": 6.637499999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.940290689468384, |
| "reward_std": 0.009564612759277225, |
| "rewards/answer_entity_reward": 0.9821428656578064, |
| "rewards/answer_wer_reward": 0.958147794008255, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 270 |
| }, |
| { |
| "completion_length": 138.875, |
| "epoch": 0.8672, |
| "grad_norm": 3.998215913772583, |
| "kl": 0.05889892578125, |
| "learning_rate": 6.624999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.9329700469970703, |
| "reward_std": 0.05405183229595423, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9581792652606964, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9782631099224091, |
| "step": 271 |
| }, |
| { |
| "completion_length": 208.53125, |
| "epoch": 0.8704, |
| "grad_norm": 2.191901206970215, |
| "kl": 0.06884765625, |
| "learning_rate": 6.6125e-07, |
| "loss": 0.0007, |
| "reward": 3.956714630126953, |
| "reward_std": 0.01909107668325305, |
| "rewards/answer_entity_reward": 0.993686854839325, |
| "rewards/answer_wer_reward": 0.9632268249988556, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9998009502887726, |
| "step": 272 |
| }, |
| { |
| "completion_length": 196.71875, |
| "epoch": 0.8736, |
| "grad_norm": 3.2068357467651367, |
| "kl": 0.0513916015625, |
| "learning_rate": 6.6e-07, |
| "loss": 0.0005, |
| "reward": 3.9089767932891846, |
| "reward_std": 0.035889009945094585, |
| "rewards/answer_entity_reward": 0.9902777671813965, |
| "rewards/answer_wer_reward": 0.934887707233429, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9838112890720367, |
| "step": 273 |
| }, |
| { |
| "completion_length": 238.03125, |
| "epoch": 0.8768, |
| "grad_norm": 12.858990669250488, |
| "kl": 0.0513916015625, |
| "learning_rate": 6.587499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.9507744312286377, |
| "reward_std": 0.012679634615778923, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9518805146217346, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9988937973976135, |
| "step": 274 |
| }, |
| { |
| "completion_length": 215.03125, |
| "epoch": 0.88, |
| "grad_norm": 6.914164066314697, |
| "kl": 0.053466796875, |
| "learning_rate": 6.575e-07, |
| "loss": 0.0005, |
| "reward": 3.920554757118225, |
| "reward_std": 0.01066223531961441, |
| "rewards/answer_entity_reward": 0.9821428656578064, |
| "rewards/answer_wer_reward": 0.9384119212627411, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 275 |
| }, |
| { |
| "completion_length": 170.3125, |
| "epoch": 0.8832, |
| "grad_norm": 1.4424182176589966, |
| "kl": 0.0533447265625, |
| "learning_rate": 6.5625e-07, |
| "loss": 0.0005, |
| "reward": 3.8676129579544067, |
| "reward_std": 0.015859364066272974, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9279236793518066, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9396892189979553, |
| "step": 276 |
| }, |
| { |
| "completion_length": 203.0, |
| "epoch": 0.8864, |
| "grad_norm": 1.4304486513137817, |
| "kl": 0.040771484375, |
| "learning_rate": 6.55e-07, |
| "loss": 0.0004, |
| "reward": 3.9131808280944824, |
| "reward_std": 0.020121398381888866, |
| "rewards/answer_entity_reward": 0.9930555820465088, |
| "rewards/answer_wer_reward": 0.9201253056526184, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 277 |
| }, |
| { |
| "completion_length": 199.9375, |
| "epoch": 0.8896, |
| "grad_norm": 4.607363700866699, |
| "kl": 0.0810546875, |
| "learning_rate": 6.5375e-07, |
| "loss": 0.0008, |
| "reward": 3.9438611268997192, |
| "reward_std": 0.014630983117967844, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9560317695140839, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.989912748336792, |
| "step": 278 |
| }, |
| { |
| "completion_length": 215.75, |
| "epoch": 0.8928, |
| "grad_norm": 0.9500401020050049, |
| "kl": 0.0498046875, |
| "learning_rate": 6.524999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.9393136501312256, |
| "reward_std": 0.010870016179978848, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9396113157272339, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997023940086365, |
| "step": 279 |
| }, |
| { |
| "completion_length": 211.4375, |
| "epoch": 0.896, |
| "grad_norm": 2.4634454250335693, |
| "kl": 0.08154296875, |
| "learning_rate": 6.5125e-07, |
| "loss": 0.0008, |
| "reward": 3.8559117317199707, |
| "reward_std": 0.020915272179991007, |
| "rewards/answer_entity_reward": 0.9944444298744202, |
| "rewards/answer_wer_reward": 0.9251176416873932, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9363496899604797, |
| "step": 280 |
| }, |
| { |
| "completion_length": 172.8125, |
| "epoch": 0.8992, |
| "grad_norm": 5.569718360900879, |
| "kl": 0.1357421875, |
| "learning_rate": 6.5e-07, |
| "loss": 0.0014, |
| "reward": 3.87375545501709, |
| "reward_std": 0.04026831593364477, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9294662475585938, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9442892372608185, |
| "step": 281 |
| }, |
| { |
| "completion_length": 114.875, |
| "epoch": 0.9024, |
| "grad_norm": 4.26852560043335, |
| "kl": 0.053955078125, |
| "learning_rate": 6.4875e-07, |
| "loss": 0.0005, |
| "reward": 3.909887909889221, |
| "reward_std": 0.015241059940308332, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9791332483291626, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9335956573486328, |
| "step": 282 |
| }, |
| { |
| "completion_length": 245.0, |
| "epoch": 0.9056, |
| "grad_norm": 1.3898316621780396, |
| "kl": 0.0450439453125, |
| "learning_rate": 6.474999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.9195964336395264, |
| "reward_std": 0.018749097362160683, |
| "rewards/answer_entity_reward": 0.9911437332630157, |
| "rewards/answer_wer_reward": 0.9284527003765106, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 283 |
| }, |
| { |
| "completion_length": 218.75, |
| "epoch": 0.9088, |
| "grad_norm": 4.705906391143799, |
| "kl": 0.0338134765625, |
| "learning_rate": 6.4625e-07, |
| "loss": 0.0003, |
| "reward": 3.9526829719543457, |
| "reward_std": 0.012810520827770233, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9526830613613129, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 284 |
| }, |
| { |
| "completion_length": 175.15625, |
| "epoch": 0.912, |
| "grad_norm": 1.7440683841705322, |
| "kl": 0.0616455078125, |
| "learning_rate": 6.45e-07, |
| "loss": 0.0006, |
| "reward": 3.9307706356048584, |
| "reward_std": 0.014890296617522836, |
| "rewards/answer_entity_reward": 0.9845238327980042, |
| "rewards/answer_wer_reward": 0.9668512642383575, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9793955981731415, |
| "step": 285 |
| }, |
| { |
| "completion_length": 154.3125, |
| "epoch": 0.9152, |
| "grad_norm": 2.3717188835144043, |
| "kl": 0.0599365234375, |
| "learning_rate": 6.4375e-07, |
| "loss": 0.0006, |
| "reward": 3.9156084060668945, |
| "reward_std": 0.013419507071375847, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.951806515455246, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9638019800186157, |
| "step": 286 |
| }, |
| { |
| "completion_length": 226.40625, |
| "epoch": 0.9184, |
| "grad_norm": 2.069488525390625, |
| "kl": 0.058349609375, |
| "learning_rate": 6.424999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.8257880210876465, |
| "reward_std": 0.023342549800872803, |
| "rewards/answer_entity_reward": 0.9914772808551788, |
| "rewards/answer_wer_reward": 0.9156993925571442, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9186112284660339, |
| "step": 287 |
| }, |
| { |
| "completion_length": 203.21875, |
| "epoch": 0.9216, |
| "grad_norm": 1.8522766828536987, |
| "kl": 0.0611572265625, |
| "learning_rate": 6.4125e-07, |
| "loss": 0.0006, |
| "reward": 3.9413124322891235, |
| "reward_std": 0.014133658958598971, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9447846114635468, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 288 |
| }, |
| { |
| "completion_length": 182.90625, |
| "epoch": 0.9248, |
| "grad_norm": 3.1601576805114746, |
| "kl": 0.0626220703125, |
| "learning_rate": 6.4e-07, |
| "loss": 0.0006, |
| "reward": 3.934013605117798, |
| "reward_std": 0.020497526740655303, |
| "rewards/answer_entity_reward": 0.9861111044883728, |
| "rewards/answer_wer_reward": 0.9598910510540009, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.988011360168457, |
| "step": 289 |
| }, |
| { |
| "completion_length": 235.71875, |
| "epoch": 0.928, |
| "grad_norm": 1.5299009084701538, |
| "kl": 0.062744140625, |
| "learning_rate": 6.3875e-07, |
| "loss": 0.0006, |
| "reward": 3.900187373161316, |
| "reward_std": 0.027182841673493385, |
| "rewards/answer_entity_reward": 0.9859217405319214, |
| "rewards/answer_wer_reward": 0.9156533181667328, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9986123144626617, |
| "step": 290 |
| }, |
| { |
| "completion_length": 181.59375, |
| "epoch": 0.9312, |
| "grad_norm": 2.8708431720733643, |
| "kl": 0.09375, |
| "learning_rate": 6.374999999999999e-07, |
| "loss": 0.0009, |
| "reward": 3.878863215446472, |
| "reward_std": 0.016461022198200226, |
| "rewards/answer_entity_reward": 0.9607954621315002, |
| "rewards/answer_wer_reward": 0.9469051957130432, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9711625277996063, |
| "step": 291 |
| }, |
| { |
| "completion_length": 252.71875, |
| "epoch": 0.9344, |
| "grad_norm": 1.3821316957473755, |
| "kl": 0.143798828125, |
| "learning_rate": 6.362499999999999e-07, |
| "loss": 0.0014, |
| "reward": 3.9444687366485596, |
| "reward_std": 0.015690275467932224, |
| "rewards/answer_entity_reward": 0.9958333373069763, |
| "rewards/answer_wer_reward": 0.9486355781555176, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 292 |
| }, |
| { |
| "completion_length": 191.5, |
| "epoch": 0.9376, |
| "grad_norm": 3.0700418949127197, |
| "kl": 0.08984375, |
| "learning_rate": 6.35e-07, |
| "loss": 0.0009, |
| "reward": 3.9288469552993774, |
| "reward_std": 0.025998966302722692, |
| "rewards/answer_entity_reward": 0.9910714626312256, |
| "rewards/answer_wer_reward": 0.9580896496772766, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.97968590259552, |
| "step": 293 |
| }, |
| { |
| "completion_length": 236.40625, |
| "epoch": 0.9408, |
| "grad_norm": 0.9392086863517761, |
| "kl": 0.0728759765625, |
| "learning_rate": 6.3375e-07, |
| "loss": 0.0007, |
| "reward": 3.9576098918914795, |
| "reward_std": 0.004891619086265564, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9576099216938019, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 294 |
| }, |
| { |
| "completion_length": 204.125, |
| "epoch": 0.944, |
| "grad_norm": 1.4554882049560547, |
| "kl": 0.044677734375, |
| "learning_rate": 6.324999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.9175373315811157, |
| "reward_std": 0.008688606787472963, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9530804753303528, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9644568264484406, |
| "step": 295 |
| }, |
| { |
| "completion_length": 230.8125, |
| "epoch": 0.9472, |
| "grad_norm": 0.7801051139831543, |
| "kl": 0.0537109375, |
| "learning_rate": 6.3125e-07, |
| "loss": 0.0005, |
| "reward": 3.941986918449402, |
| "reward_std": 0.011714181862771511, |
| "rewards/answer_entity_reward": 0.9983552694320679, |
| "rewards/answer_wer_reward": 0.9448631405830383, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987684786319733, |
| "step": 296 |
| }, |
| { |
| "completion_length": 201.6875, |
| "epoch": 0.9504, |
| "grad_norm": 3.2697925567626953, |
| "kl": 0.0723876953125, |
| "learning_rate": 6.3e-07, |
| "loss": 0.0007, |
| "reward": 3.9148101806640625, |
| "reward_std": 0.02096148394048214, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9371316432952881, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9821428656578064, |
| "step": 297 |
| }, |
| { |
| "completion_length": 174.71875, |
| "epoch": 0.9536, |
| "grad_norm": 1.3895010948181152, |
| "kl": 0.072509765625, |
| "learning_rate": 6.2875e-07, |
| "loss": 0.0007, |
| "reward": 3.9413623809814453, |
| "reward_std": 0.012068473850376904, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.96162348985672, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9821428656578064, |
| "step": 298 |
| }, |
| { |
| "completion_length": 226.53125, |
| "epoch": 0.9568, |
| "grad_norm": 0.9915501475334167, |
| "kl": 0.0574951171875, |
| "learning_rate": 6.274999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.9342339038848877, |
| "reward_std": 0.017138528637588024, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9342339336872101, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 299 |
| }, |
| { |
| "completion_length": 185.96875, |
| "epoch": 0.96, |
| "grad_norm": 2.181473970413208, |
| "kl": 0.0693359375, |
| "learning_rate": 6.262499999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.8075177669525146, |
| "reward_std": 0.008563205134123564, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.974321037530899, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8331968486309052, |
| "step": 300 |
| }, |
| { |
| "completion_length": 259.4375, |
| "epoch": 0.9632, |
| "grad_norm": 0.8825593590736389, |
| "kl": 0.053955078125, |
| "learning_rate": 6.249999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.9282361268997192, |
| "reward_std": 0.01493215560913086, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9290694296360016, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9991666674613953, |
| "step": 301 |
| }, |
| { |
| "completion_length": 233.4375, |
| "epoch": 0.9664, |
| "grad_norm": 2.377093553543091, |
| "kl": 0.08251953125, |
| "learning_rate": 6.2375e-07, |
| "loss": 0.0008, |
| "reward": 3.8652896881103516, |
| "reward_std": 0.04854640178382397, |
| "rewards/answer_entity_reward": 0.9947552382946014, |
| "rewards/answer_wer_reward": 0.931235283613205, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9392991065979004, |
| "step": 302 |
| }, |
| { |
| "completion_length": 214.9375, |
| "epoch": 0.9696, |
| "grad_norm": 2.7887818813323975, |
| "kl": 0.0765380859375, |
| "learning_rate": 6.225000000000001e-07, |
| "loss": 0.0008, |
| "reward": 3.916442394256592, |
| "reward_std": 0.014312040992081165, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9577742516994476, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9586680829524994, |
| "step": 303 |
| }, |
| { |
| "completion_length": 195.53125, |
| "epoch": 0.9728, |
| "grad_norm": 1.3930556774139404, |
| "kl": 0.0662841796875, |
| "learning_rate": 6.2125e-07, |
| "loss": 0.0007, |
| "reward": 3.8324824571609497, |
| "reward_std": 0.013787610223516822, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.9709192514419556, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8672450482845306, |
| "step": 304 |
| }, |
| { |
| "completion_length": 221.65625, |
| "epoch": 0.976, |
| "grad_norm": 1.6060283184051514, |
| "kl": 0.046875, |
| "learning_rate": 6.2e-07, |
| "loss": 0.0005, |
| "reward": 3.9341059923171997, |
| "reward_std": 0.016552825924009085, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9438435733318329, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9902624487876892, |
| "step": 305 |
| }, |
| { |
| "completion_length": 274.4375, |
| "epoch": 0.9792, |
| "grad_norm": 2.2774875164031982, |
| "kl": 0.0582275390625, |
| "learning_rate": 6.1875e-07, |
| "loss": 0.0006, |
| "reward": 3.8809224367141724, |
| "reward_std": 0.03468186687678099, |
| "rewards/answer_entity_reward": 0.9755851626396179, |
| "rewards/answer_wer_reward": 0.9063642621040344, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9989729523658752, |
| "step": 306 |
| }, |
| { |
| "completion_length": 243.875, |
| "epoch": 0.9824, |
| "grad_norm": 1.4776897430419922, |
| "kl": 0.0865478515625, |
| "learning_rate": 6.175e-07, |
| "loss": 0.0009, |
| "reward": 3.921198606491089, |
| "reward_std": 0.029711266048252583, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9262239336967468, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9984469413757324, |
| "step": 307 |
| }, |
| { |
| "completion_length": 230.6875, |
| "epoch": 0.9856, |
| "grad_norm": 0.8870422840118408, |
| "kl": 0.0528564453125, |
| "learning_rate": 6.162499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.9468624591827393, |
| "reward_std": 0.010126703884452581, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9468623399734497, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 308 |
| }, |
| { |
| "completion_length": 193.53125, |
| "epoch": 0.9888, |
| "grad_norm": 1.2648320198059082, |
| "kl": 0.0474853515625, |
| "learning_rate": 6.149999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.9692437648773193, |
| "reward_std": 0.010907594813033938, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9716475903987885, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 309 |
| }, |
| { |
| "completion_length": 226.84375, |
| "epoch": 0.992, |
| "grad_norm": 2.5334410667419434, |
| "kl": 0.099609375, |
| "learning_rate": 6.1375e-07, |
| "loss": 0.001, |
| "reward": 3.932776689529419, |
| "reward_std": 0.025886752177029848, |
| "rewards/answer_entity_reward": 0.9937500059604645, |
| "rewards/answer_wer_reward": 0.9474222362041473, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9916044771671295, |
| "step": 310 |
| }, |
| { |
| "completion_length": 202.40625, |
| "epoch": 0.9952, |
| "grad_norm": 1.6191986799240112, |
| "kl": 0.059326171875, |
| "learning_rate": 6.125000000000001e-07, |
| "loss": 0.0006, |
| "reward": 3.923641085624695, |
| "reward_std": 0.016786989755928516, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9264820218086243, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 311 |
| }, |
| { |
| "completion_length": 226.125, |
| "epoch": 0.9984, |
| "grad_norm": 2.3516252040863037, |
| "kl": 0.0587158203125, |
| "learning_rate": 6.1125e-07, |
| "loss": 0.0006, |
| "reward": 3.822533130645752, |
| "reward_std": 0.19381592608988285, |
| "rewards/answer_entity_reward": 0.9630681872367859, |
| "rewards/answer_wer_reward": 0.8977905511856079, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9929245114326477, |
| "step": 312 |
| }, |
| { |
| "completion_length": 164.4375, |
| "epoch": 1.0, |
| "grad_norm": 9.48376178741455, |
| "kl": 0.04345703125, |
| "learning_rate": 6.1e-07, |
| "loss": 0.0002, |
| "reward": 3.9722466468811035, |
| "reward_std": 0.021218769252300262, |
| "rewards/answer_entity_reward": 0.9861111044883728, |
| "rewards/answer_wer_reward": 0.9880585074424744, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9980769157409668, |
| "step": 313 |
| }, |
| { |
| "completion_length": 194.0625, |
| "epoch": 1.0032, |
| "grad_norm": 1.5969237089157104, |
| "kl": 0.0419921875, |
| "learning_rate": 6.0875e-07, |
| "loss": 0.0004, |
| "reward": 3.9741499423980713, |
| "reward_std": 0.00955872773192823, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9776757061481476, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9985576868057251, |
| "step": 314 |
| }, |
| { |
| "completion_length": 174.25, |
| "epoch": 1.0064, |
| "grad_norm": 5.0026326179504395, |
| "kl": 0.07470703125, |
| "learning_rate": 6.075e-07, |
| "loss": 0.0007, |
| "reward": 3.9532389640808105, |
| "reward_std": 0.01782281370833516, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9582388997077942, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9950000047683716, |
| "step": 315 |
| }, |
| { |
| "completion_length": 218.3125, |
| "epoch": 1.0096, |
| "grad_norm": 1.521260142326355, |
| "kl": 0.072509765625, |
| "learning_rate": 6.062499999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.891371011734009, |
| "reward_std": 0.037183830980211496, |
| "rewards/answer_entity_reward": 0.995192289352417, |
| "rewards/answer_wer_reward": 0.9465020596981049, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9496767222881317, |
| "step": 316 |
| }, |
| { |
| "completion_length": 181.21875, |
| "epoch": 1.0128, |
| "grad_norm": 2.444070339202881, |
| "kl": 0.1011962890625, |
| "learning_rate": 6.049999999999999e-07, |
| "loss": 0.001, |
| "reward": 3.957024097442627, |
| "reward_std": 0.015732225496321917, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.9627059102058411, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 317 |
| }, |
| { |
| "completion_length": 214.8125, |
| "epoch": 1.016, |
| "grad_norm": 5.038032054901123, |
| "kl": 0.081298828125, |
| "learning_rate": 6.037499999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.905093193054199, |
| "reward_std": 0.02073481073603034, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9350383579730988, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.97005495429039, |
| "step": 318 |
| }, |
| { |
| "completion_length": 209.8125, |
| "epoch": 1.0192, |
| "grad_norm": 3.9700140953063965, |
| "kl": 0.07373046875, |
| "learning_rate": 6.025000000000001e-07, |
| "loss": 0.0007, |
| "reward": 3.8465429544448853, |
| "reward_std": 0.044920976273715496, |
| "rewards/answer_entity_reward": 0.953125, |
| "rewards/answer_wer_reward": 0.935539960861206, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9578781127929688, |
| "step": 319 |
| }, |
| { |
| "completion_length": 242.8125, |
| "epoch": 1.0224, |
| "grad_norm": 1.1018257141113281, |
| "kl": 0.0404052734375, |
| "learning_rate": 6.0125e-07, |
| "loss": 0.0004, |
| "reward": 3.9351298809051514, |
| "reward_std": 0.00889231264591217, |
| "rewards/answer_entity_reward": 0.9861111044883728, |
| "rewards/answer_wer_reward": 0.9503234028816223, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9986952543258667, |
| "step": 320 |
| }, |
| { |
| "completion_length": 178.65625, |
| "epoch": 1.0256, |
| "grad_norm": 1.2945948839187622, |
| "kl": 0.059326171875, |
| "learning_rate": 6e-07, |
| "loss": 0.0006, |
| "reward": 3.9444717168807983, |
| "reward_std": 0.010739851742982864, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9468754827976227, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 321 |
| }, |
| { |
| "completion_length": 158.75, |
| "epoch": 1.0288, |
| "grad_norm": 1.9997080564498901, |
| "kl": 0.10498046875, |
| "learning_rate": 5.9875e-07, |
| "loss": 0.001, |
| "reward": 3.8997615575790405, |
| "reward_std": 0.0878201499581337, |
| "rewards/answer_entity_reward": 0.9768981039524078, |
| "rewards/answer_wer_reward": 0.9317395091056824, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9911239445209503, |
| "step": 322 |
| }, |
| { |
| "completion_length": 202.78125, |
| "epoch": 1.032, |
| "grad_norm": 2.5343425273895264, |
| "kl": 0.047119140625, |
| "learning_rate": 5.975e-07, |
| "loss": 0.0005, |
| "reward": 3.9625836610794067, |
| "reward_std": 0.0073791013564914465, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9652430713176727, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9973404407501221, |
| "step": 323 |
| }, |
| { |
| "completion_length": 181.9375, |
| "epoch": 1.0352, |
| "grad_norm": 7.240401744842529, |
| "kl": 0.067138671875, |
| "learning_rate": 5.962499999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.828685760498047, |
| "reward_std": 0.04627671558409929, |
| "rewards/answer_entity_reward": 0.995192289352417, |
| "rewards/answer_wer_reward": 0.951274037361145, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.882219523191452, |
| "step": 324 |
| }, |
| { |
| "completion_length": 209.75, |
| "epoch": 1.0384, |
| "grad_norm": 2.1784214973449707, |
| "kl": 0.0810546875, |
| "learning_rate": 5.949999999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.9578659534454346, |
| "reward_std": 0.015447806101292372, |
| "rewards/answer_entity_reward": 0.9947552382946014, |
| "rewards/answer_wer_reward": 0.9634187519550323, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996921122074127, |
| "step": 325 |
| }, |
| { |
| "completion_length": 200.78125, |
| "epoch": 1.0416, |
| "grad_norm": 1.8993250131607056, |
| "kl": 0.086669921875, |
| "learning_rate": 5.937499999999999e-07, |
| "loss": 0.0009, |
| "reward": 3.9622350931167603, |
| "reward_std": 0.011172362137585878, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9622350335121155, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 326 |
| }, |
| { |
| "completion_length": 188.0625, |
| "epoch": 1.0448, |
| "grad_norm": 2.999244213104248, |
| "kl": 0.04931640625, |
| "learning_rate": 5.925e-07, |
| "loss": 0.0005, |
| "reward": 3.8658429384231567, |
| "reward_std": 0.027352653443813324, |
| "rewards/answer_entity_reward": 0.9859203398227692, |
| "rewards/answer_wer_reward": 0.9490468800067902, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9308757185935974, |
| "step": 327 |
| }, |
| { |
| "completion_length": 211.6875, |
| "epoch": 1.048, |
| "grad_norm": 1.4307529926300049, |
| "kl": 0.06982421875, |
| "learning_rate": 5.912500000000001e-07, |
| "loss": 0.0007, |
| "reward": 3.8813902139663696, |
| "reward_std": 0.015089725144207478, |
| "rewards/answer_entity_reward": 0.9800595343112946, |
| "rewards/answer_wer_reward": 0.9558005034923553, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9455301761627197, |
| "step": 328 |
| }, |
| { |
| "completion_length": 184.1875, |
| "epoch": 1.0512, |
| "grad_norm": 1.9804878234863281, |
| "kl": 0.03851318359375, |
| "learning_rate": 5.9e-07, |
| "loss": 0.0004, |
| "reward": 3.9403220415115356, |
| "reward_std": 0.025673750409623608, |
| "rewards/answer_entity_reward": 0.9941239356994629, |
| "rewards/answer_wer_reward": 0.94679394364357, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994041919708252, |
| "step": 329 |
| }, |
| { |
| "completion_length": 200.71875, |
| "epoch": 1.0544, |
| "grad_norm": 1.5184144973754883, |
| "kl": 0.06689453125, |
| "learning_rate": 5.8875e-07, |
| "loss": 0.0007, |
| "reward": 3.945325493812561, |
| "reward_std": 0.021944692358374596, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.951007217168808, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 330 |
| }, |
| { |
| "completion_length": 211.875, |
| "epoch": 1.0576, |
| "grad_norm": 1.228079915046692, |
| "kl": 0.052978515625, |
| "learning_rate": 5.875e-07, |
| "loss": 0.0005, |
| "reward": 3.9120590686798096, |
| "reward_std": 0.015080507844686508, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.912059098482132, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 331 |
| }, |
| { |
| "completion_length": 240.5625, |
| "epoch": 1.0608, |
| "grad_norm": 1.7073534727096558, |
| "kl": 0.1005859375, |
| "learning_rate": 5.8625e-07, |
| "loss": 0.001, |
| "reward": 3.943448066711426, |
| "reward_std": 0.010788221377879381, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9437373280525208, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997106492519379, |
| "step": 332 |
| }, |
| { |
| "completion_length": 217.78125, |
| "epoch": 1.064, |
| "grad_norm": 1.9268385171890259, |
| "kl": 0.0440673828125, |
| "learning_rate": 5.849999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.9603058099746704, |
| "reward_std": 0.009590512840077281, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9625644087791443, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9977414906024933, |
| "step": 333 |
| }, |
| { |
| "completion_length": 188.125, |
| "epoch": 1.0672, |
| "grad_norm": 0.780636727809906, |
| "kl": 0.04638671875, |
| "learning_rate": 5.837499999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.949649691581726, |
| "reward_std": 0.0076717507326975465, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9496497213840485, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 334 |
| }, |
| { |
| "completion_length": 240.71875, |
| "epoch": 1.0704, |
| "grad_norm": 21.118270874023438, |
| "kl": 0.04296875, |
| "learning_rate": 5.825e-07, |
| "loss": 0.0004, |
| "reward": 3.968227982521057, |
| "reward_std": 0.01375247398391366, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9715853631496429, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.996642529964447, |
| "step": 335 |
| }, |
| { |
| "completion_length": 251.21875, |
| "epoch": 1.0735999999999999, |
| "grad_norm": 1.0980618000030518, |
| "kl": 0.0467529296875, |
| "learning_rate": 5.8125e-07, |
| "loss": 0.0005, |
| "reward": 3.9321502447128296, |
| "reward_std": 0.02487938292324543, |
| "rewards/answer_entity_reward": 0.987500011920929, |
| "rewards/answer_wer_reward": 0.945962131023407, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9986882209777832, |
| "step": 336 |
| }, |
| { |
| "completion_length": 191.0, |
| "epoch": 1.0768, |
| "grad_norm": 1.9901342391967773, |
| "kl": 0.1015625, |
| "learning_rate": 5.8e-07, |
| "loss": 0.001, |
| "reward": 3.860186219215393, |
| "reward_std": 0.008080802159383893, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9668596386909485, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8933265209197998, |
| "step": 337 |
| }, |
| { |
| "completion_length": 222.40625, |
| "epoch": 1.08, |
| "grad_norm": 1.9760770797729492, |
| "kl": 0.0791015625, |
| "learning_rate": 5.7875e-07, |
| "loss": 0.0008, |
| "reward": 3.943527340888977, |
| "reward_std": 0.013376505114138126, |
| "rewards/answer_entity_reward": 0.9927884340286255, |
| "rewards/answer_wer_reward": 0.950738936662674, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 338 |
| }, |
| { |
| "completion_length": 242.75, |
| "epoch": 1.0832, |
| "grad_norm": 1.4690314531326294, |
| "kl": 0.0699462890625, |
| "learning_rate": 5.775e-07, |
| "loss": 0.0007, |
| "reward": 3.946296215057373, |
| "reward_std": 0.010936432983726263, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.946296215057373, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 339 |
| }, |
| { |
| "completion_length": 213.75, |
| "epoch": 1.0864, |
| "grad_norm": 1.3006911277770996, |
| "kl": 0.068603515625, |
| "learning_rate": 5.7625e-07, |
| "loss": 0.0007, |
| "reward": 3.929935932159424, |
| "reward_std": 0.012226814404129982, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9303079545497894, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996279776096344, |
| "step": 340 |
| }, |
| { |
| "completion_length": 203.875, |
| "epoch": 1.0896, |
| "grad_norm": 20.699094772338867, |
| "kl": 0.0606689453125, |
| "learning_rate": 5.749999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.839663863182068, |
| "reward_std": 0.2153539047576487, |
| "rewards/answer_entity_reward": 0.9632352888584137, |
| "rewards/answer_wer_reward": 0.9303349256515503, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.977343738079071, |
| "step": 341 |
| }, |
| { |
| "completion_length": 229.9375, |
| "epoch": 1.0928, |
| "grad_norm": 10.713321685791016, |
| "kl": 0.062255859375, |
| "learning_rate": 5.737499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.952810525894165, |
| "reward_std": 0.013096342328935862, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9535458087921143, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9992647171020508, |
| "step": 342 |
| }, |
| { |
| "completion_length": 226.0625, |
| "epoch": 1.096, |
| "grad_norm": 5.412719249725342, |
| "kl": 0.068115234375, |
| "learning_rate": 5.725e-07, |
| "loss": 0.0007, |
| "reward": 3.9290108680725098, |
| "reward_std": 0.014630899764597416, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9352608323097229, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9937500059604645, |
| "step": 343 |
| }, |
| { |
| "completion_length": 180.875, |
| "epoch": 1.0992, |
| "grad_norm": 1.5433329343795776, |
| "kl": 0.046875, |
| "learning_rate": 5.7125e-07, |
| "loss": 0.0005, |
| "reward": 3.9217172861099243, |
| "reward_std": 0.007004068233072758, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9350151419639587, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9867021441459656, |
| "step": 344 |
| }, |
| { |
| "completion_length": 228.5625, |
| "epoch": 1.1024, |
| "grad_norm": 1.6970151662826538, |
| "kl": 0.058837890625, |
| "learning_rate": 5.699999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.9185184240341187, |
| "reward_std": 0.013168168719857931, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9197319746017456, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987863898277283, |
| "step": 345 |
| }, |
| { |
| "completion_length": 155.34375, |
| "epoch": 1.1056, |
| "grad_norm": 1.7489057779312134, |
| "kl": 0.0869140625, |
| "learning_rate": 5.6875e-07, |
| "loss": 0.0009, |
| "reward": 3.9059561491012573, |
| "reward_std": 0.00622332957573235, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9627758860588074, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9431802928447723, |
| "step": 346 |
| }, |
| { |
| "completion_length": 173.40625, |
| "epoch": 1.1088, |
| "grad_norm": 1.3873649835586548, |
| "kl": 0.09033203125, |
| "learning_rate": 5.675e-07, |
| "loss": 0.0009, |
| "reward": 3.9297943115234375, |
| "reward_std": 0.039116960018873215, |
| "rewards/answer_entity_reward": 0.9826389253139496, |
| "rewards/answer_wer_reward": 0.9575237333774567, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9896316528320312, |
| "step": 347 |
| }, |
| { |
| "completion_length": 210.3125, |
| "epoch": 1.112, |
| "grad_norm": 3.549527645111084, |
| "kl": 0.0986328125, |
| "learning_rate": 5.6625e-07, |
| "loss": 0.001, |
| "reward": 3.9249199628829956, |
| "reward_std": 0.019829558208584785, |
| "rewards/answer_entity_reward": 0.9842728972434998, |
| "rewards/answer_wer_reward": 0.9483617842197418, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9922854006290436, |
| "step": 348 |
| }, |
| { |
| "completion_length": 210.21875, |
| "epoch": 1.1152, |
| "grad_norm": 1.7917331457138062, |
| "kl": 0.0712890625, |
| "learning_rate": 5.649999999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.9333280324935913, |
| "reward_std": 0.011767172254621983, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9333280622959137, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 349 |
| }, |
| { |
| "completion_length": 220.1875, |
| "epoch": 1.1184, |
| "grad_norm": 0.8690351247787476, |
| "kl": 0.069580078125, |
| "learning_rate": 5.637499999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.9331865310668945, |
| "reward_std": 0.008595036342740059, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9419363439083099, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9912500977516174, |
| "step": 350 |
| }, |
| { |
| "completion_length": 192.65625, |
| "epoch": 1.1216, |
| "grad_norm": 1.7662582397460938, |
| "kl": 0.076171875, |
| "learning_rate": 5.625e-07, |
| "loss": 0.0008, |
| "reward": 3.950869083404541, |
| "reward_std": 0.020245986990630627, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.951172411441803, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996966123580933, |
| "step": 351 |
| }, |
| { |
| "completion_length": 264.25, |
| "epoch": 1.1248, |
| "grad_norm": 6.877583026885986, |
| "kl": 0.0867919921875, |
| "learning_rate": 5.6125e-07, |
| "loss": 0.0009, |
| "reward": 3.9451229572296143, |
| "reward_std": 0.017284557223320007, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.946128636598587, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9989943504333496, |
| "step": 352 |
| }, |
| { |
| "completion_length": 218.4375, |
| "epoch": 1.1280000000000001, |
| "grad_norm": 1.853745460510254, |
| "kl": 0.058837890625, |
| "learning_rate": 5.6e-07, |
| "loss": 0.0006, |
| "reward": 3.9474722146987915, |
| "reward_std": 0.01703261397778988, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9519364535808563, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 353 |
| }, |
| { |
| "completion_length": 229.9375, |
| "epoch": 1.1312, |
| "grad_norm": 7.013837814331055, |
| "kl": 0.079345703125, |
| "learning_rate": 5.587499999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.928715705871582, |
| "reward_std": 0.024107711389660835, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9372670352458954, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9938524663448334, |
| "step": 354 |
| }, |
| { |
| "completion_length": 238.09375, |
| "epoch": 1.1344, |
| "grad_norm": 1.8181698322296143, |
| "kl": 0.0587158203125, |
| "learning_rate": 5.575e-07, |
| "loss": 0.0006, |
| "reward": 3.9445427656173706, |
| "reward_std": 0.028678019531071186, |
| "rewards/answer_entity_reward": 0.9851190447807312, |
| "rewards/answer_wer_reward": 0.9630020260810852, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.996421754360199, |
| "step": 355 |
| }, |
| { |
| "completion_length": 199.46875, |
| "epoch": 1.1376, |
| "grad_norm": 17.45456314086914, |
| "kl": 0.44140625, |
| "learning_rate": 5.5625e-07, |
| "loss": 0.0044, |
| "reward": 3.793405294418335, |
| "reward_std": 0.09584336914122105, |
| "rewards/answer_entity_reward": 0.9953208565711975, |
| "rewards/answer_wer_reward": 0.9546021223068237, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.843482255935669, |
| "step": 356 |
| }, |
| { |
| "completion_length": 234.9375, |
| "epoch": 1.1408, |
| "grad_norm": 1.5193853378295898, |
| "kl": 0.056396484375, |
| "learning_rate": 5.55e-07, |
| "loss": 0.0006, |
| "reward": 3.9331583976745605, |
| "reward_std": 0.01793505996465683, |
| "rewards/answer_entity_reward": 0.9901185929775238, |
| "rewards/answer_wer_reward": 0.9450170993804932, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9980226159095764, |
| "step": 357 |
| }, |
| { |
| "completion_length": 225.21875, |
| "epoch": 1.144, |
| "grad_norm": 0.7461761236190796, |
| "kl": 0.050048828125, |
| "learning_rate": 5.5375e-07, |
| "loss": 0.0005, |
| "reward": 3.9532158374786377, |
| "reward_std": 0.013632898684591055, |
| "rewards/answer_entity_reward": 0.9930555522441864, |
| "rewards/answer_wer_reward": 0.9601602554321289, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 358 |
| }, |
| { |
| "completion_length": 196.21875, |
| "epoch": 1.1472, |
| "grad_norm": 1.688063621520996, |
| "kl": 0.0589599609375, |
| "learning_rate": 5.525e-07, |
| "loss": 0.0006, |
| "reward": 3.957648277282715, |
| "reward_std": 0.009953869972378016, |
| "rewards/answer_entity_reward": 0.9892857074737549, |
| "rewards/answer_wer_reward": 0.9689917266368866, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993708431720734, |
| "step": 359 |
| }, |
| { |
| "completion_length": 230.875, |
| "epoch": 1.1504, |
| "grad_norm": 1.0592241287231445, |
| "kl": 0.057861328125, |
| "learning_rate": 5.5125e-07, |
| "loss": 0.0006, |
| "reward": 3.9605822563171387, |
| "reward_std": 0.00902467966079712, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.961335301399231, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9992469847202301, |
| "step": 360 |
| }, |
| { |
| "completion_length": 177.25, |
| "epoch": 1.1536, |
| "grad_norm": 0.887911856174469, |
| "kl": 0.0631103515625, |
| "learning_rate": 5.5e-07, |
| "loss": 0.0006, |
| "reward": 3.9682934284210205, |
| "reward_std": 0.004935940261930227, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9682934284210205, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 361 |
| }, |
| { |
| "completion_length": 204.09375, |
| "epoch": 1.1568, |
| "grad_norm": 1.4796991348266602, |
| "kl": 0.0721435546875, |
| "learning_rate": 5.487499999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.967429041862488, |
| "reward_std": 0.004718436859548092, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.967721164226532, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997079372406006, |
| "step": 362 |
| }, |
| { |
| "completion_length": 201.90625, |
| "epoch": 1.16, |
| "grad_norm": 1.349228858947754, |
| "kl": 0.0635986328125, |
| "learning_rate": 5.474999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.968218684196472, |
| "reward_std": 0.004579245578497648, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9686298072338104, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999588817358017, |
| "step": 363 |
| }, |
| { |
| "completion_length": 222.25, |
| "epoch": 1.1632, |
| "grad_norm": 8.183592796325684, |
| "kl": 0.7177734375, |
| "learning_rate": 5.4625e-07, |
| "loss": 0.0072, |
| "reward": 3.8565011024475098, |
| "reward_std": 0.14647854026407003, |
| "rewards/answer_entity_reward": 0.9628739356994629, |
| "rewards/answer_wer_reward": 0.897028774023056, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9965982735157013, |
| "step": 364 |
| }, |
| { |
| "completion_length": 203.875, |
| "epoch": 1.1663999999999999, |
| "grad_norm": 2.1804592609405518, |
| "kl": 0.07666015625, |
| "learning_rate": 5.45e-07, |
| "loss": 0.0008, |
| "reward": 3.9330880641937256, |
| "reward_std": 0.023633791133761406, |
| "rewards/answer_entity_reward": 0.9927884340286255, |
| "rewards/answer_wer_reward": 0.9594465494155884, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9808530211448669, |
| "step": 365 |
| }, |
| { |
| "completion_length": 187.53125, |
| "epoch": 1.1696, |
| "grad_norm": 0.952870786190033, |
| "kl": 0.068603515625, |
| "learning_rate": 5.4375e-07, |
| "loss": 0.0007, |
| "reward": 3.906123399734497, |
| "reward_std": 0.02216299483552575, |
| "rewards/answer_entity_reward": 0.9882478415966034, |
| "rewards/answer_wer_reward": 0.9373133480548859, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9805622696876526, |
| "step": 366 |
| }, |
| { |
| "completion_length": 180.28125, |
| "epoch": 1.1728, |
| "grad_norm": 1.6601589918136597, |
| "kl": 0.069091796875, |
| "learning_rate": 5.425e-07, |
| "loss": 0.0007, |
| "reward": 3.9451587200164795, |
| "reward_std": 0.01368240499868989, |
| "rewards/answer_entity_reward": 0.9923513829708099, |
| "rewards/answer_wer_reward": 0.9530614018440247, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997459352016449, |
| "step": 367 |
| }, |
| { |
| "completion_length": 207.5625, |
| "epoch": 1.176, |
| "grad_norm": 2.0661466121673584, |
| "kl": 0.142578125, |
| "learning_rate": 5.4125e-07, |
| "loss": 0.0014, |
| "reward": 3.9405598640441895, |
| "reward_std": 0.009340570773929358, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9443033933639526, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9962564706802368, |
| "step": 368 |
| }, |
| { |
| "completion_length": 193.4375, |
| "epoch": 1.1792, |
| "grad_norm": 2.3376078605651855, |
| "kl": 0.0548095703125, |
| "learning_rate": 5.4e-07, |
| "loss": 0.0005, |
| "reward": 3.9724533557891846, |
| "reward_std": 0.007678399793803692, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9739435911178589, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9985099136829376, |
| "step": 369 |
| }, |
| { |
| "completion_length": 244.9375, |
| "epoch": 1.1824, |
| "grad_norm": 8.994063377380371, |
| "kl": 0.067138671875, |
| "learning_rate": 5.387499999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.8642784357070923, |
| "reward_std": 0.015206838492304087, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9453278481960297, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9217914342880249, |
| "step": 370 |
| }, |
| { |
| "completion_length": 223.5, |
| "epoch": 1.1856, |
| "grad_norm": 0.7140876054763794, |
| "kl": 0.0628662109375, |
| "learning_rate": 5.374999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.9566755294799805, |
| "reward_std": 0.008438330609351397, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9571858644485474, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994895756244659, |
| "step": 371 |
| }, |
| { |
| "completion_length": 236.09375, |
| "epoch": 1.1888, |
| "grad_norm": 5.422008514404297, |
| "kl": 0.072021484375, |
| "learning_rate": 5.3625e-07, |
| "loss": 0.0007, |
| "reward": 3.9092832803726196, |
| "reward_std": 0.02735153865069151, |
| "rewards/answer_entity_reward": 0.9869465231895447, |
| "rewards/answer_wer_reward": 0.9258767068386078, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9964599907398224, |
| "step": 372 |
| }, |
| { |
| "completion_length": 215.90625, |
| "epoch": 1.192, |
| "grad_norm": 2.5449435710906982, |
| "kl": 0.0655517578125, |
| "learning_rate": 5.35e-07, |
| "loss": 0.0007, |
| "reward": 3.8726375102996826, |
| "reward_std": 0.15768051333725452, |
| "rewards/answer_entity_reward": 0.991346150636673, |
| "rewards/answer_wer_reward": 0.9473030865192413, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9652382135391235, |
| "step": 373 |
| }, |
| { |
| "completion_length": 221.09375, |
| "epoch": 1.1952, |
| "grad_norm": 1.3450181484222412, |
| "kl": 0.0499267578125, |
| "learning_rate": 5.3375e-07, |
| "loss": 0.0005, |
| "reward": 3.945889711380005, |
| "reward_std": 0.021359253441914916, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9733871817588806, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9725023210048676, |
| "step": 374 |
| }, |
| { |
| "completion_length": 208.03125, |
| "epoch": 1.1984, |
| "grad_norm": 1.1699227094650269, |
| "kl": 0.067626953125, |
| "learning_rate": 5.325e-07, |
| "loss": 0.0007, |
| "reward": 3.951171040534973, |
| "reward_std": 0.008666176348924637, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9543131291866302, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9992617964744568, |
| "step": 375 |
| }, |
| { |
| "completion_length": 253.28125, |
| "epoch": 1.2016, |
| "grad_norm": 2.287163496017456, |
| "kl": 0.0572509765625, |
| "learning_rate": 5.3125e-07, |
| "loss": 0.0006, |
| "reward": 3.9154282808303833, |
| "reward_std": 0.04354940680786967, |
| "rewards/answer_entity_reward": 0.9888257682323456, |
| "rewards/answer_wer_reward": 0.9271413683891296, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994612038135529, |
| "step": 376 |
| }, |
| { |
| "completion_length": 187.21875, |
| "epoch": 1.2048, |
| "grad_norm": 1.3305357694625854, |
| "kl": 0.046142578125, |
| "learning_rate": 5.3e-07, |
| "loss": 0.0005, |
| "reward": 3.9359636306762695, |
| "reward_std": 0.00542741478420794, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9541498124599457, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.981813907623291, |
| "step": 377 |
| }, |
| { |
| "completion_length": 224.125, |
| "epoch": 1.208, |
| "grad_norm": 10.12941837310791, |
| "kl": 0.06201171875, |
| "learning_rate": 5.2875e-07, |
| "loss": 0.0006, |
| "reward": 3.9541337490081787, |
| "reward_std": 0.013694523833692074, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9624313712120056, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9917024075984955, |
| "step": 378 |
| }, |
| { |
| "completion_length": 158.96875, |
| "epoch": 1.2112, |
| "grad_norm": 1.3805967569351196, |
| "kl": 0.05859375, |
| "learning_rate": 5.274999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.947017788887024, |
| "reward_std": 0.02097574481740594, |
| "rewards/answer_entity_reward": 0.9902146458625793, |
| "rewards/answer_wer_reward": 0.961486428976059, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9953167736530304, |
| "step": 379 |
| }, |
| { |
| "completion_length": 250.40625, |
| "epoch": 1.2144, |
| "grad_norm": 1.2120996713638306, |
| "kl": 0.044921875, |
| "learning_rate": 5.262499999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.918868899345398, |
| "reward_std": 0.021801823284476995, |
| "rewards/answer_entity_reward": 0.9937500059604645, |
| "rewards/answer_wer_reward": 0.9251189529895782, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 380 |
| }, |
| { |
| "completion_length": 211.34375, |
| "epoch": 1.2176, |
| "grad_norm": 2.19063138961792, |
| "kl": 0.078369140625, |
| "learning_rate": 5.25e-07, |
| "loss": 0.0008, |
| "reward": 3.8982889652252197, |
| "reward_std": 0.02524574287235737, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9512019455432892, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.947086900472641, |
| "step": 381 |
| }, |
| { |
| "completion_length": 241.28125, |
| "epoch": 1.2208, |
| "grad_norm": 1.619989275932312, |
| "kl": 0.05615234375, |
| "learning_rate": 5.237500000000001e-07, |
| "loss": 0.0006, |
| "reward": 3.9471057653427124, |
| "reward_std": 0.013869246933609247, |
| "rewards/answer_entity_reward": 0.9944852888584137, |
| "rewards/answer_wer_reward": 0.9526203572750092, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 382 |
| }, |
| { |
| "completion_length": 244.875, |
| "epoch": 1.224, |
| "grad_norm": 0.8697032928466797, |
| "kl": 0.061279296875, |
| "learning_rate": 5.225e-07, |
| "loss": 0.0006, |
| "reward": 3.9235615730285645, |
| "reward_std": 0.015196615364402533, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9275480508804321, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.998417317867279, |
| "step": 383 |
| }, |
| { |
| "completion_length": 191.875, |
| "epoch": 1.2272, |
| "grad_norm": 5.2052154541015625, |
| "kl": 0.06884765625, |
| "learning_rate": 5.2125e-07, |
| "loss": 0.0007, |
| "reward": 3.934178948402405, |
| "reward_std": 0.024661258328706026, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9814408719539642, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9527381658554077, |
| "step": 384 |
| }, |
| { |
| "completion_length": 218.15625, |
| "epoch": 1.2304, |
| "grad_norm": 1.1718415021896362, |
| "kl": 0.105224609375, |
| "learning_rate": 5.2e-07, |
| "loss": 0.0011, |
| "reward": 3.8538546562194824, |
| "reward_std": 0.013242242857813835, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9431050419807434, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9135904610157013, |
| "step": 385 |
| }, |
| { |
| "completion_length": 167.59375, |
| "epoch": 1.2336, |
| "grad_norm": 1.8933672904968262, |
| "kl": 0.0555419921875, |
| "learning_rate": 5.1875e-07, |
| "loss": 0.0006, |
| "reward": 3.942023754119873, |
| "reward_std": 0.04039308475330472, |
| "rewards/answer_entity_reward": 0.9895833432674408, |
| "rewards/answer_wer_reward": 0.9561411142349243, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9962993562221527, |
| "step": 386 |
| }, |
| { |
| "completion_length": 181.1875, |
| "epoch": 1.2368000000000001, |
| "grad_norm": 1.132387399673462, |
| "kl": 0.134033203125, |
| "learning_rate": 5.174999999999999e-07, |
| "loss": 0.0013, |
| "reward": 3.883729100227356, |
| "reward_std": 0.006107622524723411, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9661928117275238, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9175363183021545, |
| "step": 387 |
| }, |
| { |
| "completion_length": 245.78125, |
| "epoch": 1.24, |
| "grad_norm": 1.5286246538162231, |
| "kl": 0.0439453125, |
| "learning_rate": 5.162499999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.9444308280944824, |
| "reward_std": 0.017588268965482712, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.951177716255188, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.993253082036972, |
| "step": 388 |
| }, |
| { |
| "completion_length": 214.5, |
| "epoch": 1.2432, |
| "grad_norm": 4.535660266876221, |
| "kl": 0.4443359375, |
| "learning_rate": 5.149999999999999e-07, |
| "loss": 0.0045, |
| "reward": 3.9712672233581543, |
| "reward_std": 0.017703328281641006, |
| "rewards/answer_entity_reward": 0.9923513829708099, |
| "rewards/answer_wer_reward": 0.9789157509803772, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 389 |
| }, |
| { |
| "completion_length": 237.71875, |
| "epoch": 1.2464, |
| "grad_norm": 1.100642204284668, |
| "kl": 0.0443115234375, |
| "learning_rate": 5.137500000000001e-07, |
| "loss": 0.0004, |
| "reward": 3.9504618644714355, |
| "reward_std": 0.01717091863974929, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9553267061710358, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9995993673801422, |
| "step": 390 |
| }, |
| { |
| "completion_length": 220.8125, |
| "epoch": 1.2496, |
| "grad_norm": 1.8153222799301147, |
| "kl": 0.050537109375, |
| "learning_rate": 5.125e-07, |
| "loss": 0.0005, |
| "reward": 3.954966902732849, |
| "reward_std": 0.023467861115932465, |
| "rewards/answer_entity_reward": 0.9909090995788574, |
| "rewards/answer_wer_reward": 0.9640579223632812, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 391 |
| }, |
| { |
| "completion_length": 215.75, |
| "epoch": 1.2528000000000001, |
| "grad_norm": 1.3607189655303955, |
| "kl": 0.0562744140625, |
| "learning_rate": 5.1125e-07, |
| "loss": 0.0006, |
| "reward": 3.947434425354004, |
| "reward_std": 0.01746128685772419, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9514667093753815, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9959677457809448, |
| "step": 392 |
| }, |
| { |
| "completion_length": 140.75, |
| "epoch": 1.256, |
| "grad_norm": 3.343885898590088, |
| "kl": 0.064208984375, |
| "learning_rate": 5.1e-07, |
| "loss": 0.0006, |
| "reward": 3.9535528421401978, |
| "reward_std": 0.016743881278671324, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9615642726421356, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9948294758796692, |
| "step": 393 |
| }, |
| { |
| "completion_length": 225.09375, |
| "epoch": 1.2591999999999999, |
| "grad_norm": 7.593709468841553, |
| "kl": 0.0628662109375, |
| "learning_rate": 5.0875e-07, |
| "loss": 0.0006, |
| "reward": 3.9337310791015625, |
| "reward_std": 0.01689326297491789, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9342745840549469, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999456524848938, |
| "step": 394 |
| }, |
| { |
| "completion_length": 195.15625, |
| "epoch": 1.2624, |
| "grad_norm": 1.6891230344772339, |
| "kl": 0.085693359375, |
| "learning_rate": 5.074999999999999e-07, |
| "loss": 0.0009, |
| "reward": 3.836549401283264, |
| "reward_std": 0.005918985931202769, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.8378467857837677, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987025856971741, |
| "step": 395 |
| }, |
| { |
| "completion_length": 218.71875, |
| "epoch": 1.2656, |
| "grad_norm": 2.0911483764648438, |
| "kl": 0.057373046875, |
| "learning_rate": 5.062499999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.930617570877075, |
| "reward_std": 0.014833949506282806, |
| "rewards/answer_entity_reward": 0.9881944358348846, |
| "rewards/answer_wer_reward": 0.9436545968055725, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987684786319733, |
| "step": 396 |
| }, |
| { |
| "completion_length": 244.4375, |
| "epoch": 1.2688, |
| "grad_norm": 0.6879564523696899, |
| "kl": 0.05810546875, |
| "learning_rate": 5.049999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.9541516304016113, |
| "reward_std": 0.014136601239442825, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9578942954540253, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9983407258987427, |
| "step": 397 |
| }, |
| { |
| "completion_length": 171.875, |
| "epoch": 1.272, |
| "grad_norm": 1.0838266611099243, |
| "kl": 0.063232421875, |
| "learning_rate": 5.0375e-07, |
| "loss": 0.0006, |
| "reward": 3.961939811706543, |
| "reward_std": 0.007458951906301081, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9619399607181549, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 398 |
| }, |
| { |
| "completion_length": 224.53125, |
| "epoch": 1.2752, |
| "grad_norm": 2.0163495540618896, |
| "kl": 0.072265625, |
| "learning_rate": 5.025e-07, |
| "loss": 0.0007, |
| "reward": 3.964465856552124, |
| "reward_std": 0.014243231620639563, |
| "rewards/answer_entity_reward": 0.9957579076290131, |
| "rewards/answer_wer_reward": 0.9695450067520142, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9991629421710968, |
| "step": 399 |
| }, |
| { |
| "completion_length": 181.15625, |
| "epoch": 1.2784, |
| "grad_norm": 0.38955262303352356, |
| "kl": 0.0517578125, |
| "learning_rate": 5.0125e-07, |
| "loss": 0.0005, |
| "reward": 3.9557042121887207, |
| "reward_std": 0.005372793646529317, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9557042419910431, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 400 |
| }, |
| { |
| "completion_length": 208.3125, |
| "epoch": 1.2816, |
| "grad_norm": 3.9781861305236816, |
| "kl": 0.0716552734375, |
| "learning_rate": 5e-07, |
| "loss": 0.0007, |
| "reward": 3.8667571544647217, |
| "reward_std": 0.015388892497867346, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9693593382835388, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8998015820980072, |
| "step": 401 |
| }, |
| { |
| "completion_length": 204.375, |
| "epoch": 1.2848, |
| "grad_norm": 1.1456544399261475, |
| "kl": 0.103515625, |
| "learning_rate": 4.9875e-07, |
| "loss": 0.001, |
| "reward": 3.956982374191284, |
| "reward_std": 0.007417811662890017, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9575175940990448, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994648098945618, |
| "step": 402 |
| }, |
| { |
| "completion_length": 216.1875, |
| "epoch": 1.288, |
| "grad_norm": 1.1664754152297974, |
| "kl": 0.06396484375, |
| "learning_rate": 4.975e-07, |
| "loss": 0.0006, |
| "reward": 3.8699432611465454, |
| "reward_std": 0.02020346373319626, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9359997510910034, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.936026930809021, |
| "step": 403 |
| }, |
| { |
| "completion_length": 253.09375, |
| "epoch": 1.2912, |
| "grad_norm": 0.8103052377700806, |
| "kl": 0.0635986328125, |
| "learning_rate": 4.9625e-07, |
| "loss": 0.0006, |
| "reward": 3.937591075897217, |
| "reward_std": 0.018769525457173586, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9415221214294434, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9989098608493805, |
| "step": 404 |
| }, |
| { |
| "completion_length": 215.0625, |
| "epoch": 1.2944, |
| "grad_norm": 1.4777588844299316, |
| "kl": 0.068603515625, |
| "learning_rate": 4.95e-07, |
| "loss": 0.0007, |
| "reward": 3.949555516242981, |
| "reward_std": 0.009917980059981346, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.949555516242981, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 405 |
| }, |
| { |
| "completion_length": 202.65625, |
| "epoch": 1.2976, |
| "grad_norm": 0.7443984150886536, |
| "kl": 0.106689453125, |
| "learning_rate": 4.9375e-07, |
| "loss": 0.0011, |
| "reward": 3.7686209678649902, |
| "reward_std": 0.011178261134773493, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9451543390750885, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8234666287899017, |
| "step": 406 |
| }, |
| { |
| "completion_length": 189.21875, |
| "epoch": 1.3008, |
| "grad_norm": 0.9547207951545715, |
| "kl": 0.077392578125, |
| "learning_rate": 4.924999999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.9593130350112915, |
| "reward_std": 0.006907296134158969, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9597530961036682, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9995598495006561, |
| "step": 407 |
| }, |
| { |
| "completion_length": 208.09375, |
| "epoch": 1.304, |
| "grad_norm": 0.8897162079811096, |
| "kl": 0.0604248046875, |
| "learning_rate": 4.9125e-07, |
| "loss": 0.0006, |
| "reward": 3.9529693126678467, |
| "reward_std": 0.0038969104643911123, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9714880287647247, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9814814925193787, |
| "step": 408 |
| }, |
| { |
| "completion_length": 199.78125, |
| "epoch": 1.3072, |
| "grad_norm": 1.1945850849151611, |
| "kl": 0.056640625, |
| "learning_rate": 4.9e-07, |
| "loss": 0.0006, |
| "reward": 3.951330065727234, |
| "reward_std": 0.0060545760206878185, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9513299763202667, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 409 |
| }, |
| { |
| "completion_length": 176.5, |
| "epoch": 1.3104, |
| "grad_norm": 1.5717577934265137, |
| "kl": 0.085205078125, |
| "learning_rate": 4.8875e-07, |
| "loss": 0.0009, |
| "reward": 3.9731186628341675, |
| "reward_std": 0.009643410099670291, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9749214053153992, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9981971085071564, |
| "step": 410 |
| }, |
| { |
| "completion_length": 209.25, |
| "epoch": 1.3136, |
| "grad_norm": 1.7357205152511597, |
| "kl": 0.05517578125, |
| "learning_rate": 4.875e-07, |
| "loss": 0.0006, |
| "reward": 3.9563956260681152, |
| "reward_std": 0.013218061067163944, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9563955068588257, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 411 |
| }, |
| { |
| "completion_length": 233.28125, |
| "epoch": 1.3168, |
| "grad_norm": 3.6717629432678223, |
| "kl": 0.070068359375, |
| "learning_rate": 4.8625e-07, |
| "loss": 0.0007, |
| "reward": 3.955284357070923, |
| "reward_std": 0.02536593284457922, |
| "rewards/answer_entity_reward": 0.9871794581413269, |
| "rewards/answer_wer_reward": 0.968104898929596, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 412 |
| }, |
| { |
| "completion_length": 205.125, |
| "epoch": 1.32, |
| "grad_norm": 1.0453362464904785, |
| "kl": 0.04473876953125, |
| "learning_rate": 4.85e-07, |
| "loss": 0.0005, |
| "reward": 3.9507482051849365, |
| "reward_std": 0.005348393111489713, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9646830558776855, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9860649704933167, |
| "step": 413 |
| }, |
| { |
| "completion_length": 197.71875, |
| "epoch": 1.3232, |
| "grad_norm": 10.967116355895996, |
| "kl": 0.4443359375, |
| "learning_rate": 4.8375e-07, |
| "loss": 0.0044, |
| "reward": 3.958775758743286, |
| "reward_std": 0.01469768793322146, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9608590006828308, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 414 |
| }, |
| { |
| "completion_length": 240.75, |
| "epoch": 1.3264, |
| "grad_norm": 1.771857738494873, |
| "kl": 0.056884765625, |
| "learning_rate": 4.824999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.9307100772857666, |
| "reward_std": 0.01262786379083991, |
| "rewards/answer_entity_reward": 0.9861111044883728, |
| "rewards/answer_wer_reward": 0.9445989429950714, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 415 |
| }, |
| { |
| "completion_length": 184.9375, |
| "epoch": 1.3296000000000001, |
| "grad_norm": 0.5742409825325012, |
| "kl": 0.081787109375, |
| "learning_rate": 4.812499999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.965754270553589, |
| "reward_std": 0.003614649409428239, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9657542705535889, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 416 |
| }, |
| { |
| "completion_length": 173.90625, |
| "epoch": 1.3328, |
| "grad_norm": 1.4033151865005493, |
| "kl": 0.074462890625, |
| "learning_rate": 4.8e-07, |
| "loss": 0.0007, |
| "reward": 3.9543731212615967, |
| "reward_std": 0.006403392762877047, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9728915691375732, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9814814925193787, |
| "step": 417 |
| }, |
| { |
| "completion_length": 224.0625, |
| "epoch": 1.336, |
| "grad_norm": 1.0427494049072266, |
| "kl": 0.0576171875, |
| "learning_rate": 4.7875e-07, |
| "loss": 0.0006, |
| "reward": 3.965309262275696, |
| "reward_std": 0.011804148089140654, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9667502641677856, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9985590577125549, |
| "step": 418 |
| }, |
| { |
| "completion_length": 228.53125, |
| "epoch": 1.3392, |
| "grad_norm": 1.1613246202468872, |
| "kl": 0.06591796875, |
| "learning_rate": 4.775e-07, |
| "loss": 0.0007, |
| "reward": 3.948023200035095, |
| "reward_std": 0.012544674333184958, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9482711553573608, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997519850730896, |
| "step": 419 |
| }, |
| { |
| "completion_length": 197.34375, |
| "epoch": 1.3424, |
| "grad_norm": 0.8760451674461365, |
| "kl": 0.072265625, |
| "learning_rate": 4.7625e-07, |
| "loss": 0.0007, |
| "reward": 3.938261866569519, |
| "reward_std": 0.004269103752449155, |
| "rewards/answer_entity_reward": 0.9886363744735718, |
| "rewards/answer_wer_reward": 0.9496253132820129, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 420 |
| }, |
| { |
| "completion_length": 225.5, |
| "epoch": 1.3456000000000001, |
| "grad_norm": 2.4799275398254395, |
| "kl": 0.1290283203125, |
| "learning_rate": 4.7499999999999995e-07, |
| "loss": 0.0013, |
| "reward": 3.9379055500030518, |
| "reward_std": 0.008256121072918177, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9677021205425262, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9702034592628479, |
| "step": 421 |
| }, |
| { |
| "completion_length": 209.3125, |
| "epoch": 1.3488, |
| "grad_norm": 0.6864319443702698, |
| "kl": 0.0604248046875, |
| "learning_rate": 4.7374999999999996e-07, |
| "loss": 0.0006, |
| "reward": 3.9712308645248413, |
| "reward_std": 0.0032088530133478343, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9722216725349426, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990091919898987, |
| "step": 422 |
| }, |
| { |
| "completion_length": 187.5625, |
| "epoch": 1.3519999999999999, |
| "grad_norm": 1.9412598609924316, |
| "kl": 0.06787109375, |
| "learning_rate": 4.725e-07, |
| "loss": 0.0007, |
| "reward": 3.947052240371704, |
| "reward_std": 0.014190569054335356, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9569187760353088, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9925373196601868, |
| "step": 423 |
| }, |
| { |
| "completion_length": 225.59375, |
| "epoch": 1.3552, |
| "grad_norm": 1.4452259540557861, |
| "kl": 0.09619140625, |
| "learning_rate": 4.7125e-07, |
| "loss": 0.001, |
| "reward": 3.939266562461853, |
| "reward_std": 0.012853712774813175, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9556125402450562, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9860578179359436, |
| "step": 424 |
| }, |
| { |
| "completion_length": 261.0, |
| "epoch": 1.3584, |
| "grad_norm": 0.9420474171638489, |
| "kl": 0.054931640625, |
| "learning_rate": 4.6999999999999995e-07, |
| "loss": 0.0006, |
| "reward": 3.939144253730774, |
| "reward_std": 0.00785708031617105, |
| "rewards/answer_entity_reward": 0.9916666746139526, |
| "rewards/answer_wer_reward": 0.9474774897098541, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 425 |
| }, |
| { |
| "completion_length": 243.1875, |
| "epoch": 1.3616, |
| "grad_norm": 1.1776657104492188, |
| "kl": 0.078369140625, |
| "learning_rate": 4.6874999999999996e-07, |
| "loss": 0.0008, |
| "reward": 3.928247570991516, |
| "reward_std": 0.02044426929205656, |
| "rewards/answer_entity_reward": 0.995192289352417, |
| "rewards/answer_wer_reward": 0.9401307106018066, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9929245114326477, |
| "step": 426 |
| }, |
| { |
| "completion_length": 204.4375, |
| "epoch": 1.3648, |
| "grad_norm": 1.6268881559371948, |
| "kl": 0.073974609375, |
| "learning_rate": 4.675e-07, |
| "loss": 0.0007, |
| "reward": 3.9266600608825684, |
| "reward_std": 0.006853222264908254, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9440751671791077, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9825847446918488, |
| "step": 427 |
| }, |
| { |
| "completion_length": 232.0625, |
| "epoch": 1.3679999999999999, |
| "grad_norm": 34.5067138671875, |
| "kl": 0.755859375, |
| "learning_rate": 4.6625e-07, |
| "loss": 0.0076, |
| "reward": 3.844196319580078, |
| "reward_std": 0.04641831433400512, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9399954378604889, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9042008221149445, |
| "step": 428 |
| }, |
| { |
| "completion_length": 253.21875, |
| "epoch": 1.3712, |
| "grad_norm": 1.4444057941436768, |
| "kl": 0.0673828125, |
| "learning_rate": 4.65e-07, |
| "loss": 0.0007, |
| "reward": 3.963658928871155, |
| "reward_std": 0.009957378264516592, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9636587798595428, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 429 |
| }, |
| { |
| "completion_length": 241.875, |
| "epoch": 1.3744, |
| "grad_norm": 0.9258720278739929, |
| "kl": 0.0687255859375, |
| "learning_rate": 4.6374999999999995e-07, |
| "loss": 0.0007, |
| "reward": 3.9617748260498047, |
| "reward_std": 0.013449362479150295, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9652469456195831, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 430 |
| }, |
| { |
| "completion_length": 204.96875, |
| "epoch": 1.3776, |
| "grad_norm": 1.6328847408294678, |
| "kl": 0.0863037109375, |
| "learning_rate": 4.625e-07, |
| "loss": 0.0009, |
| "reward": 3.8922348022460938, |
| "reward_std": 0.007920752046629786, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9477903544902802, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9444444477558136, |
| "step": 431 |
| }, |
| { |
| "completion_length": 222.375, |
| "epoch": 1.3808, |
| "grad_norm": 2.479295492172241, |
| "kl": 0.0732421875, |
| "learning_rate": 4.6125e-07, |
| "loss": 0.0007, |
| "reward": 3.9312403202056885, |
| "reward_std": 0.02260798867791891, |
| "rewards/answer_entity_reward": 0.9941239356994629, |
| "rewards/answer_wer_reward": 0.937116414308548, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 432 |
| }, |
| { |
| "completion_length": 203.28125, |
| "epoch": 1.384, |
| "grad_norm": 2.6669020652770996, |
| "kl": 0.0631103515625, |
| "learning_rate": 4.6e-07, |
| "loss": 0.0006, |
| "reward": 3.938199043273926, |
| "reward_std": 0.014480275101959705, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9408722817897797, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997305870056152, |
| "step": 433 |
| }, |
| { |
| "completion_length": 255.1875, |
| "epoch": 1.3872, |
| "grad_norm": 1.4742846488952637, |
| "kl": 0.057373046875, |
| "learning_rate": 4.5874999999999995e-07, |
| "loss": 0.0006, |
| "reward": 3.9382212162017822, |
| "reward_std": 0.01696724910289049, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.94236820936203, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9982567131519318, |
| "step": 434 |
| }, |
| { |
| "completion_length": 211.15625, |
| "epoch": 1.3904, |
| "grad_norm": 1.795336365699768, |
| "kl": 0.0667724609375, |
| "learning_rate": 4.575e-07, |
| "loss": 0.0007, |
| "reward": 3.919999361038208, |
| "reward_std": 0.028288409113883972, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9725300371646881, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9474693238735199, |
| "step": 435 |
| }, |
| { |
| "completion_length": 208.65625, |
| "epoch": 1.3936, |
| "grad_norm": 2.1704065799713135, |
| "kl": 0.095947265625, |
| "learning_rate": 4.5624999999999997e-07, |
| "loss": 0.001, |
| "reward": 3.857280731201172, |
| "reward_std": 0.2144411588087678, |
| "rewards/answer_entity_reward": 0.9618055820465088, |
| "rewards/answer_wer_reward": 0.949828714132309, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9768964946269989, |
| "step": 436 |
| }, |
| { |
| "completion_length": 194.9375, |
| "epoch": 1.3968, |
| "grad_norm": 3.8814220428466797, |
| "kl": 0.082275390625, |
| "learning_rate": 4.55e-07, |
| "loss": 0.0008, |
| "reward": 3.941987633705139, |
| "reward_std": 0.015088737476617098, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.94545978307724, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 437 |
| }, |
| { |
| "completion_length": 217.5, |
| "epoch": 1.4, |
| "grad_norm": 1.3024876117706299, |
| "kl": 0.0389404296875, |
| "learning_rate": 4.5374999999999994e-07, |
| "loss": 0.0004, |
| "reward": 3.950901508331299, |
| "reward_std": 0.008365771966055036, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9589883685112, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9919130802154541, |
| "step": 438 |
| }, |
| { |
| "completion_length": 159.03125, |
| "epoch": 1.4032, |
| "grad_norm": 0.272270530462265, |
| "kl": 0.0396728515625, |
| "learning_rate": 4.525e-07, |
| "loss": 0.0004, |
| "reward": 3.9221452474594116, |
| "reward_std": 0.0014547138416673988, |
| "rewards/answer_entity_reward": 0.9916666746139526, |
| "rewards/answer_wer_reward": 0.9875754117965698, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9429032206535339, |
| "step": 439 |
| }, |
| { |
| "completion_length": 200.28125, |
| "epoch": 1.4064, |
| "grad_norm": 5.4578399658203125, |
| "kl": 0.0828857421875, |
| "learning_rate": 4.5124999999999997e-07, |
| "loss": 0.0008, |
| "reward": 3.9259976148605347, |
| "reward_std": 0.014895747415721416, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9536634683609009, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9758064448833466, |
| "step": 440 |
| }, |
| { |
| "completion_length": 229.1875, |
| "epoch": 1.4096, |
| "grad_norm": 0.6568198800086975, |
| "kl": 0.067138671875, |
| "learning_rate": 4.5e-07, |
| "loss": 0.0007, |
| "reward": 3.9455034732818604, |
| "reward_std": 0.011267438880167902, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9479073286056519, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 441 |
| }, |
| { |
| "completion_length": 199.3125, |
| "epoch": 1.4128, |
| "grad_norm": 1.0056089162826538, |
| "kl": 0.0567626953125, |
| "learning_rate": 4.4874999999999994e-07, |
| "loss": 0.0006, |
| "reward": 3.9622955322265625, |
| "reward_std": 0.008431105175986886, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9622955024242401, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 442 |
| }, |
| { |
| "completion_length": 212.375, |
| "epoch": 1.416, |
| "grad_norm": 0.7950085997581482, |
| "kl": 0.051025390625, |
| "learning_rate": 4.475e-07, |
| "loss": 0.0005, |
| "reward": 3.9517738819122314, |
| "reward_std": 0.03710572328418493, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9708344638347626, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9809393286705017, |
| "step": 443 |
| }, |
| { |
| "completion_length": 227.71875, |
| "epoch": 1.4192, |
| "grad_norm": 0.8971355557441711, |
| "kl": 0.0460205078125, |
| "learning_rate": 4.4624999999999996e-07, |
| "loss": 0.0005, |
| "reward": 3.980188012123108, |
| "reward_std": 0.00624943315051496, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9801879525184631, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 444 |
| }, |
| { |
| "completion_length": 226.78125, |
| "epoch": 1.4224, |
| "grad_norm": 2.114032745361328, |
| "kl": 0.0791015625, |
| "learning_rate": 4.45e-07, |
| "loss": 0.0008, |
| "reward": 3.879195213317871, |
| "reward_std": 0.03936337144114077, |
| "rewards/answer_entity_reward": 0.9981617629528046, |
| "rewards/answer_wer_reward": 0.9502902626991272, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9307432472705841, |
| "step": 445 |
| }, |
| { |
| "completion_length": 227.875, |
| "epoch": 1.4256, |
| "grad_norm": 1.0065126419067383, |
| "kl": 0.083984375, |
| "learning_rate": 4.4374999999999993e-07, |
| "loss": 0.0009, |
| "reward": 3.939829707145691, |
| "reward_std": 0.013783617876470089, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9398296475410461, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 446 |
| }, |
| { |
| "completion_length": 202.6875, |
| "epoch": 1.4288, |
| "grad_norm": 1.7568168640136719, |
| "kl": 0.0418701171875, |
| "learning_rate": 4.425e-07, |
| "loss": 0.0004, |
| "reward": 3.943518042564392, |
| "reward_std": 0.016201740596443415, |
| "rewards/answer_entity_reward": 0.9914772808551788, |
| "rewards/answer_wer_reward": 0.9520406126976013, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 447 |
| }, |
| { |
| "completion_length": 172.8125, |
| "epoch": 1.432, |
| "grad_norm": 1.0688170194625854, |
| "kl": 0.0494384765625, |
| "learning_rate": 4.4124999999999996e-07, |
| "loss": 0.0005, |
| "reward": 3.7196162939071655, |
| "reward_std": 0.006592530757188797, |
| "rewards/answer_entity_reward": 0.8677884340286255, |
| "rewards/answer_wer_reward": 0.8742637634277344, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9775640964508057, |
| "step": 448 |
| }, |
| { |
| "completion_length": 168.59375, |
| "epoch": 1.4352, |
| "grad_norm": 1.7712996006011963, |
| "kl": 0.0435791015625, |
| "learning_rate": 4.3999999999999997e-07, |
| "loss": 0.0004, |
| "reward": 3.8386131525039673, |
| "reward_std": 0.011066187638789415, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.8386130630970001, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 449 |
| }, |
| { |
| "completion_length": 197.40625, |
| "epoch": 1.4384000000000001, |
| "grad_norm": 0.8872710466384888, |
| "kl": 0.058349609375, |
| "learning_rate": 4.3874999999999993e-07, |
| "loss": 0.0006, |
| "reward": 3.7988067865371704, |
| "reward_std": 0.03104257071390748, |
| "rewards/answer_entity_reward": 0.9734432399272919, |
| "rewards/answer_wer_reward": 0.8270545899868011, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9983089566230774, |
| "step": 450 |
| }, |
| { |
| "completion_length": 178.40625, |
| "epoch": 1.4416, |
| "grad_norm": 6.044506072998047, |
| "kl": 0.0657958984375, |
| "learning_rate": 4.375e-07, |
| "loss": 0.0007, |
| "reward": 3.9419833421707153, |
| "reward_std": 0.021156481467187405, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9676234424114227, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9778319895267487, |
| "step": 451 |
| }, |
| { |
| "completion_length": 201.8125, |
| "epoch": 1.4447999999999999, |
| "grad_norm": 0.7943681478500366, |
| "kl": 0.0511474609375, |
| "learning_rate": 4.3625e-07, |
| "loss": 0.0005, |
| "reward": 3.956661581993103, |
| "reward_std": 0.007463611662387848, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9675310552120209, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.989130437374115, |
| "step": 452 |
| }, |
| { |
| "completion_length": 219.03125, |
| "epoch": 1.448, |
| "grad_norm": 1.069403052330017, |
| "kl": 0.0570068359375, |
| "learning_rate": 4.3499999999999996e-07, |
| "loss": 0.0006, |
| "reward": 3.9562065601348877, |
| "reward_std": 0.011006501503288746, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9564736187458038, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997329115867615, |
| "step": 453 |
| }, |
| { |
| "completion_length": 206.8125, |
| "epoch": 1.4512, |
| "grad_norm": 1.0987451076507568, |
| "kl": 0.0611572265625, |
| "learning_rate": 4.3375000000000003e-07, |
| "loss": 0.0006, |
| "reward": 3.9423000812530518, |
| "reward_std": 0.01284673297777772, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9693345129489899, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9758064448833466, |
| "step": 454 |
| }, |
| { |
| "completion_length": 211.375, |
| "epoch": 1.4544000000000001, |
| "grad_norm": 3.5896220207214355, |
| "kl": 0.065673828125, |
| "learning_rate": 4.325e-07, |
| "loss": 0.0007, |
| "reward": 3.961179494857788, |
| "reward_std": 0.012267218437045813, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9640858769416809, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.99709352850914, |
| "step": 455 |
| }, |
| { |
| "completion_length": 238.8125, |
| "epoch": 1.4576, |
| "grad_norm": 0.625076174736023, |
| "kl": 0.0399169921875, |
| "learning_rate": 4.3125e-07, |
| "loss": 0.0004, |
| "reward": 3.9661307334899902, |
| "reward_std": 0.013454007916152477, |
| "rewards/answer_entity_reward": 0.9958333373069763, |
| "rewards/answer_wer_reward": 0.9702973961830139, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 456 |
| }, |
| { |
| "completion_length": 206.9375, |
| "epoch": 1.4607999999999999, |
| "grad_norm": 0.6369054317474365, |
| "kl": 0.059814453125, |
| "learning_rate": 4.2999999999999996e-07, |
| "loss": 0.0006, |
| "reward": 3.9704521894454956, |
| "reward_std": 0.006653362594079226, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9733729660511017, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9970792829990387, |
| "step": 457 |
| }, |
| { |
| "completion_length": 199.625, |
| "epoch": 1.464, |
| "grad_norm": 1.2201271057128906, |
| "kl": 0.083251953125, |
| "learning_rate": 4.2875e-07, |
| "loss": 0.0008, |
| "reward": 3.967539429664612, |
| "reward_std": 0.012669337913393974, |
| "rewards/answer_entity_reward": 0.9927884340286255, |
| "rewards/answer_wer_reward": 0.9747509360313416, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 458 |
| }, |
| { |
| "completion_length": 220.0, |
| "epoch": 1.4672, |
| "grad_norm": 11.574130058288574, |
| "kl": 0.2125244140625, |
| "learning_rate": 4.275e-07, |
| "loss": 0.0021, |
| "reward": 3.9735381603240967, |
| "reward_std": 0.0033322512172162533, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9737901091575623, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997479915618896, |
| "step": 459 |
| }, |
| { |
| "completion_length": 181.0625, |
| "epoch": 1.4704, |
| "grad_norm": 1.050900936126709, |
| "kl": 0.0736083984375, |
| "learning_rate": 4.2625e-07, |
| "loss": 0.0007, |
| "reward": 3.9467893838882446, |
| "reward_std": 0.00827464903704822, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9717220067977905, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9750673770904541, |
| "step": 460 |
| }, |
| { |
| "completion_length": 207.4375, |
| "epoch": 1.4736, |
| "grad_norm": 1.25560462474823, |
| "kl": 0.07861328125, |
| "learning_rate": 4.2499999999999995e-07, |
| "loss": 0.0008, |
| "reward": 3.885838508605957, |
| "reward_std": 0.012714273296296597, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9541967213153839, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9316417276859283, |
| "step": 461 |
| }, |
| { |
| "completion_length": 205.125, |
| "epoch": 1.4768, |
| "grad_norm": 2.1235697269439697, |
| "kl": 0.064208984375, |
| "learning_rate": 4.2375e-07, |
| "loss": 0.0006, |
| "reward": 3.952380895614624, |
| "reward_std": 0.013835938647389412, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9538231492042542, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9985576868057251, |
| "step": 462 |
| }, |
| { |
| "completion_length": 229.25, |
| "epoch": 1.48, |
| "grad_norm": 3.838672399520874, |
| "kl": 0.09619140625, |
| "learning_rate": 4.225e-07, |
| "loss": 0.001, |
| "reward": 3.9537363052368164, |
| "reward_std": 0.014287983998656273, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9542993903160095, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994369447231293, |
| "step": 463 |
| }, |
| { |
| "completion_length": 224.71875, |
| "epoch": 1.4832, |
| "grad_norm": 0.7103460431098938, |
| "kl": 0.058837890625, |
| "learning_rate": 4.2125e-07, |
| "loss": 0.0006, |
| "reward": 3.9675354957580566, |
| "reward_std": 0.013558031525462866, |
| "rewards/answer_entity_reward": 0.9958333373069763, |
| "rewards/answer_wer_reward": 0.9719286262989044, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997735619544983, |
| "step": 464 |
| }, |
| { |
| "completion_length": 147.625, |
| "epoch": 1.4864, |
| "grad_norm": 2.865051031112671, |
| "kl": 0.099853515625, |
| "learning_rate": 4.1999999999999995e-07, |
| "loss": 0.001, |
| "reward": 3.958040475845337, |
| "reward_std": 0.00422883324790746, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9780724942684174, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9799679517745972, |
| "step": 465 |
| }, |
| { |
| "completion_length": 250.625, |
| "epoch": 1.4896, |
| "grad_norm": 1.115330696105957, |
| "kl": 0.062744140625, |
| "learning_rate": 4.1875e-07, |
| "loss": 0.0006, |
| "reward": 3.925747871398926, |
| "reward_std": 0.01510471198707819, |
| "rewards/answer_entity_reward": 0.9895833134651184, |
| "rewards/answer_wer_reward": 0.9361644089221954, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 466 |
| }, |
| { |
| "completion_length": 181.28125, |
| "epoch": 1.4928, |
| "grad_norm": 0.8615334033966064, |
| "kl": 0.095703125, |
| "learning_rate": 4.1749999999999997e-07, |
| "loss": 0.001, |
| "reward": 3.9389272928237915, |
| "reward_std": 0.009215079713612795, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.947648286819458, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9912790656089783, |
| "step": 467 |
| }, |
| { |
| "completion_length": 201.1875, |
| "epoch": 1.496, |
| "grad_norm": 0.8399393558502197, |
| "kl": 0.067138671875, |
| "learning_rate": 4.1625e-07, |
| "loss": 0.0007, |
| "reward": 3.9645369052886963, |
| "reward_std": 0.005296911578625441, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9660760462284088, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9984607994556427, |
| "step": 468 |
| }, |
| { |
| "completion_length": 181.53125, |
| "epoch": 1.4992, |
| "grad_norm": 1.692581057548523, |
| "kl": 0.116455078125, |
| "learning_rate": 4.1499999999999994e-07, |
| "loss": 0.0012, |
| "reward": 3.91774320602417, |
| "reward_std": 0.007862454745918512, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9589084982872009, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.958834707736969, |
| "step": 469 |
| }, |
| { |
| "completion_length": 208.375, |
| "epoch": 1.5024, |
| "grad_norm": 1.0280638933181763, |
| "kl": 0.0733642578125, |
| "learning_rate": 4.1375e-07, |
| "loss": 0.0007, |
| "reward": 3.963421940803528, |
| "reward_std": 0.010574808926321566, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9634219110012054, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 470 |
| }, |
| { |
| "completion_length": 194.375, |
| "epoch": 1.5056, |
| "grad_norm": 0.9556618332862854, |
| "kl": 0.04541015625, |
| "learning_rate": 4.1249999999999997e-07, |
| "loss": 0.0005, |
| "reward": 3.9483964443206787, |
| "reward_std": 0.0071337176486849785, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9483965635299683, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 471 |
| }, |
| { |
| "completion_length": 219.90625, |
| "epoch": 1.5088, |
| "grad_norm": 8.583925247192383, |
| "kl": 0.057373046875, |
| "learning_rate": 4.1125e-07, |
| "loss": 0.0006, |
| "reward": 3.9298593997955322, |
| "reward_std": 0.010127428220584989, |
| "rewards/answer_entity_reward": 0.9764957129955292, |
| "rewards/answer_wer_reward": 0.9549680352210999, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9983957409858704, |
| "step": 472 |
| }, |
| { |
| "completion_length": 169.71875, |
| "epoch": 1.512, |
| "grad_norm": 1.0506740808486938, |
| "kl": 0.0703125, |
| "learning_rate": 4.0999999999999994e-07, |
| "loss": 0.0007, |
| "reward": 3.9712518453598022, |
| "reward_std": 0.004299861378967762, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9712517857551575, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 473 |
| }, |
| { |
| "completion_length": 254.0, |
| "epoch": 1.5152, |
| "grad_norm": 1.2391588687896729, |
| "kl": 0.055419921875, |
| "learning_rate": 4.0875e-07, |
| "loss": 0.0006, |
| "reward": 3.9443717002868652, |
| "reward_std": 0.007719833869487047, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9459867179393768, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9983848929405212, |
| "step": 474 |
| }, |
| { |
| "completion_length": 173.15625, |
| "epoch": 1.5184, |
| "grad_norm": 21.967166900634766, |
| "kl": 0.0810546875, |
| "learning_rate": 4.0749999999999996e-07, |
| "loss": 0.0008, |
| "reward": 3.892626404762268, |
| "reward_std": 0.03193977475166321, |
| "rewards/answer_entity_reward": 0.9926470518112183, |
| "rewards/answer_wer_reward": 0.9627694487571716, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9372097849845886, |
| "step": 475 |
| }, |
| { |
| "completion_length": 177.96875, |
| "epoch": 1.5215999999999998, |
| "grad_norm": 2.125126838684082, |
| "kl": 0.0814208984375, |
| "learning_rate": 4.0625e-07, |
| "loss": 0.0008, |
| "reward": 3.957445502281189, |
| "reward_std": 0.016827338375151157, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9618943929672241, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990234375, |
| "step": 476 |
| }, |
| { |
| "completion_length": 259.34375, |
| "epoch": 1.5248, |
| "grad_norm": 1.144234538078308, |
| "kl": 0.0545654296875, |
| "learning_rate": 4.05e-07, |
| "loss": 0.0005, |
| "reward": 3.9333302974700928, |
| "reward_std": 0.015490441583096981, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9336776435375214, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999652773141861, |
| "step": 477 |
| }, |
| { |
| "completion_length": 223.84375, |
| "epoch": 1.528, |
| "grad_norm": 0.8379483222961426, |
| "kl": 0.0653076171875, |
| "learning_rate": 4.0375e-07, |
| "loss": 0.0007, |
| "reward": 3.9397594928741455, |
| "reward_std": 0.006189712788909674, |
| "rewards/answer_entity_reward": 0.9926470518112183, |
| "rewards/answer_wer_reward": 0.9652985334396362, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.981813907623291, |
| "step": 478 |
| }, |
| { |
| "completion_length": 195.15625, |
| "epoch": 1.5312000000000001, |
| "grad_norm": 1.9627622365951538, |
| "kl": 0.0709228515625, |
| "learning_rate": 4.025e-07, |
| "loss": 0.0007, |
| "reward": 3.90268337726593, |
| "reward_std": 0.022933244705200195, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9422430694103241, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9632812440395355, |
| "step": 479 |
| }, |
| { |
| "completion_length": 212.03125, |
| "epoch": 1.5344, |
| "grad_norm": 1.4353668689727783, |
| "kl": 0.0572509765625, |
| "learning_rate": 4.0124999999999997e-07, |
| "loss": 0.0006, |
| "reward": 3.955712080001831, |
| "reward_std": 0.004905138397589326, |
| "rewards/answer_entity_reward": 0.9903846085071564, |
| "rewards/answer_wer_reward": 0.9653275012969971, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 480 |
| }, |
| { |
| "completion_length": 238.125, |
| "epoch": 1.5375999999999999, |
| "grad_norm": 0.9400500059127808, |
| "kl": 0.0516357421875, |
| "learning_rate": 4e-07, |
| "loss": 0.0005, |
| "reward": 3.9561740159988403, |
| "reward_std": 0.004761199816130102, |
| "rewards/answer_entity_reward": 0.9903846085071564, |
| "rewards/answer_wer_reward": 0.9657893478870392, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 481 |
| }, |
| { |
| "completion_length": 197.125, |
| "epoch": 1.5408, |
| "grad_norm": 1.7909142971038818, |
| "kl": 0.044677734375, |
| "learning_rate": 3.9875e-07, |
| "loss": 0.0004, |
| "reward": 3.9649877548217773, |
| "reward_std": 0.008824507240206003, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9712709188461304, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.993716835975647, |
| "step": 482 |
| }, |
| { |
| "completion_length": 247.28125, |
| "epoch": 1.544, |
| "grad_norm": 1.305432915687561, |
| "kl": 0.0885009765625, |
| "learning_rate": 3.975e-07, |
| "loss": 0.0009, |
| "reward": 3.9271016120910645, |
| "reward_std": 0.010741112288087606, |
| "rewards/answer_entity_reward": 0.9867424368858337, |
| "rewards/answer_wer_reward": 0.9422920942306519, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9980670213699341, |
| "step": 483 |
| }, |
| { |
| "completion_length": 183.71875, |
| "epoch": 1.5472000000000001, |
| "grad_norm": 1.2143511772155762, |
| "kl": 0.083251953125, |
| "learning_rate": 3.9624999999999996e-07, |
| "loss": 0.0008, |
| "reward": 3.961517810821533, |
| "reward_std": 0.015109732514247298, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9659819006919861, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 484 |
| }, |
| { |
| "completion_length": 190.96875, |
| "epoch": 1.5504, |
| "grad_norm": 1.3901034593582153, |
| "kl": 0.0478515625, |
| "learning_rate": 3.95e-07, |
| "loss": 0.0005, |
| "reward": 3.9620405435562134, |
| "reward_std": 0.007438812637701631, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.962040513753891, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 485 |
| }, |
| { |
| "completion_length": 236.71875, |
| "epoch": 1.5535999999999999, |
| "grad_norm": 1.005139946937561, |
| "kl": 0.064697265625, |
| "learning_rate": 3.9375e-07, |
| "loss": 0.0007, |
| "reward": 3.9681735038757324, |
| "reward_std": 0.007598390802741051, |
| "rewards/answer_entity_reward": 0.9981617629528046, |
| "rewards/answer_wer_reward": 0.9703975021839142, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996141791343689, |
| "step": 486 |
| }, |
| { |
| "completion_length": 167.71875, |
| "epoch": 1.5568, |
| "grad_norm": 14.769695281982422, |
| "kl": 0.088623046875, |
| "learning_rate": 3.925e-07, |
| "loss": 0.0009, |
| "reward": 3.9402579069137573, |
| "reward_std": 0.01711948262527585, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9504852592945099, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9897727370262146, |
| "step": 487 |
| }, |
| { |
| "completion_length": 245.59375, |
| "epoch": 1.56, |
| "grad_norm": 2.1311302185058594, |
| "kl": 0.0643310546875, |
| "learning_rate": 3.9124999999999996e-07, |
| "loss": 0.0006, |
| "reward": 3.965644121170044, |
| "reward_std": 0.006802293471992016, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9664610624313354, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9991829991340637, |
| "step": 488 |
| }, |
| { |
| "completion_length": 228.90625, |
| "epoch": 1.5632000000000001, |
| "grad_norm": 2.194638967514038, |
| "kl": 0.07861328125, |
| "learning_rate": 3.8999999999999997e-07, |
| "loss": 0.0008, |
| "reward": 3.940732479095459, |
| "reward_std": 0.00845141801983118, |
| "rewards/answer_entity_reward": 0.9916666746139526, |
| "rewards/answer_wer_reward": 0.9496362805366516, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994294047355652, |
| "step": 489 |
| }, |
| { |
| "completion_length": 229.09375, |
| "epoch": 1.5664, |
| "grad_norm": 1.4338947534561157, |
| "kl": 0.067138671875, |
| "learning_rate": 3.8875e-07, |
| "loss": 0.0007, |
| "reward": 3.974826216697693, |
| "reward_std": 0.008368036011233926, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9759277105331421, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9988985061645508, |
| "step": 490 |
| }, |
| { |
| "completion_length": 147.1875, |
| "epoch": 1.5695999999999999, |
| "grad_norm": 0.9500789046287537, |
| "kl": 0.055908203125, |
| "learning_rate": 3.875e-07, |
| "loss": 0.0006, |
| "reward": 3.900749683380127, |
| "reward_std": 0.004976645112037659, |
| "rewards/answer_entity_reward": 0.9886363744735718, |
| "rewards/answer_wer_reward": 0.981389045715332, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9307242631912231, |
| "step": 491 |
| }, |
| { |
| "completion_length": 207.1875, |
| "epoch": 1.5728, |
| "grad_norm": 18.29888916015625, |
| "kl": 0.0787353515625, |
| "learning_rate": 3.8624999999999995e-07, |
| "loss": 0.0008, |
| "reward": 3.9231996536254883, |
| "reward_std": 0.01712162047624588, |
| "rewards/answer_entity_reward": 0.9963235259056091, |
| "rewards/answer_wer_reward": 0.9278469979763031, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990289807319641, |
| "step": 492 |
| }, |
| { |
| "completion_length": 215.3125, |
| "epoch": 1.576, |
| "grad_norm": 2.524644613265991, |
| "kl": 0.0682373046875, |
| "learning_rate": 3.8499999999999997e-07, |
| "loss": 0.0007, |
| "reward": 3.9182220697402954, |
| "reward_std": 0.028343133628368378, |
| "rewards/answer_entity_reward": 0.9899839758872986, |
| "rewards/answer_wer_reward": 0.9533904790878296, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9748475551605225, |
| "step": 493 |
| }, |
| { |
| "completion_length": 205.21875, |
| "epoch": 1.5792000000000002, |
| "grad_norm": 0.8041574954986572, |
| "kl": 0.0572509765625, |
| "learning_rate": 3.8375e-07, |
| "loss": 0.0006, |
| "reward": 3.9712276458740234, |
| "reward_std": 0.006993145681917667, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9721719622612, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990555644035339, |
| "step": 494 |
| }, |
| { |
| "completion_length": 245.84375, |
| "epoch": 1.5824, |
| "grad_norm": 1.4723294973373413, |
| "kl": 0.0518798828125, |
| "learning_rate": 3.825e-07, |
| "loss": 0.0005, |
| "reward": 3.9171528816223145, |
| "reward_std": 0.007540189428254962, |
| "rewards/answer_entity_reward": 0.9707792401313782, |
| "rewards/answer_wer_reward": 0.9463737607002258, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 495 |
| }, |
| { |
| "completion_length": 191.1875, |
| "epoch": 1.5856, |
| "grad_norm": 5.778710842132568, |
| "kl": 0.095703125, |
| "learning_rate": 3.8124999999999995e-07, |
| "loss": 0.001, |
| "reward": 3.7989085912704468, |
| "reward_std": 0.02309321239590645, |
| "rewards/answer_entity_reward": 0.9837072491645813, |
| "rewards/answer_wer_reward": 0.9482426345348358, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.866958737373352, |
| "step": 496 |
| }, |
| { |
| "completion_length": 164.375, |
| "epoch": 1.5888, |
| "grad_norm": 3.773331880569458, |
| "kl": 0.0452880859375, |
| "learning_rate": 3.7999999999999996e-07, |
| "loss": 0.0005, |
| "reward": 3.957179307937622, |
| "reward_std": 0.03012340608984232, |
| "rewards/answer_entity_reward": 0.995192289352417, |
| "rewards/answer_wer_reward": 0.9724558889865875, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9895310997962952, |
| "step": 497 |
| }, |
| { |
| "completion_length": 190.34375, |
| "epoch": 1.592, |
| "grad_norm": 1.7698373794555664, |
| "kl": 0.0579833984375, |
| "learning_rate": 3.7875e-07, |
| "loss": 0.0006, |
| "reward": 3.9473685026168823, |
| "reward_std": 0.009419793263077736, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9480363428592682, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993322789669037, |
| "step": 498 |
| }, |
| { |
| "completion_length": 223.03125, |
| "epoch": 1.5952, |
| "grad_norm": 1.197536587715149, |
| "kl": 0.074462890625, |
| "learning_rate": 3.775e-07, |
| "loss": 0.0007, |
| "reward": 3.9201695919036865, |
| "reward_std": 0.012398123741149902, |
| "rewards/answer_entity_reward": 0.9861111044883728, |
| "rewards/answer_wer_reward": 0.9409077167510986, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9931506812572479, |
| "step": 499 |
| }, |
| { |
| "completion_length": 204.46875, |
| "epoch": 1.5984, |
| "grad_norm": 1.5246530771255493, |
| "kl": 0.0849609375, |
| "learning_rate": 3.7624999999999994e-07, |
| "loss": 0.0008, |
| "reward": 3.9556870460510254, |
| "reward_std": 0.010473677422851324, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9580392241477966, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9976478517055511, |
| "step": 500 |
| }, |
| { |
| "completion_length": 230.0625, |
| "epoch": 1.6016, |
| "grad_norm": 1.1340093612670898, |
| "kl": 0.10595703125, |
| "learning_rate": 3.75e-07, |
| "loss": 0.0011, |
| "reward": 3.9659206867218018, |
| "reward_std": 0.008191006258130074, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9659207165241241, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 501 |
| }, |
| { |
| "completion_length": 185.15625, |
| "epoch": 1.6048, |
| "grad_norm": 1.2874914407730103, |
| "kl": 0.045654296875, |
| "learning_rate": 3.7375e-07, |
| "loss": 0.0005, |
| "reward": 3.9568817615509033, |
| "reward_std": 0.011238863109610975, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9603540003299713, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 502 |
| }, |
| { |
| "completion_length": 241.78125, |
| "epoch": 1.608, |
| "grad_norm": 0.9499295353889465, |
| "kl": 0.0531005859375, |
| "learning_rate": 3.725e-07, |
| "loss": 0.0005, |
| "reward": 3.9388747215270996, |
| "reward_std": 0.008348907809704542, |
| "rewards/answer_entity_reward": 0.9886363744735718, |
| "rewards/answer_wer_reward": 0.9510295391082764, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9992088675498962, |
| "step": 503 |
| }, |
| { |
| "completion_length": 233.25, |
| "epoch": 1.6112, |
| "grad_norm": 1.0857101678848267, |
| "kl": 0.062744140625, |
| "learning_rate": 3.7125e-07, |
| "loss": 0.0006, |
| "reward": 3.958517551422119, |
| "reward_std": 0.0058578201569616795, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.958990752696991, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9995267689228058, |
| "step": 504 |
| }, |
| { |
| "completion_length": 251.78125, |
| "epoch": 1.6143999999999998, |
| "grad_norm": 28.171039581298828, |
| "kl": 0.114013671875, |
| "learning_rate": 3.7e-07, |
| "loss": 0.0011, |
| "reward": 3.866329312324524, |
| "reward_std": 0.01942992489784956, |
| "rewards/answer_entity_reward": 0.9720904231071472, |
| "rewards/answer_wer_reward": 0.8955735862255096, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9986652433872223, |
| "step": 505 |
| }, |
| { |
| "completion_length": 186.46875, |
| "epoch": 1.6176, |
| "grad_norm": 6.638906955718994, |
| "kl": 0.06884765625, |
| "learning_rate": 3.6875e-07, |
| "loss": 0.0007, |
| "reward": 3.7806142568588257, |
| "reward_std": 0.013823950197547674, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.945627748966217, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8378273248672485, |
| "step": 506 |
| }, |
| { |
| "completion_length": 225.375, |
| "epoch": 1.6208, |
| "grad_norm": 2.12021803855896, |
| "kl": 0.07177734375, |
| "learning_rate": 3.675e-07, |
| "loss": 0.0007, |
| "reward": 3.9451769590377808, |
| "reward_std": 0.013169697020202875, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9672558605670929, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.977921187877655, |
| "step": 507 |
| }, |
| { |
| "completion_length": 219.125, |
| "epoch": 1.624, |
| "grad_norm": 1.5153933763504028, |
| "kl": 0.053955078125, |
| "learning_rate": 3.6625e-07, |
| "loss": 0.0005, |
| "reward": 3.959490180015564, |
| "reward_std": 0.010949777672067285, |
| "rewards/answer_entity_reward": 0.9958333373069763, |
| "rewards/answer_wer_reward": 0.9636567533016205, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 508 |
| }, |
| { |
| "completion_length": 228.4375, |
| "epoch": 1.6272, |
| "grad_norm": 3.832310676574707, |
| "kl": 0.0521240234375, |
| "learning_rate": 3.65e-07, |
| "loss": 0.0005, |
| "reward": 3.953840732574463, |
| "reward_std": 0.017153040505945683, |
| "rewards/answer_entity_reward": 0.9936868846416473, |
| "rewards/answer_wer_reward": 0.9603707194328308, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997829794883728, |
| "step": 509 |
| }, |
| { |
| "completion_length": 243.46875, |
| "epoch": 1.6303999999999998, |
| "grad_norm": 1.285962462425232, |
| "kl": 0.0673828125, |
| "learning_rate": 3.6375e-07, |
| "loss": 0.0007, |
| "reward": 3.960462808609009, |
| "reward_std": 0.0062334975227713585, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9608500599861145, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996127486228943, |
| "step": 510 |
| }, |
| { |
| "completion_length": 262.65625, |
| "epoch": 1.6336, |
| "grad_norm": 1.124130368232727, |
| "kl": 0.0596923828125, |
| "learning_rate": 3.6249999999999997e-07, |
| "loss": 0.0006, |
| "reward": 3.941042900085449, |
| "reward_std": 0.01204587472602725, |
| "rewards/answer_entity_reward": 0.9970238208770752, |
| "rewards/answer_wer_reward": 0.9446144104003906, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994047582149506, |
| "step": 511 |
| }, |
| { |
| "completion_length": 182.28125, |
| "epoch": 1.6368, |
| "grad_norm": 1.9966425895690918, |
| "kl": 0.061279296875, |
| "learning_rate": 3.6125e-07, |
| "loss": 0.0006, |
| "reward": 3.9531023502349854, |
| "reward_std": 0.02773769712075591, |
| "rewards/answer_entity_reward": 0.9917200803756714, |
| "rewards/answer_wer_reward": 0.9697157144546509, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9916666746139526, |
| "step": 512 |
| }, |
| { |
| "completion_length": 218.125, |
| "epoch": 1.6400000000000001, |
| "grad_norm": 3.2862062454223633, |
| "kl": 0.04736328125, |
| "learning_rate": 3.6e-07, |
| "loss": 0.0005, |
| "reward": 3.858319878578186, |
| "reward_std": 0.07778534758836031, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9565341770648956, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.90625, |
| "step": 513 |
| }, |
| { |
| "completion_length": 235.03125, |
| "epoch": 1.6432, |
| "grad_norm": 1.14111328125, |
| "kl": 0.054443359375, |
| "learning_rate": 3.5875e-07, |
| "loss": 0.0005, |
| "reward": 3.967674970626831, |
| "reward_std": 0.0044005257077515125, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9691169261932373, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9985580444335938, |
| "step": 514 |
| }, |
| { |
| "completion_length": 233.90625, |
| "epoch": 1.6463999999999999, |
| "grad_norm": 1.2006644010543823, |
| "kl": 0.06103515625, |
| "learning_rate": 3.5749999999999997e-07, |
| "loss": 0.0006, |
| "reward": 3.959411859512329, |
| "reward_std": 0.005820953520014882, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9596619009971619, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999750018119812, |
| "step": 515 |
| }, |
| { |
| "completion_length": 252.5625, |
| "epoch": 1.6496, |
| "grad_norm": 0.7272346615791321, |
| "kl": 0.0428466796875, |
| "learning_rate": 3.5625e-07, |
| "loss": 0.0004, |
| "reward": 3.963356375694275, |
| "reward_std": 0.0036240214249119163, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.964261919260025, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990943968296051, |
| "step": 516 |
| }, |
| { |
| "completion_length": 240.6875, |
| "epoch": 1.6528, |
| "grad_norm": 1.0241456031799316, |
| "kl": 0.0665283203125, |
| "learning_rate": 3.55e-07, |
| "loss": 0.0007, |
| "reward": 3.953768730163574, |
| "reward_std": 0.012724505737423897, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9555812776088715, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9981874525547028, |
| "step": 517 |
| }, |
| { |
| "completion_length": 221.5625, |
| "epoch": 1.6560000000000001, |
| "grad_norm": 0.9653159379959106, |
| "kl": 0.0732421875, |
| "learning_rate": 3.5375e-07, |
| "loss": 0.0007, |
| "reward": 3.928879141807556, |
| "reward_std": 0.03069964610040188, |
| "rewards/answer_entity_reward": 0.9769324958324432, |
| "rewards/answer_wer_reward": 0.9525844156742096, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993622303009033, |
| "step": 518 |
| }, |
| { |
| "completion_length": 186.53125, |
| "epoch": 1.6592, |
| "grad_norm": 1.616326928138733, |
| "kl": 0.0673828125, |
| "learning_rate": 3.5249999999999996e-07, |
| "loss": 0.0007, |
| "reward": 3.963484525680542, |
| "reward_std": 0.0024420777335762978, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9634844958782196, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 519 |
| }, |
| { |
| "completion_length": 197.53125, |
| "epoch": 1.6623999999999999, |
| "grad_norm": 1.1605949401855469, |
| "kl": 0.066162109375, |
| "learning_rate": 3.5124999999999997e-07, |
| "loss": 0.0007, |
| "reward": 3.871947407722473, |
| "reward_std": 0.008121895836666226, |
| "rewards/answer_entity_reward": 0.9832701981067657, |
| "rewards/answer_wer_reward": 0.9628296792507172, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9258474707603455, |
| "step": 520 |
| }, |
| { |
| "completion_length": 199.9375, |
| "epoch": 1.6656, |
| "grad_norm": 2.1799464225769043, |
| "kl": 0.098876953125, |
| "learning_rate": 3.5e-07, |
| "loss": 0.001, |
| "reward": 3.914597272872925, |
| "reward_std": 0.046278308145701885, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9440673291683197, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9705299139022827, |
| "step": 521 |
| }, |
| { |
| "completion_length": 213.78125, |
| "epoch": 1.6688, |
| "grad_norm": 1.8315109014511108, |
| "kl": 0.0609130859375, |
| "learning_rate": 3.4875e-07, |
| "loss": 0.0006, |
| "reward": 3.934143304824829, |
| "reward_std": 0.005300799617543817, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9627971351146698, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9713463485240936, |
| "step": 522 |
| }, |
| { |
| "completion_length": 233.21875, |
| "epoch": 1.6720000000000002, |
| "grad_norm": 2.7353854179382324, |
| "kl": 0.0634765625, |
| "learning_rate": 3.4749999999999996e-07, |
| "loss": 0.0006, |
| "reward": 3.940351963043213, |
| "reward_std": 0.012048345990478992, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9584531188011169, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9818988144397736, |
| "step": 523 |
| }, |
| { |
| "completion_length": 226.8125, |
| "epoch": 1.6752, |
| "grad_norm": 1.2798601388931274, |
| "kl": 0.0517578125, |
| "learning_rate": 3.4624999999999997e-07, |
| "loss": 0.0005, |
| "reward": 3.94057559967041, |
| "reward_std": 0.016422050073742867, |
| "rewards/answer_entity_reward": 0.9859203100204468, |
| "rewards/answer_wer_reward": 0.9546553492546082, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 524 |
| }, |
| { |
| "completion_length": 225.375, |
| "epoch": 1.6784, |
| "grad_norm": 2.434398651123047, |
| "kl": 0.0570068359375, |
| "learning_rate": 3.45e-07, |
| "loss": 0.0006, |
| "reward": 3.9358779191970825, |
| "reward_std": 0.02181497309356928, |
| "rewards/answer_entity_reward": 0.9961080551147461, |
| "rewards/answer_wer_reward": 0.9410910904407501, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9986788034439087, |
| "step": 525 |
| }, |
| { |
| "completion_length": 181.21875, |
| "epoch": 1.6816, |
| "grad_norm": 1.322139859199524, |
| "kl": 0.116943359375, |
| "learning_rate": 3.4375e-07, |
| "loss": 0.0012, |
| "reward": 3.946447730064392, |
| "reward_std": 0.007033249130472541, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9464477598667145, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 526 |
| }, |
| { |
| "completion_length": 195.8125, |
| "epoch": 1.6848, |
| "grad_norm": 1.412061333656311, |
| "kl": 0.06640625, |
| "learning_rate": 3.425e-07, |
| "loss": 0.0007, |
| "reward": 3.936468005180359, |
| "reward_std": 0.00922114565037191, |
| "rewards/answer_entity_reward": 0.9841346144676208, |
| "rewards/answer_wer_reward": 0.952333390712738, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 527 |
| }, |
| { |
| "completion_length": 210.84375, |
| "epoch": 1.688, |
| "grad_norm": 3.695819139480591, |
| "kl": 0.056640625, |
| "learning_rate": 3.4124999999999996e-07, |
| "loss": 0.0006, |
| "reward": 3.894517421722412, |
| "reward_std": 0.015210594050586224, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.9634661674499512, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9367331266403198, |
| "step": 528 |
| }, |
| { |
| "completion_length": 220.09375, |
| "epoch": 1.6912, |
| "grad_norm": 1.6299357414245605, |
| "kl": 0.0711669921875, |
| "learning_rate": 3.4000000000000003e-07, |
| "loss": 0.0007, |
| "reward": 3.9391125440597534, |
| "reward_std": 0.014290765568148345, |
| "rewards/answer_entity_reward": 0.9847222566604614, |
| "rewards/answer_wer_reward": 0.954390287399292, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 529 |
| }, |
| { |
| "completion_length": 195.65625, |
| "epoch": 1.6944, |
| "grad_norm": 4.491413116455078, |
| "kl": 0.064453125, |
| "learning_rate": 3.3875e-07, |
| "loss": 0.0007, |
| "reward": 3.971281409263611, |
| "reward_std": 0.017785906326025724, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9796920418739319, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9944303929805756, |
| "step": 530 |
| }, |
| { |
| "completion_length": 208.34375, |
| "epoch": 1.6976, |
| "grad_norm": 4.832588195800781, |
| "kl": 0.0972900390625, |
| "learning_rate": 3.375e-07, |
| "loss": 0.001, |
| "reward": 3.9011433124542236, |
| "reward_std": 0.010198547039180994, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9640267491340637, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9371165633201599, |
| "step": 531 |
| }, |
| { |
| "completion_length": 203.40625, |
| "epoch": 1.7008, |
| "grad_norm": 3.4038021564483643, |
| "kl": 0.071044921875, |
| "learning_rate": 3.3624999999999996e-07, |
| "loss": 0.0007, |
| "reward": 3.9605783224105835, |
| "reward_std": 0.0076046837493777275, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9607688188552856, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9998094439506531, |
| "step": 532 |
| }, |
| { |
| "completion_length": 241.8125, |
| "epoch": 1.704, |
| "grad_norm": 1.0362496376037598, |
| "kl": 0.063232421875, |
| "learning_rate": 3.35e-07, |
| "loss": 0.0006, |
| "reward": 3.9339258670806885, |
| "reward_std": 0.018858356634154916, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9387494027614594, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996408224105835, |
| "step": 533 |
| }, |
| { |
| "completion_length": 235.90625, |
| "epoch": 1.7072, |
| "grad_norm": 3.604599714279175, |
| "kl": 0.0853271484375, |
| "learning_rate": 3.3375e-07, |
| "loss": 0.0009, |
| "reward": 3.861118197441101, |
| "reward_std": 0.011326078558340669, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9576848149299622, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9034333229064941, |
| "step": 534 |
| }, |
| { |
| "completion_length": 229.0, |
| "epoch": 1.7104, |
| "grad_norm": 2.319185256958008, |
| "kl": 0.052001953125, |
| "learning_rate": 3.325e-07, |
| "loss": 0.0005, |
| "reward": 3.9228227138519287, |
| "reward_std": 0.03856424614787102, |
| "rewards/answer_entity_reward": 0.9914772808551788, |
| "rewards/answer_wer_reward": 0.9560109972953796, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9753345847129822, |
| "step": 535 |
| }, |
| { |
| "completion_length": 224.71875, |
| "epoch": 1.7136, |
| "grad_norm": 2.444124460220337, |
| "kl": 0.080810546875, |
| "learning_rate": 3.3124999999999995e-07, |
| "loss": 0.0008, |
| "reward": 3.9688942432403564, |
| "reward_std": 0.003912239335477352, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9688942730426788, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 536 |
| }, |
| { |
| "completion_length": 224.3125, |
| "epoch": 1.7168, |
| "grad_norm": 6.20790958404541, |
| "kl": 0.064697265625, |
| "learning_rate": 3.3e-07, |
| "loss": 0.0006, |
| "reward": 3.8677161931991577, |
| "reward_std": 0.02981195878237486, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9471929371356964, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9233641624450684, |
| "step": 537 |
| }, |
| { |
| "completion_length": 150.34375, |
| "epoch": 1.72, |
| "grad_norm": 1.6208490133285522, |
| "kl": 0.03924560546875, |
| "learning_rate": 3.2875e-07, |
| "loss": 0.0004, |
| "reward": 3.9733328819274902, |
| "reward_std": 0.002679725643247366, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9733329117298126, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 538 |
| }, |
| { |
| "completion_length": 183.0, |
| "epoch": 1.7231999999999998, |
| "grad_norm": 1.2286797761917114, |
| "kl": 0.057861328125, |
| "learning_rate": 3.275e-07, |
| "loss": 0.0006, |
| "reward": 3.935777187347412, |
| "reward_std": 0.003249647794291377, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9795266687870026, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9562505781650543, |
| "step": 539 |
| }, |
| { |
| "completion_length": 234.625, |
| "epoch": 1.7264, |
| "grad_norm": 1.304764747619629, |
| "kl": 0.054931640625, |
| "learning_rate": 3.2624999999999995e-07, |
| "loss": 0.0005, |
| "reward": 3.950987696647644, |
| "reward_std": 0.00898568145930767, |
| "rewards/answer_entity_reward": 0.9958333373069763, |
| "rewards/answer_wer_reward": 0.9557509124279022, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994034171104431, |
| "step": 540 |
| }, |
| { |
| "completion_length": 183.96875, |
| "epoch": 1.7296, |
| "grad_norm": 1.3975461721420288, |
| "kl": 0.07421875, |
| "learning_rate": 3.25e-07, |
| "loss": 0.0007, |
| "reward": 3.918307065963745, |
| "reward_std": 0.01607332704588771, |
| "rewards/answer_entity_reward": 0.9720314145088196, |
| "rewards/answer_wer_reward": 0.9547825455665588, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9914930462837219, |
| "step": 541 |
| }, |
| { |
| "completion_length": 204.0625, |
| "epoch": 1.7328000000000001, |
| "grad_norm": 2.0030770301818848, |
| "kl": 0.070068359375, |
| "learning_rate": 3.2374999999999997e-07, |
| "loss": 0.0007, |
| "reward": 3.9624624252319336, |
| "reward_std": 0.011391833890229464, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9645456969738007, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 542 |
| }, |
| { |
| "completion_length": 236.8125, |
| "epoch": 1.736, |
| "grad_norm": 1.0529872179031372, |
| "kl": 0.06396484375, |
| "learning_rate": 3.225e-07, |
| "loss": 0.0006, |
| "reward": 3.9355998039245605, |
| "reward_std": 0.011712775565683842, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.946576714515686, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9890230894088745, |
| "step": 543 |
| }, |
| { |
| "completion_length": 171.03125, |
| "epoch": 1.7391999999999999, |
| "grad_norm": 1.4777579307556152, |
| "kl": 0.07861328125, |
| "learning_rate": 3.2124999999999994e-07, |
| "loss": 0.0008, |
| "reward": 3.959132194519043, |
| "reward_std": 0.007866068510338664, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9591321349143982, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 544 |
| }, |
| { |
| "completion_length": 199.03125, |
| "epoch": 1.7424, |
| "grad_norm": 1.5819900035858154, |
| "kl": 0.07666015625, |
| "learning_rate": 3.2e-07, |
| "loss": 0.0008, |
| "reward": 3.9456801414489746, |
| "reward_std": 0.01446144049987197, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9492515921592712, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9985119104385376, |
| "step": 545 |
| }, |
| { |
| "completion_length": 243.53125, |
| "epoch": 1.7456, |
| "grad_norm": 6.461181640625, |
| "kl": 0.1029052734375, |
| "learning_rate": 3.1874999999999997e-07, |
| "loss": 0.001, |
| "reward": 3.9253257513046265, |
| "reward_std": 0.013943355064839125, |
| "rewards/answer_entity_reward": 0.9861111044883728, |
| "rewards/answer_wer_reward": 0.9411455988883972, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9980691075325012, |
| "step": 546 |
| }, |
| { |
| "completion_length": 190.21875, |
| "epoch": 1.7488000000000001, |
| "grad_norm": 1.5046278238296509, |
| "kl": 0.0430908203125, |
| "learning_rate": 3.175e-07, |
| "loss": 0.0004, |
| "reward": 3.946847081184387, |
| "reward_std": 0.006090850802138448, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9579125344753265, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9889345765113831, |
| "step": 547 |
| }, |
| { |
| "completion_length": 199.5625, |
| "epoch": 1.752, |
| "grad_norm": 2.7514781951904297, |
| "kl": 0.054931640625, |
| "learning_rate": 3.1624999999999994e-07, |
| "loss": 0.0006, |
| "reward": 3.9198288917541504, |
| "reward_std": 0.008053636411204934, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9198288321495056, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 548 |
| }, |
| { |
| "completion_length": 244.625, |
| "epoch": 1.7551999999999999, |
| "grad_norm": 1.0448155403137207, |
| "kl": 0.0426025390625, |
| "learning_rate": 3.15e-07, |
| "loss": 0.0004, |
| "reward": 3.958520531654358, |
| "reward_std": 0.008235724177211523, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9585205316543579, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 549 |
| }, |
| { |
| "completion_length": 249.0, |
| "epoch": 1.7584, |
| "grad_norm": 128.38499450683594, |
| "kl": 17.28076171875, |
| "learning_rate": 3.1374999999999996e-07, |
| "loss": 0.172, |
| "reward": 3.932722330093384, |
| "reward_std": 0.012139817699790001, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9340447783470154, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.998677521944046, |
| "step": 550 |
| }, |
| { |
| "completion_length": 202.25, |
| "epoch": 1.7616, |
| "grad_norm": 1.6289058923721313, |
| "kl": 0.0709228515625, |
| "learning_rate": 3.1249999999999997e-07, |
| "loss": 0.0007, |
| "reward": 3.931633234024048, |
| "reward_std": 0.015017563942819834, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.9620243012905121, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9752906560897827, |
| "step": 551 |
| }, |
| { |
| "completion_length": 223.65625, |
| "epoch": 1.7648000000000001, |
| "grad_norm": 0.650069534778595, |
| "kl": 0.0467529296875, |
| "learning_rate": 3.1125000000000004e-07, |
| "loss": 0.0005, |
| "reward": 3.9622879028320312, |
| "reward_std": 0.004962240578606725, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9622879028320312, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 552 |
| }, |
| { |
| "completion_length": 238.65625, |
| "epoch": 1.768, |
| "grad_norm": 9.516084671020508, |
| "kl": 0.0474853515625, |
| "learning_rate": 3.1e-07, |
| "loss": 0.0005, |
| "reward": 3.9525749683380127, |
| "reward_std": 0.012759724631905556, |
| "rewards/answer_entity_reward": 0.9937500059604645, |
| "rewards/answer_wer_reward": 0.9610438644886017, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9977810680866241, |
| "step": 553 |
| }, |
| { |
| "completion_length": 224.65625, |
| "epoch": 1.7711999999999999, |
| "grad_norm": 1.8886899948120117, |
| "kl": 0.044189453125, |
| "learning_rate": 3.0875e-07, |
| "loss": 0.0004, |
| "reward": 3.9586617946624756, |
| "reward_std": 0.01200480293482542, |
| "rewards/answer_entity_reward": 0.9914772808551788, |
| "rewards/answer_wer_reward": 0.9675752222537994, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996093809604645, |
| "step": 554 |
| }, |
| { |
| "completion_length": 220.9375, |
| "epoch": 1.7744, |
| "grad_norm": 5.122376918792725, |
| "kl": 0.048828125, |
| "learning_rate": 3.0749999999999997e-07, |
| "loss": 0.0005, |
| "reward": 3.9466060400009155, |
| "reward_std": 0.016119306907057762, |
| "rewards/answer_entity_reward": 0.9965170323848724, |
| "rewards/answer_wer_reward": 0.9567474722862244, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.993341475725174, |
| "step": 555 |
| }, |
| { |
| "completion_length": 198.8125, |
| "epoch": 1.7776, |
| "grad_norm": 4.916889667510986, |
| "kl": 0.068115234375, |
| "learning_rate": 3.0625000000000003e-07, |
| "loss": 0.0007, |
| "reward": 3.949711561203003, |
| "reward_std": 0.0163404387421906, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9576182961463928, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9920931458473206, |
| "step": 556 |
| }, |
| { |
| "completion_length": 180.875, |
| "epoch": 1.7808000000000002, |
| "grad_norm": 10.021855354309082, |
| "kl": 0.072021484375, |
| "learning_rate": 3.05e-07, |
| "loss": 0.0007, |
| "reward": 3.867478370666504, |
| "reward_std": 0.047242360189557076, |
| "rewards/answer_entity_reward": 0.9821428656578064, |
| "rewards/answer_wer_reward": 0.9576010704040527, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.927734375, |
| "step": 557 |
| }, |
| { |
| "completion_length": 227.8125, |
| "epoch": 1.784, |
| "grad_norm": 1.7502044439315796, |
| "kl": 0.04443359375, |
| "learning_rate": 3.0375e-07, |
| "loss": 0.0004, |
| "reward": 3.9525381326675415, |
| "reward_std": 0.013325697276741266, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9532942175865173, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999243974685669, |
| "step": 558 |
| }, |
| { |
| "completion_length": 204.15625, |
| "epoch": 1.7872, |
| "grad_norm": 5.304961681365967, |
| "kl": 0.0496826171875, |
| "learning_rate": 3.0249999999999996e-07, |
| "loss": 0.0005, |
| "reward": 3.957284450531006, |
| "reward_std": 0.005683758878149092, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9572845101356506, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 559 |
| }, |
| { |
| "completion_length": 228.34375, |
| "epoch": 1.7904, |
| "grad_norm": 1.2513984441757202, |
| "kl": 0.0577392578125, |
| "learning_rate": 3.0125000000000003e-07, |
| "loss": 0.0006, |
| "reward": 3.94599187374115, |
| "reward_std": 0.00800859834998846, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.957431435585022, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9885604083538055, |
| "step": 560 |
| }, |
| { |
| "completion_length": 211.03125, |
| "epoch": 1.7936, |
| "grad_norm": 5.97805118560791, |
| "kl": 0.1036376953125, |
| "learning_rate": 3e-07, |
| "loss": 0.001, |
| "reward": 3.9404828548431396, |
| "reward_std": 0.01265423372387886, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9433237612247467, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 561 |
| }, |
| { |
| "completion_length": 205.4375, |
| "epoch": 1.7968, |
| "grad_norm": 3.833575487136841, |
| "kl": 0.22998046875, |
| "learning_rate": 2.9875e-07, |
| "loss": 0.0023, |
| "reward": 3.909332752227783, |
| "reward_std": 0.007294894196093082, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9648370146751404, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9444957971572876, |
| "step": 562 |
| }, |
| { |
| "completion_length": 207.4375, |
| "epoch": 1.8, |
| "grad_norm": 0.8627040982246399, |
| "kl": 0.0611572265625, |
| "learning_rate": 2.9749999999999996e-07, |
| "loss": 0.0006, |
| "reward": 3.9548414945602417, |
| "reward_std": 0.006908831186592579, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9550975561141968, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997438490390778, |
| "step": 563 |
| }, |
| { |
| "completion_length": 198.15625, |
| "epoch": 1.8032, |
| "grad_norm": 0.9193502068519592, |
| "kl": 0.0518798828125, |
| "learning_rate": 2.9625e-07, |
| "loss": 0.0005, |
| "reward": 3.9462149143218994, |
| "reward_std": 0.007913234177976847, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9465437531471252, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996710419654846, |
| "step": 564 |
| }, |
| { |
| "completion_length": 198.15625, |
| "epoch": 1.8064, |
| "grad_norm": 1.9635776281356812, |
| "kl": 0.059814453125, |
| "learning_rate": 2.95e-07, |
| "loss": 0.0006, |
| "reward": 3.896806240081787, |
| "reward_std": 0.012922112364321947, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9503778219223022, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9464285671710968, |
| "step": 565 |
| }, |
| { |
| "completion_length": 164.90625, |
| "epoch": 1.8096, |
| "grad_norm": 1.2068322896957397, |
| "kl": 0.09375, |
| "learning_rate": 2.9375e-07, |
| "loss": 0.0009, |
| "reward": 3.8490008115768433, |
| "reward_std": 0.1467541428282857, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9502907395362854, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9328009486198425, |
| "step": 566 |
| }, |
| { |
| "completion_length": 206.34375, |
| "epoch": 1.8128, |
| "grad_norm": 2.1644375324249268, |
| "kl": 0.08251953125, |
| "learning_rate": 2.9249999999999995e-07, |
| "loss": 0.0008, |
| "reward": 3.970282793045044, |
| "reward_std": 0.0077400594018399715, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9728601574897766, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9974226951599121, |
| "step": 567 |
| }, |
| { |
| "completion_length": 233.09375, |
| "epoch": 1.8159999999999998, |
| "grad_norm": 1.106130599975586, |
| "kl": 0.0552978515625, |
| "learning_rate": 2.9125e-07, |
| "loss": 0.0005, |
| "reward": 3.9414994716644287, |
| "reward_std": 0.011295767035335302, |
| "rewards/answer_entity_reward": 0.9848698973655701, |
| "rewards/answer_wer_reward": 0.9577165246009827, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.998913049697876, |
| "step": 568 |
| }, |
| { |
| "completion_length": 206.46875, |
| "epoch": 1.8192, |
| "grad_norm": 1.2371478080749512, |
| "kl": 0.0599365234375, |
| "learning_rate": 2.9e-07, |
| "loss": 0.0006, |
| "reward": 3.9829952716827393, |
| "reward_std": 0.007155058206990361, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9829952716827393, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 569 |
| }, |
| { |
| "completion_length": 227.875, |
| "epoch": 1.8224, |
| "grad_norm": 0.9648468494415283, |
| "kl": 0.0587158203125, |
| "learning_rate": 2.8875e-07, |
| "loss": 0.0006, |
| "reward": 3.875002384185791, |
| "reward_std": 0.007613388821482658, |
| "rewards/answer_entity_reward": 0.9604166746139526, |
| "rewards/answer_wer_reward": 0.9299702048301697, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9846153855323792, |
| "step": 570 |
| }, |
| { |
| "completion_length": 242.1875, |
| "epoch": 1.8256000000000001, |
| "grad_norm": 3.7682442665100098, |
| "kl": 0.0732421875, |
| "learning_rate": 2.8749999999999995e-07, |
| "loss": 0.0007, |
| "reward": 3.790624737739563, |
| "reward_std": 0.14343099505640566, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9464230239391327, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.8754517436027527, |
| "step": 571 |
| }, |
| { |
| "completion_length": 248.28125, |
| "epoch": 1.8288, |
| "grad_norm": 0.7550325393676758, |
| "kl": 0.039794921875, |
| "learning_rate": 2.8625e-07, |
| "loss": 0.0004, |
| "reward": 3.9295032024383545, |
| "reward_std": 0.004920503590255976, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9295033514499664, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 572 |
| }, |
| { |
| "completion_length": 222.4375, |
| "epoch": 1.8319999999999999, |
| "grad_norm": 1.055333137512207, |
| "kl": 0.0567626953125, |
| "learning_rate": 2.8499999999999997e-07, |
| "loss": 0.0006, |
| "reward": 3.929059386253357, |
| "reward_std": 0.014613255392760038, |
| "rewards/answer_entity_reward": 0.9819711446762085, |
| "rewards/answer_wer_reward": 0.9496394395828247, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.997448742389679, |
| "step": 573 |
| }, |
| { |
| "completion_length": 217.40625, |
| "epoch": 1.8352, |
| "grad_norm": 1.640468716621399, |
| "kl": 0.0443115234375, |
| "learning_rate": 2.8375e-07, |
| "loss": 0.0004, |
| "reward": 3.9705777168273926, |
| "reward_std": 0.013166352873668075, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9736025929450989, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9998161792755127, |
| "step": 574 |
| }, |
| { |
| "completion_length": 229.1875, |
| "epoch": 1.8384, |
| "grad_norm": 3.271684169769287, |
| "kl": 0.0567626953125, |
| "learning_rate": 2.8249999999999994e-07, |
| "loss": 0.0006, |
| "reward": 3.9389246702194214, |
| "reward_std": 0.007664299104362726, |
| "rewards/answer_entity_reward": 0.9833333492279053, |
| "rewards/answer_wer_reward": 0.9555914402008057, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 575 |
| }, |
| { |
| "completion_length": 203.28125, |
| "epoch": 1.8416000000000001, |
| "grad_norm": 1.6847234964370728, |
| "kl": 0.063232421875, |
| "learning_rate": 2.8125e-07, |
| "loss": 0.0006, |
| "reward": 3.9692747592926025, |
| "reward_std": 0.006263851770199835, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9701676964759827, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999107152223587, |
| "step": 576 |
| }, |
| { |
| "completion_length": 251.0625, |
| "epoch": 1.8448, |
| "grad_norm": 4.737148761749268, |
| "kl": 0.128173828125, |
| "learning_rate": 2.8e-07, |
| "loss": 0.0013, |
| "reward": 3.935584545135498, |
| "reward_std": 0.016471964307129383, |
| "rewards/answer_entity_reward": 0.9937500059604645, |
| "rewards/answer_wer_reward": 0.9418345093727112, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 577 |
| }, |
| { |
| "completion_length": 199.5, |
| "epoch": 1.8479999999999999, |
| "grad_norm": 1.7424699068069458, |
| "kl": 0.0618896484375, |
| "learning_rate": 2.7875e-07, |
| "loss": 0.0006, |
| "reward": 3.966155171394348, |
| "reward_std": 0.012047166470438242, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9755966663360596, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9905584752559662, |
| "step": 578 |
| }, |
| { |
| "completion_length": 192.96875, |
| "epoch": 1.8512, |
| "grad_norm": 0.8571773171424866, |
| "kl": 0.0526123046875, |
| "learning_rate": 2.775e-07, |
| "loss": 0.0005, |
| "reward": 3.977761387825012, |
| "reward_std": 0.0047087406273931265, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9777614176273346, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 579 |
| }, |
| { |
| "completion_length": 223.6875, |
| "epoch": 1.8544, |
| "grad_norm": 1.3312608003616333, |
| "kl": 0.050537109375, |
| "learning_rate": 2.7625e-07, |
| "loss": 0.0005, |
| "reward": 3.9508321285247803, |
| "reward_std": 0.00891483761370182, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9508320689201355, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 580 |
| }, |
| { |
| "completion_length": 241.96875, |
| "epoch": 1.8576000000000001, |
| "grad_norm": 4.553063869476318, |
| "kl": 0.19140625, |
| "learning_rate": 2.75e-07, |
| "loss": 0.0019, |
| "reward": 3.925418257713318, |
| "reward_std": 0.016543671488761902, |
| "rewards/answer_entity_reward": 0.9963235259056091, |
| "rewards/answer_wer_reward": 0.9290946125984192, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 581 |
| }, |
| { |
| "completion_length": 241.90625, |
| "epoch": 1.8608, |
| "grad_norm": 0.8970361948013306, |
| "kl": 0.065185546875, |
| "learning_rate": 2.7374999999999997e-07, |
| "loss": 0.0007, |
| "reward": 3.9467151165008545, |
| "reward_std": 0.007796656806021929, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9470826387405396, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996323585510254, |
| "step": 582 |
| }, |
| { |
| "completion_length": 246.96875, |
| "epoch": 1.8639999999999999, |
| "grad_norm": 1.9463343620300293, |
| "kl": 0.04547119140625, |
| "learning_rate": 2.725e-07, |
| "loss": 0.0005, |
| "reward": 3.940864324569702, |
| "reward_std": 0.011073273373767734, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9416800141334534, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9991843402385712, |
| "step": 583 |
| }, |
| { |
| "completion_length": 206.625, |
| "epoch": 1.8672, |
| "grad_norm": 4.5208892822265625, |
| "kl": 0.092529296875, |
| "learning_rate": 2.7125e-07, |
| "loss": 0.0009, |
| "reward": 3.8930487632751465, |
| "reward_std": 0.032747200690209866, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.9660382270812988, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9326923191547394, |
| "step": 584 |
| }, |
| { |
| "completion_length": 255.25, |
| "epoch": 1.8704, |
| "grad_norm": 2.1606805324554443, |
| "kl": 0.04736328125, |
| "learning_rate": 2.7e-07, |
| "loss": 0.0005, |
| "reward": 3.936957836151123, |
| "reward_std": 0.013339729979634285, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9393823444843292, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.997575432062149, |
| "step": 585 |
| }, |
| { |
| "completion_length": 226.3125, |
| "epoch": 1.8736000000000002, |
| "grad_norm": 0.7422674298286438, |
| "kl": 0.048095703125, |
| "learning_rate": 2.6874999999999997e-07, |
| "loss": 0.0005, |
| "reward": 3.9866139888763428, |
| "reward_std": 0.0038484669639728963, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.987176924943924, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994369745254517, |
| "step": 586 |
| }, |
| { |
| "completion_length": 214.59375, |
| "epoch": 1.8768, |
| "grad_norm": 1.313864827156067, |
| "kl": 0.0684814453125, |
| "learning_rate": 2.675e-07, |
| "loss": 0.0007, |
| "reward": 3.9567151069641113, |
| "reward_std": 0.012406408437527716, |
| "rewards/answer_entity_reward": 0.9832702279090881, |
| "rewards/answer_wer_reward": 0.9734448790550232, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 587 |
| }, |
| { |
| "completion_length": 256.46875, |
| "epoch": 1.88, |
| "grad_norm": 1.4952497482299805, |
| "kl": 0.1278076171875, |
| "learning_rate": 2.6625e-07, |
| "loss": 0.0013, |
| "reward": 3.8717525005340576, |
| "reward_std": 0.13869436737149954, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9397719204425812, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9660714268684387, |
| "step": 588 |
| }, |
| { |
| "completion_length": 217.09375, |
| "epoch": 1.8832, |
| "grad_norm": 1.3716284036636353, |
| "kl": 0.054931640625, |
| "learning_rate": 2.65e-07, |
| "loss": 0.0006, |
| "reward": 3.962627410888672, |
| "reward_std": 0.006240109680220485, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9626273214817047, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 589 |
| }, |
| { |
| "completion_length": 253.0, |
| "epoch": 1.8864, |
| "grad_norm": 1.4284135103225708, |
| "kl": 0.07080078125, |
| "learning_rate": 2.6374999999999996e-07, |
| "loss": 0.0007, |
| "reward": 3.9501919746398926, |
| "reward_std": 0.012296234723180532, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9531300067901611, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9970619678497314, |
| "step": 590 |
| }, |
| { |
| "completion_length": 204.5, |
| "epoch": 1.8896, |
| "grad_norm": 3.8569161891937256, |
| "kl": 0.07421875, |
| "learning_rate": 2.625e-07, |
| "loss": 0.0007, |
| "reward": 3.9426995515823364, |
| "reward_std": 0.027584614232182503, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9779268503189087, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9647727012634277, |
| "step": 591 |
| }, |
| { |
| "completion_length": 229.0625, |
| "epoch": 1.8928, |
| "grad_norm": 2.589956760406494, |
| "kl": 0.08203125, |
| "learning_rate": 2.6125e-07, |
| "loss": 0.0008, |
| "reward": 3.9178069829940796, |
| "reward_std": 0.007971604820340872, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.95549076795578, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9623160660266876, |
| "step": 592 |
| }, |
| { |
| "completion_length": 170.65625, |
| "epoch": 1.896, |
| "grad_norm": 3.586792469024658, |
| "kl": 0.0423583984375, |
| "learning_rate": 2.6e-07, |
| "loss": 0.0004, |
| "reward": 3.9206513166427612, |
| "reward_std": 0.023992381058633327, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9824000000953674, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9382513165473938, |
| "step": 593 |
| }, |
| { |
| "completion_length": 229.34375, |
| "epoch": 1.8992, |
| "grad_norm": 4.520889759063721, |
| "kl": 0.07421875, |
| "learning_rate": 2.5874999999999996e-07, |
| "loss": 0.0007, |
| "reward": 3.942514419555664, |
| "reward_std": 0.038696477888152, |
| "rewards/answer_entity_reward": 0.984275609254837, |
| "rewards/answer_wer_reward": 0.9582389295101166, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 594 |
| }, |
| { |
| "completion_length": 223.4375, |
| "epoch": 1.9024, |
| "grad_norm": 1.3104579448699951, |
| "kl": 0.0565185546875, |
| "learning_rate": 2.5749999999999997e-07, |
| "loss": 0.0006, |
| "reward": 3.976773500442505, |
| "reward_std": 0.0044562743860296905, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9767734706401825, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 595 |
| }, |
| { |
| "completion_length": 254.09375, |
| "epoch": 1.9056, |
| "grad_norm": 1.03975510597229, |
| "kl": 0.05322265625, |
| "learning_rate": 2.5625e-07, |
| "loss": 0.0005, |
| "reward": 3.943529725074768, |
| "reward_std": 0.009816794656217098, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9451378583908081, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9983919560909271, |
| "step": 596 |
| }, |
| { |
| "completion_length": 243.03125, |
| "epoch": 1.9088, |
| "grad_norm": 1.0213077068328857, |
| "kl": 0.0506591796875, |
| "learning_rate": 2.55e-07, |
| "loss": 0.0005, |
| "reward": 3.9278059005737305, |
| "reward_std": 0.00602961634285748, |
| "rewards/answer_entity_reward": 0.9861111044883728, |
| "rewards/answer_wer_reward": 0.9420903027057648, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996044337749481, |
| "step": 597 |
| }, |
| { |
| "completion_length": 182.46875, |
| "epoch": 1.912, |
| "grad_norm": 1.8683794736862183, |
| "kl": 0.065185546875, |
| "learning_rate": 2.5374999999999995e-07, |
| "loss": 0.0007, |
| "reward": 3.9624691009521484, |
| "reward_std": 0.012565109878778458, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9729967415332794, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9939365684986115, |
| "step": 598 |
| }, |
| { |
| "completion_length": 166.25, |
| "epoch": 1.9152, |
| "grad_norm": 1.716305136680603, |
| "kl": 0.0968017578125, |
| "learning_rate": 2.5249999999999996e-07, |
| "loss": 0.001, |
| "reward": 3.896498918533325, |
| "reward_std": 0.11676233587786555, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9749563038349152, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9527925550937653, |
| "step": 599 |
| }, |
| { |
| "completion_length": 199.59375, |
| "epoch": 1.9184, |
| "grad_norm": 1.2319942712783813, |
| "kl": 0.0775146484375, |
| "learning_rate": 2.5125e-07, |
| "loss": 0.0008, |
| "reward": 3.9489831924438477, |
| "reward_std": 0.010235858615487814, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9580873548984528, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9908958971500397, |
| "step": 600 |
| }, |
| { |
| "completion_length": 210.53125, |
| "epoch": 1.9216, |
| "grad_norm": 1.0385370254516602, |
| "kl": 0.0650634765625, |
| "learning_rate": 2.5e-07, |
| "loss": 0.0007, |
| "reward": 3.966851830482483, |
| "reward_std": 0.005628936691209674, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9668518006801605, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 601 |
| }, |
| { |
| "completion_length": 186.9375, |
| "epoch": 1.9247999999999998, |
| "grad_norm": 2.1772327423095703, |
| "kl": 0.11279296875, |
| "learning_rate": 2.4875e-07, |
| "loss": 0.0011, |
| "reward": 3.9322038888931274, |
| "reward_std": 0.01743672974407673, |
| "rewards/answer_entity_reward": 0.9880681931972504, |
| "rewards/answer_wer_reward": 0.9574334919452667, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9867021441459656, |
| "step": 602 |
| }, |
| { |
| "completion_length": 209.65625, |
| "epoch": 1.928, |
| "grad_norm": 0.9661850929260254, |
| "kl": 0.072998046875, |
| "learning_rate": 2.475e-07, |
| "loss": 0.0007, |
| "reward": 3.9598844051361084, |
| "reward_std": 0.009228286100551486, |
| "rewards/answer_entity_reward": 0.9937500059604645, |
| "rewards/answer_wer_reward": 0.966718465089798, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994158744812012, |
| "step": 603 |
| }, |
| { |
| "completion_length": 193.65625, |
| "epoch": 1.9312, |
| "grad_norm": 2.6254851818084717, |
| "kl": 0.102294921875, |
| "learning_rate": 2.4624999999999997e-07, |
| "loss": 0.001, |
| "reward": 3.957027792930603, |
| "reward_std": 0.008546661585569382, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9570277333259583, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 604 |
| }, |
| { |
| "completion_length": 219.34375, |
| "epoch": 1.9344000000000001, |
| "grad_norm": 1.0413298606872559, |
| "kl": 0.104736328125, |
| "learning_rate": 2.45e-07, |
| "loss": 0.0011, |
| "reward": 3.9702824354171753, |
| "reward_std": 0.007483657216653228, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9702823162078857, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 605 |
| }, |
| { |
| "completion_length": 157.46875, |
| "epoch": 1.9376, |
| "grad_norm": 2.432849645614624, |
| "kl": 0.14453125, |
| "learning_rate": 2.4375e-07, |
| "loss": 0.0014, |
| "reward": 3.957343101501465, |
| "reward_std": 0.005332180997356772, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.957624614238739, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997184872627258, |
| "step": 606 |
| }, |
| { |
| "completion_length": 248.40625, |
| "epoch": 1.9407999999999999, |
| "grad_norm": 0.8216654062271118, |
| "kl": 0.071044921875, |
| "learning_rate": 2.425e-07, |
| "loss": 0.0007, |
| "reward": 3.9644582271575928, |
| "reward_std": 0.01216787239536643, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9688305556774139, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.998031497001648, |
| "step": 607 |
| }, |
| { |
| "completion_length": 218.625, |
| "epoch": 1.944, |
| "grad_norm": 0.9195014834403992, |
| "kl": 0.0545654296875, |
| "learning_rate": 2.4124999999999997e-07, |
| "loss": 0.0005, |
| "reward": 3.972040057182312, |
| "reward_std": 0.004315207479521632, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9726911783218384, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993489682674408, |
| "step": 608 |
| }, |
| { |
| "completion_length": 231.84375, |
| "epoch": 1.9472, |
| "grad_norm": 1.3564932346343994, |
| "kl": 0.06103515625, |
| "learning_rate": 2.4e-07, |
| "loss": 0.0006, |
| "reward": 3.951057553291321, |
| "reward_std": 0.013061597011983395, |
| "rewards/answer_entity_reward": 0.9963235259056091, |
| "rewards/answer_wer_reward": 0.9553851187229156, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993489682674408, |
| "step": 609 |
| }, |
| { |
| "completion_length": 241.4375, |
| "epoch": 1.9504000000000001, |
| "grad_norm": 0.9419238567352295, |
| "kl": 0.051513671875, |
| "learning_rate": 2.3875e-07, |
| "loss": 0.0005, |
| "reward": 3.971252202987671, |
| "reward_std": 0.006067809648811817, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9715149104595184, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997373819351196, |
| "step": 610 |
| }, |
| { |
| "completion_length": 222.09375, |
| "epoch": 1.9536, |
| "grad_norm": 1.4854899644851685, |
| "kl": 0.166748046875, |
| "learning_rate": 2.3749999999999998e-07, |
| "loss": 0.0017, |
| "reward": 3.9489357471466064, |
| "reward_std": 0.012118924409151077, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.948935866355896, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 611 |
| }, |
| { |
| "completion_length": 259.8125, |
| "epoch": 1.9567999999999999, |
| "grad_norm": 2.2286458015441895, |
| "kl": 0.0426025390625, |
| "learning_rate": 2.3625e-07, |
| "loss": 0.0004, |
| "reward": 3.96254563331604, |
| "reward_std": 0.005056597990915179, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9625457525253296, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 612 |
| }, |
| { |
| "completion_length": 209.4375, |
| "epoch": 1.96, |
| "grad_norm": 4.077661514282227, |
| "kl": 0.05615234375, |
| "learning_rate": 2.3499999999999997e-07, |
| "loss": 0.0006, |
| "reward": 3.941632628440857, |
| "reward_std": 0.01233140891417861, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9416325688362122, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 613 |
| }, |
| { |
| "completion_length": 221.71875, |
| "epoch": 1.9632, |
| "grad_norm": 0.7665371298789978, |
| "kl": 0.0555419921875, |
| "learning_rate": 2.3375e-07, |
| "loss": 0.0005, |
| "reward": 3.9698644876480103, |
| "reward_std": 0.009979546128306538, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.973064661026001, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996408224105835, |
| "step": 614 |
| }, |
| { |
| "completion_length": 219.875, |
| "epoch": 1.9664000000000001, |
| "grad_norm": 2.4666738510131836, |
| "kl": 0.0546875, |
| "learning_rate": 2.325e-07, |
| "loss": 0.0005, |
| "reward": 3.9548712968826294, |
| "reward_std": 0.011192699894309044, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9553521871566772, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9995192289352417, |
| "step": 615 |
| }, |
| { |
| "completion_length": 235.0, |
| "epoch": 1.9696, |
| "grad_norm": 1.5382620096206665, |
| "kl": 0.044921875, |
| "learning_rate": 2.3125e-07, |
| "loss": 0.0005, |
| "reward": 3.9565550088882446, |
| "reward_std": 0.008881408954039216, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9740456640720367, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9825093150138855, |
| "step": 616 |
| }, |
| { |
| "completion_length": 141.09375, |
| "epoch": 1.9727999999999999, |
| "grad_norm": 2.0756258964538574, |
| "kl": 0.0631103515625, |
| "learning_rate": 2.3e-07, |
| "loss": 0.0006, |
| "reward": 3.9571491479873657, |
| "reward_std": 0.005044124089181423, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.980070561170578, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9770786762237549, |
| "step": 617 |
| }, |
| { |
| "completion_length": 222.46875, |
| "epoch": 1.976, |
| "grad_norm": 5.071360111236572, |
| "kl": 0.075927734375, |
| "learning_rate": 2.2875e-07, |
| "loss": 0.0008, |
| "reward": 3.8557703495025635, |
| "reward_std": 0.06493359804153442, |
| "rewards/answer_entity_reward": 0.9847027957439423, |
| "rewards/answer_wer_reward": 0.9706770181655884, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.900390625, |
| "step": 618 |
| }, |
| { |
| "completion_length": 231.125, |
| "epoch": 1.9792, |
| "grad_norm": 1.0749843120574951, |
| "kl": 0.050537109375, |
| "learning_rate": 2.275e-07, |
| "loss": 0.0005, |
| "reward": 3.9660208225250244, |
| "reward_std": 0.0037171735893934965, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9660208523273468, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 619 |
| }, |
| { |
| "completion_length": 252.625, |
| "epoch": 1.9824000000000002, |
| "grad_norm": 1.5367364883422852, |
| "kl": 0.070068359375, |
| "learning_rate": 2.2625e-07, |
| "loss": 0.0007, |
| "reward": 3.946213126182556, |
| "reward_std": 0.01816728012636304, |
| "rewards/answer_entity_reward": 0.9867424070835114, |
| "rewards/answer_wer_reward": 0.9616928696632385, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9977777898311615, |
| "step": 620 |
| }, |
| { |
| "completion_length": 239.34375, |
| "epoch": 1.9856, |
| "grad_norm": 2.541694164276123, |
| "kl": 0.142578125, |
| "learning_rate": 2.25e-07, |
| "loss": 0.0014, |
| "reward": 3.947938561439514, |
| "reward_std": 0.009988004341721535, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9479385614395142, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 621 |
| }, |
| { |
| "completion_length": 224.65625, |
| "epoch": 1.9888, |
| "grad_norm": 1.3821133375167847, |
| "kl": 0.075927734375, |
| "learning_rate": 2.2375e-07, |
| "loss": 0.0007, |
| "reward": 3.953581690788269, |
| "reward_std": 0.006479294504970312, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.953581839799881, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 622 |
| }, |
| { |
| "completion_length": 206.3125, |
| "epoch": 1.992, |
| "grad_norm": 1.0023412704467773, |
| "kl": 0.13232421875, |
| "learning_rate": 2.225e-07, |
| "loss": 0.0013, |
| "reward": 3.8949310779571533, |
| "reward_std": 0.006026371265761554, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9634793996810913, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9314516186714172, |
| "step": 623 |
| }, |
| { |
| "completion_length": 179.96875, |
| "epoch": 1.9952, |
| "grad_norm": 1.534476637840271, |
| "kl": 0.078125, |
| "learning_rate": 2.2125e-07, |
| "loss": 0.0008, |
| "reward": 3.966533660888672, |
| "reward_std": 0.008991609327495098, |
| "rewards/answer_entity_reward": 0.9950658082962036, |
| "rewards/answer_wer_reward": 0.9756669104099274, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9958009123802185, |
| "step": 624 |
| }, |
| { |
| "completion_length": 232.75, |
| "epoch": 1.9984, |
| "grad_norm": 0.7324752807617188, |
| "kl": 0.0499267578125, |
| "learning_rate": 2.1999999999999998e-07, |
| "loss": 0.0005, |
| "reward": 3.946596384048462, |
| "reward_std": 0.011123172473162413, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9492979049682617, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997023940086365, |
| "step": 625 |
| }, |
| { |
| "completion_length": 176.0625, |
| "epoch": 2.0, |
| "grad_norm": 0.33141908049583435, |
| "kl": 0.06005859375, |
| "learning_rate": 2.1875e-07, |
| "loss": 0.0003, |
| "reward": 3.9717535972595215, |
| "reward_std": 0.012056672014296055, |
| "rewards/answer_entity_reward": 0.9963235259056091, |
| "rewards/answer_wer_reward": 0.975429892539978, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 626 |
| }, |
| { |
| "completion_length": 232.21875, |
| "epoch": 2.0032, |
| "grad_norm": 0.8334391117095947, |
| "kl": 0.0457763671875, |
| "learning_rate": 2.1749999999999998e-07, |
| "loss": 0.0004, |
| "reward": 3.970544457435608, |
| "reward_std": 0.003736199578270316, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9705445766448975, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 627 |
| }, |
| { |
| "completion_length": 173.375, |
| "epoch": 2.0064, |
| "grad_norm": 0.965114951133728, |
| "kl": 0.067626953125, |
| "learning_rate": 2.1625e-07, |
| "loss": 0.0007, |
| "reward": 3.974756956100464, |
| "reward_std": 0.004756669281050563, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9788074791431427, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9959495067596436, |
| "step": 628 |
| }, |
| { |
| "completion_length": 222.15625, |
| "epoch": 2.0096, |
| "grad_norm": 2.102520227432251, |
| "kl": 0.0474853515625, |
| "learning_rate": 2.1499999999999998e-07, |
| "loss": 0.0005, |
| "reward": 3.938779830932617, |
| "reward_std": 0.01813220279291272, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9791045486927032, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9596752524375916, |
| "step": 629 |
| }, |
| { |
| "completion_length": 206.40625, |
| "epoch": 2.0128, |
| "grad_norm": 1.3867822885513306, |
| "kl": 0.095458984375, |
| "learning_rate": 2.1375e-07, |
| "loss": 0.001, |
| "reward": 3.977003812789917, |
| "reward_std": 0.003467106493189931, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9772301912307739, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997735619544983, |
| "step": 630 |
| }, |
| { |
| "completion_length": 237.625, |
| "epoch": 2.016, |
| "grad_norm": 1.2721437215805054, |
| "kl": 0.0576171875, |
| "learning_rate": 2.1249999999999998e-07, |
| "loss": 0.0006, |
| "reward": 3.96044921875, |
| "reward_std": 0.007887857500463724, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9609974026679993, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999451756477356, |
| "step": 631 |
| }, |
| { |
| "completion_length": 190.65625, |
| "epoch": 2.0192, |
| "grad_norm": 1.6940927505493164, |
| "kl": 0.170166015625, |
| "learning_rate": 2.1125e-07, |
| "loss": 0.0017, |
| "reward": 3.92085862159729, |
| "reward_std": 0.012093114666640759, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9635953307151794, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9572633504867554, |
| "step": 632 |
| }, |
| { |
| "completion_length": 213.75, |
| "epoch": 2.0224, |
| "grad_norm": 1.3798060417175293, |
| "kl": 0.0552978515625, |
| "learning_rate": 2.0999999999999997e-07, |
| "loss": 0.0006, |
| "reward": 3.9467806816101074, |
| "reward_std": 0.00452708825469017, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9470699727535248, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997106492519379, |
| "step": 633 |
| }, |
| { |
| "completion_length": 193.5625, |
| "epoch": 2.0256, |
| "grad_norm": 1.5375889539718628, |
| "kl": 0.046875, |
| "learning_rate": 2.0874999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.9730241298675537, |
| "reward_std": 0.006102013634517789, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9743154048919678, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987086653709412, |
| "step": 634 |
| }, |
| { |
| "completion_length": 204.09375, |
| "epoch": 2.0288, |
| "grad_norm": 1.0933163166046143, |
| "kl": 0.09228515625, |
| "learning_rate": 2.0749999999999997e-07, |
| "loss": 0.0009, |
| "reward": 3.9593019485473633, |
| "reward_std": 0.008372287498787045, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9602685272693634, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999033510684967, |
| "step": 635 |
| }, |
| { |
| "completion_length": 186.875, |
| "epoch": 2.032, |
| "grad_norm": 3.5551085472106934, |
| "kl": 0.085205078125, |
| "learning_rate": 2.0624999999999998e-07, |
| "loss": 0.0008, |
| "reward": 3.937085270881653, |
| "reward_std": 0.028064538724720478, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9683353006839752, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9687500298023224, |
| "step": 636 |
| }, |
| { |
| "completion_length": 228.875, |
| "epoch": 2.0352, |
| "grad_norm": 0.9865986108779907, |
| "kl": 0.0728759765625, |
| "learning_rate": 2.0499999999999997e-07, |
| "loss": 0.0007, |
| "reward": 3.9492111206054688, |
| "reward_std": 0.007756081875413656, |
| "rewards/answer_entity_reward": 0.9916666746139526, |
| "rewards/answer_wer_reward": 0.9575444757938385, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 637 |
| }, |
| { |
| "completion_length": 212.28125, |
| "epoch": 2.0384, |
| "grad_norm": 3.542672872543335, |
| "kl": 0.110107421875, |
| "learning_rate": 2.0374999999999998e-07, |
| "loss": 0.0011, |
| "reward": 3.9374581575393677, |
| "reward_std": 0.009235690347850323, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9742424190044403, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9652990996837616, |
| "step": 638 |
| }, |
| { |
| "completion_length": 232.0, |
| "epoch": 2.0416, |
| "grad_norm": 1.4940472841262817, |
| "kl": 0.0565185546875, |
| "learning_rate": 2.025e-07, |
| "loss": 0.0006, |
| "reward": 3.947740077972412, |
| "reward_std": 0.006069941911846399, |
| "rewards/answer_entity_reward": 0.9861111044883728, |
| "rewards/answer_wer_reward": 0.9616289734840393, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 639 |
| }, |
| { |
| "completion_length": 214.46875, |
| "epoch": 2.0448, |
| "grad_norm": 1.0322229862213135, |
| "kl": 0.0865478515625, |
| "learning_rate": 2.0125e-07, |
| "loss": 0.0009, |
| "reward": 3.973870038986206, |
| "reward_std": 0.005974382860586047, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9738699197769165, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 640 |
| }, |
| { |
| "completion_length": 175.71875, |
| "epoch": 2.048, |
| "grad_norm": 2.1991164684295654, |
| "kl": 0.0986328125, |
| "learning_rate": 2e-07, |
| "loss": 0.001, |
| "reward": 3.9478849172592163, |
| "reward_std": 0.012253349646925926, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9485794901847839, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993055462837219, |
| "step": 641 |
| }, |
| { |
| "completion_length": 202.3125, |
| "epoch": 2.0512, |
| "grad_norm": 2.254936456680298, |
| "kl": 0.0758056640625, |
| "learning_rate": 1.9875e-07, |
| "loss": 0.0008, |
| "reward": 3.9462071657180786, |
| "reward_std": 0.007457165978848934, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9462071061134338, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 642 |
| }, |
| { |
| "completion_length": 205.03125, |
| "epoch": 2.0544, |
| "grad_norm": 2.473928928375244, |
| "kl": 0.079345703125, |
| "learning_rate": 1.975e-07, |
| "loss": 0.0008, |
| "reward": 3.92992103099823, |
| "reward_std": 0.014722079504281282, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.9436539113521576, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9919487535953522, |
| "step": 643 |
| }, |
| { |
| "completion_length": 202.3125, |
| "epoch": 2.0576, |
| "grad_norm": 1.5329126119613647, |
| "kl": 0.03643798828125, |
| "learning_rate": 1.9625e-07, |
| "loss": 0.0004, |
| "reward": 3.944863796234131, |
| "reward_std": 0.006489667110145092, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9667904078960419, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9780733287334442, |
| "step": 644 |
| }, |
| { |
| "completion_length": 202.53125, |
| "epoch": 2.0608, |
| "grad_norm": 0.6484522223472595, |
| "kl": 0.04443359375, |
| "learning_rate": 1.9499999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.975989580154419, |
| "reward_std": 0.0032934267073869705, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9759896695613861, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 645 |
| }, |
| { |
| "completion_length": 248.65625, |
| "epoch": 2.064, |
| "grad_norm": 3.43375301361084, |
| "kl": 0.0609130859375, |
| "learning_rate": 1.9375e-07, |
| "loss": 0.0006, |
| "reward": 3.952019691467285, |
| "reward_std": 0.010596145410090685, |
| "rewards/answer_entity_reward": 0.9983552694320679, |
| "rewards/answer_wer_reward": 0.9558849632740021, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9977796375751495, |
| "step": 646 |
| }, |
| { |
| "completion_length": 209.40625, |
| "epoch": 2.0672, |
| "grad_norm": 1.1015528440475464, |
| "kl": 0.057373046875, |
| "learning_rate": 1.9249999999999998e-07, |
| "loss": 0.0006, |
| "reward": 3.9535114765167236, |
| "reward_std": 0.0073295624461025, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9535112977027893, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 647 |
| }, |
| { |
| "completion_length": 247.15625, |
| "epoch": 2.0704, |
| "grad_norm": 5.493063449859619, |
| "kl": 0.052490234375, |
| "learning_rate": 1.9125e-07, |
| "loss": 0.0005, |
| "reward": 3.959768056869507, |
| "reward_std": 0.009880491998046637, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9597680270671844, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 648 |
| }, |
| { |
| "completion_length": 190.3125, |
| "epoch": 2.0736, |
| "grad_norm": 3.042928457260132, |
| "kl": 0.070556640625, |
| "learning_rate": 1.8999999999999998e-07, |
| "loss": 0.0007, |
| "reward": 3.935302972793579, |
| "reward_std": 0.008418679004535079, |
| "rewards/answer_entity_reward": 0.9861111044883728, |
| "rewards/answer_wer_reward": 0.9707636535167694, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9784283638000488, |
| "step": 649 |
| }, |
| { |
| "completion_length": 240.1875, |
| "epoch": 2.0768, |
| "grad_norm": 1.1801666021347046, |
| "kl": 0.068359375, |
| "learning_rate": 1.8875e-07, |
| "loss": 0.0007, |
| "reward": 3.944392442703247, |
| "reward_std": 0.008859490510076284, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9443924725055695, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 650 |
| }, |
| { |
| "completion_length": 212.0625, |
| "epoch": 2.08, |
| "grad_norm": 1.1967086791992188, |
| "kl": 0.072021484375, |
| "learning_rate": 1.875e-07, |
| "loss": 0.0007, |
| "reward": 3.96494197845459, |
| "reward_std": 0.011900570709258318, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9674758613109589, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9974662065505981, |
| "step": 651 |
| }, |
| { |
| "completion_length": 179.90625, |
| "epoch": 2.0832, |
| "grad_norm": 2.0556278228759766, |
| "kl": 0.056640625, |
| "learning_rate": 1.8625e-07, |
| "loss": 0.0006, |
| "reward": 3.925339102745056, |
| "reward_std": 0.005963671952486038, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9453259110450745, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9800131618976593, |
| "step": 652 |
| }, |
| { |
| "completion_length": 232.1875, |
| "epoch": 2.0864, |
| "grad_norm": 1.1875349283218384, |
| "kl": 0.076171875, |
| "learning_rate": 1.85e-07, |
| "loss": 0.0008, |
| "reward": 3.9718481302261353, |
| "reward_std": 0.01158686971757561, |
| "rewards/answer_entity_reward": 0.9955128133296967, |
| "rewards/answer_wer_reward": 0.9763352572917938, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 653 |
| }, |
| { |
| "completion_length": 222.65625, |
| "epoch": 2.0896, |
| "grad_norm": 2.1682872772216797, |
| "kl": 0.09423828125, |
| "learning_rate": 1.8375e-07, |
| "loss": 0.0009, |
| "reward": 3.94124174118042, |
| "reward_std": 0.008590340381488204, |
| "rewards/answer_entity_reward": 0.9903846085071564, |
| "rewards/answer_wer_reward": 0.9508572518825531, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 654 |
| }, |
| { |
| "completion_length": 173.03125, |
| "epoch": 2.0928, |
| "grad_norm": 2.1240601539611816, |
| "kl": 0.066162109375, |
| "learning_rate": 1.825e-07, |
| "loss": 0.0007, |
| "reward": 3.9930202960968018, |
| "reward_std": 0.0026576630771160126, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9934512376785278, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9995689690113068, |
| "step": 655 |
| }, |
| { |
| "completion_length": 177.09375, |
| "epoch": 2.096, |
| "grad_norm": 4.589439868927002, |
| "kl": 0.083984375, |
| "learning_rate": 1.8124999999999999e-07, |
| "loss": 0.0008, |
| "reward": 3.7905973196029663, |
| "reward_std": 0.05029802396893501, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9605589509010315, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8300382792949677, |
| "step": 656 |
| }, |
| { |
| "completion_length": 182.5, |
| "epoch": 2.0992, |
| "grad_norm": 2.9955060482025146, |
| "kl": 0.0601806640625, |
| "learning_rate": 1.8e-07, |
| "loss": 0.0006, |
| "reward": 3.959343194961548, |
| "reward_std": 0.010165283223614097, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9634606242179871, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.995882511138916, |
| "step": 657 |
| }, |
| { |
| "completion_length": 247.53125, |
| "epoch": 2.1024, |
| "grad_norm": 6.366602897644043, |
| "kl": 0.2166748046875, |
| "learning_rate": 1.7874999999999998e-07, |
| "loss": 0.0022, |
| "reward": 3.95376193523407, |
| "reward_std": 0.007726241368800402, |
| "rewards/answer_entity_reward": 0.9903846085071564, |
| "rewards/answer_wer_reward": 0.9633772671222687, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 658 |
| }, |
| { |
| "completion_length": 212.8125, |
| "epoch": 2.1056, |
| "grad_norm": 1.1973211765289307, |
| "kl": 0.0445556640625, |
| "learning_rate": 1.775e-07, |
| "loss": 0.0004, |
| "reward": 3.979708194732666, |
| "reward_std": 0.007615833543241024, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9800336956977844, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996744692325592, |
| "step": 659 |
| }, |
| { |
| "completion_length": 244.65625, |
| "epoch": 2.1088, |
| "grad_norm": 1.237342357635498, |
| "kl": 0.063232421875, |
| "learning_rate": 1.7624999999999998e-07, |
| "loss": 0.0006, |
| "reward": 3.9267531633377075, |
| "reward_std": 0.01262162160128355, |
| "rewards/answer_entity_reward": 0.9903846085071564, |
| "rewards/answer_wer_reward": 0.937911719083786, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9984567761421204, |
| "step": 660 |
| }, |
| { |
| "completion_length": 211.46875, |
| "epoch": 2.112, |
| "grad_norm": 1.6842882633209229, |
| "kl": 0.0623779296875, |
| "learning_rate": 1.75e-07, |
| "loss": 0.0006, |
| "reward": 3.9610049724578857, |
| "reward_std": 0.008832846768200397, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9619665145874023, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990384578704834, |
| "step": 661 |
| }, |
| { |
| "completion_length": 208.6875, |
| "epoch": 2.1152, |
| "grad_norm": 1.8498320579528809, |
| "kl": 0.0687255859375, |
| "learning_rate": 1.7374999999999998e-07, |
| "loss": 0.0007, |
| "reward": 3.908181667327881, |
| "reward_std": 0.05270358338020742, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9462520182132721, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9654017686843872, |
| "step": 662 |
| }, |
| { |
| "completion_length": 220.15625, |
| "epoch": 2.1184, |
| "grad_norm": 1.3248109817504883, |
| "kl": 0.0576171875, |
| "learning_rate": 1.725e-07, |
| "loss": 0.0006, |
| "reward": 3.977890729904175, |
| "reward_std": 0.0048680840991437435, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9778908789157867, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 663 |
| }, |
| { |
| "completion_length": 203.125, |
| "epoch": 2.1216, |
| "grad_norm": 1.2837951183319092, |
| "kl": 0.0660400390625, |
| "learning_rate": 1.7125e-07, |
| "loss": 0.0007, |
| "reward": 3.951757311820984, |
| "reward_std": 0.01306973909959197, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9517573118209839, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 664 |
| }, |
| { |
| "completion_length": 234.71875, |
| "epoch": 2.1248, |
| "grad_norm": 1.2517513036727905, |
| "kl": 0.072265625, |
| "learning_rate": 1.7000000000000001e-07, |
| "loss": 0.0007, |
| "reward": 3.932037830352783, |
| "reward_std": 0.018653371836990118, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9320378601551056, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 665 |
| }, |
| { |
| "completion_length": 154.4375, |
| "epoch": 2.128, |
| "grad_norm": 1.6812143325805664, |
| "kl": 0.057373046875, |
| "learning_rate": 1.6875e-07, |
| "loss": 0.0006, |
| "reward": 3.933722972869873, |
| "reward_std": 0.004374760144855827, |
| "rewards/answer_entity_reward": 0.9916666746139526, |
| "rewards/answer_wer_reward": 0.9603091180324554, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9817472994327545, |
| "step": 666 |
| }, |
| { |
| "completion_length": 194.375, |
| "epoch": 2.1312, |
| "grad_norm": 1.1369833946228027, |
| "kl": 0.10205078125, |
| "learning_rate": 1.675e-07, |
| "loss": 0.001, |
| "reward": 3.948467254638672, |
| "reward_std": 0.013669541105628014, |
| "rewards/answer_entity_reward": 0.9895833134651184, |
| "rewards/answer_wer_reward": 0.9588838517665863, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 667 |
| }, |
| { |
| "completion_length": 222.40625, |
| "epoch": 2.1344, |
| "grad_norm": 1.289441466331482, |
| "kl": 0.09716796875, |
| "learning_rate": 1.6625e-07, |
| "loss": 0.001, |
| "reward": 3.938557267189026, |
| "reward_std": 0.005478785838931799, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9577881693840027, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9807692170143127, |
| "step": 668 |
| }, |
| { |
| "completion_length": 185.71875, |
| "epoch": 2.1376, |
| "grad_norm": 1.9890272617340088, |
| "kl": 0.084716796875, |
| "learning_rate": 1.65e-07, |
| "loss": 0.0008, |
| "reward": 3.967849016189575, |
| "reward_std": 0.008760316297411919, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.967848926782608, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 669 |
| }, |
| { |
| "completion_length": 248.46875, |
| "epoch": 2.1408, |
| "grad_norm": 1.1813039779663086, |
| "kl": 0.074462890625, |
| "learning_rate": 1.6375e-07, |
| "loss": 0.0007, |
| "reward": 3.8907772302627563, |
| "reward_std": 0.07307082694023848, |
| "rewards/answer_entity_reward": 0.9749999940395355, |
| "rewards/answer_wer_reward": 0.915777176618576, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 670 |
| }, |
| { |
| "completion_length": 204.09375, |
| "epoch": 2.144, |
| "grad_norm": 1.4091624021530151, |
| "kl": 0.079833984375, |
| "learning_rate": 1.625e-07, |
| "loss": 0.0008, |
| "reward": 3.9357553720474243, |
| "reward_std": 0.018585966899991035, |
| "rewards/answer_entity_reward": 0.9924799501895905, |
| "rewards/answer_wer_reward": 0.9553823173046112, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9878930449485779, |
| "step": 671 |
| }, |
| { |
| "completion_length": 204.15625, |
| "epoch": 2.1471999999999998, |
| "grad_norm": 1.9349714517593384, |
| "kl": 0.0614013671875, |
| "learning_rate": 1.6125e-07, |
| "loss": 0.0006, |
| "reward": 3.963050127029419, |
| "reward_std": 0.011341096367686987, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9657188355922699, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997351765632629, |
| "step": 672 |
| }, |
| { |
| "completion_length": 183.1875, |
| "epoch": 2.1504, |
| "grad_norm": 3.866070508956909, |
| "kl": 0.1171875, |
| "learning_rate": 1.6e-07, |
| "loss": 0.0012, |
| "reward": 3.778456449508667, |
| "reward_std": 0.1051805429160595, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9563734233379364, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8244869709014893, |
| "step": 673 |
| }, |
| { |
| "completion_length": 237.875, |
| "epoch": 2.1536, |
| "grad_norm": 1.3984158039093018, |
| "kl": 0.0478515625, |
| "learning_rate": 1.5875e-07, |
| "loss": 0.0005, |
| "reward": 3.9681609869003296, |
| "reward_std": 0.007229159120470285, |
| "rewards/answer_entity_reward": 0.9981617629528046, |
| "rewards/answer_wer_reward": 0.9706325232982635, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993667006492615, |
| "step": 674 |
| }, |
| { |
| "completion_length": 201.9375, |
| "epoch": 2.1568, |
| "grad_norm": 4.475615501403809, |
| "kl": 0.06640625, |
| "learning_rate": 1.575e-07, |
| "loss": 0.0007, |
| "reward": 3.8558905124664307, |
| "reward_std": 0.0662167351692915, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9515935778617859, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.904296875, |
| "step": 675 |
| }, |
| { |
| "completion_length": 199.59375, |
| "epoch": 2.16, |
| "grad_norm": 1.3850592374801636, |
| "kl": 0.042236328125, |
| "learning_rate": 1.5624999999999999e-07, |
| "loss": 0.0004, |
| "reward": 3.9729303121566772, |
| "reward_std": 0.01144796540029347, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9750137031078339, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 676 |
| }, |
| { |
| "completion_length": 198.8125, |
| "epoch": 2.1632, |
| "grad_norm": 0.8988875150680542, |
| "kl": 0.0848388671875, |
| "learning_rate": 1.55e-07, |
| "loss": 0.0008, |
| "reward": 3.9634130001068115, |
| "reward_std": 0.016308533609844744, |
| "rewards/answer_entity_reward": 0.9937500059604645, |
| "rewards/answer_wer_reward": 0.969995379447937, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996675550937653, |
| "step": 677 |
| }, |
| { |
| "completion_length": 242.0, |
| "epoch": 2.1664, |
| "grad_norm": 0.886544406414032, |
| "kl": 0.057861328125, |
| "learning_rate": 1.5374999999999998e-07, |
| "loss": 0.0006, |
| "reward": 3.9666435718536377, |
| "reward_std": 0.009206962306052446, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9666436016559601, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 678 |
| }, |
| { |
| "completion_length": 208.09375, |
| "epoch": 2.1696, |
| "grad_norm": 1.2104874849319458, |
| "kl": 0.0665283203125, |
| "learning_rate": 1.525e-07, |
| "loss": 0.0007, |
| "reward": 3.956413745880127, |
| "reward_std": 0.008385751629248261, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9564136564731598, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 679 |
| }, |
| { |
| "completion_length": 205.65625, |
| "epoch": 2.1728, |
| "grad_norm": 1.4340012073516846, |
| "kl": 0.0653076171875, |
| "learning_rate": 1.5124999999999998e-07, |
| "loss": 0.0007, |
| "reward": 3.9660589694976807, |
| "reward_std": 0.007518206490203738, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9666839838027954, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993749856948853, |
| "step": 680 |
| }, |
| { |
| "completion_length": 243.875, |
| "epoch": 2.176, |
| "grad_norm": 2.6693804264068604, |
| "kl": 0.0611572265625, |
| "learning_rate": 1.5e-07, |
| "loss": 0.0006, |
| "reward": 3.9342352151870728, |
| "reward_std": 0.0278960638679564, |
| "rewards/answer_entity_reward": 0.9851190745830536, |
| "rewards/answer_wer_reward": 0.9509375989437103, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9981784820556641, |
| "step": 681 |
| }, |
| { |
| "completion_length": 247.875, |
| "epoch": 2.1792, |
| "grad_norm": 0.978139340877533, |
| "kl": 0.050537109375, |
| "learning_rate": 1.4874999999999998e-07, |
| "loss": 0.0005, |
| "reward": 3.9769967794418335, |
| "reward_std": 0.006702936254441738, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9769968390464783, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 682 |
| }, |
| { |
| "completion_length": 222.5625, |
| "epoch": 2.1824, |
| "grad_norm": 1.382318139076233, |
| "kl": 0.065185546875, |
| "learning_rate": 1.475e-07, |
| "loss": 0.0007, |
| "reward": 3.9492597579956055, |
| "reward_std": 0.008544785436242819, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9507622122764587, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9984976053237915, |
| "step": 683 |
| }, |
| { |
| "completion_length": 219.71875, |
| "epoch": 2.1856, |
| "grad_norm": 2.196531057357788, |
| "kl": 0.0595703125, |
| "learning_rate": 1.4624999999999998e-07, |
| "loss": 0.0006, |
| "reward": 3.9446985721588135, |
| "reward_std": 0.014558171853423119, |
| "rewards/answer_entity_reward": 0.9813033938407898, |
| "rewards/answer_wer_reward": 0.9633950591087341, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 684 |
| }, |
| { |
| "completion_length": 219.5, |
| "epoch": 2.1888, |
| "grad_norm": 1.4868621826171875, |
| "kl": 0.07177734375, |
| "learning_rate": 1.45e-07, |
| "loss": 0.0007, |
| "reward": 3.9446860551834106, |
| "reward_std": 0.010166772175580263, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9451901018619537, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994959831237793, |
| "step": 685 |
| }, |
| { |
| "completion_length": 261.59375, |
| "epoch": 2.192, |
| "grad_norm": 0.8591821789741516, |
| "kl": 0.0595703125, |
| "learning_rate": 1.4374999999999997e-07, |
| "loss": 0.0006, |
| "reward": 3.9277877807617188, |
| "reward_std": 0.010211648885160685, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.930150032043457, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9976378083229065, |
| "step": 686 |
| }, |
| { |
| "completion_length": 205.0625, |
| "epoch": 2.1952, |
| "grad_norm": 0.924826443195343, |
| "kl": 0.0703125, |
| "learning_rate": 1.4249999999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.9727468490600586, |
| "reward_std": 0.006501165917143226, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.972746878862381, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 687 |
| }, |
| { |
| "completion_length": 197.625, |
| "epoch": 2.1984, |
| "grad_norm": 1.508520483970642, |
| "kl": 0.092041015625, |
| "learning_rate": 1.4124999999999997e-07, |
| "loss": 0.0009, |
| "reward": 3.9627835750579834, |
| "reward_std": 0.010947544127702713, |
| "rewards/answer_entity_reward": 0.9930555522441864, |
| "rewards/answer_wer_reward": 0.9707047045230865, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9990234375, |
| "step": 688 |
| }, |
| { |
| "completion_length": 205.09375, |
| "epoch": 2.2016, |
| "grad_norm": 2.3478713035583496, |
| "kl": 0.0712890625, |
| "learning_rate": 1.4e-07, |
| "loss": 0.0007, |
| "reward": 3.933359384536743, |
| "reward_std": 0.008363787084817886, |
| "rewards/answer_entity_reward": 0.9861111044883728, |
| "rewards/answer_wer_reward": 0.9626152515411377, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9846329689025879, |
| "step": 689 |
| }, |
| { |
| "completion_length": 225.03125, |
| "epoch": 2.2048, |
| "grad_norm": 1.3916107416152954, |
| "kl": 0.058837890625, |
| "learning_rate": 1.3875e-07, |
| "loss": 0.0006, |
| "reward": 3.9732636213302612, |
| "reward_std": 0.009609260130673647, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9732636511325836, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 690 |
| }, |
| { |
| "completion_length": 152.59375, |
| "epoch": 2.208, |
| "grad_norm": 1.322786808013916, |
| "kl": 0.0557861328125, |
| "learning_rate": 1.375e-07, |
| "loss": 0.0006, |
| "reward": 3.8575568199157715, |
| "reward_std": 0.011282142717391253, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9596264958381653, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9003343284130096, |
| "step": 691 |
| }, |
| { |
| "completion_length": 162.71875, |
| "epoch": 2.2112, |
| "grad_norm": 0.7846171855926514, |
| "kl": 0.0657958984375, |
| "learning_rate": 1.3625e-07, |
| "loss": 0.0007, |
| "reward": 3.9684951305389404, |
| "reward_std": 0.013251218944787979, |
| "rewards/answer_entity_reward": 0.9910714328289032, |
| "rewards/answer_wer_reward": 0.9774238169193268, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 692 |
| }, |
| { |
| "completion_length": 207.65625, |
| "epoch": 2.2144, |
| "grad_norm": 1.7230638265609741, |
| "kl": 0.1243896484375, |
| "learning_rate": 1.35e-07, |
| "loss": 0.0012, |
| "reward": 3.9475139379501343, |
| "reward_std": 0.00949817756190896, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9486435055732727, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.998870462179184, |
| "step": 693 |
| }, |
| { |
| "completion_length": 245.96875, |
| "epoch": 2.2176, |
| "grad_norm": 1.5247471332550049, |
| "kl": 0.061767578125, |
| "learning_rate": 1.3375e-07, |
| "loss": 0.0006, |
| "reward": 3.947926878929138, |
| "reward_std": 0.014066703617572784, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9513991177082062, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 694 |
| }, |
| { |
| "completion_length": 222.5625, |
| "epoch": 2.2208, |
| "grad_norm": 1.5721601247787476, |
| "kl": 0.0782470703125, |
| "learning_rate": 1.325e-07, |
| "loss": 0.0008, |
| "reward": 3.903387188911438, |
| "reward_std": 0.005873196758329868, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9640650153160095, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9393221139907837, |
| "step": 695 |
| }, |
| { |
| "completion_length": 187.03125, |
| "epoch": 2.224, |
| "grad_norm": 1.1470870971679688, |
| "kl": 0.0457763671875, |
| "learning_rate": 1.3125e-07, |
| "loss": 0.0005, |
| "reward": 3.9857735633850098, |
| "reward_std": 0.003898413386195898, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9857736229896545, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 696 |
| }, |
| { |
| "completion_length": 202.84375, |
| "epoch": 2.2272, |
| "grad_norm": 2.00569486618042, |
| "kl": 0.077392578125, |
| "learning_rate": 1.3e-07, |
| "loss": 0.0008, |
| "reward": 3.9439765214920044, |
| "reward_std": 0.00677294097840786, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9666953980922699, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9772810935974121, |
| "step": 697 |
| }, |
| { |
| "completion_length": 200.875, |
| "epoch": 2.2304, |
| "grad_norm": 0.5203324556350708, |
| "kl": 0.0533447265625, |
| "learning_rate": 1.2874999999999998e-07, |
| "loss": 0.0005, |
| "reward": 3.981989622116089, |
| "reward_std": 0.003249130444601178, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9819895327091217, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 698 |
| }, |
| { |
| "completion_length": 229.5, |
| "epoch": 2.2336, |
| "grad_norm": 1.028457760810852, |
| "kl": 0.0615234375, |
| "learning_rate": 1.275e-07, |
| "loss": 0.0006, |
| "reward": 3.9699747562408447, |
| "reward_std": 0.007223621942102909, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9699748456478119, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 699 |
| }, |
| { |
| "completion_length": 183.375, |
| "epoch": 2.2368, |
| "grad_norm": 1.1010169982910156, |
| "kl": 0.09619140625, |
| "learning_rate": 1.2624999999999998e-07, |
| "loss": 0.001, |
| "reward": 3.9709969758987427, |
| "reward_std": 0.013876417418941855, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9763848185539246, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.994612067937851, |
| "step": 700 |
| }, |
| { |
| "completion_length": 192.6875, |
| "epoch": 2.24, |
| "grad_norm": 1.9254510402679443, |
| "kl": 0.126708984375, |
| "learning_rate": 1.25e-07, |
| "loss": 0.0013, |
| "reward": 3.9508676528930664, |
| "reward_std": 0.007698251400142908, |
| "rewards/answer_entity_reward": 0.9903846085071564, |
| "rewards/answer_wer_reward": 0.9661648571491241, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9943181872367859, |
| "step": 701 |
| }, |
| { |
| "completion_length": 206.6875, |
| "epoch": 2.2432, |
| "grad_norm": 4.035684108734131, |
| "kl": 0.04833984375, |
| "learning_rate": 1.2375e-07, |
| "loss": 0.0005, |
| "reward": 3.9621732234954834, |
| "reward_std": 0.007325239945203066, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.977934330701828, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9842387735843658, |
| "step": 702 |
| }, |
| { |
| "completion_length": 240.78125, |
| "epoch": 2.2464, |
| "grad_norm": 1.4605140686035156, |
| "kl": 0.0582275390625, |
| "learning_rate": 1.225e-07, |
| "loss": 0.0006, |
| "reward": 3.951379179954529, |
| "reward_std": 0.005893495166674256, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9543100893497467, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9970689713954926, |
| "step": 703 |
| }, |
| { |
| "completion_length": 190.84375, |
| "epoch": 2.2496, |
| "grad_norm": 0.8877372741699219, |
| "kl": 0.064453125, |
| "learning_rate": 1.2125e-07, |
| "loss": 0.0007, |
| "reward": 3.9827821254730225, |
| "reward_std": 0.003501511411741376, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.983114629983902, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996675550937653, |
| "step": 704 |
| }, |
| { |
| "completion_length": 169.875, |
| "epoch": 2.2528, |
| "grad_norm": 4.669096946716309, |
| "kl": 0.0634765625, |
| "learning_rate": 1.2e-07, |
| "loss": 0.0006, |
| "reward": 3.9501044750213623, |
| "reward_std": 0.00536915916018188, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9683522582054138, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9817522466182709, |
| "step": 705 |
| }, |
| { |
| "completion_length": 208.34375, |
| "epoch": 2.2560000000000002, |
| "grad_norm": 2.4436697959899902, |
| "kl": 0.072998046875, |
| "learning_rate": 1.1874999999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.95159912109375, |
| "reward_std": 0.012246299302205443, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9697677791118622, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9818313717842102, |
| "step": 706 |
| }, |
| { |
| "completion_length": 253.34375, |
| "epoch": 2.2592, |
| "grad_norm": 0.6258556842803955, |
| "kl": 0.0625, |
| "learning_rate": 1.1749999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.943672776222229, |
| "reward_std": 0.004726027720607817, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9436727464199066, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 707 |
| }, |
| { |
| "completion_length": 187.96875, |
| "epoch": 2.2624, |
| "grad_norm": 2.1608188152313232, |
| "kl": 0.09521484375, |
| "learning_rate": 1.1625e-07, |
| "loss": 0.0009, |
| "reward": 3.9321788549423218, |
| "reward_std": 0.021823766641318798, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9405494034290314, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.99609375, |
| "step": 708 |
| }, |
| { |
| "completion_length": 201.0625, |
| "epoch": 2.2656, |
| "grad_norm": 5.012310028076172, |
| "kl": 0.04071044921875, |
| "learning_rate": 1.15e-07, |
| "loss": 0.0004, |
| "reward": 3.9624879360198975, |
| "reward_std": 0.01549163879826665, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9760953187942505, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.989864856004715, |
| "step": 709 |
| }, |
| { |
| "completion_length": 238.71875, |
| "epoch": 2.2688, |
| "grad_norm": 1.1021510362625122, |
| "kl": 0.08154296875, |
| "learning_rate": 1.1375e-07, |
| "loss": 0.0008, |
| "reward": 3.9332664012908936, |
| "reward_std": 0.015113649424165487, |
| "rewards/answer_entity_reward": 0.9832701981067657, |
| "rewards/answer_wer_reward": 0.9499962031841278, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 710 |
| }, |
| { |
| "completion_length": 220.90625, |
| "epoch": 2.2720000000000002, |
| "grad_norm": 1.1716574430465698, |
| "kl": 0.053466796875, |
| "learning_rate": 1.125e-07, |
| "loss": 0.0005, |
| "reward": 3.9751139879226685, |
| "reward_std": 0.007001735270023346, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9751139879226685, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 711 |
| }, |
| { |
| "completion_length": 241.40625, |
| "epoch": 2.2752, |
| "grad_norm": 1.469359278678894, |
| "kl": 0.07275390625, |
| "learning_rate": 1.1125e-07, |
| "loss": 0.0007, |
| "reward": 3.898247718811035, |
| "reward_std": 0.039173625875264406, |
| "rewards/answer_entity_reward": 0.984375, |
| "rewards/answer_wer_reward": 0.9162905812263489, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9975819885730743, |
| "step": 712 |
| }, |
| { |
| "completion_length": 205.25, |
| "epoch": 2.2784, |
| "grad_norm": 0.7749589085578918, |
| "kl": 0.0621337890625, |
| "learning_rate": 1.0999999999999999e-07, |
| "loss": 0.0006, |
| "reward": 3.9739962816238403, |
| "reward_std": 0.0056007420644164085, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9743727445602417, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996234774589539, |
| "step": 713 |
| }, |
| { |
| "completion_length": 206.125, |
| "epoch": 2.2816, |
| "grad_norm": 0.5464848875999451, |
| "kl": 0.04901123046875, |
| "learning_rate": 1.0874999999999999e-07, |
| "loss": 0.0005, |
| "reward": 3.95177161693573, |
| "reward_std": 0.004434725036844611, |
| "rewards/answer_entity_reward": 0.9903846085071564, |
| "rewards/answer_wer_reward": 0.9615707993507385, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9998161792755127, |
| "step": 714 |
| }, |
| { |
| "completion_length": 169.09375, |
| "epoch": 2.2848, |
| "grad_norm": 3.133605480194092, |
| "kl": 0.06689453125, |
| "learning_rate": 1.0749999999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.929832339286804, |
| "reward_std": 0.01732827629894018, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9689165651798248, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.960915744304657, |
| "step": 715 |
| }, |
| { |
| "completion_length": 204.59375, |
| "epoch": 2.288, |
| "grad_norm": 0.7156680822372437, |
| "kl": 0.06884765625, |
| "learning_rate": 1.0624999999999999e-07, |
| "loss": 0.0007, |
| "reward": 3.976062059402466, |
| "reward_std": 0.0025083101354539394, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9836839437484741, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9923780560493469, |
| "step": 716 |
| }, |
| { |
| "completion_length": 210.59375, |
| "epoch": 2.2912, |
| "grad_norm": 284.2210998535156, |
| "kl": 0.1416015625, |
| "learning_rate": 1.0499999999999999e-07, |
| "loss": 0.0014, |
| "reward": 3.9028064012527466, |
| "reward_std": 0.016830324195325375, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9614686369895935, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9413377344608307, |
| "step": 717 |
| }, |
| { |
| "completion_length": 232.53125, |
| "epoch": 2.2944, |
| "grad_norm": 1.077739953994751, |
| "kl": 0.08544921875, |
| "learning_rate": 1.0374999999999999e-07, |
| "loss": 0.0009, |
| "reward": 3.9475821256637573, |
| "reward_std": 0.011592368595302105, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9478915929794312, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999690592288971, |
| "step": 718 |
| }, |
| { |
| "completion_length": 217.875, |
| "epoch": 2.2976, |
| "grad_norm": 2.2114531993865967, |
| "kl": 0.195068359375, |
| "learning_rate": 1.0249999999999998e-07, |
| "loss": 0.002, |
| "reward": 3.941352367401123, |
| "reward_std": 0.00652403780259192, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9605833292007446, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9807692170143127, |
| "step": 719 |
| }, |
| { |
| "completion_length": 241.875, |
| "epoch": 2.3008, |
| "grad_norm": 2.330026865005493, |
| "kl": 0.10693359375, |
| "learning_rate": 1.0125e-07, |
| "loss": 0.0011, |
| "reward": 3.8385108709335327, |
| "reward_std": 0.0217201872728765, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9317739605903625, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9067369103431702, |
| "step": 720 |
| }, |
| { |
| "completion_length": 148.15625, |
| "epoch": 2.304, |
| "grad_norm": 6.020991802215576, |
| "kl": 0.0804443359375, |
| "learning_rate": 1e-07, |
| "loss": 0.0008, |
| "reward": 3.9653271436691284, |
| "reward_std": 0.010471278452314436, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9677309989929199, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 721 |
| }, |
| { |
| "completion_length": 242.34375, |
| "epoch": 2.3072, |
| "grad_norm": 1.3827441930770874, |
| "kl": 0.0606689453125, |
| "learning_rate": 9.875e-08, |
| "loss": 0.0006, |
| "reward": 3.9477760791778564, |
| "reward_std": 0.017027822323143482, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9524165093898773, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9988317787647247, |
| "step": 722 |
| }, |
| { |
| "completion_length": 182.875, |
| "epoch": 2.3104, |
| "grad_norm": 0.6132823824882507, |
| "kl": 0.0732421875, |
| "learning_rate": 9.749999999999999e-08, |
| "loss": 0.0007, |
| "reward": 3.9824774265289307, |
| "reward_std": 0.0017756590968929231, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9837089478969574, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9987684786319733, |
| "step": 723 |
| }, |
| { |
| "completion_length": 259.21875, |
| "epoch": 2.3136, |
| "grad_norm": 1.0919182300567627, |
| "kl": 0.052001953125, |
| "learning_rate": 9.624999999999999e-08, |
| "loss": 0.0005, |
| "reward": 3.9247913360595703, |
| "reward_std": 0.0157609935849905, |
| "rewards/answer_entity_reward": 0.9692307412624359, |
| "rewards/answer_wer_reward": 0.9555604159832001, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 724 |
| }, |
| { |
| "completion_length": 243.21875, |
| "epoch": 2.3168, |
| "grad_norm": 1.7886172533035278, |
| "kl": 0.04718017578125, |
| "learning_rate": 9.499999999999999e-08, |
| "loss": 0.0005, |
| "reward": 3.9662917852401733, |
| "reward_std": 0.005910404259338975, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9665379524230957, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997539520263672, |
| "step": 725 |
| }, |
| { |
| "completion_length": 201.65625, |
| "epoch": 2.32, |
| "grad_norm": 1.3444185256958008, |
| "kl": 0.0606689453125, |
| "learning_rate": 9.375e-08, |
| "loss": 0.0006, |
| "reward": 3.9709818363189697, |
| "reward_std": 0.00892023229971528, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9738226532936096, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 726 |
| }, |
| { |
| "completion_length": 224.1875, |
| "epoch": 2.3232, |
| "grad_norm": 4.107091426849365, |
| "kl": 0.229736328125, |
| "learning_rate": 9.25e-08, |
| "loss": 0.0023, |
| "reward": 3.9483840465545654, |
| "reward_std": 0.013201091904193163, |
| "rewards/answer_entity_reward": 0.9927884340286255, |
| "rewards/answer_wer_reward": 0.955822080373764, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997735619544983, |
| "step": 727 |
| }, |
| { |
| "completion_length": 189.75, |
| "epoch": 2.3264, |
| "grad_norm": 1.512626051902771, |
| "kl": 0.0589599609375, |
| "learning_rate": 9.125e-08, |
| "loss": 0.0006, |
| "reward": 3.9542768001556396, |
| "reward_std": 0.008582692593336105, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9702657759189606, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.984011173248291, |
| "step": 728 |
| }, |
| { |
| "completion_length": 172.8125, |
| "epoch": 2.3296, |
| "grad_norm": 4.1475830078125, |
| "kl": 0.110107421875, |
| "learning_rate": 9e-08, |
| "loss": 0.0011, |
| "reward": 3.9462348222732544, |
| "reward_std": 0.009323009755462408, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9772224724292755, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9690123498439789, |
| "step": 729 |
| }, |
| { |
| "completion_length": 198.84375, |
| "epoch": 2.3327999999999998, |
| "grad_norm": 1.3541475534439087, |
| "kl": 0.045166015625, |
| "learning_rate": 8.875e-08, |
| "loss": 0.0005, |
| "reward": 3.9697635173797607, |
| "reward_std": 0.00771446293219924, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9707715511322021, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9989919364452362, |
| "step": 730 |
| }, |
| { |
| "completion_length": 217.71875, |
| "epoch": 2.336, |
| "grad_norm": 1.2064177989959717, |
| "kl": 0.05908203125, |
| "learning_rate": 8.75e-08, |
| "loss": 0.0006, |
| "reward": 3.9431110620498657, |
| "reward_std": 0.01243708049878478, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9462102055549622, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9969007968902588, |
| "step": 731 |
| }, |
| { |
| "completion_length": 208.0, |
| "epoch": 2.3392, |
| "grad_norm": 1.1856428384780884, |
| "kl": 0.048095703125, |
| "learning_rate": 8.625e-08, |
| "loss": 0.0005, |
| "reward": 3.955425500869751, |
| "reward_std": 0.013023892883211374, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9704216420650482, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9850037693977356, |
| "step": 732 |
| }, |
| { |
| "completion_length": 230.46875, |
| "epoch": 2.3424, |
| "grad_norm": 7.96836519241333, |
| "kl": 0.0765380859375, |
| "learning_rate": 8.500000000000001e-08, |
| "loss": 0.0008, |
| "reward": 3.8350234031677246, |
| "reward_std": 0.0071187918074429035, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9640994668006897, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.8709239065647125, |
| "step": 733 |
| }, |
| { |
| "completion_length": 240.46875, |
| "epoch": 2.3456, |
| "grad_norm": 1.9817602634429932, |
| "kl": 0.067138671875, |
| "learning_rate": 8.375e-08, |
| "loss": 0.0007, |
| "reward": 3.8598886728286743, |
| "reward_std": 0.009870891459286213, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9313421249389648, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9285465180873871, |
| "step": 734 |
| }, |
| { |
| "completion_length": 232.625, |
| "epoch": 2.3487999999999998, |
| "grad_norm": 1.4039250612258911, |
| "kl": 0.05126953125, |
| "learning_rate": 8.25e-08, |
| "loss": 0.0005, |
| "reward": 3.9484113454818726, |
| "reward_std": 0.011133690131828189, |
| "rewards/answer_entity_reward": 0.9971590936183929, |
| "rewards/answer_wer_reward": 0.9577626585960388, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9934895932674408, |
| "step": 735 |
| }, |
| { |
| "completion_length": 168.15625, |
| "epoch": 2.352, |
| "grad_norm": 0.8416581153869629, |
| "kl": 0.068359375, |
| "learning_rate": 8.125e-08, |
| "loss": 0.0007, |
| "reward": 3.9322515726089478, |
| "reward_std": 0.002792949788272381, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9946084916591644, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9376430511474609, |
| "step": 736 |
| }, |
| { |
| "completion_length": 232.375, |
| "epoch": 2.3552, |
| "grad_norm": 1.3709439039230347, |
| "kl": 0.068359375, |
| "learning_rate": 8e-08, |
| "loss": 0.0007, |
| "reward": 3.9093856811523438, |
| "reward_std": 0.0034298759419471025, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.971885621547699, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9375, |
| "step": 737 |
| }, |
| { |
| "completion_length": 201.15625, |
| "epoch": 2.3584, |
| "grad_norm": 0.9587724804878235, |
| "kl": 0.0657958984375, |
| "learning_rate": 7.875e-08, |
| "loss": 0.0007, |
| "reward": 3.960189461708069, |
| "reward_std": 0.017379604279994965, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9656778275966644, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9979838728904724, |
| "step": 738 |
| }, |
| { |
| "completion_length": 204.3125, |
| "epoch": 2.3616, |
| "grad_norm": 1.5729237794876099, |
| "kl": 0.075439453125, |
| "learning_rate": 7.75e-08, |
| "loss": 0.0007, |
| "reward": 3.9626389741897583, |
| "reward_std": 0.01823890022933483, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9661112725734711, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 739 |
| }, |
| { |
| "completion_length": 239.875, |
| "epoch": 2.3648, |
| "grad_norm": 0.9296643733978271, |
| "kl": 0.064208984375, |
| "learning_rate": 7.625e-08, |
| "loss": 0.0006, |
| "reward": 3.968054413795471, |
| "reward_std": 0.0051011774921789765, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9680543541908264, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 740 |
| }, |
| { |
| "completion_length": 242.71875, |
| "epoch": 2.368, |
| "grad_norm": 0.9536841511726379, |
| "kl": 0.0606689453125, |
| "learning_rate": 7.5e-08, |
| "loss": 0.0006, |
| "reward": 3.9280422925949097, |
| "reward_std": 0.005676981760188937, |
| "rewards/answer_entity_reward": 0.9861111044883728, |
| "rewards/answer_wer_reward": 0.9430340826511383, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9988970458507538, |
| "step": 741 |
| }, |
| { |
| "completion_length": 239.59375, |
| "epoch": 2.3712, |
| "grad_norm": 1.1191787719726562, |
| "kl": 0.0565185546875, |
| "learning_rate": 7.375e-08, |
| "loss": 0.0006, |
| "reward": 3.9627801179885864, |
| "reward_std": 0.004723543883301318, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9627801775932312, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 742 |
| }, |
| { |
| "completion_length": 198.125, |
| "epoch": 2.3744, |
| "grad_norm": 19.45572280883789, |
| "kl": 0.0677490234375, |
| "learning_rate": 7.25e-08, |
| "loss": 0.0007, |
| "reward": 3.8835959434509277, |
| "reward_std": 0.0259452061727643, |
| "rewards/answer_entity_reward": 0.9975961446762085, |
| "rewards/answer_wer_reward": 0.9579322040081024, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9280676245689392, |
| "step": 743 |
| }, |
| { |
| "completion_length": 176.875, |
| "epoch": 2.3776, |
| "grad_norm": 2.2377281188964844, |
| "kl": 0.090087890625, |
| "learning_rate": 7.124999999999999e-08, |
| "loss": 0.0009, |
| "reward": 3.9422539472579956, |
| "reward_std": 0.039653101935982704, |
| "rewards/answer_entity_reward": 0.9895833134651184, |
| "rewards/answer_wer_reward": 0.9664814472198486, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.986189216375351, |
| "step": 744 |
| }, |
| { |
| "completion_length": 229.1875, |
| "epoch": 2.3808, |
| "grad_norm": 1.561314344406128, |
| "kl": 0.0491943359375, |
| "learning_rate": 7e-08, |
| "loss": 0.0005, |
| "reward": 3.8669506311416626, |
| "reward_std": 0.19146580225788057, |
| "rewards/answer_entity_reward": 0.96875, |
| "rewards/answer_wer_reward": 0.9294506311416626, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 745 |
| }, |
| { |
| "completion_length": 192.875, |
| "epoch": 2.384, |
| "grad_norm": 1.9305033683776855, |
| "kl": 0.078857421875, |
| "learning_rate": 6.875e-08, |
| "loss": 0.0008, |
| "reward": 3.944983959197998, |
| "reward_std": 0.012190061155706644, |
| "rewards/answer_entity_reward": 0.9979166686534882, |
| "rewards/answer_wer_reward": 0.9473004341125488, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997667968273163, |
| "step": 746 |
| }, |
| { |
| "completion_length": 214.75, |
| "epoch": 2.3872, |
| "grad_norm": 13.16278076171875, |
| "kl": 0.0552978515625, |
| "learning_rate": 6.75e-08, |
| "loss": 0.0006, |
| "reward": 3.981534004211426, |
| "reward_std": 0.016841471777297556, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.989596426486969, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.991937518119812, |
| "step": 747 |
| }, |
| { |
| "completion_length": 202.65625, |
| "epoch": 2.3904, |
| "grad_norm": 1.269473671913147, |
| "kl": 0.0595703125, |
| "learning_rate": 6.625e-08, |
| "loss": 0.0006, |
| "reward": 3.9539172649383545, |
| "reward_std": 0.006352424388751388, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9545792937278748, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993380010128021, |
| "step": 748 |
| }, |
| { |
| "completion_length": 241.9375, |
| "epoch": 2.3936, |
| "grad_norm": 0.799062192440033, |
| "kl": 0.08447265625, |
| "learning_rate": 6.5e-08, |
| "loss": 0.0008, |
| "reward": 3.968814492225647, |
| "reward_std": 0.0058513006661087275, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9696769118309021, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9991375803947449, |
| "step": 749 |
| }, |
| { |
| "completion_length": 175.90625, |
| "epoch": 2.3968, |
| "grad_norm": 1.7988041639328003, |
| "kl": 0.06201171875, |
| "learning_rate": 6.375e-08, |
| "loss": 0.0006, |
| "reward": 3.9838857650756836, |
| "reward_std": 0.0046576057793572545, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9841121137142181, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997735619544983, |
| "step": 750 |
| }, |
| { |
| "completion_length": 215.59375, |
| "epoch": 2.4, |
| "grad_norm": 2.852858781814575, |
| "kl": 0.0533447265625, |
| "learning_rate": 6.25e-08, |
| "loss": 0.0005, |
| "reward": 3.943244457244873, |
| "reward_std": 0.03492546791676432, |
| "rewards/answer_entity_reward": 0.9943181872367859, |
| "rewards/answer_wer_reward": 0.9835853576660156, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9653409123420715, |
| "step": 751 |
| }, |
| { |
| "completion_length": 238.5, |
| "epoch": 2.4032, |
| "grad_norm": 12.164900779724121, |
| "kl": 0.0615234375, |
| "learning_rate": 6.125e-08, |
| "loss": 0.0006, |
| "reward": 3.9755419492721558, |
| "reward_std": 0.010625506052747369, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9782145917415619, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9973272979259491, |
| "step": 752 |
| }, |
| { |
| "completion_length": 179.15625, |
| "epoch": 2.4064, |
| "grad_norm": 0.9550566077232361, |
| "kl": 0.0693359375, |
| "learning_rate": 6e-08, |
| "loss": 0.0007, |
| "reward": 3.954240560531616, |
| "reward_std": 0.011055386741645634, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9720976054668427, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9821428656578064, |
| "step": 753 |
| }, |
| { |
| "completion_length": 216.0625, |
| "epoch": 2.4096, |
| "grad_norm": 1.3647923469543457, |
| "kl": 0.0582275390625, |
| "learning_rate": 5.8749999999999993e-08, |
| "loss": 0.0006, |
| "reward": 3.962032198905945, |
| "reward_std": 0.008129856083542109, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9623997509479523, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996323585510254, |
| "step": 754 |
| }, |
| { |
| "completion_length": 221.8125, |
| "epoch": 2.4128, |
| "grad_norm": 1.9497917890548706, |
| "kl": 0.0604248046875, |
| "learning_rate": 5.75e-08, |
| "loss": 0.0006, |
| "reward": 3.9653851985931396, |
| "reward_std": 0.02012356440536678, |
| "rewards/answer_entity_reward": 0.9937500059604645, |
| "rewards/answer_wer_reward": 0.9722139835357666, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9994212985038757, |
| "step": 755 |
| }, |
| { |
| "completion_length": 198.4375, |
| "epoch": 2.416, |
| "grad_norm": 0.6684221029281616, |
| "kl": 0.07568359375, |
| "learning_rate": 5.625e-08, |
| "loss": 0.0008, |
| "reward": 3.942944049835205, |
| "reward_std": 0.008921493077650666, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9674927294254303, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9754513502120972, |
| "step": 756 |
| }, |
| { |
| "completion_length": 234.875, |
| "epoch": 2.4192, |
| "grad_norm": 1.097367525100708, |
| "kl": 0.1142578125, |
| "learning_rate": 5.4999999999999996e-08, |
| "loss": 0.0011, |
| "reward": 3.9485758543014526, |
| "reward_std": 0.01669642748311162, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9544399976730347, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9986002445220947, |
| "step": 757 |
| }, |
| { |
| "completion_length": 150.46875, |
| "epoch": 2.4224, |
| "grad_norm": 0.21660760045051575, |
| "kl": 0.0321044921875, |
| "learning_rate": 5.3749999999999995e-08, |
| "loss": 0.0003, |
| "reward": 3.978167176246643, |
| "reward_std": 0.0010678768157958984, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9781671762466431, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 758 |
| }, |
| { |
| "completion_length": 231.25, |
| "epoch": 2.4256, |
| "grad_norm": 3.330300807952881, |
| "kl": 0.078857421875, |
| "learning_rate": 5.2499999999999994e-08, |
| "loss": 0.0008, |
| "reward": 3.9418994188308716, |
| "reward_std": 0.007436740444973111, |
| "rewards/answer_entity_reward": 0.9861111044883728, |
| "rewards/answer_wer_reward": 0.9557883143424988, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 759 |
| }, |
| { |
| "completion_length": 212.5, |
| "epoch": 2.4288, |
| "grad_norm": 3.427900791168213, |
| "kl": 0.13525390625, |
| "learning_rate": 5.124999999999999e-08, |
| "loss": 0.0014, |
| "reward": 3.9013478755950928, |
| "reward_std": 0.030906156171113253, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9625242948532104, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9388234913349152, |
| "step": 760 |
| }, |
| { |
| "completion_length": 218.90625, |
| "epoch": 2.432, |
| "grad_norm": 1.3307231664657593, |
| "kl": 0.0567626953125, |
| "learning_rate": 5e-08, |
| "loss": 0.0006, |
| "reward": 3.9774084091186523, |
| "reward_std": 0.0034683155827224255, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9774083495140076, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 761 |
| }, |
| { |
| "completion_length": 207.875, |
| "epoch": 2.4352, |
| "grad_norm": 0.7475162148475647, |
| "kl": 0.057373046875, |
| "learning_rate": 4.8749999999999996e-08, |
| "loss": 0.0006, |
| "reward": 3.9419760704040527, |
| "reward_std": 0.004616708727553487, |
| "rewards/answer_entity_reward": 0.9788995385169983, |
| "rewards/answer_wer_reward": 0.9634398818016052, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996366202831268, |
| "step": 762 |
| }, |
| { |
| "completion_length": 195.28125, |
| "epoch": 2.4384, |
| "grad_norm": 2.0728979110717773, |
| "kl": 0.0966796875, |
| "learning_rate": 4.7499999999999995e-08, |
| "loss": 0.001, |
| "reward": 3.944322109222412, |
| "reward_std": 0.017246471252292395, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9587452709674835, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9855769276618958, |
| "step": 763 |
| }, |
| { |
| "completion_length": 224.9375, |
| "epoch": 2.4416, |
| "grad_norm": 1.2122951745986938, |
| "kl": 0.1226806640625, |
| "learning_rate": 4.625e-08, |
| "loss": 0.0012, |
| "reward": 3.9620940685272217, |
| "reward_std": 0.007986569311469793, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9652903079986572, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.996803879737854, |
| "step": 764 |
| }, |
| { |
| "completion_length": 249.03125, |
| "epoch": 2.4448, |
| "grad_norm": 1.21713125705719, |
| "kl": 0.065673828125, |
| "learning_rate": 4.5e-08, |
| "loss": 0.0007, |
| "reward": 3.9346585273742676, |
| "reward_std": 0.006481441203504801, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9346585869789124, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 765 |
| }, |
| { |
| "completion_length": 239.875, |
| "epoch": 2.448, |
| "grad_norm": 5.105895519256592, |
| "kl": 0.0665283203125, |
| "learning_rate": 4.375e-08, |
| "loss": 0.0007, |
| "reward": 3.916127324104309, |
| "reward_std": 0.02047336893156171, |
| "rewards/answer_entity_reward": 0.9910714626312256, |
| "rewards/answer_wer_reward": 0.9319192171096802, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9931367635726929, |
| "step": 766 |
| }, |
| { |
| "completion_length": 216.84375, |
| "epoch": 2.4512, |
| "grad_norm": 3.230001449584961, |
| "kl": 0.0445556640625, |
| "learning_rate": 4.2500000000000003e-08, |
| "loss": 0.0004, |
| "reward": 3.9800050258636475, |
| "reward_std": 0.004955247277393937, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9800049960613251, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 767 |
| }, |
| { |
| "completion_length": 229.8125, |
| "epoch": 2.4544, |
| "grad_norm": 1.2354313135147095, |
| "kl": 0.0478515625, |
| "learning_rate": 4.125e-08, |
| "loss": 0.0005, |
| "reward": 3.9553003311157227, |
| "reward_std": 0.013880819431506097, |
| "rewards/answer_entity_reward": 0.9826388955116272, |
| "rewards/answer_wer_reward": 0.9739912152290344, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.998670220375061, |
| "step": 768 |
| }, |
| { |
| "completion_length": 248.34375, |
| "epoch": 2.4576000000000002, |
| "grad_norm": 0.8089145421981812, |
| "kl": 0.06005859375, |
| "learning_rate": 4e-08, |
| "loss": 0.0006, |
| "reward": 3.9643748998641968, |
| "reward_std": 0.007618119474500418, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9654783606529236, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9988966286182404, |
| "step": 769 |
| }, |
| { |
| "completion_length": 233.53125, |
| "epoch": 2.4608, |
| "grad_norm": 1.2253531217575073, |
| "kl": 0.0540771484375, |
| "learning_rate": 3.875e-08, |
| "loss": 0.0005, |
| "reward": 3.955801010131836, |
| "reward_std": 0.007193901808932424, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9558009505271912, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 770 |
| }, |
| { |
| "completion_length": 246.25, |
| "epoch": 2.464, |
| "grad_norm": 0.8907082080841064, |
| "kl": 0.0740966796875, |
| "learning_rate": 3.75e-08, |
| "loss": 0.0007, |
| "reward": 3.9567649364471436, |
| "reward_std": 0.007558103417977691, |
| "rewards/answer_entity_reward": 0.9926470518112183, |
| "rewards/answer_wer_reward": 0.9644212424755096, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996966123580933, |
| "step": 771 |
| }, |
| { |
| "completion_length": 157.84375, |
| "epoch": 2.4672, |
| "grad_norm": 0.6787045001983643, |
| "kl": 0.080078125, |
| "learning_rate": 3.625e-08, |
| "loss": 0.0008, |
| "reward": 3.989119529724121, |
| "reward_std": 0.0026377947069704533, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9893985092639923, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997209906578064, |
| "step": 772 |
| }, |
| { |
| "completion_length": 228.75, |
| "epoch": 2.4704, |
| "grad_norm": 0.6448482275009155, |
| "kl": 0.0562744140625, |
| "learning_rate": 3.5e-08, |
| "loss": 0.0006, |
| "reward": 3.960241913795471, |
| "reward_std": 0.005235916236415505, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.960241824388504, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 773 |
| }, |
| { |
| "completion_length": 226.5, |
| "epoch": 2.4736000000000002, |
| "grad_norm": 0.9646191596984863, |
| "kl": 0.05224609375, |
| "learning_rate": 3.375e-08, |
| "loss": 0.0005, |
| "reward": 3.9351943731307983, |
| "reward_std": 0.015792422462254763, |
| "rewards/answer_entity_reward": 0.9866071343421936, |
| "rewards/answer_wer_reward": 0.9691915214061737, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9793955981731415, |
| "step": 774 |
| }, |
| { |
| "completion_length": 249.53125, |
| "epoch": 2.4768, |
| "grad_norm": 2.9048826694488525, |
| "kl": 0.0540771484375, |
| "learning_rate": 3.25e-08, |
| "loss": 0.0005, |
| "reward": 3.952099561691284, |
| "reward_std": 0.00629690324421972, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9520994424819946, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 775 |
| }, |
| { |
| "completion_length": 222.21875, |
| "epoch": 2.48, |
| "grad_norm": 1.1555320024490356, |
| "kl": 0.0548095703125, |
| "learning_rate": 3.125e-08, |
| "loss": 0.0005, |
| "reward": 3.9609912633895874, |
| "reward_std": 0.017560790292918682, |
| "rewards/answer_entity_reward": 0.9927884340286255, |
| "rewards/answer_wer_reward": 0.9682029485702515, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 776 |
| }, |
| { |
| "completion_length": 245.46875, |
| "epoch": 2.4832, |
| "grad_norm": 2.5107345581054688, |
| "kl": 0.098388671875, |
| "learning_rate": 3e-08, |
| "loss": 0.001, |
| "reward": 3.9294867515563965, |
| "reward_std": 0.009384696371853352, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.932422935962677, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9970638751983643, |
| "step": 777 |
| }, |
| { |
| "completion_length": 175.1875, |
| "epoch": 2.4864, |
| "grad_norm": 3.319678783416748, |
| "kl": 0.06640625, |
| "learning_rate": 2.875e-08, |
| "loss": 0.0007, |
| "reward": 3.9766006469726562, |
| "reward_std": 0.005283091915771365, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9770888686180115, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.99951171875, |
| "step": 778 |
| }, |
| { |
| "completion_length": 215.8125, |
| "epoch": 2.4896, |
| "grad_norm": 1.7188315391540527, |
| "kl": 0.058837890625, |
| "learning_rate": 2.7499999999999998e-08, |
| "loss": 0.0006, |
| "reward": 3.945501208305359, |
| "reward_std": 0.006351021584123373, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9457343518733978, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9997667968273163, |
| "step": 779 |
| }, |
| { |
| "completion_length": 204.28125, |
| "epoch": 2.4928, |
| "grad_norm": 1.284071683883667, |
| "kl": 0.0640869140625, |
| "learning_rate": 2.6249999999999997e-08, |
| "loss": 0.0006, |
| "reward": 3.9768584966659546, |
| "reward_std": 0.003316762624308467, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9779550433158875, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9989035129547119, |
| "step": 780 |
| }, |
| { |
| "completion_length": 216.71875, |
| "epoch": 2.496, |
| "grad_norm": 1.442418098449707, |
| "kl": 0.067138671875, |
| "learning_rate": 2.5e-08, |
| "loss": 0.0007, |
| "reward": 3.945361614227295, |
| "reward_std": 0.03020885493606329, |
| "rewards/answer_entity_reward": 0.9867424070835114, |
| "rewards/answer_wer_reward": 0.9586191177368164, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 781 |
| }, |
| { |
| "completion_length": 199.28125, |
| "epoch": 2.4992, |
| "grad_norm": 2.220127582550049, |
| "kl": 0.071533203125, |
| "learning_rate": 2.3749999999999998e-08, |
| "loss": 0.0007, |
| "reward": 3.945390462875366, |
| "reward_std": 0.012143698055297136, |
| "rewards/answer_entity_reward": 0.9965277910232544, |
| "rewards/answer_wer_reward": 0.9488627314567566, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 782 |
| }, |
| { |
| "completion_length": 220.6875, |
| "epoch": 2.5023999999999997, |
| "grad_norm": 2.2362775802612305, |
| "kl": 0.0634765625, |
| "learning_rate": 2.25e-08, |
| "loss": 0.0006, |
| "reward": 3.960192322731018, |
| "reward_std": 0.006831311853602529, |
| "rewards/answer_entity_reward": 0.9916666746139526, |
| "rewards/answer_wer_reward": 0.9691977500915527, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9993279576301575, |
| "step": 783 |
| }, |
| { |
| "completion_length": 235.5, |
| "epoch": 2.5056000000000003, |
| "grad_norm": 0.9817630052566528, |
| "kl": 0.05224609375, |
| "learning_rate": 2.1250000000000002e-08, |
| "loss": 0.0005, |
| "reward": 3.9702308177948, |
| "reward_std": 0.006825624033808708, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9809376895427704, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9892931282520294, |
| "step": 784 |
| }, |
| { |
| "completion_length": 203.78125, |
| "epoch": 2.5088, |
| "grad_norm": 2.859792947769165, |
| "kl": 0.053955078125, |
| "learning_rate": 2e-08, |
| "loss": 0.0005, |
| "reward": 3.9142426252365112, |
| "reward_std": 0.01792304962873459, |
| "rewards/answer_entity_reward": 0.9944852888584137, |
| "rewards/answer_wer_reward": 0.9792338609695435, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9405233263969421, |
| "step": 785 |
| }, |
| { |
| "completion_length": 224.28125, |
| "epoch": 2.512, |
| "grad_norm": 3.7338051795959473, |
| "kl": 0.060791015625, |
| "learning_rate": 1.875e-08, |
| "loss": 0.0006, |
| "reward": 3.948864221572876, |
| "reward_std": 0.01559874601662159, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9604960083961487, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9883682429790497, |
| "step": 786 |
| }, |
| { |
| "completion_length": 175.1875, |
| "epoch": 2.5152, |
| "grad_norm": 4.41845703125, |
| "kl": 0.083740234375, |
| "learning_rate": 1.75e-08, |
| "loss": 0.0008, |
| "reward": 3.949966311454773, |
| "reward_std": 0.01157908933237195, |
| "rewards/answer_entity_reward": 0.9914772808551788, |
| "rewards/answer_wer_reward": 0.9717868566513062, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9867021441459656, |
| "step": 787 |
| }, |
| { |
| "completion_length": 259.25, |
| "epoch": 2.5183999999999997, |
| "grad_norm": 0.9571487903594971, |
| "kl": 0.0584716796875, |
| "learning_rate": 1.625e-08, |
| "loss": 0.0006, |
| "reward": 3.853899836540222, |
| "reward_std": 0.1917457883246243, |
| "rewards/answer_entity_reward": 0.9654605388641357, |
| "rewards/answer_wer_reward": 0.9225141406059265, |
| "rewards/format_reward": 0.96875, |
| "rewards/think_ocr_reward": 0.9971751868724823, |
| "step": 788 |
| }, |
| { |
| "completion_length": 249.375, |
| "epoch": 2.5216, |
| "grad_norm": 2.86120867729187, |
| "kl": 0.1368408203125, |
| "learning_rate": 1.5e-08, |
| "loss": 0.0014, |
| "reward": 3.9423060417175293, |
| "reward_std": 0.01874951831996441, |
| "rewards/answer_entity_reward": 0.9955357313156128, |
| "rewards/answer_wer_reward": 0.9478386044502258, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9989316165447235, |
| "step": 789 |
| }, |
| { |
| "completion_length": 198.78125, |
| "epoch": 2.5248, |
| "grad_norm": 4.95521879196167, |
| "kl": 0.0611572265625, |
| "learning_rate": 1.3749999999999999e-08, |
| "loss": 0.0006, |
| "reward": 3.915849447250366, |
| "reward_std": 0.016107629984617233, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9824348092079163, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9334145784378052, |
| "step": 790 |
| }, |
| { |
| "completion_length": 184.1875, |
| "epoch": 2.528, |
| "grad_norm": 0.8447386622428894, |
| "kl": 0.0634765625, |
| "learning_rate": 1.25e-08, |
| "loss": 0.0006, |
| "reward": 3.929018259048462, |
| "reward_std": 0.009709671430755407, |
| "rewards/answer_entity_reward": 0.9886363744735718, |
| "rewards/answer_wer_reward": 0.940733015537262, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9996488690376282, |
| "step": 791 |
| }, |
| { |
| "completion_length": 185.59375, |
| "epoch": 2.5312, |
| "grad_norm": 2.6198718547821045, |
| "kl": 0.0439453125, |
| "learning_rate": 1.125e-08, |
| "loss": 0.0004, |
| "reward": 3.9582111835479736, |
| "reward_std": 0.007002702914178371, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9582110941410065, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 792 |
| }, |
| { |
| "completion_length": 197.8125, |
| "epoch": 2.5343999999999998, |
| "grad_norm": 1.3550831079483032, |
| "kl": 0.065185546875, |
| "learning_rate": 1e-08, |
| "loss": 0.0007, |
| "reward": 3.8907723426818848, |
| "reward_std": 0.005525397136807442, |
| "rewards/answer_entity_reward": 0.9903846085071564, |
| "rewards/answer_wer_reward": 0.9687470197677612, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.931640625, |
| "step": 793 |
| }, |
| { |
| "completion_length": 187.375, |
| "epoch": 2.5376, |
| "grad_norm": 1.0252914428710938, |
| "kl": 0.086181640625, |
| "learning_rate": 8.75e-09, |
| "loss": 0.0009, |
| "reward": 3.86617374420166, |
| "reward_std": 0.011230799835175276, |
| "rewards/answer_entity_reward": 0.9981617629528046, |
| "rewards/answer_wer_reward": 0.9432033002376556, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9248086512088776, |
| "step": 794 |
| }, |
| { |
| "completion_length": 220.84375, |
| "epoch": 2.5408, |
| "grad_norm": 3.189028739929199, |
| "kl": 0.05078125, |
| "learning_rate": 7.5e-09, |
| "loss": 0.0005, |
| "reward": 3.9672648906707764, |
| "reward_std": 0.006707400782033801, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.968046098947525, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.999218761920929, |
| "step": 795 |
| }, |
| { |
| "completion_length": 148.875, |
| "epoch": 2.544, |
| "grad_norm": 0.518578052520752, |
| "kl": 0.085693359375, |
| "learning_rate": 6.25e-09, |
| "loss": 0.0009, |
| "reward": 3.8482353687286377, |
| "reward_std": 0.0038536423817276955, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.8497678339481354, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9984675645828247, |
| "step": 796 |
| }, |
| { |
| "completion_length": 193.40625, |
| "epoch": 2.5472, |
| "grad_norm": 0.928065299987793, |
| "kl": 0.081298828125, |
| "learning_rate": 5e-09, |
| "loss": 0.0008, |
| "reward": 3.9669394493103027, |
| "reward_std": 0.013519858941435814, |
| "rewards/answer_entity_reward": 0.9937500059604645, |
| "rewards/answer_wer_reward": 0.9760889112949371, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9971005320549011, |
| "step": 797 |
| }, |
| { |
| "completion_length": 220.0625, |
| "epoch": 2.5504, |
| "grad_norm": 2.7394306659698486, |
| "kl": 0.050537109375, |
| "learning_rate": 3.75e-09, |
| "loss": 0.0005, |
| "reward": 3.972287654876709, |
| "reward_std": 0.0053059973288327456, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9722877740859985, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 798 |
| }, |
| { |
| "completion_length": 221.4375, |
| "epoch": 2.5536, |
| "grad_norm": 3.9942383766174316, |
| "kl": 0.0673828125, |
| "learning_rate": 2.5e-09, |
| "loss": 0.0007, |
| "reward": 3.9344537258148193, |
| "reward_std": 0.01906409254297614, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9436750113964081, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 0.9907786846160889, |
| "step": 799 |
| }, |
| { |
| "completion_length": 231.1875, |
| "epoch": 2.5568, |
| "grad_norm": 2.3216702938079834, |
| "kl": 0.0462646484375, |
| "learning_rate": 1.25e-09, |
| "loss": 0.0005, |
| "reward": 3.959131956100464, |
| "reward_std": 0.005453485995531082, |
| "rewards/answer_entity_reward": 1.0, |
| "rewards/answer_wer_reward": 0.9591320157051086, |
| "rewards/format_reward": 1.0, |
| "rewards/think_ocr_reward": 1.0, |
| "step": 800 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 800, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|