VAPO-7B / trainer_state.json
RUIH's picture
Upload folder using huggingface_hub
b3bcdde verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.5568,
"eval_steps": 500,
"global_step": 800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 175.78125,
"epoch": 0.0032,
"grad_norm": 5.3713698387146,
"kl": 0.0,
"learning_rate": 1e-06,
"loss": 0.0,
"reward": 2.691648483276367,
"reward_std": 0.9842272102832794,
"rewards/answer_entity_reward": 0.8998827934265137,
"rewards/answer_wer_reward": 0.6144023239612579,
"rewards/format_reward": 0.65625,
"rewards/think_ocr_reward": 0.5211134254932404,
"step": 1
},
{
"completion_length": 205.1875,
"epoch": 0.0064,
"grad_norm": 12.984394073486328,
"kl": 0.000339508056640625,
"learning_rate": 9.9875e-07,
"loss": 0.0,
"reward": 2.8287014961242676,
"reward_std": 1.0050830841064453,
"rewards/answer_entity_reward": 0.7303222715854645,
"rewards/answer_wer_reward": 0.47497838735580444,
"rewards/format_reward": 0.875,
"rewards/think_ocr_reward": 0.7484009563922882,
"step": 2
},
{
"completion_length": 203.09375,
"epoch": 0.0096,
"grad_norm": 5.166553497314453,
"kl": 0.00044536590576171875,
"learning_rate": 9.975e-07,
"loss": 0.0,
"reward": 3.498788595199585,
"reward_std": 0.2545953020453453,
"rewards/answer_entity_reward": 0.9527146220207214,
"rewards/answer_wer_reward": 0.7393675744533539,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8067062795162201,
"step": 3
},
{
"completion_length": 206.1875,
"epoch": 0.0128,
"grad_norm": 2.356685161590576,
"kl": 0.0009002685546875,
"learning_rate": 9.9625e-07,
"loss": 0.0,
"reward": 3.299022078514099,
"reward_std": 0.5456227362155914,
"rewards/answer_entity_reward": 0.8519714176654816,
"rewards/answer_wer_reward": 0.6592651903629303,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.8190353512763977,
"step": 4
},
{
"completion_length": 223.28125,
"epoch": 0.016,
"grad_norm": 3.5642409324645996,
"kl": 0.001827239990234375,
"learning_rate": 9.95e-07,
"loss": 0.0,
"reward": 2.8498330116271973,
"reward_std": 0.6001743674278259,
"rewards/answer_entity_reward": 0.8803278803825378,
"rewards/answer_wer_reward": 0.45287495851516724,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.5478802025318146,
"step": 5
},
{
"completion_length": 210.28125,
"epoch": 0.0192,
"grad_norm": 2.062991142272949,
"kl": 0.004608154296875,
"learning_rate": 9.9375e-07,
"loss": 0.0,
"reward": 3.345002055168152,
"reward_std": 0.5891430526971817,
"rewards/answer_entity_reward": 0.8334160447120667,
"rewards/answer_wer_reward": 0.7313504219055176,
"rewards/format_reward": 0.875,
"rewards/think_ocr_reward": 0.9052354693412781,
"step": 6
},
{
"completion_length": 204.9375,
"epoch": 0.0224,
"grad_norm": 2.77138090133667,
"kl": 0.01922607421875,
"learning_rate": 9.925e-07,
"loss": 0.0002,
"reward": 3.3531779050827026,
"reward_std": 0.7286678552627563,
"rewards/answer_entity_reward": 0.8474657833576202,
"rewards/answer_wer_reward": 0.7306987345218658,
"rewards/format_reward": 0.90625,
"rewards/think_ocr_reward": 0.8687634468078613,
"step": 7
},
{
"completion_length": 242.0,
"epoch": 0.0256,
"grad_norm": 1.9377678632736206,
"kl": 0.00897216796875,
"learning_rate": 9.912499999999998e-07,
"loss": 0.0001,
"reward": 3.538244366645813,
"reward_std": 0.26357416808605194,
"rewards/answer_entity_reward": 0.8956374526023865,
"rewards/answer_wer_reward": 0.795194149017334,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.8786628246307373,
"step": 8
},
{
"completion_length": 181.28125,
"epoch": 0.0288,
"grad_norm": 2.9018149375915527,
"kl": 0.0250244140625,
"learning_rate": 9.9e-07,
"loss": 0.0002,
"reward": 3.6827263832092285,
"reward_std": 0.21120695769786835,
"rewards/answer_entity_reward": 0.9178647994995117,
"rewards/answer_wer_reward": 0.8329994082450867,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9318622648715973,
"step": 9
},
{
"completion_length": 211.1875,
"epoch": 0.032,
"grad_norm": 3.4354376792907715,
"kl": 0.02166748046875,
"learning_rate": 9.8875e-07,
"loss": 0.0002,
"reward": 3.6928374767303467,
"reward_std": 0.21010804921388626,
"rewards/answer_entity_reward": 0.8995116055011749,
"rewards/answer_wer_reward": 0.8549435138702393,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9383824467658997,
"step": 10
},
{
"completion_length": 165.40625,
"epoch": 0.0352,
"grad_norm": 5.1537652015686035,
"kl": 0.0521240234375,
"learning_rate": 9.875e-07,
"loss": 0.0005,
"reward": 3.500484824180603,
"reward_std": 0.5196337550878525,
"rewards/answer_entity_reward": 0.9380581974983215,
"rewards/answer_wer_reward": 0.7917109727859497,
"rewards/format_reward": 0.9375,
"rewards/think_ocr_reward": 0.833215594291687,
"step": 11
},
{
"completion_length": 223.8125,
"epoch": 0.0384,
"grad_norm": 3.7026002407073975,
"kl": 0.02813720703125,
"learning_rate": 9.862499999999999e-07,
"loss": 0.0003,
"reward": 3.7366983890533447,
"reward_std": 0.19402557611465454,
"rewards/answer_entity_reward": 0.9315968751907349,
"rewards/answer_wer_reward": 0.836162269115448,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9689393639564514,
"step": 12
},
{
"completion_length": 201.34375,
"epoch": 0.0416,
"grad_norm": 4.624758243560791,
"kl": 0.0487060546875,
"learning_rate": 9.849999999999999e-07,
"loss": 0.0005,
"reward": 3.6485583782196045,
"reward_std": 0.19490989297628403,
"rewards/answer_entity_reward": 0.9538419842720032,
"rewards/answer_wer_reward": 0.8439803719520569,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.8819859325885773,
"step": 13
},
{
"completion_length": 197.53125,
"epoch": 0.0448,
"grad_norm": 5.349609375,
"kl": 0.03363037109375,
"learning_rate": 9.8375e-07,
"loss": 0.0003,
"reward": 3.579698920249939,
"reward_std": 0.12941206991672516,
"rewards/answer_entity_reward": 0.9086007177829742,
"rewards/answer_wer_reward": 0.8474478721618652,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8236501812934875,
"step": 14
},
{
"completion_length": 180.5625,
"epoch": 0.048,
"grad_norm": 5.51423454284668,
"kl": 0.0633544921875,
"learning_rate": 9.825e-07,
"loss": 0.0006,
"reward": 3.6973917484283447,
"reward_std": 0.15208109095692635,
"rewards/answer_entity_reward": 0.9153402149677277,
"rewards/answer_wer_reward": 0.8323444426059723,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9497069418430328,
"step": 15
},
{
"completion_length": 205.03125,
"epoch": 0.0512,
"grad_norm": 3.2830357551574707,
"kl": 0.059326171875,
"learning_rate": 9.8125e-07,
"loss": 0.0006,
"reward": 3.477460026741028,
"reward_std": 0.43340209126472473,
"rewards/answer_entity_reward": 0.8780590891838074,
"rewards/answer_wer_reward": 0.7556597292423248,
"rewards/format_reward": 0.9375,
"rewards/think_ocr_reward": 0.9062411189079285,
"step": 16
},
{
"completion_length": 243.84375,
"epoch": 0.0544,
"grad_norm": 2.257538080215454,
"kl": 0.03240966796875,
"learning_rate": 9.8e-07,
"loss": 0.0003,
"reward": 3.6340386867523193,
"reward_std": 0.15337160229682922,
"rewards/answer_entity_reward": 0.8995862305164337,
"rewards/answer_wer_reward": 0.7731227576732635,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9613295793533325,
"step": 17
},
{
"completion_length": 236.125,
"epoch": 0.0576,
"grad_norm": 2.133462429046631,
"kl": 0.0579833984375,
"learning_rate": 9.7875e-07,
"loss": 0.0006,
"reward": 3.730382204055786,
"reward_std": 0.1639438048005104,
"rewards/answer_entity_reward": 0.9158936738967896,
"rewards/answer_wer_reward": 0.8535431623458862,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9609452486038208,
"step": 18
},
{
"completion_length": 253.84375,
"epoch": 0.0608,
"grad_norm": 2.6911232471466064,
"kl": 0.042236328125,
"learning_rate": 9.775e-07,
"loss": 0.0004,
"reward": 3.6918214559555054,
"reward_std": 0.24240515753626823,
"rewards/answer_entity_reward": 0.908495306968689,
"rewards/answer_wer_reward": 0.8162411749362946,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9983349442481995,
"step": 19
},
{
"completion_length": 195.3125,
"epoch": 0.064,
"grad_norm": 2.856860876083374,
"kl": 0.0548095703125,
"learning_rate": 9.7625e-07,
"loss": 0.0005,
"reward": 3.570927858352661,
"reward_std": 0.38515634275972843,
"rewards/answer_entity_reward": 0.885971337556839,
"rewards/answer_wer_reward": 0.7937527894973755,
"rewards/format_reward": 0.9375,
"rewards/think_ocr_reward": 0.9537037014961243,
"step": 20
},
{
"completion_length": 200.21875,
"epoch": 0.0672,
"grad_norm": 2.869398355484009,
"kl": 0.059814453125,
"learning_rate": 9.75e-07,
"loss": 0.0006,
"reward": 3.7599644660949707,
"reward_std": 0.13445724919438362,
"rewards/answer_entity_reward": 0.9744762480258942,
"rewards/answer_wer_reward": 0.8406906425952911,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9447975754737854,
"step": 21
},
{
"completion_length": 228.9375,
"epoch": 0.0704,
"grad_norm": 2.2584221363067627,
"kl": 0.03387451171875,
"learning_rate": 9.7375e-07,
"loss": 0.0003,
"reward": 3.5859320163726807,
"reward_std": 0.14986564964056015,
"rewards/answer_entity_reward": 0.9357894659042358,
"rewards/answer_wer_reward": 0.8099571466445923,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8401854038238525,
"step": 22
},
{
"completion_length": 219.78125,
"epoch": 0.0736,
"grad_norm": 2.140197277069092,
"kl": 0.0499267578125,
"learning_rate": 9.725e-07,
"loss": 0.0005,
"reward": 3.755205750465393,
"reward_std": 0.09474575892090797,
"rewards/answer_entity_reward": 0.9487689137458801,
"rewards/answer_wer_reward": 0.871625155210495,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9348115921020508,
"step": 23
},
{
"completion_length": 206.28125,
"epoch": 0.0768,
"grad_norm": 3.823035478591919,
"kl": 0.13916015625,
"learning_rate": 9.712499999999998e-07,
"loss": 0.0014,
"reward": 3.7580984830856323,
"reward_std": 0.07033384963870049,
"rewards/answer_entity_reward": 0.9635280966758728,
"rewards/answer_wer_reward": 0.8670244812965393,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9275458753108978,
"step": 24
},
{
"completion_length": 141.875,
"epoch": 0.08,
"grad_norm": 3.9088714122772217,
"kl": 0.10791015625,
"learning_rate": 9.7e-07,
"loss": 0.0011,
"reward": 3.7762891054153442,
"reward_std": 0.04259665124118328,
"rewards/answer_entity_reward": 0.9848519563674927,
"rewards/answer_wer_reward": 0.8006402850151062,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9907970130443573,
"step": 25
},
{
"completion_length": 205.21875,
"epoch": 0.0832,
"grad_norm": 2.103792905807495,
"kl": 0.065185546875,
"learning_rate": 9.6875e-07,
"loss": 0.0007,
"reward": 3.811550498008728,
"reward_std": 0.11633584462106228,
"rewards/answer_entity_reward": 0.9553370177745819,
"rewards/answer_wer_reward": 0.9040265679359436,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9834368824958801,
"step": 26
},
{
"completion_length": 228.78125,
"epoch": 0.0864,
"grad_norm": 2.7897403240203857,
"kl": 0.0435791015625,
"learning_rate": 9.675e-07,
"loss": 0.0004,
"reward": 3.788088798522949,
"reward_std": 0.10910476744174957,
"rewards/answer_entity_reward": 0.9546680450439453,
"rewards/answer_wer_reward": 0.872740238904953,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9606806039810181,
"step": 27
},
{
"completion_length": 210.5,
"epoch": 0.0896,
"grad_norm": 1.2101320028305054,
"kl": 0.0552978515625,
"learning_rate": 9.6625e-07,
"loss": 0.0006,
"reward": 3.8938169479370117,
"reward_std": 0.04485907219350338,
"rewards/answer_entity_reward": 0.974581778049469,
"rewards/answer_wer_reward": 0.9207929372787476,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9984423518180847,
"step": 28
},
{
"completion_length": 233.78125,
"epoch": 0.0928,
"grad_norm": 2.7460684776306152,
"kl": 0.035400390625,
"learning_rate": 9.649999999999999e-07,
"loss": 0.0004,
"reward": 3.662728428840637,
"reward_std": 0.20339616388082504,
"rewards/answer_entity_reward": 0.8774791359901428,
"rewards/answer_wer_reward": 0.8000176846981049,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9852316677570343,
"step": 29
},
{
"completion_length": 199.59375,
"epoch": 0.096,
"grad_norm": 1.8316643238067627,
"kl": 0.0596923828125,
"learning_rate": 9.637499999999999e-07,
"loss": 0.0006,
"reward": 3.890167713165283,
"reward_std": 0.037449197843670845,
"rewards/answer_entity_reward": 0.96912881731987,
"rewards/answer_wer_reward": 0.9220606982707977,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9989781975746155,
"step": 30
},
{
"completion_length": 226.125,
"epoch": 0.0992,
"grad_norm": 2.0417702198028564,
"kl": 0.0440673828125,
"learning_rate": 9.624999999999999e-07,
"loss": 0.0004,
"reward": 3.8260613679885864,
"reward_std": 0.07994803786277771,
"rewards/answer_entity_reward": 0.9577426314353943,
"rewards/answer_wer_reward": 0.902205765247345,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9661130309104919,
"step": 31
},
{
"completion_length": 214.5,
"epoch": 0.1024,
"grad_norm": 4.027645111083984,
"kl": 0.1015625,
"learning_rate": 9.6125e-07,
"loss": 0.001,
"reward": 3.7394936084747314,
"reward_std": 0.10389792174100876,
"rewards/answer_entity_reward": 0.9218434691429138,
"rewards/answer_wer_reward": 0.8621510863304138,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9554989635944366,
"step": 32
},
{
"completion_length": 255.28125,
"epoch": 0.1056,
"grad_norm": 1.527213454246521,
"kl": 0.046875,
"learning_rate": 9.6e-07,
"loss": 0.0005,
"reward": 3.8307132720947266,
"reward_std": 0.0552691500633955,
"rewards/answer_entity_reward": 0.9554121494293213,
"rewards/answer_wer_reward": 0.8765550553798676,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987460970878601,
"step": 33
},
{
"completion_length": 226.0,
"epoch": 0.1088,
"grad_norm": 1.822529673576355,
"kl": 0.0372314453125,
"learning_rate": 9.5875e-07,
"loss": 0.0004,
"reward": 3.8188695907592773,
"reward_std": 0.07392234448343515,
"rewards/answer_entity_reward": 0.9491736888885498,
"rewards/answer_wer_reward": 0.8781739175319672,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.991521954536438,
"step": 34
},
{
"completion_length": 230.71875,
"epoch": 0.112,
"grad_norm": 1.96689772605896,
"kl": 0.05322265625,
"learning_rate": 9.575e-07,
"loss": 0.0005,
"reward": 3.839812397956848,
"reward_std": 0.04108080454170704,
"rewards/answer_entity_reward": 0.9491481184959412,
"rewards/answer_wer_reward": 0.8918017745018005,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9988624751567841,
"step": 35
},
{
"completion_length": 181.75,
"epoch": 0.1152,
"grad_norm": 25.535808563232422,
"kl": 0.100830078125,
"learning_rate": 9.5625e-07,
"loss": 0.001,
"reward": 3.8188287019729614,
"reward_std": 0.1601814702153206,
"rewards/answer_entity_reward": 0.9457894563674927,
"rewards/answer_wer_reward": 0.9093815982341766,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9636577069759369,
"step": 36
},
{
"completion_length": 165.375,
"epoch": 0.1184,
"grad_norm": 2.886183738708496,
"kl": 0.0692138671875,
"learning_rate": 9.55e-07,
"loss": 0.0007,
"reward": 3.8752315044403076,
"reward_std": 0.04815678671002388,
"rewards/answer_entity_reward": 0.994689553976059,
"rewards/answer_wer_reward": 0.9401271045207977,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9404149055480957,
"step": 37
},
{
"completion_length": 250.40625,
"epoch": 0.1216,
"grad_norm": 2.9052975177764893,
"kl": 0.0467529296875,
"learning_rate": 9.5375e-07,
"loss": 0.0005,
"reward": 3.8545405864715576,
"reward_std": 0.04892056295648217,
"rewards/answer_entity_reward": 0.9534467458724976,
"rewards/answer_wer_reward": 0.9035276472568512,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9975661933422089,
"step": 38
},
{
"completion_length": 234.125,
"epoch": 0.1248,
"grad_norm": 1.5214505195617676,
"kl": 0.04010009765625,
"learning_rate": 9.525e-07,
"loss": 0.0004,
"reward": 3.7642624378204346,
"reward_std": 0.06860890984535217,
"rewards/answer_entity_reward": 0.9330369234085083,
"rewards/answer_wer_reward": 0.8348780572414398,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9963473677635193,
"step": 39
},
{
"completion_length": 222.5,
"epoch": 0.128,
"grad_norm": 1.4751359224319458,
"kl": 0.0521240234375,
"learning_rate": 9.5125e-07,
"loss": 0.0005,
"reward": 3.8170441389083862,
"reward_std": 0.06563596054911613,
"rewards/answer_entity_reward": 0.9340721964836121,
"rewards/answer_wer_reward": 0.8999682068824768,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9830037355422974,
"step": 40
},
{
"completion_length": 201.84375,
"epoch": 0.1312,
"grad_norm": 20.2832088470459,
"kl": 0.038818359375,
"learning_rate": 9.499999999999999e-07,
"loss": 0.0004,
"reward": 3.7008172273635864,
"reward_std": 0.039744822308421135,
"rewards/answer_entity_reward": 0.9294143319129944,
"rewards/answer_wer_reward": 0.890234112739563,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8811687231063843,
"step": 41
},
{
"completion_length": 192.09375,
"epoch": 0.1344,
"grad_norm": 3.430189609527588,
"kl": 0.0523681640625,
"learning_rate": 9.487499999999999e-07,
"loss": 0.0005,
"reward": 3.8015908002853394,
"reward_std": 0.057819752022624016,
"rewards/answer_entity_reward": 0.9672390222549438,
"rewards/answer_wer_reward": 0.8474858105182648,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9868659377098083,
"step": 42
},
{
"completion_length": 215.53125,
"epoch": 0.1376,
"grad_norm": 16.041494369506836,
"kl": 0.0418701171875,
"learning_rate": 9.474999999999999e-07,
"loss": 0.0004,
"reward": 3.730579137802124,
"reward_std": 0.11731705069541931,
"rewards/answer_entity_reward": 0.9560448527336121,
"rewards/answer_wer_reward": 0.8699329495429993,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9046012759208679,
"step": 43
},
{
"completion_length": 236.78125,
"epoch": 0.1408,
"grad_norm": 1.6949574947357178,
"kl": 0.0352783203125,
"learning_rate": 9.462499999999999e-07,
"loss": 0.0004,
"reward": 3.899806261062622,
"reward_std": 0.018219145480543375,
"rewards/answer_entity_reward": 0.9738267660140991,
"rewards/answer_wer_reward": 0.9316939115524292,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9942856431007385,
"step": 44
},
{
"completion_length": 246.4375,
"epoch": 0.144,
"grad_norm": 1.3507007360458374,
"kl": 0.0330810546875,
"learning_rate": 9.45e-07,
"loss": 0.0003,
"reward": 3.8328453302383423,
"reward_std": 0.06314087565988302,
"rewards/answer_entity_reward": 0.9711392819881439,
"rewards/answer_wer_reward": 0.8670938909053802,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.994612067937851,
"step": 45
},
{
"completion_length": 170.28125,
"epoch": 0.1472,
"grad_norm": 2.2585864067077637,
"kl": 0.077392578125,
"learning_rate": 9.4375e-07,
"loss": 0.0008,
"reward": 3.902386784553528,
"reward_std": 0.035709235817193985,
"rewards/answer_entity_reward": 0.9873873591423035,
"rewards/answer_wer_reward": 0.9353838264942169,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9796155691146851,
"step": 46
},
{
"completion_length": 149.8125,
"epoch": 0.1504,
"grad_norm": 4.581851005554199,
"kl": 0.0452880859375,
"learning_rate": 9.425e-07,
"loss": 0.0005,
"reward": 3.6548960208892822,
"reward_std": 0.06261088512837887,
"rewards/answer_entity_reward": 0.9477430880069733,
"rewards/answer_wer_reward": 0.8129006922245026,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8942522406578064,
"step": 47
},
{
"completion_length": 216.75,
"epoch": 0.1536,
"grad_norm": 47.897464752197266,
"kl": 0.3621826171875,
"learning_rate": 9.4125e-07,
"loss": 0.0036,
"reward": 3.906231164932251,
"reward_std": 0.034966002218425274,
"rewards/answer_entity_reward": 0.9823353588581085,
"rewards/answer_wer_reward": 0.9293725490570068,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9945231974124908,
"step": 48
},
{
"completion_length": 196.9375,
"epoch": 0.1568,
"grad_norm": 2.257028579711914,
"kl": 0.0465087890625,
"learning_rate": 9.399999999999999e-07,
"loss": 0.0005,
"reward": 3.8652477264404297,
"reward_std": 0.03087481390684843,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9092975854873657,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.958791047334671,
"step": 49
},
{
"completion_length": 196.9375,
"epoch": 0.16,
"grad_norm": 4.950622081756592,
"kl": 0.0345458984375,
"learning_rate": 9.387499999999999e-07,
"loss": 0.0003,
"reward": 3.824746251106262,
"reward_std": 0.058218397200107574,
"rewards/answer_entity_reward": 0.9825757443904877,
"rewards/answer_wer_reward": 0.9601459503173828,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8820245563983917,
"step": 50
},
{
"completion_length": 174.8125,
"epoch": 0.1632,
"grad_norm": 7.211401462554932,
"kl": 0.0582275390625,
"learning_rate": 9.374999999999999e-07,
"loss": 0.0006,
"reward": 3.8160147666931152,
"reward_std": 0.04299969598650932,
"rewards/answer_entity_reward": 0.9790209829807281,
"rewards/answer_wer_reward": 0.9350173771381378,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.901976466178894,
"step": 51
},
{
"completion_length": 248.125,
"epoch": 0.1664,
"grad_norm": 0.9922041893005371,
"kl": 0.0201416015625,
"learning_rate": 9.3625e-07,
"loss": 0.0002,
"reward": 3.8918874263763428,
"reward_std": 0.029974642675369978,
"rewards/answer_entity_reward": 0.9869123697280884,
"rewards/answer_wer_reward": 0.9067046940326691,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9982704818248749,
"step": 52
},
{
"completion_length": 251.59375,
"epoch": 0.1696,
"grad_norm": 0.9144994020462036,
"kl": 0.02001953125,
"learning_rate": 9.35e-07,
"loss": 0.0002,
"reward": 3.782878875732422,
"reward_std": 0.04338405467569828,
"rewards/answer_entity_reward": 0.9685876965522766,
"rewards/answer_wer_reward": 0.8232664167881012,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9910247623920441,
"step": 53
},
{
"completion_length": 224.5625,
"epoch": 0.1728,
"grad_norm": 0.8014624118804932,
"kl": 0.01904296875,
"learning_rate": 9.3375e-07,
"loss": 0.0002,
"reward": 3.804163098335266,
"reward_std": 0.02029208466410637,
"rewards/answer_entity_reward": 0.9539299309253693,
"rewards/answer_wer_reward": 0.8539278209209442,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9963052868843079,
"step": 54
},
{
"completion_length": 174.59375,
"epoch": 0.176,
"grad_norm": 2.5315935611724854,
"kl": 0.02862548828125,
"learning_rate": 9.325e-07,
"loss": 0.0003,
"reward": 3.8737215995788574,
"reward_std": 0.06625958904623985,
"rewards/answer_entity_reward": 0.9887503385543823,
"rewards/answer_wer_reward": 0.9215180277824402,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9634531438350677,
"step": 55
},
{
"completion_length": 239.4375,
"epoch": 0.1792,
"grad_norm": 1.3654975891113281,
"kl": 0.0283203125,
"learning_rate": 9.3125e-07,
"loss": 0.0003,
"reward": 3.8753963708877563,
"reward_std": 0.04764867387712002,
"rewards/answer_entity_reward": 0.9810132682323456,
"rewards/answer_wer_reward": 0.8943831324577332,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 56
},
{
"completion_length": 214.75,
"epoch": 0.1824,
"grad_norm": 1.4159584045410156,
"kl": 0.02081298828125,
"learning_rate": 9.3e-07,
"loss": 0.0002,
"reward": 3.8986427783966064,
"reward_std": 0.031265249475836754,
"rewards/answer_entity_reward": 0.9880681931972504,
"rewards/answer_wer_reward": 0.9130412340164185,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.99753338098526,
"step": 57
},
{
"completion_length": 240.46875,
"epoch": 0.1856,
"grad_norm": 1.1824144124984741,
"kl": 0.015960693359375,
"learning_rate": 9.287499999999999e-07,
"loss": 0.0002,
"reward": 3.90795361995697,
"reward_std": 0.02096135076135397,
"rewards/answer_entity_reward": 0.9983552694320679,
"rewards/answer_wer_reward": 0.9095984101295471,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 58
},
{
"completion_length": 238.09375,
"epoch": 0.1888,
"grad_norm": 1.165099024772644,
"kl": 0.026123046875,
"learning_rate": 9.274999999999999e-07,
"loss": 0.0003,
"reward": 3.9033310413360596,
"reward_std": 0.03423699922859669,
"rewards/answer_entity_reward": 0.9810605943202972,
"rewards/answer_wer_reward": 0.9234386384487152,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9988317787647247,
"step": 59
},
{
"completion_length": 221.84375,
"epoch": 0.192,
"grad_norm": 2.964642286300659,
"kl": 0.02587890625,
"learning_rate": 9.2625e-07,
"loss": 0.0003,
"reward": 3.9065024852752686,
"reward_std": 0.022342820651829243,
"rewards/answer_entity_reward": 0.978426069021225,
"rewards/answer_wer_reward": 0.9289742708206177,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9991020858287811,
"step": 60
},
{
"completion_length": 211.1875,
"epoch": 0.1952,
"grad_norm": 2.225137233734131,
"kl": 0.0374755859375,
"learning_rate": 9.25e-07,
"loss": 0.0004,
"reward": 3.6701877117156982,
"reward_std": 0.03641202859580517,
"rewards/answer_entity_reward": 0.9796620309352875,
"rewards/answer_wer_reward": 0.7723922729492188,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9181334376335144,
"step": 61
},
{
"completion_length": 150.0,
"epoch": 0.1984,
"grad_norm": 4.289616584777832,
"kl": 0.062744140625,
"learning_rate": 9.237499999999999e-07,
"loss": 0.0006,
"reward": 3.769058585166931,
"reward_std": 0.060237159952521324,
"rewards/answer_entity_reward": 0.842234879732132,
"rewards/answer_wer_reward": 0.9324747323989868,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9943490326404572,
"step": 62
},
{
"completion_length": 172.59375,
"epoch": 0.2016,
"grad_norm": 0.9226670861244202,
"kl": 0.04541015625,
"learning_rate": 9.225e-07,
"loss": 0.0005,
"reward": 3.9475854635238647,
"reward_std": 0.009972278494387865,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9488748908042908,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987106323242188,
"step": 63
},
{
"completion_length": 186.28125,
"epoch": 0.2048,
"grad_norm": 2.8787524700164795,
"kl": 0.02923583984375,
"learning_rate": 9.2125e-07,
"loss": 0.0003,
"reward": 3.8407578468322754,
"reward_std": 0.04633911233395338,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9414158165454865,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8993421196937561,
"step": 64
},
{
"completion_length": 235.21875,
"epoch": 0.208,
"grad_norm": 3.289802074432373,
"kl": 0.02203369140625,
"learning_rate": 9.2e-07,
"loss": 0.0002,
"reward": 3.8516111373901367,
"reward_std": 0.05013709142804146,
"rewards/answer_entity_reward": 0.9782106876373291,
"rewards/answer_wer_reward": 0.8967941999435425,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9766062498092651,
"step": 65
},
{
"completion_length": 182.9375,
"epoch": 0.2112,
"grad_norm": 15.17410659790039,
"kl": 0.079833984375,
"learning_rate": 9.187499999999999e-07,
"loss": 0.0008,
"reward": 3.7952799797058105,
"reward_std": 0.08191402442753315,
"rewards/answer_entity_reward": 0.9947552382946014,
"rewards/answer_wer_reward": 0.9461319446563721,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8543927371501923,
"step": 66
},
{
"completion_length": 195.3125,
"epoch": 0.2144,
"grad_norm": 1.6663379669189453,
"kl": 0.0638427734375,
"learning_rate": 9.174999999999999e-07,
"loss": 0.0006,
"reward": 3.916337490081787,
"reward_std": 0.018936872947961092,
"rewards/answer_entity_reward": 0.9955128133296967,
"rewards/answer_wer_reward": 0.9398471117019653,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.980977475643158,
"step": 67
},
{
"completion_length": 211.84375,
"epoch": 0.2176,
"grad_norm": 2.6255111694335938,
"kl": 0.05126953125,
"learning_rate": 9.1625e-07,
"loss": 0.0005,
"reward": 3.9224915504455566,
"reward_std": 0.01644316827878356,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9280897378921509,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.997242659330368,
"step": 68
},
{
"completion_length": 170.65625,
"epoch": 0.2208,
"grad_norm": 3.3114447593688965,
"kl": 0.0849609375,
"learning_rate": 9.15e-07,
"loss": 0.0009,
"reward": 3.801788806915283,
"reward_std": 0.07587217539548874,
"rewards/answer_entity_reward": 0.9663097262382507,
"rewards/answer_wer_reward": 0.9007239937782288,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9347550868988037,
"step": 69
},
{
"completion_length": 194.0,
"epoch": 0.224,
"grad_norm": 0.908227264881134,
"kl": 0.0428466796875,
"learning_rate": 9.137499999999999e-07,
"loss": 0.0004,
"reward": 3.908014178276062,
"reward_std": 0.015611772891134024,
"rewards/answer_entity_reward": 0.9866071343421936,
"rewards/answer_wer_reward": 0.9214071035385132,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 70
},
{
"completion_length": 235.15625,
"epoch": 0.2272,
"grad_norm": 6.288023471832275,
"kl": 0.0377197265625,
"learning_rate": 9.124999999999999e-07,
"loss": 0.0004,
"reward": 3.8232322931289673,
"reward_std": 0.019494441337883472,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9413564205169678,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8853481709957123,
"step": 71
},
{
"completion_length": 202.84375,
"epoch": 0.2304,
"grad_norm": 3.666252374649048,
"kl": 0.02703857421875,
"learning_rate": 9.1125e-07,
"loss": 0.0003,
"reward": 3.8724911212921143,
"reward_std": 0.036418632604181767,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9379763305187225,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.934514731168747,
"step": 72
},
{
"completion_length": 192.59375,
"epoch": 0.2336,
"grad_norm": 2.5703845024108887,
"kl": 0.04815673828125,
"learning_rate": 9.1e-07,
"loss": 0.0005,
"reward": 3.819400668144226,
"reward_std": 0.09702013805508614,
"rewards/answer_entity_reward": 0.9749708473682404,
"rewards/answer_wer_reward": 0.8958881497383118,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9485417604446411,
"step": 73
},
{
"completion_length": 233.96875,
"epoch": 0.2368,
"grad_norm": 5.079833030700684,
"kl": 0.03594970703125,
"learning_rate": 9.087499999999999e-07,
"loss": 0.0004,
"reward": 3.87298047542572,
"reward_std": 0.04117333237081766,
"rewards/answer_entity_reward": 0.979208379983902,
"rewards/answer_wer_reward": 0.8985798060894012,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.995192289352417,
"step": 74
},
{
"completion_length": 232.09375,
"epoch": 0.24,
"grad_norm": 1.3709529638290405,
"kl": 0.0469970703125,
"learning_rate": 9.074999999999999e-07,
"loss": 0.0005,
"reward": 3.8842471837997437,
"reward_std": 0.02406489010900259,
"rewards/answer_entity_reward": 0.976262629032135,
"rewards/answer_wer_reward": 0.9083134233951569,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996710419654846,
"step": 75
},
{
"completion_length": 134.46875,
"epoch": 0.2432,
"grad_norm": 1.7917073965072632,
"kl": 0.04345703125,
"learning_rate": 9.0625e-07,
"loss": 0.0004,
"reward": 3.9434739351272583,
"reward_std": 0.03165043890476227,
"rewards/answer_entity_reward": 0.9853896200656891,
"rewards/answer_wer_reward": 0.960752934217453,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9973313212394714,
"step": 76
},
{
"completion_length": 260.75,
"epoch": 0.2464,
"grad_norm": 2.487206220626831,
"kl": 0.02789306640625,
"learning_rate": 9.05e-07,
"loss": 0.0003,
"reward": 3.8149930238723755,
"reward_std": 0.04638839513063431,
"rewards/answer_entity_reward": 0.9494674503803253,
"rewards/answer_wer_reward": 0.8663396835327148,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9991858303546906,
"step": 77
},
{
"completion_length": 221.3125,
"epoch": 0.2496,
"grad_norm": 1.8767852783203125,
"kl": 0.017425537109375,
"learning_rate": 9.0375e-07,
"loss": 0.0002,
"reward": 3.8600170612335205,
"reward_std": 0.04895954905077815,
"rewards/answer_entity_reward": 0.9861111044883728,
"rewards/answer_wer_reward": 0.8933806419372559,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9805253744125366,
"step": 78
},
{
"completion_length": 230.71875,
"epoch": 0.2528,
"grad_norm": 3.712688684463501,
"kl": 0.054931640625,
"learning_rate": 9.024999999999999e-07,
"loss": 0.0005,
"reward": 3.8847248554229736,
"reward_std": 0.012873267754912376,
"rewards/answer_entity_reward": 0.9855768978595734,
"rewards/answer_wer_reward": 0.9019420742988586,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9972057938575745,
"step": 79
},
{
"completion_length": 199.3125,
"epoch": 0.256,
"grad_norm": 1.9246958494186401,
"kl": 0.054931640625,
"learning_rate": 9.0125e-07,
"loss": 0.0005,
"reward": 3.8006842136383057,
"reward_std": 0.052133604884147644,
"rewards/answer_entity_reward": 0.9955128133296967,
"rewards/answer_wer_reward": 0.9017785787582397,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9033928513526917,
"step": 80
},
{
"completion_length": 250.21875,
"epoch": 0.2592,
"grad_norm": 1.160876989364624,
"kl": 0.0220947265625,
"learning_rate": 9e-07,
"loss": 0.0002,
"reward": 3.8708144426345825,
"reward_std": 0.030466954689472914,
"rewards/answer_entity_reward": 0.9937500059604645,
"rewards/answer_wer_reward": 0.8790038824081421,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9980604648590088,
"step": 81
},
{
"completion_length": 237.125,
"epoch": 0.2624,
"grad_norm": 5.024093151092529,
"kl": 0.0382080078125,
"learning_rate": 8.9875e-07,
"loss": 0.0004,
"reward": 3.9048351049423218,
"reward_std": 0.03107828088104725,
"rewards/answer_entity_reward": 0.9851398468017578,
"rewards/answer_wer_reward": 0.9344828426837921,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9852123558521271,
"step": 82
},
{
"completion_length": 222.5,
"epoch": 0.2656,
"grad_norm": 1.6519030332565308,
"kl": 0.0380859375,
"learning_rate": 8.974999999999999e-07,
"loss": 0.0004,
"reward": 3.863801956176758,
"reward_std": 0.030243747401982546,
"rewards/answer_entity_reward": 0.9727078676223755,
"rewards/answer_wer_reward": 0.9002127051353455,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9908813536167145,
"step": 83
},
{
"completion_length": 225.53125,
"epoch": 0.2688,
"grad_norm": 1.4793689250946045,
"kl": 0.0517578125,
"learning_rate": 8.9625e-07,
"loss": 0.0005,
"reward": 3.8814769983291626,
"reward_std": 0.029270809143781662,
"rewards/answer_entity_reward": 0.9880681931972504,
"rewards/answer_wer_reward": 0.8934087753295898,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 84
},
{
"completion_length": 235.9375,
"epoch": 0.272,
"grad_norm": 1.597517728805542,
"kl": 0.1016845703125,
"learning_rate": 8.95e-07,
"loss": 0.001,
"reward": 3.8768863677978516,
"reward_std": 0.03502520266920328,
"rewards/answer_entity_reward": 0.9798878133296967,
"rewards/answer_wer_reward": 0.8985857367515564,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9984128177165985,
"step": 85
},
{
"completion_length": 214.4375,
"epoch": 0.2752,
"grad_norm": 4.483051300048828,
"kl": 0.04150390625,
"learning_rate": 8.9375e-07,
"loss": 0.0004,
"reward": 3.903320074081421,
"reward_std": 0.019831405603326857,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9384645223617554,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9648554623126984,
"step": 86
},
{
"completion_length": 217.3125,
"epoch": 0.2784,
"grad_norm": 2.5979843139648438,
"kl": 0.0279541015625,
"learning_rate": 8.924999999999999e-07,
"loss": 0.0003,
"reward": 3.8643628358840942,
"reward_std": 0.07706086616963148,
"rewards/answer_entity_reward": 0.9751845002174377,
"rewards/answer_wer_reward": 0.9189748764038086,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9702034890651703,
"step": 87
},
{
"completion_length": 209.0625,
"epoch": 0.2816,
"grad_norm": 2.134483575820923,
"kl": 0.0654296875,
"learning_rate": 8.912499999999999e-07,
"loss": 0.0007,
"reward": 3.829586148262024,
"reward_std": 0.11678730137646198,
"rewards/answer_entity_reward": 0.9327990114688873,
"rewards/answer_wer_reward": 0.9185277223587036,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9782594740390778,
"step": 88
},
{
"completion_length": 202.5625,
"epoch": 0.2848,
"grad_norm": 2.750098943710327,
"kl": 0.0386962890625,
"learning_rate": 8.9e-07,
"loss": 0.0004,
"reward": 3.813106060028076,
"reward_std": 0.013170521473512053,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.8155100047588348,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9975961446762085,
"step": 89
},
{
"completion_length": 208.21875,
"epoch": 0.288,
"grad_norm": 1.0419001579284668,
"kl": 0.02874755859375,
"learning_rate": 8.8875e-07,
"loss": 0.0003,
"reward": 3.7984471321105957,
"reward_std": 0.046625567600131035,
"rewards/answer_entity_reward": 0.9813492298126221,
"rewards/answer_wer_reward": 0.908283531665802,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9088144898414612,
"step": 90
},
{
"completion_length": 240.875,
"epoch": 0.2912,
"grad_norm": 1.406315565109253,
"kl": 0.0322265625,
"learning_rate": 8.874999999999999e-07,
"loss": 0.0003,
"reward": 3.917527914047241,
"reward_std": 0.018682857509702444,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.919611245393753,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 91
},
{
"completion_length": 248.28125,
"epoch": 0.2944,
"grad_norm": 0.9986963868141174,
"kl": 0.034912109375,
"learning_rate": 8.8625e-07,
"loss": 0.0003,
"reward": 3.8824074268341064,
"reward_std": 0.027639332227408886,
"rewards/answer_entity_reward": 0.9829497039318085,
"rewards/answer_wer_reward": 0.8998689651489258,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999588817358017,
"step": 92
},
{
"completion_length": 166.84375,
"epoch": 0.2976,
"grad_norm": 1.9086061716079712,
"kl": 0.03448486328125,
"learning_rate": 8.85e-07,
"loss": 0.0003,
"reward": 3.9501060247421265,
"reward_std": 0.012802016455680132,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9628694355487823,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9872365295886993,
"step": 93
},
{
"completion_length": 256.1875,
"epoch": 0.3008,
"grad_norm": 3.4043421745300293,
"kl": 0.049072265625,
"learning_rate": 8.8375e-07,
"loss": 0.0005,
"reward": 3.814915657043457,
"reward_std": 0.03222915716469288,
"rewards/answer_entity_reward": 0.9890734255313873,
"rewards/answer_wer_reward": 0.8261894881725311,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999652773141861,
"step": 94
},
{
"completion_length": 253.4375,
"epoch": 0.304,
"grad_norm": 0.9184324741363525,
"kl": 0.03564453125,
"learning_rate": 8.824999999999999e-07,
"loss": 0.0004,
"reward": 3.8896020650863647,
"reward_std": 0.02269437536597252,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.8971993029117584,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9952436983585358,
"step": 95
},
{
"completion_length": 202.15625,
"epoch": 0.3072,
"grad_norm": 12.922323226928711,
"kl": 0.05908203125,
"learning_rate": 8.812499999999999e-07,
"loss": 0.0006,
"reward": 3.9009629487991333,
"reward_std": 0.0202713580802083,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9189554452896118,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9820075929164886,
"step": 96
},
{
"completion_length": 224.53125,
"epoch": 0.3104,
"grad_norm": 4.217601299285889,
"kl": 0.0465087890625,
"learning_rate": 8.799999999999999e-07,
"loss": 0.0005,
"reward": 3.8913207054138184,
"reward_std": 0.014381649438291788,
"rewards/answer_entity_reward": 0.9821428656578064,
"rewards/answer_wer_reward": 0.9095685184001923,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996093809604645,
"step": 97
},
{
"completion_length": 206.40625,
"epoch": 0.3136,
"grad_norm": 2.168041706085205,
"kl": 0.0323486328125,
"learning_rate": 8.7875e-07,
"loss": 0.0003,
"reward": 3.8137295246124268,
"reward_std": 0.06389336660504341,
"rewards/answer_entity_reward": 0.9776169061660767,
"rewards/answer_wer_reward": 0.8989610075950623,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9371516108512878,
"step": 98
},
{
"completion_length": 209.0625,
"epoch": 0.3168,
"grad_norm": 1.6052436828613281,
"kl": 0.0345458984375,
"learning_rate": 8.774999999999999e-07,
"loss": 0.0003,
"reward": 3.828700304031372,
"reward_std": 0.019330056384205818,
"rewards/answer_entity_reward": 0.9850388169288635,
"rewards/answer_wer_reward": 0.846589595079422,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9970719814300537,
"step": 99
},
{
"completion_length": 210.6875,
"epoch": 0.32,
"grad_norm": 0.9548845887184143,
"kl": 0.0341796875,
"learning_rate": 8.7625e-07,
"loss": 0.0003,
"reward": 3.9469358921051025,
"reward_std": 0.021241382230073214,
"rewards/answer_entity_reward": 0.9851641654968262,
"rewards/answer_wer_reward": 0.961771547794342,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 100
},
{
"completion_length": 214.28125,
"epoch": 0.3232,
"grad_norm": 2.8610620498657227,
"kl": 0.052734375,
"learning_rate": 8.75e-07,
"loss": 0.0005,
"reward": 3.806527853012085,
"reward_std": 0.04471902176737785,
"rewards/answer_entity_reward": 0.9853896200656891,
"rewards/answer_wer_reward": 0.8547504544258118,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.966387927532196,
"step": 101
},
{
"completion_length": 223.4375,
"epoch": 0.3264,
"grad_norm": 0.7780336141586304,
"kl": 0.034912109375,
"learning_rate": 8.7375e-07,
"loss": 0.0003,
"reward": 3.880792260169983,
"reward_std": 0.022754055447876453,
"rewards/answer_entity_reward": 0.989393949508667,
"rewards/answer_wer_reward": 0.8913983702659607,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 102
},
{
"completion_length": 231.0625,
"epoch": 0.3296,
"grad_norm": 1.3763070106506348,
"kl": 0.024444580078125,
"learning_rate": 8.725e-07,
"loss": 0.0003,
"reward": 3.929618239402771,
"reward_std": 0.012849014718085527,
"rewards/answer_entity_reward": 0.9983552694320679,
"rewards/answer_wer_reward": 0.9325020015239716,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987609684467316,
"step": 103
},
{
"completion_length": 268.96875,
"epoch": 0.3328,
"grad_norm": 1.7985624074935913,
"kl": 0.0289306640625,
"learning_rate": 8.712499999999999e-07,
"loss": 0.0003,
"reward": 3.888875961303711,
"reward_std": 0.027541114017367363,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.8925231993198395,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999193549156189,
"step": 104
},
{
"completion_length": 254.1875,
"epoch": 0.336,
"grad_norm": 18.920978546142578,
"kl": 0.027099609375,
"learning_rate": 8.699999999999999e-07,
"loss": 0.0003,
"reward": 3.860435366630554,
"reward_std": 0.030950906220823526,
"rewards/answer_entity_reward": 0.9734883308410645,
"rewards/answer_wer_reward": 0.8872724771499634,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996744692325592,
"step": 105
},
{
"completion_length": 163.53125,
"epoch": 0.3392,
"grad_norm": 2.867141008377075,
"kl": 0.03399658203125,
"learning_rate": 8.687499999999999e-07,
"loss": 0.0003,
"reward": 3.9226391315460205,
"reward_std": 0.023416020907461643,
"rewards/answer_entity_reward": 0.9886363744735718,
"rewards/answer_wer_reward": 0.9473121762275696,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9866906106472015,
"step": 106
},
{
"completion_length": 230.5625,
"epoch": 0.3424,
"grad_norm": 1.7444649934768677,
"kl": 0.03302001953125,
"learning_rate": 8.675000000000001e-07,
"loss": 0.0003,
"reward": 3.9037901163101196,
"reward_std": 0.013123108074069023,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9062368869781494,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996366202831268,
"step": 107
},
{
"completion_length": 196.96875,
"epoch": 0.3456,
"grad_norm": 1.4596710205078125,
"kl": 0.0565185546875,
"learning_rate": 8.6625e-07,
"loss": 0.0006,
"reward": 3.927306890487671,
"reward_std": 0.017726238816976547,
"rewards/answer_entity_reward": 0.9847221970558167,
"rewards/answer_wer_reward": 0.9435714483261108,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990131855010986,
"step": 108
},
{
"completion_length": 204.03125,
"epoch": 0.3488,
"grad_norm": 21.111600875854492,
"kl": 0.259765625,
"learning_rate": 8.65e-07,
"loss": 0.0026,
"reward": 3.878751039505005,
"reward_std": 0.09589649271219969,
"rewards/answer_entity_reward": 0.9957579076290131,
"rewards/answer_wer_reward": 0.9333003461360931,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9496928453445435,
"step": 109
},
{
"completion_length": 213.25,
"epoch": 0.352,
"grad_norm": 5.349282264709473,
"kl": 0.0455322265625,
"learning_rate": 8.6375e-07,
"loss": 0.0005,
"reward": 3.862163782119751,
"reward_std": 0.031207362189888954,
"rewards/answer_entity_reward": 0.9892857372760773,
"rewards/answer_wer_reward": 0.9074709117412567,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9654072225093842,
"step": 110
},
{
"completion_length": 220.3125,
"epoch": 0.3552,
"grad_norm": 3.316596746444702,
"kl": 0.03369140625,
"learning_rate": 8.625e-07,
"loss": 0.0003,
"reward": 3.8875255584716797,
"reward_std": 0.03998068626970053,
"rewards/answer_entity_reward": 0.9902909696102142,
"rewards/answer_wer_reward": 0.9038136303424835,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9934210479259491,
"step": 111
},
{
"completion_length": 250.78125,
"epoch": 0.3584,
"grad_norm": 2.525360107421875,
"kl": 0.03515625,
"learning_rate": 8.612499999999999e-07,
"loss": 0.0003,
"reward": 3.8880720138549805,
"reward_std": 0.025330569595098495,
"rewards/answer_entity_reward": 0.9918486475944519,
"rewards/answer_wer_reward": 0.8981437385082245,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9980796277523041,
"step": 112
},
{
"completion_length": 220.09375,
"epoch": 0.3616,
"grad_norm": 5.7261433601379395,
"kl": 0.038330078125,
"learning_rate": 8.599999999999999e-07,
"loss": 0.0004,
"reward": 3.873054027557373,
"reward_std": 0.018459735438227654,
"rewards/answer_entity_reward": 0.9903846085071564,
"rewards/answer_wer_reward": 0.8848404586315155,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9978289306163788,
"step": 113
},
{
"completion_length": 233.625,
"epoch": 0.3648,
"grad_norm": 2.1468665599823,
"kl": 0.0286865234375,
"learning_rate": 8.587499999999999e-07,
"loss": 0.0003,
"reward": 3.9267923831939697,
"reward_std": 0.026638164184987545,
"rewards/answer_entity_reward": 0.993686854839325,
"rewards/answer_wer_reward": 0.9341540634632111,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9989514350891113,
"step": 114
},
{
"completion_length": 237.375,
"epoch": 0.368,
"grad_norm": 14.322599411010742,
"kl": 0.04052734375,
"learning_rate": 8.575e-07,
"loss": 0.0004,
"reward": 3.9121710062026978,
"reward_std": 0.02902364358305931,
"rewards/answer_entity_reward": 0.9908459782600403,
"rewards/answer_wer_reward": 0.922933429479599,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9983916878700256,
"step": 115
},
{
"completion_length": 240.15625,
"epoch": 0.3712,
"grad_norm": 2.0209200382232666,
"kl": 0.06103515625,
"learning_rate": 8.5625e-07,
"loss": 0.0006,
"reward": 3.888006567955017,
"reward_std": 0.023146681487560272,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.895849883556366,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.994560569524765,
"step": 116
},
{
"completion_length": 222.4375,
"epoch": 0.3744,
"grad_norm": 2.933910608291626,
"kl": 0.0419921875,
"learning_rate": 8.55e-07,
"loss": 0.0004,
"reward": 3.8359127044677734,
"reward_std": 0.058022117242217064,
"rewards/answer_entity_reward": 0.9440500438213348,
"rewards/answer_wer_reward": 0.894202709197998,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9976600110530853,
"step": 117
},
{
"completion_length": 214.5625,
"epoch": 0.3776,
"grad_norm": 7.493628025054932,
"kl": 0.064453125,
"learning_rate": 8.5375e-07,
"loss": 0.0006,
"reward": 3.799570918083191,
"reward_std": 0.06657508388161659,
"rewards/answer_entity_reward": 0.9727430641651154,
"rewards/answer_wer_reward": 0.871229887008667,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9555979073047638,
"step": 118
},
{
"completion_length": 212.625,
"epoch": 0.3808,
"grad_norm": 2.1899421215057373,
"kl": 0.0570068359375,
"learning_rate": 8.525e-07,
"loss": 0.0006,
"reward": 3.9054840803146362,
"reward_std": 0.027329989708960056,
"rewards/answer_entity_reward": 0.9914772808551788,
"rewards/answer_wer_reward": 0.9344967901706696,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9795099198818207,
"step": 119
},
{
"completion_length": 249.8125,
"epoch": 0.384,
"grad_norm": 2.4804491996765137,
"kl": 0.035888671875,
"learning_rate": 8.512499999999999e-07,
"loss": 0.0004,
"reward": 3.8948739767074585,
"reward_std": 0.028746116440743208,
"rewards/answer_entity_reward": 0.9953208565711975,
"rewards/answer_wer_reward": 0.9002179205417633,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993351101875305,
"step": 120
},
{
"completion_length": 185.34375,
"epoch": 0.3872,
"grad_norm": 2.305140256881714,
"kl": 0.102783203125,
"learning_rate": 8.499999999999999e-07,
"loss": 0.001,
"reward": 3.9010980129241943,
"reward_std": 0.021339962724596262,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9222235083580017,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9788744449615479,
"step": 121
},
{
"completion_length": 204.65625,
"epoch": 0.3904,
"grad_norm": 1.5420470237731934,
"kl": 0.0313720703125,
"learning_rate": 8.487499999999999e-07,
"loss": 0.0003,
"reward": 3.927214741706848,
"reward_std": 0.019817203283309937,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.92842698097229,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987878203392029,
"step": 122
},
{
"completion_length": 216.90625,
"epoch": 0.3936,
"grad_norm": 8.852909088134766,
"kl": 0.0716552734375,
"learning_rate": 8.475e-07,
"loss": 0.0007,
"reward": 3.811018466949463,
"reward_std": 0.010543343145400286,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.938366711139679,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8726518452167511,
"step": 123
},
{
"completion_length": 257.75,
"epoch": 0.3968,
"grad_norm": 1.4971685409545898,
"kl": 0.0330810546875,
"learning_rate": 8.462499999999999e-07,
"loss": 0.0003,
"reward": 3.9272462129592896,
"reward_std": 0.01983210165053606,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9303403496742249,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9989891648292542,
"step": 124
},
{
"completion_length": 207.4375,
"epoch": 0.4,
"grad_norm": 1.9963277578353882,
"kl": 0.056396484375,
"learning_rate": 8.45e-07,
"loss": 0.0006,
"reward": 3.9006247520446777,
"reward_std": 0.030232679098844528,
"rewards/answer_entity_reward": 0.9941239356994629,
"rewards/answer_wer_reward": 0.9261119067668915,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9803889393806458,
"step": 125
},
{
"completion_length": 246.875,
"epoch": 0.4032,
"grad_norm": 1.1950430870056152,
"kl": 0.03369140625,
"learning_rate": 8.4375e-07,
"loss": 0.0003,
"reward": 3.881152391433716,
"reward_std": 0.03120280895382166,
"rewards/answer_entity_reward": 0.9683753550052643,
"rewards/answer_wer_reward": 0.9131445586681366,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996323585510254,
"step": 126
},
{
"completion_length": 212.15625,
"epoch": 0.4064,
"grad_norm": 4.167364120483398,
"kl": 0.257568359375,
"learning_rate": 8.425e-07,
"loss": 0.0026,
"reward": 3.891525626182556,
"reward_std": 0.03758985735476017,
"rewards/answer_entity_reward": 0.9853896200656891,
"rewards/answer_wer_reward": 0.9100889563560486,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9960470795631409,
"step": 127
},
{
"completion_length": 215.5625,
"epoch": 0.4096,
"grad_norm": 1.2758169174194336,
"kl": 0.059326171875,
"learning_rate": 8.4125e-07,
"loss": 0.0006,
"reward": 3.8984569311141968,
"reward_std": 0.02103353524580598,
"rewards/answer_entity_reward": 0.987500011920929,
"rewards/answer_wer_reward": 0.9310561716556549,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9799006283283234,
"step": 128
},
{
"completion_length": 221.0,
"epoch": 0.4128,
"grad_norm": 1.6011369228363037,
"kl": 0.02734375,
"learning_rate": 8.399999999999999e-07,
"loss": 0.0003,
"reward": 3.907585859298706,
"reward_std": 0.024174046237021685,
"rewards/answer_entity_reward": 0.9887152910232544,
"rewards/answer_wer_reward": 0.9191110134124756,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997596144676208,
"step": 129
},
{
"completion_length": 189.375,
"epoch": 0.416,
"grad_norm": 2.7846839427948,
"kl": 0.0413818359375,
"learning_rate": 8.387499999999999e-07,
"loss": 0.0004,
"reward": 3.8641178607940674,
"reward_std": 0.03212345764040947,
"rewards/answer_entity_reward": 0.9947552382946014,
"rewards/answer_wer_reward": 0.9255104064941406,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9438523054122925,
"step": 130
},
{
"completion_length": 209.5,
"epoch": 0.4192,
"grad_norm": 4.144553184509277,
"kl": 0.0548095703125,
"learning_rate": 8.375e-07,
"loss": 0.0006,
"reward": 3.8618308305740356,
"reward_std": 0.07612445950508118,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9306082725524902,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9312225580215454,
"step": 131
},
{
"completion_length": 198.5,
"epoch": 0.4224,
"grad_norm": 2.663985013961792,
"kl": 0.04052734375,
"learning_rate": 8.3625e-07,
"loss": 0.0004,
"reward": 3.897012948989868,
"reward_std": 0.030758653301745653,
"rewards/answer_entity_reward": 0.9861111044883728,
"rewards/answer_wer_reward": 0.9326047897338867,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9782971143722534,
"step": 132
},
{
"completion_length": 180.78125,
"epoch": 0.4256,
"grad_norm": 2.2100954055786133,
"kl": 0.0439453125,
"learning_rate": 8.349999999999999e-07,
"loss": 0.0004,
"reward": 3.923304557800293,
"reward_std": 0.025213422253727913,
"rewards/answer_entity_reward": 0.9882478713989258,
"rewards/answer_wer_reward": 0.9360361397266388,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999020516872406,
"step": 133
},
{
"completion_length": 219.59375,
"epoch": 0.4288,
"grad_norm": 15.98015022277832,
"kl": 0.0645751953125,
"learning_rate": 8.3375e-07,
"loss": 0.0006,
"reward": 3.8721258640289307,
"reward_std": 0.02985560242086649,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9070867002010345,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9650392830371857,
"step": 134
},
{
"completion_length": 239.90625,
"epoch": 0.432,
"grad_norm": 3.754002332687378,
"kl": 0.0419921875,
"learning_rate": 8.325e-07,
"loss": 0.0004,
"reward": 3.8614091873168945,
"reward_std": 0.0724228248000145,
"rewards/answer_entity_reward": 0.9794008135795593,
"rewards/answer_wer_reward": 0.9043296277523041,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9776785373687744,
"step": 135
},
{
"completion_length": 228.09375,
"epoch": 0.4352,
"grad_norm": 2.609844207763672,
"kl": 0.037841796875,
"learning_rate": 8.3125e-07,
"loss": 0.0004,
"reward": 3.8617947101593018,
"reward_std": 0.021692313253879547,
"rewards/answer_entity_reward": 0.9861111044883728,
"rewards/answer_wer_reward": 0.8795575797557831,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.996126115322113,
"step": 136
},
{
"completion_length": 158.8125,
"epoch": 0.4384,
"grad_norm": 1.6180543899536133,
"kl": 0.055419921875,
"learning_rate": 8.299999999999999e-07,
"loss": 0.0005,
"reward": 3.9137951135635376,
"reward_std": 0.020158007740974426,
"rewards/answer_entity_reward": 0.970695972442627,
"rewards/answer_wer_reward": 0.9480262100696564,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9950730204582214,
"step": 137
},
{
"completion_length": 231.25,
"epoch": 0.4416,
"grad_norm": 0.9336134195327759,
"kl": 0.03399658203125,
"learning_rate": 8.287499999999999e-07,
"loss": 0.0003,
"reward": 3.9351539611816406,
"reward_std": 0.014509289292618632,
"rewards/answer_entity_reward": 0.9934294819831848,
"rewards/answer_wer_reward": 0.9442258775234222,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9974986016750336,
"step": 138
},
{
"completion_length": 220.34375,
"epoch": 0.4448,
"grad_norm": 21.355905532836914,
"kl": 0.059814453125,
"learning_rate": 8.275e-07,
"loss": 0.0006,
"reward": 3.863122820854187,
"reward_std": 0.060401469469070435,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9233364760875702,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9397862255573273,
"step": 139
},
{
"completion_length": 214.90625,
"epoch": 0.448,
"grad_norm": 1.280321478843689,
"kl": 0.052490234375,
"learning_rate": 8.2625e-07,
"loss": 0.0005,
"reward": 3.9231661558151245,
"reward_std": 0.009715312160551548,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9245247840881348,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.998641312122345,
"step": 140
},
{
"completion_length": 211.375,
"epoch": 0.4512,
"grad_norm": 1.7492412328720093,
"kl": 0.062744140625,
"learning_rate": 8.249999999999999e-07,
"loss": 0.0006,
"reward": 3.88791024684906,
"reward_std": 0.011862037936225533,
"rewards/answer_entity_reward": 0.9832702279090881,
"rewards/answer_wer_reward": 0.957579493522644,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9470604658126831,
"step": 141
},
{
"completion_length": 246.3125,
"epoch": 0.4544,
"grad_norm": 2.37640118598938,
"kl": 0.0369873046875,
"learning_rate": 8.2375e-07,
"loss": 0.0004,
"reward": 3.944279909133911,
"reward_std": 0.011443465016782284,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9472803771495819,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9969995319843292,
"step": 142
},
{
"completion_length": 199.90625,
"epoch": 0.4576,
"grad_norm": 2.8359158039093018,
"kl": 0.0540771484375,
"learning_rate": 8.225e-07,
"loss": 0.0005,
"reward": 3.93644380569458,
"reward_std": 0.023367811925709248,
"rewards/answer_entity_reward": 0.9916666746139526,
"rewards/answer_wer_reward": 0.9554752707481384,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9893018305301666,
"step": 143
},
{
"completion_length": 195.28125,
"epoch": 0.4608,
"grad_norm": 1.723976731300354,
"kl": 0.031982421875,
"learning_rate": 8.2125e-07,
"loss": 0.0003,
"reward": 3.9411680698394775,
"reward_std": 0.007689078338444233,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.941936582326889,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9992315769195557,
"step": 144
},
{
"completion_length": 223.375,
"epoch": 0.464,
"grad_norm": 1.08156418800354,
"kl": 0.02874755859375,
"learning_rate": 8.199999999999999e-07,
"loss": 0.0003,
"reward": 3.9059054851531982,
"reward_std": 0.007867377484217286,
"rewards/answer_entity_reward": 0.9903846085071564,
"rewards/answer_wer_reward": 0.9531411230564117,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9623797535896301,
"step": 145
},
{
"completion_length": 184.75,
"epoch": 0.4672,
"grad_norm": 1.7059741020202637,
"kl": 0.0400390625,
"learning_rate": 8.187499999999999e-07,
"loss": 0.0004,
"reward": 3.939697027206421,
"reward_std": 0.0070332614704966545,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9535529613494873,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9861441254615784,
"step": 146
},
{
"completion_length": 222.84375,
"epoch": 0.4704,
"grad_norm": 1.5283204317092896,
"kl": 0.072998046875,
"learning_rate": 8.175e-07,
"loss": 0.0007,
"reward": 3.843386173248291,
"reward_std": 0.02895416272804141,
"rewards/answer_entity_reward": 0.9304008483886719,
"rewards/answer_wer_reward": 0.9129853844642639,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 147
},
{
"completion_length": 165.25,
"epoch": 0.4736,
"grad_norm": 2.885890245437622,
"kl": 0.04193115234375,
"learning_rate": 8.1625e-07,
"loss": 0.0004,
"reward": 3.8639066219329834,
"reward_std": 0.01842296402901411,
"rewards/answer_entity_reward": 0.9947552382946014,
"rewards/answer_wer_reward": 0.9352113604545593,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9339398741722107,
"step": 148
},
{
"completion_length": 225.8125,
"epoch": 0.4768,
"grad_norm": 1.5893429517745972,
"kl": 0.0615234375,
"learning_rate": 8.149999999999999e-07,
"loss": 0.0006,
"reward": 3.9009220600128174,
"reward_std": 0.022383708506822586,
"rewards/answer_entity_reward": 0.9967105388641357,
"rewards/answer_wer_reward": 0.9052460193634033,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9989655911922455,
"step": 149
},
{
"completion_length": 236.21875,
"epoch": 0.48,
"grad_norm": 2.1324307918548584,
"kl": 0.0377197265625,
"learning_rate": 8.137499999999999e-07,
"loss": 0.0004,
"reward": 3.8904128074645996,
"reward_std": 0.02841739635914564,
"rewards/answer_entity_reward": 0.9930555820465088,
"rewards/answer_wer_reward": 0.8976494371891022,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997079372406006,
"step": 150
},
{
"completion_length": 213.15625,
"epoch": 0.4832,
"grad_norm": 0.9698525667190552,
"kl": 0.034423828125,
"learning_rate": 8.125e-07,
"loss": 0.0003,
"reward": 3.890373468399048,
"reward_std": 0.009532647207379341,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9459290206432343,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9444444477558136,
"step": 151
},
{
"completion_length": 250.21875,
"epoch": 0.4864,
"grad_norm": 4.16625452041626,
"kl": 0.198486328125,
"learning_rate": 8.1125e-07,
"loss": 0.002,
"reward": 3.8978230953216553,
"reward_std": 0.024048997554928064,
"rewards/answer_entity_reward": 0.987500011920929,
"rewards/answer_wer_reward": 0.9117782711982727,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9985446929931641,
"step": 152
},
{
"completion_length": 174.15625,
"epoch": 0.4896,
"grad_norm": 2.9183833599090576,
"kl": 0.0716552734375,
"learning_rate": 8.1e-07,
"loss": 0.0007,
"reward": 3.908216118812561,
"reward_std": 0.032137976959347725,
"rewards/answer_entity_reward": 0.9895833432674408,
"rewards/answer_wer_reward": 0.9441157281398773,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9745170772075653,
"step": 153
},
{
"completion_length": 187.03125,
"epoch": 0.4928,
"grad_norm": 1.039563536643982,
"kl": 0.0535888671875,
"learning_rate": 8.087499999999999e-07,
"loss": 0.0005,
"reward": 3.940076231956482,
"reward_std": 0.014994107652455568,
"rewards/answer_entity_reward": 0.9910714626312256,
"rewards/answer_wer_reward": 0.9499542117118835,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990506172180176,
"step": 154
},
{
"completion_length": 214.125,
"epoch": 0.496,
"grad_norm": 2.49003267288208,
"kl": 0.0635986328125,
"learning_rate": 8.075e-07,
"loss": 0.0006,
"reward": 3.850375175476074,
"reward_std": 0.026249381713569164,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.8511867821216583,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9991883039474487,
"step": 155
},
{
"completion_length": 214.8125,
"epoch": 0.4992,
"grad_norm": 2.7330820560455322,
"kl": 0.03717041015625,
"learning_rate": 8.0625e-07,
"loss": 0.0004,
"reward": 3.9070980548858643,
"reward_std": 0.04327901639044285,
"rewards/answer_entity_reward": 0.9886363744735718,
"rewards/answer_wer_reward": 0.9249836802482605,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9934781193733215,
"step": 156
},
{
"completion_length": 195.0625,
"epoch": 0.5024,
"grad_norm": 2.878744602203369,
"kl": 0.0828857421875,
"learning_rate": 8.05e-07,
"loss": 0.0008,
"reward": 3.9139277935028076,
"reward_std": 0.022999857552349567,
"rewards/answer_entity_reward": 0.9947916567325592,
"rewards/answer_wer_reward": 0.9313595592975616,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9877763986587524,
"step": 157
},
{
"completion_length": 216.625,
"epoch": 0.5056,
"grad_norm": 1.1287983655929565,
"kl": 0.049072265625,
"learning_rate": 8.037499999999999e-07,
"loss": 0.0005,
"reward": 3.9037948846817017,
"reward_std": 0.011531218886375427,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9081907570362091,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9956042170524597,
"step": 158
},
{
"completion_length": 200.21875,
"epoch": 0.5088,
"grad_norm": 1.5555959939956665,
"kl": 0.0369873046875,
"learning_rate": 8.024999999999999e-07,
"loss": 0.0004,
"reward": 3.9110556840896606,
"reward_std": 0.019422957440838218,
"rewards/answer_entity_reward": 0.9941239356994629,
"rewards/answer_wer_reward": 0.9354503750801086,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9814814925193787,
"step": 159
},
{
"completion_length": 202.875,
"epoch": 0.512,
"grad_norm": 13.22675895690918,
"kl": 0.084228515625,
"learning_rate": 8.0125e-07,
"loss": 0.0008,
"reward": 3.8508609533309937,
"reward_std": 0.037849435582756996,
"rewards/answer_entity_reward": 0.9867424070835114,
"rewards/answer_wer_reward": 0.9194300472736359,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9446885287761688,
"step": 160
},
{
"completion_length": 187.5625,
"epoch": 0.5152,
"grad_norm": 1.9724727869033813,
"kl": 0.05126953125,
"learning_rate": 8e-07,
"loss": 0.0005,
"reward": 3.9261248111724854,
"reward_std": 0.02531399577856064,
"rewards/answer_entity_reward": 0.9882478713989258,
"rewards/answer_wer_reward": 0.9410728812217712,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9968039691448212,
"step": 161
},
{
"completion_length": 254.84375,
"epoch": 0.5184,
"grad_norm": 2.3500356674194336,
"kl": 0.05340576171875,
"learning_rate": 7.9875e-07,
"loss": 0.0005,
"reward": 3.910772919654846,
"reward_std": 0.04009111411869526,
"rewards/answer_entity_reward": 0.9747862815856934,
"rewards/answer_wer_reward": 0.9362366199493408,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999750018119812,
"step": 162
},
{
"completion_length": 206.625,
"epoch": 0.5216,
"grad_norm": 6.3654890060424805,
"kl": 0.069580078125,
"learning_rate": 7.975e-07,
"loss": 0.0007,
"reward": 3.805917978286743,
"reward_std": 0.052407728508114815,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9451808631420135,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8642092943191528,
"step": 163
},
{
"completion_length": 212.71875,
"epoch": 0.5248,
"grad_norm": 1.921622633934021,
"kl": 0.09283447265625,
"learning_rate": 7.9625e-07,
"loss": 0.0009,
"reward": 3.9235308170318604,
"reward_std": 0.022881922777742147,
"rewards/answer_entity_reward": 0.993686854839325,
"rewards/answer_wer_reward": 0.9401760995388031,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9896678924560547,
"step": 164
},
{
"completion_length": 234.5625,
"epoch": 0.528,
"grad_norm": 1.4160696268081665,
"kl": 0.061767578125,
"learning_rate": 7.95e-07,
"loss": 0.0006,
"reward": 3.890324354171753,
"reward_std": 0.014382836874574423,
"rewards/answer_entity_reward": 0.9653846025466919,
"rewards/answer_wer_reward": 0.9249398708343506,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 165
},
{
"completion_length": 223.0,
"epoch": 0.5312,
"grad_norm": 1.2775448560714722,
"kl": 0.0582275390625,
"learning_rate": 7.937499999999999e-07,
"loss": 0.0006,
"reward": 3.9478421211242676,
"reward_std": 0.011931413784623146,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9481260776519775,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997159242630005,
"step": 166
},
{
"completion_length": 214.65625,
"epoch": 0.5344,
"grad_norm": 1.287255883216858,
"kl": 0.052734375,
"learning_rate": 7.924999999999999e-07,
"loss": 0.0005,
"reward": 3.9042768478393555,
"reward_std": 0.02827941346913576,
"rewards/answer_entity_reward": 0.9787962138652802,
"rewards/answer_wer_reward": 0.925747811794281,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997329115867615,
"step": 167
},
{
"completion_length": 224.59375,
"epoch": 0.5376,
"grad_norm": 1.7952959537506104,
"kl": 0.0364990234375,
"learning_rate": 7.912499999999999e-07,
"loss": 0.0004,
"reward": 3.935611605644226,
"reward_std": 0.027386673726141453,
"rewards/answer_entity_reward": 0.9919143319129944,
"rewards/answer_wer_reward": 0.9439473152160645,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999750018119812,
"step": 168
},
{
"completion_length": 183.28125,
"epoch": 0.5408,
"grad_norm": 8.36503791809082,
"kl": 0.0848388671875,
"learning_rate": 7.9e-07,
"loss": 0.0008,
"reward": 3.8025405406951904,
"reward_std": 0.04630524106323719,
"rewards/answer_entity_reward": 0.9862637221813202,
"rewards/answer_wer_reward": 0.8270655274391174,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9892113208770752,
"step": 169
},
{
"completion_length": 235.15625,
"epoch": 0.544,
"grad_norm": 2.2816457748413086,
"kl": 0.0296630859375,
"learning_rate": 7.8875e-07,
"loss": 0.0003,
"reward": 3.934034824371338,
"reward_std": 0.009957955218851566,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9344717264175415,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9995629191398621,
"step": 170
},
{
"completion_length": 247.53125,
"epoch": 0.5472,
"grad_norm": 1.6856052875518799,
"kl": 0.13134765625,
"learning_rate": 7.875e-07,
"loss": 0.0013,
"reward": 3.896223545074463,
"reward_std": 0.015339810401201248,
"rewards/answer_entity_reward": 0.9861111044883728,
"rewards/answer_wer_reward": 0.9109295010566711,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9991828203201294,
"step": 171
},
{
"completion_length": 245.03125,
"epoch": 0.5504,
"grad_norm": 4.956347465515137,
"kl": 0.044921875,
"learning_rate": 7.8625e-07,
"loss": 0.0005,
"reward": 3.7271645069122314,
"reward_std": 0.21888091787695885,
"rewards/answer_entity_reward": 0.9630681872367859,
"rewards/answer_wer_reward": 0.8937070369720459,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9016393423080444,
"step": 172
},
{
"completion_length": 211.3125,
"epoch": 0.5536,
"grad_norm": 1.1714370250701904,
"kl": 0.0323486328125,
"learning_rate": 7.85e-07,
"loss": 0.0003,
"reward": 3.913045883178711,
"reward_std": 0.04143238253891468,
"rewards/answer_entity_reward": 0.9870130121707916,
"rewards/answer_wer_reward": 0.9331351518630981,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9928977191448212,
"step": 173
},
{
"completion_length": 272.1875,
"epoch": 0.5568,
"grad_norm": 1.2012341022491455,
"kl": 0.0413818359375,
"learning_rate": 7.837499999999999e-07,
"loss": 0.0004,
"reward": 3.876948356628418,
"reward_std": 0.03149130195379257,
"rewards/answer_entity_reward": 0.9889954328536987,
"rewards/answer_wer_reward": 0.9271560311317444,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9607969224452972,
"step": 174
},
{
"completion_length": 200.3125,
"epoch": 0.56,
"grad_norm": 2.998842477798462,
"kl": 0.067138671875,
"learning_rate": 7.824999999999999e-07,
"loss": 0.0007,
"reward": 3.8472641706466675,
"reward_std": 0.04471721313893795,
"rewards/answer_entity_reward": 0.9902146458625793,
"rewards/answer_wer_reward": 0.9358225166797638,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.921226978302002,
"step": 175
},
{
"completion_length": 207.03125,
"epoch": 0.5632,
"grad_norm": 10.961363792419434,
"kl": 0.0789794921875,
"learning_rate": 7.812499999999999e-07,
"loss": 0.0008,
"reward": 3.9478721618652344,
"reward_std": 0.027662259992212057,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9600406885147095,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9902353584766388,
"step": 176
},
{
"completion_length": 221.59375,
"epoch": 0.5664,
"grad_norm": 1.341109275817871,
"kl": 0.065185546875,
"learning_rate": 7.799999999999999e-07,
"loss": 0.0006,
"reward": 3.8582847118377686,
"reward_std": 0.041704089380800724,
"rewards/answer_entity_reward": 0.9775640964508057,
"rewards/answer_wer_reward": 0.9368657767772675,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9438548386096954,
"step": 177
},
{
"completion_length": 239.90625,
"epoch": 0.5696,
"grad_norm": 1.4057974815368652,
"kl": 0.045166015625,
"learning_rate": 7.787500000000001e-07,
"loss": 0.0005,
"reward": 3.9274110794067383,
"reward_std": 0.02352920500561595,
"rewards/answer_entity_reward": 0.9946895241737366,
"rewards/answer_wer_reward": 0.9349404275417328,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9977810680866241,
"step": 178
},
{
"completion_length": 211.78125,
"epoch": 0.5728,
"grad_norm": 2.9184887409210205,
"kl": 0.031982421875,
"learning_rate": 7.775e-07,
"loss": 0.0003,
"reward": 3.945718765258789,
"reward_std": 0.01779081765562296,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9512039721012115,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9973557591438293,
"step": 179
},
{
"completion_length": 204.375,
"epoch": 0.576,
"grad_norm": 113.12403869628906,
"kl": 0.05322265625,
"learning_rate": 7.7625e-07,
"loss": 0.0005,
"reward": 3.8825124502182007,
"reward_std": 0.07031127344816923,
"rewards/answer_entity_reward": 0.9926734566688538,
"rewards/answer_wer_reward": 0.9367940425872803,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9530448913574219,
"step": 180
},
{
"completion_length": 214.75,
"epoch": 0.5792,
"grad_norm": 1.3515021800994873,
"kl": 0.0609130859375,
"learning_rate": 7.75e-07,
"loss": 0.0006,
"reward": 3.920071840286255,
"reward_std": 0.011316743912175298,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9225669503211975,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9975048303604126,
"step": 181
},
{
"completion_length": 205.0625,
"epoch": 0.5824,
"grad_norm": 1.5749711990356445,
"kl": 0.054443359375,
"learning_rate": 7.7375e-07,
"loss": 0.0005,
"reward": 3.921678900718689,
"reward_std": 0.013327162247151136,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9460242688655853,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9780585169792175,
"step": 182
},
{
"completion_length": 217.75,
"epoch": 0.5856,
"grad_norm": 0.7737219929695129,
"kl": 0.0469970703125,
"learning_rate": 7.724999999999999e-07,
"loss": 0.0005,
"reward": 3.9334832429885864,
"reward_std": 0.020406807772815228,
"rewards/answer_entity_reward": 0.9947552382946014,
"rewards/answer_wer_reward": 0.938728004693985,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 183
},
{
"completion_length": 231.59375,
"epoch": 0.5888,
"grad_norm": 1.6825175285339355,
"kl": 0.0543212890625,
"learning_rate": 7.712499999999999e-07,
"loss": 0.0005,
"reward": 3.938681125640869,
"reward_std": 0.017365658190101385,
"rewards/answer_entity_reward": 0.9981617629528046,
"rewards/answer_wer_reward": 0.9413779377937317,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9991414844989777,
"step": 184
},
{
"completion_length": 239.71875,
"epoch": 0.592,
"grad_norm": 1.3427449464797974,
"kl": 0.058837890625,
"learning_rate": 7.699999999999999e-07,
"loss": 0.0006,
"reward": 3.9066988229751587,
"reward_std": 0.020341036841273308,
"rewards/answer_entity_reward": 0.9776557087898254,
"rewards/answer_wer_reward": 0.929761528968811,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9992816150188446,
"step": 185
},
{
"completion_length": 133.90625,
"epoch": 0.5952,
"grad_norm": 4.991705417633057,
"kl": 0.0623779296875,
"learning_rate": 7.6875e-07,
"loss": 0.0006,
"reward": 3.926753878593445,
"reward_std": 0.023914007004350424,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9629489779472351,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9638049006462097,
"step": 186
},
{
"completion_length": 234.625,
"epoch": 0.5984,
"grad_norm": 2.8712401390075684,
"kl": 0.096435546875,
"learning_rate": 7.675e-07,
"loss": 0.001,
"reward": 3.872377395629883,
"reward_std": 0.06525835767388344,
"rewards/answer_entity_reward": 0.9841803908348083,
"rewards/answer_wer_reward": 0.9093597233295441,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9788371920585632,
"step": 187
},
{
"completion_length": 225.4375,
"epoch": 0.6016,
"grad_norm": 2.3115170001983643,
"kl": 0.055419921875,
"learning_rate": 7.6625e-07,
"loss": 0.0006,
"reward": 3.9362770318984985,
"reward_std": 0.019690027460455894,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9422614872455597,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.997487872838974,
"step": 188
},
{
"completion_length": 214.15625,
"epoch": 0.6048,
"grad_norm": 3.583329677581787,
"kl": 0.0550537109375,
"learning_rate": 7.65e-07,
"loss": 0.0005,
"reward": 3.9327969551086426,
"reward_std": 0.014218965079635382,
"rewards/answer_entity_reward": 0.9903846085071564,
"rewards/answer_wer_reward": 0.9424121379852295,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 189
},
{
"completion_length": 249.15625,
"epoch": 0.608,
"grad_norm": 1.4651848077774048,
"kl": 0.052001953125,
"learning_rate": 7.6375e-07,
"loss": 0.0005,
"reward": 3.941069722175598,
"reward_std": 0.009663278236985207,
"rewards/answer_entity_reward": 0.9926470518112183,
"rewards/answer_wer_reward": 0.9507163166999817,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9977063536643982,
"step": 190
},
{
"completion_length": 197.84375,
"epoch": 0.6112,
"grad_norm": 1.4688224792480469,
"kl": 0.0577392578125,
"learning_rate": 7.624999999999999e-07,
"loss": 0.0006,
"reward": 3.9300395250320435,
"reward_std": 0.014806594932451844,
"rewards/answer_entity_reward": 0.984722226858139,
"rewards/answer_wer_reward": 0.9455022215843201,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9998151063919067,
"step": 191
},
{
"completion_length": 254.6875,
"epoch": 0.6144,
"grad_norm": 1.1648938655853271,
"kl": 0.0589599609375,
"learning_rate": 7.612499999999999e-07,
"loss": 0.0006,
"reward": 3.9228453636169434,
"reward_std": 0.026355463080108166,
"rewards/answer_entity_reward": 0.9819444715976715,
"rewards/answer_wer_reward": 0.9418983161449432,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990026652812958,
"step": 192
},
{
"completion_length": 264.34375,
"epoch": 0.6176,
"grad_norm": 1.2595146894454956,
"kl": 0.0635986328125,
"learning_rate": 7.599999999999999e-07,
"loss": 0.0006,
"reward": 3.9068782329559326,
"reward_std": 0.02374061942100525,
"rewards/answer_entity_reward": 0.9758522510528564,
"rewards/answer_wer_reward": 0.9392839670181274,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9917418956756592,
"step": 193
},
{
"completion_length": 226.875,
"epoch": 0.6208,
"grad_norm": 3.0049514770507812,
"kl": 0.065185546875,
"learning_rate": 7.5875e-07,
"loss": 0.0007,
"reward": 3.9182554483413696,
"reward_std": 0.028174775652587414,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.9239371716976166,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 194
},
{
"completion_length": 233.90625,
"epoch": 0.624,
"grad_norm": 3.6226987838745117,
"kl": 0.14013671875,
"learning_rate": 7.575e-07,
"loss": 0.0014,
"reward": 3.917691946029663,
"reward_std": 0.015854593832045794,
"rewards/answer_entity_reward": 0.9861111044883728,
"rewards/answer_wer_reward": 0.9359965324401855,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.99558424949646,
"step": 195
},
{
"completion_length": 228.96875,
"epoch": 0.6272,
"grad_norm": 3.1564576625823975,
"kl": 0.03131103515625,
"learning_rate": 7.5625e-07,
"loss": 0.0003,
"reward": 3.8988983631134033,
"reward_std": 0.04383570794016123,
"rewards/answer_entity_reward": 0.980654776096344,
"rewards/answer_wer_reward": 0.9372455775737762,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9809979796409607,
"step": 196
},
{
"completion_length": 235.875,
"epoch": 0.6304,
"grad_norm": 1.3267861604690552,
"kl": 0.052978515625,
"learning_rate": 7.55e-07,
"loss": 0.0005,
"reward": 3.9319225549697876,
"reward_std": 0.02372880419716239,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9346356689929962,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999690592288971,
"step": 197
},
{
"completion_length": 162.34375,
"epoch": 0.6336,
"grad_norm": 1.4438445568084717,
"kl": 0.065185546875,
"learning_rate": 7.5375e-07,
"loss": 0.0006,
"reward": 3.8535887002944946,
"reward_std": 0.041104525327682495,
"rewards/answer_entity_reward": 0.9681412279605865,
"rewards/answer_wer_reward": 0.9683326184749603,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9171150028705597,
"step": 198
},
{
"completion_length": 203.875,
"epoch": 0.6368,
"grad_norm": 4.674152374267578,
"kl": 0.050048828125,
"learning_rate": 7.524999999999999e-07,
"loss": 0.0005,
"reward": 3.938958764076233,
"reward_std": 0.01455747289583087,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9664872884750366,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.974875271320343,
"step": 199
},
{
"completion_length": 230.625,
"epoch": 0.64,
"grad_norm": 1.899129867553711,
"kl": 0.0535888671875,
"learning_rate": 7.512499999999999e-07,
"loss": 0.0005,
"reward": 3.9438642263412476,
"reward_std": 0.014077516738325357,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.952812910079956,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9938922822475433,
"step": 200
},
{
"completion_length": 212.4375,
"epoch": 0.6432,
"grad_norm": 1.8970869779586792,
"kl": 0.0460205078125,
"learning_rate": 7.5e-07,
"loss": 0.0005,
"reward": 3.9026511907577515,
"reward_std": 0.038714910857379436,
"rewards/answer_entity_reward": 0.9916666746139526,
"rewards/answer_wer_reward": 0.911726325750351,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9992582499980927,
"step": 201
},
{
"completion_length": 203.875,
"epoch": 0.6464,
"grad_norm": 2.5214030742645264,
"kl": 0.083251953125,
"learning_rate": 7.4875e-07,
"loss": 0.0008,
"reward": 3.9040462970733643,
"reward_std": 0.016587836667895317,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9761527180671692,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9278934299945831,
"step": 202
},
{
"completion_length": 216.375,
"epoch": 0.6496,
"grad_norm": 4.072224140167236,
"kl": 0.053955078125,
"learning_rate": 7.475e-07,
"loss": 0.0005,
"reward": 3.9431036710739136,
"reward_std": 0.020094456151127815,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.949131965637207,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.996812641620636,
"step": 203
},
{
"completion_length": 221.0,
"epoch": 0.6528,
"grad_norm": 3.3709828853607178,
"kl": 0.070556640625,
"learning_rate": 7.4625e-07,
"loss": 0.0007,
"reward": 3.8844679594039917,
"reward_std": 0.05386691028252244,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.934579610824585,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9498883783817291,
"step": 204
},
{
"completion_length": 195.9375,
"epoch": 0.656,
"grad_norm": 2.4978103637695312,
"kl": 0.0775146484375,
"learning_rate": 7.45e-07,
"loss": 0.0008,
"reward": 3.9303336143493652,
"reward_std": 0.04689153959043324,
"rewards/answer_entity_reward": 0.9804924428462982,
"rewards/answer_wer_reward": 0.9526000618934631,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9972410500049591,
"step": 205
},
{
"completion_length": 256.875,
"epoch": 0.6592,
"grad_norm": 2.3422584533691406,
"kl": 0.1229248046875,
"learning_rate": 7.4375e-07,
"loss": 0.0012,
"reward": 3.9243087768554688,
"reward_std": 0.019790570251643658,
"rewards/answer_entity_reward": 0.9764957129955292,
"rewards/answer_wer_reward": 0.9478131830692291,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 206
},
{
"completion_length": 204.15625,
"epoch": 0.6624,
"grad_norm": 2.19623064994812,
"kl": 0.0550537109375,
"learning_rate": 7.425e-07,
"loss": 0.0006,
"reward": 3.936911940574646,
"reward_std": 0.02031032182276249,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9463189840316772,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9905929565429688,
"step": 207
},
{
"completion_length": 225.21875,
"epoch": 0.6656,
"grad_norm": 5.279341220855713,
"kl": 0.0498046875,
"learning_rate": 7.412499999999999e-07,
"loss": 0.0005,
"reward": 3.915460228919983,
"reward_std": 0.015285669825971127,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9175935089588165,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9978667497634888,
"step": 208
},
{
"completion_length": 188.78125,
"epoch": 0.6688,
"grad_norm": 3.7716915607452393,
"kl": 0.0576171875,
"learning_rate": 7.4e-07,
"loss": 0.0006,
"reward": 3.8296241760253906,
"reward_std": 0.017440371215343475,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9421272277832031,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8899007737636566,
"step": 209
},
{
"completion_length": 203.28125,
"epoch": 0.672,
"grad_norm": 1.2790639400482178,
"kl": 0.0582275390625,
"learning_rate": 7.3875e-07,
"loss": 0.0006,
"reward": 3.952346086502075,
"reward_std": 0.007349871098995209,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.969746857881546,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9825991690158844,
"step": 210
},
{
"completion_length": 196.1875,
"epoch": 0.6752,
"grad_norm": 14.005128860473633,
"kl": 0.0604248046875,
"learning_rate": 7.375e-07,
"loss": 0.0006,
"reward": 3.8537105321884155,
"reward_std": 0.012695960700511932,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9704558551311493,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8832548260688782,
"step": 211
},
{
"completion_length": 159.3125,
"epoch": 0.6784,
"grad_norm": 4.394070625305176,
"kl": 0.068115234375,
"learning_rate": 7.362499999999999e-07,
"loss": 0.0007,
"reward": 3.9123398065567017,
"reward_std": 0.02882718201726675,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9466139674186707,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.965725839138031,
"step": 212
},
{
"completion_length": 238.75,
"epoch": 0.6816,
"grad_norm": 5.395397663116455,
"kl": 0.041748046875,
"learning_rate": 7.35e-07,
"loss": 0.0004,
"reward": 3.89706289768219,
"reward_std": 0.0131816565990448,
"rewards/answer_entity_reward": 0.9914772808551788,
"rewards/answer_wer_reward": 0.9062366485595703,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993489682674408,
"step": 213
},
{
"completion_length": 255.65625,
"epoch": 0.6848,
"grad_norm": 1.9760891199111938,
"kl": 0.03961181640625,
"learning_rate": 7.3375e-07,
"loss": 0.0004,
"reward": 3.917116641998291,
"reward_std": 0.04898790689185262,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9182944297790527,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9988220930099487,
"step": 214
},
{
"completion_length": 165.75,
"epoch": 0.688,
"grad_norm": 2.763314723968506,
"kl": 0.0577392578125,
"learning_rate": 7.325e-07,
"loss": 0.0006,
"reward": 3.952502489089966,
"reward_std": 0.016542275436222553,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9569029808044434,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990717768669128,
"step": 215
},
{
"completion_length": 215.625,
"epoch": 0.6912,
"grad_norm": 7.516313552856445,
"kl": 0.0439453125,
"learning_rate": 7.312499999999999e-07,
"loss": 0.0004,
"reward": 3.9650633335113525,
"reward_std": 0.015061032958328724,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9679040908813477,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 216
},
{
"completion_length": 227.84375,
"epoch": 0.6944,
"grad_norm": 1.8075324296951294,
"kl": 0.0511474609375,
"learning_rate": 7.3e-07,
"loss": 0.0005,
"reward": 3.9209293127059937,
"reward_std": 0.01800437457859516,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.9266109764575958,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 217
},
{
"completion_length": 213.625,
"epoch": 0.6976,
"grad_norm": 5.917069911956787,
"kl": 0.0426025390625,
"learning_rate": 7.2875e-07,
"loss": 0.0004,
"reward": 3.9082109928131104,
"reward_std": 0.07417950965464115,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9089923202991486,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999218761920929,
"step": 218
},
{
"completion_length": 228.6875,
"epoch": 0.7008,
"grad_norm": 1.1044409275054932,
"kl": 0.0531005859375,
"learning_rate": 7.275e-07,
"loss": 0.0005,
"reward": 3.908870220184326,
"reward_std": 0.016815255396068096,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9117993116378784,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994747638702393,
"step": 219
},
{
"completion_length": 199.125,
"epoch": 0.704,
"grad_norm": 3.019407272338867,
"kl": 0.058837890625,
"learning_rate": 7.262499999999999e-07,
"loss": 0.0006,
"reward": 3.925763249397278,
"reward_std": 0.01313594076782465,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9272693395614624,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9984939694404602,
"step": 220
},
{
"completion_length": 210.65625,
"epoch": 0.7072,
"grad_norm": 2.7719058990478516,
"kl": 0.0377197265625,
"learning_rate": 7.249999999999999e-07,
"loss": 0.0004,
"reward": 3.8708763122558594,
"reward_std": 0.028095172019675374,
"rewards/answer_entity_reward": 0.9812500178813934,
"rewards/answer_wer_reward": 0.9290285110473633,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.960597813129425,
"step": 221
},
{
"completion_length": 199.6875,
"epoch": 0.7104,
"grad_norm": 2.267350435256958,
"kl": 0.0660400390625,
"learning_rate": 7.2375e-07,
"loss": 0.0006,
"reward": 3.9580957889556885,
"reward_std": 0.03087126836180687,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9705802798271179,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9899193644523621,
"step": 222
},
{
"completion_length": 181.8125,
"epoch": 0.7136,
"grad_norm": 8.685694694519043,
"kl": 0.081787109375,
"learning_rate": 7.225e-07,
"loss": 0.0008,
"reward": 3.8902955055236816,
"reward_std": 0.011068197898566723,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9720200002193451,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9182755053043365,
"step": 223
},
{
"completion_length": 185.4375,
"epoch": 0.7168,
"grad_norm": 2.514770746231079,
"kl": 0.0609130859375,
"learning_rate": 7.212499999999999e-07,
"loss": 0.0006,
"reward": 3.9320486783981323,
"reward_std": 0.033941914327442646,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9598598778247833,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.972188800573349,
"step": 224
},
{
"completion_length": 250.84375,
"epoch": 0.72,
"grad_norm": 1.7914812564849854,
"kl": 0.03045654296875,
"learning_rate": 7.2e-07,
"loss": 0.0003,
"reward": 3.8908780813217163,
"reward_std": 0.03203156217932701,
"rewards/answer_entity_reward": 0.9678819179534912,
"rewards/answer_wer_reward": 0.9238358736038208,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9991601407527924,
"step": 225
},
{
"completion_length": 249.125,
"epoch": 0.7232,
"grad_norm": 4.627202987670898,
"kl": 0.0531005859375,
"learning_rate": 7.1875e-07,
"loss": 0.0005,
"reward": 3.899629235267639,
"reward_std": 0.06726673897355795,
"rewards/answer_entity_reward": 0.9953208565711975,
"rewards/answer_wer_reward": 0.9247469902038574,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9795613586902618,
"step": 226
},
{
"completion_length": 214.40625,
"epoch": 0.7264,
"grad_norm": 1.942586064338684,
"kl": 0.0352783203125,
"learning_rate": 7.175e-07,
"loss": 0.0003,
"reward": 3.959649443626404,
"reward_std": 0.01394367078319192,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9649502038955688,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9975401163101196,
"step": 227
},
{
"completion_length": 182.59375,
"epoch": 0.7296,
"grad_norm": 3.191298246383667,
"kl": 0.055419921875,
"learning_rate": 7.1625e-07,
"loss": 0.0005,
"reward": 3.9260960817337036,
"reward_std": 0.021659906953573227,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9576999247074127,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9712370932102203,
"step": 228
},
{
"completion_length": 212.53125,
"epoch": 0.7328,
"grad_norm": 1.0323834419250488,
"kl": 0.0533447265625,
"learning_rate": 7.149999999999999e-07,
"loss": 0.0005,
"reward": 3.939168095588684,
"reward_std": 0.009458722081035376,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9402457773685455,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9989224076271057,
"step": 229
},
{
"completion_length": 187.8125,
"epoch": 0.736,
"grad_norm": 4.53863000869751,
"kl": 0.050537109375,
"learning_rate": 7.137499999999999e-07,
"loss": 0.0005,
"reward": 3.893386960029602,
"reward_std": 0.03008814249187708,
"rewards/answer_entity_reward": 0.9941239356994629,
"rewards/answer_wer_reward": 0.9532185792922974,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.946044385433197,
"step": 230
},
{
"completion_length": 235.5,
"epoch": 0.7392,
"grad_norm": 2.1737990379333496,
"kl": 0.0477294921875,
"learning_rate": 7.125e-07,
"loss": 0.0005,
"reward": 3.8995944261550903,
"reward_std": 0.021292359568178654,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9127146005630493,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9868797659873962,
"step": 231
},
{
"completion_length": 230.625,
"epoch": 0.7424,
"grad_norm": 0.8920266628265381,
"kl": 0.02874755859375,
"learning_rate": 7.1125e-07,
"loss": 0.0003,
"reward": 3.9383678436279297,
"reward_std": 0.008275180356577039,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9394271969795227,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9989406764507294,
"step": 232
},
{
"completion_length": 196.125,
"epoch": 0.7456,
"grad_norm": 2.1836190223693848,
"kl": 0.06640625,
"learning_rate": 7.1e-07,
"loss": 0.0007,
"reward": 3.9469913244247437,
"reward_std": 0.01094681373797357,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9498908519744873,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9971005320549011,
"step": 233
},
{
"completion_length": 200.6875,
"epoch": 0.7488,
"grad_norm": 1.5529507398605347,
"kl": 0.041748046875,
"learning_rate": 7.0875e-07,
"loss": 0.0004,
"reward": 3.8839221000671387,
"reward_std": 0.02069476176984608,
"rewards/answer_entity_reward": 0.9841346144676208,
"rewards/answer_wer_reward": 0.9540095031261444,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.945777952671051,
"step": 234
},
{
"completion_length": 222.90625,
"epoch": 0.752,
"grad_norm": 17.55677604675293,
"kl": 0.061767578125,
"learning_rate": 7.075e-07,
"loss": 0.0006,
"reward": 3.92560076713562,
"reward_std": 0.03323593852110207,
"rewards/answer_entity_reward": 0.9963235259056091,
"rewards/answer_wer_reward": 0.9402145445346832,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.989062488079071,
"step": 235
},
{
"completion_length": 195.0625,
"epoch": 0.7552,
"grad_norm": 1.7806612253189087,
"kl": 0.056640625,
"learning_rate": 7.0625e-07,
"loss": 0.0006,
"reward": 3.9366722106933594,
"reward_std": 0.02212852332741022,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9515082538127899,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9886362552642822,
"step": 236
},
{
"completion_length": 224.34375,
"epoch": 0.7584,
"grad_norm": 3.0402088165283203,
"kl": 0.0352783203125,
"learning_rate": 7.049999999999999e-07,
"loss": 0.0004,
"reward": 3.947329044342041,
"reward_std": 0.011976622510701418,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.961329847574234,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.988840252161026,
"step": 237
},
{
"completion_length": 223.53125,
"epoch": 0.7616,
"grad_norm": 2.889293670654297,
"kl": 0.0616455078125,
"learning_rate": 7.037499999999999e-07,
"loss": 0.0006,
"reward": 3.9246891736984253,
"reward_std": 0.05990536604076624,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9530621469020844,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9750992059707642,
"step": 238
},
{
"completion_length": 184.78125,
"epoch": 0.7648,
"grad_norm": 1.2427425384521484,
"kl": 0.0623779296875,
"learning_rate": 7.024999999999999e-07,
"loss": 0.0006,
"reward": 3.957573890686035,
"reward_std": 0.005278389900922775,
"rewards/answer_entity_reward": 0.9926470518112183,
"rewards/answer_wer_reward": 0.9649269282817841,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 239
},
{
"completion_length": 236.28125,
"epoch": 0.768,
"grad_norm": 2.361463785171509,
"kl": 0.0545654296875,
"learning_rate": 7.0125e-07,
"loss": 0.0005,
"reward": 3.9197674989700317,
"reward_std": 0.02553732506930828,
"rewards/answer_entity_reward": 0.9834134578704834,
"rewards/answer_wer_reward": 0.9363541007041931,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 240
},
{
"completion_length": 174.0,
"epoch": 0.7712,
"grad_norm": 2.3930962085723877,
"kl": 0.05926513671875,
"learning_rate": 7e-07,
"loss": 0.0006,
"reward": 3.9211114645004272,
"reward_std": 0.008784215082414448,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9724419414997101,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9486694633960724,
"step": 241
},
{
"completion_length": 254.59375,
"epoch": 0.7744,
"grad_norm": 1.6553773880004883,
"kl": 0.0389404296875,
"learning_rate": 6.9875e-07,
"loss": 0.0004,
"reward": 3.929746985435486,
"reward_std": 0.012057055719196796,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9313917756080627,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9983552694320679,
"step": 242
},
{
"completion_length": 235.375,
"epoch": 0.7776,
"grad_norm": 0.8029008507728577,
"kl": 0.04083251953125,
"learning_rate": 6.975e-07,
"loss": 0.0004,
"reward": 3.9153066873550415,
"reward_std": 0.005760843865573406,
"rewards/answer_entity_reward": 0.9916666746139526,
"rewards/answer_wer_reward": 0.9309280216693878,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9927119016647339,
"step": 243
},
{
"completion_length": 186.78125,
"epoch": 0.7808,
"grad_norm": 3.1181294918060303,
"kl": 0.0732421875,
"learning_rate": 6.9625e-07,
"loss": 0.0007,
"reward": 3.9115726947784424,
"reward_std": 0.007224578293971717,
"rewards/answer_entity_reward": 0.9707792401313782,
"rewards/answer_wer_reward": 0.940793514251709,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 244
},
{
"completion_length": 223.6875,
"epoch": 0.784,
"grad_norm": 1.3839703798294067,
"kl": 0.0380859375,
"learning_rate": 6.949999999999999e-07,
"loss": 0.0004,
"reward": 3.9361883401870728,
"reward_std": 0.012964933644980192,
"rewards/answer_entity_reward": 0.9818618893623352,
"rewards/answer_wer_reward": 0.9550732672214508,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999253123998642,
"step": 245
},
{
"completion_length": 222.53125,
"epoch": 0.7872,
"grad_norm": 3.1735548973083496,
"kl": 0.072509765625,
"learning_rate": 6.937499999999999e-07,
"loss": 0.0007,
"reward": 3.9446396827697754,
"reward_std": 0.023095417767763138,
"rewards/answer_entity_reward": 0.9895833134651184,
"rewards/answer_wer_reward": 0.9603613913059235,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9946948885917664,
"step": 246
},
{
"completion_length": 217.4375,
"epoch": 0.7904,
"grad_norm": 1.185796856880188,
"kl": 0.042236328125,
"learning_rate": 6.924999999999999e-07,
"loss": 0.0004,
"reward": 3.9417611360549927,
"reward_std": 0.013147154357284307,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9470057189464569,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9947555065155029,
"step": 247
},
{
"completion_length": 240.59375,
"epoch": 0.7936,
"grad_norm": 2.088177442550659,
"kl": 0.0504150390625,
"learning_rate": 6.9125e-07,
"loss": 0.0005,
"reward": 3.9391993284225464,
"reward_std": 0.015122740995138884,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9413229823112488,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9978764653205872,
"step": 248
},
{
"completion_length": 251.8125,
"epoch": 0.7968,
"grad_norm": 1.0327165126800537,
"kl": 0.0439453125,
"learning_rate": 6.9e-07,
"loss": 0.0004,
"reward": 3.928339123725891,
"reward_std": 0.014733773190528154,
"rewards/answer_entity_reward": 0.9895104765892029,
"rewards/answer_wer_reward": 0.9401907324790955,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9986380338668823,
"step": 249
},
{
"completion_length": 202.125,
"epoch": 0.8,
"grad_norm": 1.0536175966262817,
"kl": 0.0443115234375,
"learning_rate": 6.8875e-07,
"loss": 0.0004,
"reward": 3.9324183464050293,
"reward_std": 0.018241871614009142,
"rewards/answer_entity_reward": 0.9873737692832947,
"rewards/answer_wer_reward": 0.9567070603370667,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9883374869823456,
"step": 250
},
{
"completion_length": 231.59375,
"epoch": 0.8032,
"grad_norm": 1.8605543375015259,
"kl": 0.0467529296875,
"learning_rate": 6.875e-07,
"loss": 0.0005,
"reward": 3.9515386819839478,
"reward_std": 0.014535096473991871,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9524115920066833,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9991269707679749,
"step": 251
},
{
"completion_length": 202.8125,
"epoch": 0.8064,
"grad_norm": 1.7101868391036987,
"kl": 0.0673828125,
"learning_rate": 6.8625e-07,
"loss": 0.0007,
"reward": 3.947361946105957,
"reward_std": 0.01079330500215292,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9485193192958832,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9988425970077515,
"step": 252
},
{
"completion_length": 194.4375,
"epoch": 0.8096,
"grad_norm": 1.6060519218444824,
"kl": 0.0518798828125,
"learning_rate": 6.85e-07,
"loss": 0.0005,
"reward": 3.8238483667373657,
"reward_std": 0.09831315139308572,
"rewards/answer_entity_reward": 0.9366161823272705,
"rewards/answer_wer_reward": 0.888142466545105,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990898072719574,
"step": 253
},
{
"completion_length": 231.71875,
"epoch": 0.8128,
"grad_norm": 1.4323464632034302,
"kl": 0.04559326171875,
"learning_rate": 6.837499999999999e-07,
"loss": 0.0005,
"reward": 3.9585113525390625,
"reward_std": 0.009139138273894787,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9591011703014374,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994101524353027,
"step": 254
},
{
"completion_length": 242.15625,
"epoch": 0.816,
"grad_norm": 1.638405442237854,
"kl": 0.0592041015625,
"learning_rate": 6.824999999999999e-07,
"loss": 0.0006,
"reward": 3.938191056251526,
"reward_std": 0.015181098598986864,
"rewards/answer_entity_reward": 0.9916666746139526,
"rewards/answer_wer_reward": 0.9465242922306061,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 255
},
{
"completion_length": 178.96875,
"epoch": 0.8192,
"grad_norm": 2.906489133834839,
"kl": 0.07958984375,
"learning_rate": 6.8125e-07,
"loss": 0.0008,
"reward": 3.9418115615844727,
"reward_std": 0.024727396899834275,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.9549268186092377,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9925665557384491,
"step": 256
},
{
"completion_length": 191.59375,
"epoch": 0.8224,
"grad_norm": 4.772871494293213,
"kl": 0.271484375,
"learning_rate": 6.800000000000001e-07,
"loss": 0.0027,
"reward": 3.9085776805877686,
"reward_std": 0.01904244115576148,
"rewards/answer_entity_reward": 0.9866071343421936,
"rewards/answer_wer_reward": 0.9542762637138367,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9676942825317383,
"step": 257
},
{
"completion_length": 192.78125,
"epoch": 0.8256,
"grad_norm": 2.3399181365966797,
"kl": 0.081787109375,
"learning_rate": 6.7875e-07,
"loss": 0.0008,
"reward": 3.930221199989319,
"reward_std": 0.014671812066808343,
"rewards/answer_entity_reward": 0.9867201447486877,
"rewards/answer_wer_reward": 0.9438917338848114,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996093809604645,
"step": 258
},
{
"completion_length": 187.5,
"epoch": 0.8288,
"grad_norm": 9.805069923400879,
"kl": 0.072265625,
"learning_rate": 6.775e-07,
"loss": 0.0007,
"reward": 3.939017653465271,
"reward_std": 0.016680479515343904,
"rewards/answer_entity_reward": 0.9944852888584137,
"rewards/answer_wer_reward": 0.9445324242115021,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 259
},
{
"completion_length": 234.5625,
"epoch": 0.832,
"grad_norm": 1.5217561721801758,
"kl": 0.0516357421875,
"learning_rate": 6.7625e-07,
"loss": 0.0005,
"reward": 3.922031283378601,
"reward_std": 0.01609009224921465,
"rewards/answer_entity_reward": 0.9681277275085449,
"rewards/answer_wer_reward": 0.9539035856723785,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 260
},
{
"completion_length": 159.0,
"epoch": 0.8352,
"grad_norm": 2.5927042961120605,
"kl": 0.0557861328125,
"learning_rate": 6.75e-07,
"loss": 0.0006,
"reward": 3.9503369331359863,
"reward_std": 0.004757039016112685,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9792385697364807,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9710983335971832,
"step": 261
},
{
"completion_length": 222.15625,
"epoch": 0.8384,
"grad_norm": 1.9485008716583252,
"kl": 0.0928955078125,
"learning_rate": 6.737499999999999e-07,
"loss": 0.0009,
"reward": 3.9718098640441895,
"reward_std": 0.01134553411975503,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9718098938465118,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 262
},
{
"completion_length": 248.875,
"epoch": 0.8416,
"grad_norm": 5.045698165893555,
"kl": 0.0552978515625,
"learning_rate": 6.724999999999999e-07,
"loss": 0.0006,
"reward": 3.799831986427307,
"reward_std": 0.03707320708781481,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.9218086004257202,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.883705198764801,
"step": 263
},
{
"completion_length": 157.6875,
"epoch": 0.8448,
"grad_norm": 1.9603397846221924,
"kl": 0.14111328125,
"learning_rate": 6.7125e-07,
"loss": 0.0014,
"reward": 3.9334217309951782,
"reward_std": 0.00959050771780312,
"rewards/answer_entity_reward": 0.9916666746139526,
"rewards/answer_wer_reward": 0.9538573622703552,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.987897664308548,
"step": 264
},
{
"completion_length": 249.21875,
"epoch": 0.848,
"grad_norm": 1.720057725906372,
"kl": 0.102783203125,
"learning_rate": 6.7e-07,
"loss": 0.001,
"reward": 3.9404491186141968,
"reward_std": 0.023797483183443546,
"rewards/answer_entity_reward": 0.9947552382946014,
"rewards/answer_wer_reward": 0.9459458291530609,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997479915618896,
"step": 265
},
{
"completion_length": 200.65625,
"epoch": 0.8512,
"grad_norm": 1.7017474174499512,
"kl": 0.06640625,
"learning_rate": 6.6875e-07,
"loss": 0.0007,
"reward": 3.897473454475403,
"reward_std": 0.017802401445806026,
"rewards/answer_entity_reward": 0.9892628192901611,
"rewards/answer_wer_reward": 0.9560422301292419,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.952168345451355,
"step": 266
},
{
"completion_length": 206.9375,
"epoch": 0.8544,
"grad_norm": 1.7645119428634644,
"kl": 0.107177734375,
"learning_rate": 6.675e-07,
"loss": 0.0011,
"reward": 3.919585347175598,
"reward_std": 0.017358362209051847,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9206817746162415,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9989035129547119,
"step": 267
},
{
"completion_length": 234.125,
"epoch": 0.8576,
"grad_norm": 2.324972629547119,
"kl": 0.07275390625,
"learning_rate": 6.6625e-07,
"loss": 0.0007,
"reward": 3.8366565704345703,
"reward_std": 0.03994511067867279,
"rewards/answer_entity_reward": 0.9375,
"rewards/answer_wer_reward": 0.9288243353366852,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9703322649002075,
"step": 268
},
{
"completion_length": 163.28125,
"epoch": 0.8608,
"grad_norm": 3.44211483001709,
"kl": 0.07080078125,
"learning_rate": 6.65e-07,
"loss": 0.0007,
"reward": 3.8973175287246704,
"reward_std": 0.051633019000291824,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9550660252571106,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9457237124443054,
"step": 269
},
{
"completion_length": 198.0625,
"epoch": 0.864,
"grad_norm": 5.092156887054443,
"kl": 0.072998046875,
"learning_rate": 6.637499999999999e-07,
"loss": 0.0007,
"reward": 3.940290689468384,
"reward_std": 0.009564612759277225,
"rewards/answer_entity_reward": 0.9821428656578064,
"rewards/answer_wer_reward": 0.958147794008255,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 270
},
{
"completion_length": 138.875,
"epoch": 0.8672,
"grad_norm": 3.998215913772583,
"kl": 0.05889892578125,
"learning_rate": 6.624999999999999e-07,
"loss": 0.0006,
"reward": 3.9329700469970703,
"reward_std": 0.05405183229595423,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9581792652606964,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9782631099224091,
"step": 271
},
{
"completion_length": 208.53125,
"epoch": 0.8704,
"grad_norm": 2.191901206970215,
"kl": 0.06884765625,
"learning_rate": 6.6125e-07,
"loss": 0.0007,
"reward": 3.956714630126953,
"reward_std": 0.01909107668325305,
"rewards/answer_entity_reward": 0.993686854839325,
"rewards/answer_wer_reward": 0.9632268249988556,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9998009502887726,
"step": 272
},
{
"completion_length": 196.71875,
"epoch": 0.8736,
"grad_norm": 3.2068357467651367,
"kl": 0.0513916015625,
"learning_rate": 6.6e-07,
"loss": 0.0005,
"reward": 3.9089767932891846,
"reward_std": 0.035889009945094585,
"rewards/answer_entity_reward": 0.9902777671813965,
"rewards/answer_wer_reward": 0.934887707233429,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9838112890720367,
"step": 273
},
{
"completion_length": 238.03125,
"epoch": 0.8768,
"grad_norm": 12.858990669250488,
"kl": 0.0513916015625,
"learning_rate": 6.587499999999999e-07,
"loss": 0.0005,
"reward": 3.9507744312286377,
"reward_std": 0.012679634615778923,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9518805146217346,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9988937973976135,
"step": 274
},
{
"completion_length": 215.03125,
"epoch": 0.88,
"grad_norm": 6.914164066314697,
"kl": 0.053466796875,
"learning_rate": 6.575e-07,
"loss": 0.0005,
"reward": 3.920554757118225,
"reward_std": 0.01066223531961441,
"rewards/answer_entity_reward": 0.9821428656578064,
"rewards/answer_wer_reward": 0.9384119212627411,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 275
},
{
"completion_length": 170.3125,
"epoch": 0.8832,
"grad_norm": 1.4424182176589966,
"kl": 0.0533447265625,
"learning_rate": 6.5625e-07,
"loss": 0.0005,
"reward": 3.8676129579544067,
"reward_std": 0.015859364066272974,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9279236793518066,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9396892189979553,
"step": 276
},
{
"completion_length": 203.0,
"epoch": 0.8864,
"grad_norm": 1.4304486513137817,
"kl": 0.040771484375,
"learning_rate": 6.55e-07,
"loss": 0.0004,
"reward": 3.9131808280944824,
"reward_std": 0.020121398381888866,
"rewards/answer_entity_reward": 0.9930555820465088,
"rewards/answer_wer_reward": 0.9201253056526184,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 277
},
{
"completion_length": 199.9375,
"epoch": 0.8896,
"grad_norm": 4.607363700866699,
"kl": 0.0810546875,
"learning_rate": 6.5375e-07,
"loss": 0.0008,
"reward": 3.9438611268997192,
"reward_std": 0.014630983117967844,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9560317695140839,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.989912748336792,
"step": 278
},
{
"completion_length": 215.75,
"epoch": 0.8928,
"grad_norm": 0.9500401020050049,
"kl": 0.0498046875,
"learning_rate": 6.524999999999999e-07,
"loss": 0.0005,
"reward": 3.9393136501312256,
"reward_std": 0.010870016179978848,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9396113157272339,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997023940086365,
"step": 279
},
{
"completion_length": 211.4375,
"epoch": 0.896,
"grad_norm": 2.4634454250335693,
"kl": 0.08154296875,
"learning_rate": 6.5125e-07,
"loss": 0.0008,
"reward": 3.8559117317199707,
"reward_std": 0.020915272179991007,
"rewards/answer_entity_reward": 0.9944444298744202,
"rewards/answer_wer_reward": 0.9251176416873932,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9363496899604797,
"step": 280
},
{
"completion_length": 172.8125,
"epoch": 0.8992,
"grad_norm": 5.569718360900879,
"kl": 0.1357421875,
"learning_rate": 6.5e-07,
"loss": 0.0014,
"reward": 3.87375545501709,
"reward_std": 0.04026831593364477,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9294662475585938,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9442892372608185,
"step": 281
},
{
"completion_length": 114.875,
"epoch": 0.9024,
"grad_norm": 4.26852560043335,
"kl": 0.053955078125,
"learning_rate": 6.4875e-07,
"loss": 0.0005,
"reward": 3.909887909889221,
"reward_std": 0.015241059940308332,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9791332483291626,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9335956573486328,
"step": 282
},
{
"completion_length": 245.0,
"epoch": 0.9056,
"grad_norm": 1.3898316621780396,
"kl": 0.0450439453125,
"learning_rate": 6.474999999999999e-07,
"loss": 0.0005,
"reward": 3.9195964336395264,
"reward_std": 0.018749097362160683,
"rewards/answer_entity_reward": 0.9911437332630157,
"rewards/answer_wer_reward": 0.9284527003765106,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 283
},
{
"completion_length": 218.75,
"epoch": 0.9088,
"grad_norm": 4.705906391143799,
"kl": 0.0338134765625,
"learning_rate": 6.4625e-07,
"loss": 0.0003,
"reward": 3.9526829719543457,
"reward_std": 0.012810520827770233,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9526830613613129,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 284
},
{
"completion_length": 175.15625,
"epoch": 0.912,
"grad_norm": 1.7440683841705322,
"kl": 0.0616455078125,
"learning_rate": 6.45e-07,
"loss": 0.0006,
"reward": 3.9307706356048584,
"reward_std": 0.014890296617522836,
"rewards/answer_entity_reward": 0.9845238327980042,
"rewards/answer_wer_reward": 0.9668512642383575,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9793955981731415,
"step": 285
},
{
"completion_length": 154.3125,
"epoch": 0.9152,
"grad_norm": 2.3717188835144043,
"kl": 0.0599365234375,
"learning_rate": 6.4375e-07,
"loss": 0.0006,
"reward": 3.9156084060668945,
"reward_std": 0.013419507071375847,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.951806515455246,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9638019800186157,
"step": 286
},
{
"completion_length": 226.40625,
"epoch": 0.9184,
"grad_norm": 2.069488525390625,
"kl": 0.058349609375,
"learning_rate": 6.424999999999999e-07,
"loss": 0.0006,
"reward": 3.8257880210876465,
"reward_std": 0.023342549800872803,
"rewards/answer_entity_reward": 0.9914772808551788,
"rewards/answer_wer_reward": 0.9156993925571442,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9186112284660339,
"step": 287
},
{
"completion_length": 203.21875,
"epoch": 0.9216,
"grad_norm": 1.8522766828536987,
"kl": 0.0611572265625,
"learning_rate": 6.4125e-07,
"loss": 0.0006,
"reward": 3.9413124322891235,
"reward_std": 0.014133658958598971,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9447846114635468,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 288
},
{
"completion_length": 182.90625,
"epoch": 0.9248,
"grad_norm": 3.1601576805114746,
"kl": 0.0626220703125,
"learning_rate": 6.4e-07,
"loss": 0.0006,
"reward": 3.934013605117798,
"reward_std": 0.020497526740655303,
"rewards/answer_entity_reward": 0.9861111044883728,
"rewards/answer_wer_reward": 0.9598910510540009,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.988011360168457,
"step": 289
},
{
"completion_length": 235.71875,
"epoch": 0.928,
"grad_norm": 1.5299009084701538,
"kl": 0.062744140625,
"learning_rate": 6.3875e-07,
"loss": 0.0006,
"reward": 3.900187373161316,
"reward_std": 0.027182841673493385,
"rewards/answer_entity_reward": 0.9859217405319214,
"rewards/answer_wer_reward": 0.9156533181667328,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9986123144626617,
"step": 290
},
{
"completion_length": 181.59375,
"epoch": 0.9312,
"grad_norm": 2.8708431720733643,
"kl": 0.09375,
"learning_rate": 6.374999999999999e-07,
"loss": 0.0009,
"reward": 3.878863215446472,
"reward_std": 0.016461022198200226,
"rewards/answer_entity_reward": 0.9607954621315002,
"rewards/answer_wer_reward": 0.9469051957130432,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9711625277996063,
"step": 291
},
{
"completion_length": 252.71875,
"epoch": 0.9344,
"grad_norm": 1.3821316957473755,
"kl": 0.143798828125,
"learning_rate": 6.362499999999999e-07,
"loss": 0.0014,
"reward": 3.9444687366485596,
"reward_std": 0.015690275467932224,
"rewards/answer_entity_reward": 0.9958333373069763,
"rewards/answer_wer_reward": 0.9486355781555176,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 292
},
{
"completion_length": 191.5,
"epoch": 0.9376,
"grad_norm": 3.0700418949127197,
"kl": 0.08984375,
"learning_rate": 6.35e-07,
"loss": 0.0009,
"reward": 3.9288469552993774,
"reward_std": 0.025998966302722692,
"rewards/answer_entity_reward": 0.9910714626312256,
"rewards/answer_wer_reward": 0.9580896496772766,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.97968590259552,
"step": 293
},
{
"completion_length": 236.40625,
"epoch": 0.9408,
"grad_norm": 0.9392086863517761,
"kl": 0.0728759765625,
"learning_rate": 6.3375e-07,
"loss": 0.0007,
"reward": 3.9576098918914795,
"reward_std": 0.004891619086265564,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9576099216938019,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 294
},
{
"completion_length": 204.125,
"epoch": 0.944,
"grad_norm": 1.4554882049560547,
"kl": 0.044677734375,
"learning_rate": 6.324999999999999e-07,
"loss": 0.0004,
"reward": 3.9175373315811157,
"reward_std": 0.008688606787472963,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9530804753303528,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9644568264484406,
"step": 295
},
{
"completion_length": 230.8125,
"epoch": 0.9472,
"grad_norm": 0.7801051139831543,
"kl": 0.0537109375,
"learning_rate": 6.3125e-07,
"loss": 0.0005,
"reward": 3.941986918449402,
"reward_std": 0.011714181862771511,
"rewards/answer_entity_reward": 0.9983552694320679,
"rewards/answer_wer_reward": 0.9448631405830383,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987684786319733,
"step": 296
},
{
"completion_length": 201.6875,
"epoch": 0.9504,
"grad_norm": 3.2697925567626953,
"kl": 0.0723876953125,
"learning_rate": 6.3e-07,
"loss": 0.0007,
"reward": 3.9148101806640625,
"reward_std": 0.02096148394048214,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9371316432952881,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9821428656578064,
"step": 297
},
{
"completion_length": 174.71875,
"epoch": 0.9536,
"grad_norm": 1.3895010948181152,
"kl": 0.072509765625,
"learning_rate": 6.2875e-07,
"loss": 0.0007,
"reward": 3.9413623809814453,
"reward_std": 0.012068473850376904,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.96162348985672,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9821428656578064,
"step": 298
},
{
"completion_length": 226.53125,
"epoch": 0.9568,
"grad_norm": 0.9915501475334167,
"kl": 0.0574951171875,
"learning_rate": 6.274999999999999e-07,
"loss": 0.0006,
"reward": 3.9342339038848877,
"reward_std": 0.017138528637588024,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9342339336872101,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 299
},
{
"completion_length": 185.96875,
"epoch": 0.96,
"grad_norm": 2.181473970413208,
"kl": 0.0693359375,
"learning_rate": 6.262499999999999e-07,
"loss": 0.0007,
"reward": 3.8075177669525146,
"reward_std": 0.008563205134123564,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.974321037530899,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8331968486309052,
"step": 300
},
{
"completion_length": 259.4375,
"epoch": 0.9632,
"grad_norm": 0.8825593590736389,
"kl": 0.053955078125,
"learning_rate": 6.249999999999999e-07,
"loss": 0.0005,
"reward": 3.9282361268997192,
"reward_std": 0.01493215560913086,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9290694296360016,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9991666674613953,
"step": 301
},
{
"completion_length": 233.4375,
"epoch": 0.9664,
"grad_norm": 2.377093553543091,
"kl": 0.08251953125,
"learning_rate": 6.2375e-07,
"loss": 0.0008,
"reward": 3.8652896881103516,
"reward_std": 0.04854640178382397,
"rewards/answer_entity_reward": 0.9947552382946014,
"rewards/answer_wer_reward": 0.931235283613205,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9392991065979004,
"step": 302
},
{
"completion_length": 214.9375,
"epoch": 0.9696,
"grad_norm": 2.7887818813323975,
"kl": 0.0765380859375,
"learning_rate": 6.225000000000001e-07,
"loss": 0.0008,
"reward": 3.916442394256592,
"reward_std": 0.014312040992081165,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9577742516994476,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9586680829524994,
"step": 303
},
{
"completion_length": 195.53125,
"epoch": 0.9728,
"grad_norm": 1.3930556774139404,
"kl": 0.0662841796875,
"learning_rate": 6.2125e-07,
"loss": 0.0007,
"reward": 3.8324824571609497,
"reward_std": 0.013787610223516822,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.9709192514419556,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8672450482845306,
"step": 304
},
{
"completion_length": 221.65625,
"epoch": 0.976,
"grad_norm": 1.6060283184051514,
"kl": 0.046875,
"learning_rate": 6.2e-07,
"loss": 0.0005,
"reward": 3.9341059923171997,
"reward_std": 0.016552825924009085,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9438435733318329,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9902624487876892,
"step": 305
},
{
"completion_length": 274.4375,
"epoch": 0.9792,
"grad_norm": 2.2774875164031982,
"kl": 0.0582275390625,
"learning_rate": 6.1875e-07,
"loss": 0.0006,
"reward": 3.8809224367141724,
"reward_std": 0.03468186687678099,
"rewards/answer_entity_reward": 0.9755851626396179,
"rewards/answer_wer_reward": 0.9063642621040344,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9989729523658752,
"step": 306
},
{
"completion_length": 243.875,
"epoch": 0.9824,
"grad_norm": 1.4776897430419922,
"kl": 0.0865478515625,
"learning_rate": 6.175e-07,
"loss": 0.0009,
"reward": 3.921198606491089,
"reward_std": 0.029711266048252583,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9262239336967468,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9984469413757324,
"step": 307
},
{
"completion_length": 230.6875,
"epoch": 0.9856,
"grad_norm": 0.8870422840118408,
"kl": 0.0528564453125,
"learning_rate": 6.162499999999999e-07,
"loss": 0.0005,
"reward": 3.9468624591827393,
"reward_std": 0.010126703884452581,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9468623399734497,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 308
},
{
"completion_length": 193.53125,
"epoch": 0.9888,
"grad_norm": 1.2648320198059082,
"kl": 0.0474853515625,
"learning_rate": 6.149999999999999e-07,
"loss": 0.0005,
"reward": 3.9692437648773193,
"reward_std": 0.010907594813033938,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9716475903987885,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 309
},
{
"completion_length": 226.84375,
"epoch": 0.992,
"grad_norm": 2.5334410667419434,
"kl": 0.099609375,
"learning_rate": 6.1375e-07,
"loss": 0.001,
"reward": 3.932776689529419,
"reward_std": 0.025886752177029848,
"rewards/answer_entity_reward": 0.9937500059604645,
"rewards/answer_wer_reward": 0.9474222362041473,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9916044771671295,
"step": 310
},
{
"completion_length": 202.40625,
"epoch": 0.9952,
"grad_norm": 1.6191986799240112,
"kl": 0.059326171875,
"learning_rate": 6.125000000000001e-07,
"loss": 0.0006,
"reward": 3.923641085624695,
"reward_std": 0.016786989755928516,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9264820218086243,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 311
},
{
"completion_length": 226.125,
"epoch": 0.9984,
"grad_norm": 2.3516252040863037,
"kl": 0.0587158203125,
"learning_rate": 6.1125e-07,
"loss": 0.0006,
"reward": 3.822533130645752,
"reward_std": 0.19381592608988285,
"rewards/answer_entity_reward": 0.9630681872367859,
"rewards/answer_wer_reward": 0.8977905511856079,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9929245114326477,
"step": 312
},
{
"completion_length": 164.4375,
"epoch": 1.0,
"grad_norm": 9.48376178741455,
"kl": 0.04345703125,
"learning_rate": 6.1e-07,
"loss": 0.0002,
"reward": 3.9722466468811035,
"reward_std": 0.021218769252300262,
"rewards/answer_entity_reward": 0.9861111044883728,
"rewards/answer_wer_reward": 0.9880585074424744,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9980769157409668,
"step": 313
},
{
"completion_length": 194.0625,
"epoch": 1.0032,
"grad_norm": 1.5969237089157104,
"kl": 0.0419921875,
"learning_rate": 6.0875e-07,
"loss": 0.0004,
"reward": 3.9741499423980713,
"reward_std": 0.00955872773192823,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9776757061481476,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9985576868057251,
"step": 314
},
{
"completion_length": 174.25,
"epoch": 1.0064,
"grad_norm": 5.0026326179504395,
"kl": 0.07470703125,
"learning_rate": 6.075e-07,
"loss": 0.0007,
"reward": 3.9532389640808105,
"reward_std": 0.01782281370833516,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9582388997077942,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9950000047683716,
"step": 315
},
{
"completion_length": 218.3125,
"epoch": 1.0096,
"grad_norm": 1.521260142326355,
"kl": 0.072509765625,
"learning_rate": 6.062499999999999e-07,
"loss": 0.0007,
"reward": 3.891371011734009,
"reward_std": 0.037183830980211496,
"rewards/answer_entity_reward": 0.995192289352417,
"rewards/answer_wer_reward": 0.9465020596981049,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9496767222881317,
"step": 316
},
{
"completion_length": 181.21875,
"epoch": 1.0128,
"grad_norm": 2.444070339202881,
"kl": 0.1011962890625,
"learning_rate": 6.049999999999999e-07,
"loss": 0.001,
"reward": 3.957024097442627,
"reward_std": 0.015732225496321917,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.9627059102058411,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 317
},
{
"completion_length": 214.8125,
"epoch": 1.016,
"grad_norm": 5.038032054901123,
"kl": 0.081298828125,
"learning_rate": 6.037499999999999e-07,
"loss": 0.0008,
"reward": 3.905093193054199,
"reward_std": 0.02073481073603034,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9350383579730988,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.97005495429039,
"step": 318
},
{
"completion_length": 209.8125,
"epoch": 1.0192,
"grad_norm": 3.9700140953063965,
"kl": 0.07373046875,
"learning_rate": 6.025000000000001e-07,
"loss": 0.0007,
"reward": 3.8465429544448853,
"reward_std": 0.044920976273715496,
"rewards/answer_entity_reward": 0.953125,
"rewards/answer_wer_reward": 0.935539960861206,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9578781127929688,
"step": 319
},
{
"completion_length": 242.8125,
"epoch": 1.0224,
"grad_norm": 1.1018257141113281,
"kl": 0.0404052734375,
"learning_rate": 6.0125e-07,
"loss": 0.0004,
"reward": 3.9351298809051514,
"reward_std": 0.00889231264591217,
"rewards/answer_entity_reward": 0.9861111044883728,
"rewards/answer_wer_reward": 0.9503234028816223,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9986952543258667,
"step": 320
},
{
"completion_length": 178.65625,
"epoch": 1.0256,
"grad_norm": 1.2945948839187622,
"kl": 0.059326171875,
"learning_rate": 6e-07,
"loss": 0.0006,
"reward": 3.9444717168807983,
"reward_std": 0.010739851742982864,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9468754827976227,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 321
},
{
"completion_length": 158.75,
"epoch": 1.0288,
"grad_norm": 1.9997080564498901,
"kl": 0.10498046875,
"learning_rate": 5.9875e-07,
"loss": 0.001,
"reward": 3.8997615575790405,
"reward_std": 0.0878201499581337,
"rewards/answer_entity_reward": 0.9768981039524078,
"rewards/answer_wer_reward": 0.9317395091056824,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9911239445209503,
"step": 322
},
{
"completion_length": 202.78125,
"epoch": 1.032,
"grad_norm": 2.5343425273895264,
"kl": 0.047119140625,
"learning_rate": 5.975e-07,
"loss": 0.0005,
"reward": 3.9625836610794067,
"reward_std": 0.0073791013564914465,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9652430713176727,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9973404407501221,
"step": 323
},
{
"completion_length": 181.9375,
"epoch": 1.0352,
"grad_norm": 7.240401744842529,
"kl": 0.067138671875,
"learning_rate": 5.962499999999999e-07,
"loss": 0.0007,
"reward": 3.828685760498047,
"reward_std": 0.04627671558409929,
"rewards/answer_entity_reward": 0.995192289352417,
"rewards/answer_wer_reward": 0.951274037361145,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.882219523191452,
"step": 324
},
{
"completion_length": 209.75,
"epoch": 1.0384,
"grad_norm": 2.1784214973449707,
"kl": 0.0810546875,
"learning_rate": 5.949999999999999e-07,
"loss": 0.0008,
"reward": 3.9578659534454346,
"reward_std": 0.015447806101292372,
"rewards/answer_entity_reward": 0.9947552382946014,
"rewards/answer_wer_reward": 0.9634187519550323,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996921122074127,
"step": 325
},
{
"completion_length": 200.78125,
"epoch": 1.0416,
"grad_norm": 1.8993250131607056,
"kl": 0.086669921875,
"learning_rate": 5.937499999999999e-07,
"loss": 0.0009,
"reward": 3.9622350931167603,
"reward_std": 0.011172362137585878,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9622350335121155,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 326
},
{
"completion_length": 188.0625,
"epoch": 1.0448,
"grad_norm": 2.999244213104248,
"kl": 0.04931640625,
"learning_rate": 5.925e-07,
"loss": 0.0005,
"reward": 3.8658429384231567,
"reward_std": 0.027352653443813324,
"rewards/answer_entity_reward": 0.9859203398227692,
"rewards/answer_wer_reward": 0.9490468800067902,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9308757185935974,
"step": 327
},
{
"completion_length": 211.6875,
"epoch": 1.048,
"grad_norm": 1.4307529926300049,
"kl": 0.06982421875,
"learning_rate": 5.912500000000001e-07,
"loss": 0.0007,
"reward": 3.8813902139663696,
"reward_std": 0.015089725144207478,
"rewards/answer_entity_reward": 0.9800595343112946,
"rewards/answer_wer_reward": 0.9558005034923553,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9455301761627197,
"step": 328
},
{
"completion_length": 184.1875,
"epoch": 1.0512,
"grad_norm": 1.9804878234863281,
"kl": 0.03851318359375,
"learning_rate": 5.9e-07,
"loss": 0.0004,
"reward": 3.9403220415115356,
"reward_std": 0.025673750409623608,
"rewards/answer_entity_reward": 0.9941239356994629,
"rewards/answer_wer_reward": 0.94679394364357,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994041919708252,
"step": 329
},
{
"completion_length": 200.71875,
"epoch": 1.0544,
"grad_norm": 1.5184144973754883,
"kl": 0.06689453125,
"learning_rate": 5.8875e-07,
"loss": 0.0007,
"reward": 3.945325493812561,
"reward_std": 0.021944692358374596,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.951007217168808,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 330
},
{
"completion_length": 211.875,
"epoch": 1.0576,
"grad_norm": 1.228079915046692,
"kl": 0.052978515625,
"learning_rate": 5.875e-07,
"loss": 0.0005,
"reward": 3.9120590686798096,
"reward_std": 0.015080507844686508,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.912059098482132,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 331
},
{
"completion_length": 240.5625,
"epoch": 1.0608,
"grad_norm": 1.7073534727096558,
"kl": 0.1005859375,
"learning_rate": 5.8625e-07,
"loss": 0.001,
"reward": 3.943448066711426,
"reward_std": 0.010788221377879381,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9437373280525208,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997106492519379,
"step": 332
},
{
"completion_length": 217.78125,
"epoch": 1.064,
"grad_norm": 1.9268385171890259,
"kl": 0.0440673828125,
"learning_rate": 5.849999999999999e-07,
"loss": 0.0004,
"reward": 3.9603058099746704,
"reward_std": 0.009590512840077281,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9625644087791443,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9977414906024933,
"step": 333
},
{
"completion_length": 188.125,
"epoch": 1.0672,
"grad_norm": 0.780636727809906,
"kl": 0.04638671875,
"learning_rate": 5.837499999999999e-07,
"loss": 0.0005,
"reward": 3.949649691581726,
"reward_std": 0.0076717507326975465,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9496497213840485,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 334
},
{
"completion_length": 240.71875,
"epoch": 1.0704,
"grad_norm": 21.118270874023438,
"kl": 0.04296875,
"learning_rate": 5.825e-07,
"loss": 0.0004,
"reward": 3.968227982521057,
"reward_std": 0.01375247398391366,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9715853631496429,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.996642529964447,
"step": 335
},
{
"completion_length": 251.21875,
"epoch": 1.0735999999999999,
"grad_norm": 1.0980618000030518,
"kl": 0.0467529296875,
"learning_rate": 5.8125e-07,
"loss": 0.0005,
"reward": 3.9321502447128296,
"reward_std": 0.02487938292324543,
"rewards/answer_entity_reward": 0.987500011920929,
"rewards/answer_wer_reward": 0.945962131023407,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9986882209777832,
"step": 336
},
{
"completion_length": 191.0,
"epoch": 1.0768,
"grad_norm": 1.9901342391967773,
"kl": 0.1015625,
"learning_rate": 5.8e-07,
"loss": 0.001,
"reward": 3.860186219215393,
"reward_std": 0.008080802159383893,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9668596386909485,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8933265209197998,
"step": 337
},
{
"completion_length": 222.40625,
"epoch": 1.08,
"grad_norm": 1.9760770797729492,
"kl": 0.0791015625,
"learning_rate": 5.7875e-07,
"loss": 0.0008,
"reward": 3.943527340888977,
"reward_std": 0.013376505114138126,
"rewards/answer_entity_reward": 0.9927884340286255,
"rewards/answer_wer_reward": 0.950738936662674,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 338
},
{
"completion_length": 242.75,
"epoch": 1.0832,
"grad_norm": 1.4690314531326294,
"kl": 0.0699462890625,
"learning_rate": 5.775e-07,
"loss": 0.0007,
"reward": 3.946296215057373,
"reward_std": 0.010936432983726263,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.946296215057373,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 339
},
{
"completion_length": 213.75,
"epoch": 1.0864,
"grad_norm": 1.3006911277770996,
"kl": 0.068603515625,
"learning_rate": 5.7625e-07,
"loss": 0.0007,
"reward": 3.929935932159424,
"reward_std": 0.012226814404129982,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9303079545497894,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996279776096344,
"step": 340
},
{
"completion_length": 203.875,
"epoch": 1.0896,
"grad_norm": 20.699094772338867,
"kl": 0.0606689453125,
"learning_rate": 5.749999999999999e-07,
"loss": 0.0006,
"reward": 3.839663863182068,
"reward_std": 0.2153539047576487,
"rewards/answer_entity_reward": 0.9632352888584137,
"rewards/answer_wer_reward": 0.9303349256515503,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.977343738079071,
"step": 341
},
{
"completion_length": 229.9375,
"epoch": 1.0928,
"grad_norm": 10.713321685791016,
"kl": 0.062255859375,
"learning_rate": 5.737499999999999e-07,
"loss": 0.0006,
"reward": 3.952810525894165,
"reward_std": 0.013096342328935862,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9535458087921143,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9992647171020508,
"step": 342
},
{
"completion_length": 226.0625,
"epoch": 1.096,
"grad_norm": 5.412719249725342,
"kl": 0.068115234375,
"learning_rate": 5.725e-07,
"loss": 0.0007,
"reward": 3.9290108680725098,
"reward_std": 0.014630899764597416,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9352608323097229,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9937500059604645,
"step": 343
},
{
"completion_length": 180.875,
"epoch": 1.0992,
"grad_norm": 1.5433329343795776,
"kl": 0.046875,
"learning_rate": 5.7125e-07,
"loss": 0.0005,
"reward": 3.9217172861099243,
"reward_std": 0.007004068233072758,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9350151419639587,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9867021441459656,
"step": 344
},
{
"completion_length": 228.5625,
"epoch": 1.1024,
"grad_norm": 1.6970151662826538,
"kl": 0.058837890625,
"learning_rate": 5.699999999999999e-07,
"loss": 0.0006,
"reward": 3.9185184240341187,
"reward_std": 0.013168168719857931,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9197319746017456,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987863898277283,
"step": 345
},
{
"completion_length": 155.34375,
"epoch": 1.1056,
"grad_norm": 1.7489057779312134,
"kl": 0.0869140625,
"learning_rate": 5.6875e-07,
"loss": 0.0009,
"reward": 3.9059561491012573,
"reward_std": 0.00622332957573235,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9627758860588074,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9431802928447723,
"step": 346
},
{
"completion_length": 173.40625,
"epoch": 1.1088,
"grad_norm": 1.3873649835586548,
"kl": 0.09033203125,
"learning_rate": 5.675e-07,
"loss": 0.0009,
"reward": 3.9297943115234375,
"reward_std": 0.039116960018873215,
"rewards/answer_entity_reward": 0.9826389253139496,
"rewards/answer_wer_reward": 0.9575237333774567,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9896316528320312,
"step": 347
},
{
"completion_length": 210.3125,
"epoch": 1.112,
"grad_norm": 3.549527645111084,
"kl": 0.0986328125,
"learning_rate": 5.6625e-07,
"loss": 0.001,
"reward": 3.9249199628829956,
"reward_std": 0.019829558208584785,
"rewards/answer_entity_reward": 0.9842728972434998,
"rewards/answer_wer_reward": 0.9483617842197418,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9922854006290436,
"step": 348
},
{
"completion_length": 210.21875,
"epoch": 1.1152,
"grad_norm": 1.7917331457138062,
"kl": 0.0712890625,
"learning_rate": 5.649999999999999e-07,
"loss": 0.0007,
"reward": 3.9333280324935913,
"reward_std": 0.011767172254621983,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9333280622959137,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 349
},
{
"completion_length": 220.1875,
"epoch": 1.1184,
"grad_norm": 0.8690351247787476,
"kl": 0.069580078125,
"learning_rate": 5.637499999999999e-07,
"loss": 0.0007,
"reward": 3.9331865310668945,
"reward_std": 0.008595036342740059,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9419363439083099,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9912500977516174,
"step": 350
},
{
"completion_length": 192.65625,
"epoch": 1.1216,
"grad_norm": 1.7662582397460938,
"kl": 0.076171875,
"learning_rate": 5.625e-07,
"loss": 0.0008,
"reward": 3.950869083404541,
"reward_std": 0.020245986990630627,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.951172411441803,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996966123580933,
"step": 351
},
{
"completion_length": 264.25,
"epoch": 1.1248,
"grad_norm": 6.877583026885986,
"kl": 0.0867919921875,
"learning_rate": 5.6125e-07,
"loss": 0.0009,
"reward": 3.9451229572296143,
"reward_std": 0.017284557223320007,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.946128636598587,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9989943504333496,
"step": 352
},
{
"completion_length": 218.4375,
"epoch": 1.1280000000000001,
"grad_norm": 1.853745460510254,
"kl": 0.058837890625,
"learning_rate": 5.6e-07,
"loss": 0.0006,
"reward": 3.9474722146987915,
"reward_std": 0.01703261397778988,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9519364535808563,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 353
},
{
"completion_length": 229.9375,
"epoch": 1.1312,
"grad_norm": 7.013837814331055,
"kl": 0.079345703125,
"learning_rate": 5.587499999999999e-07,
"loss": 0.0008,
"reward": 3.928715705871582,
"reward_std": 0.024107711389660835,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9372670352458954,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9938524663448334,
"step": 354
},
{
"completion_length": 238.09375,
"epoch": 1.1344,
"grad_norm": 1.8181698322296143,
"kl": 0.0587158203125,
"learning_rate": 5.575e-07,
"loss": 0.0006,
"reward": 3.9445427656173706,
"reward_std": 0.028678019531071186,
"rewards/answer_entity_reward": 0.9851190447807312,
"rewards/answer_wer_reward": 0.9630020260810852,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.996421754360199,
"step": 355
},
{
"completion_length": 199.46875,
"epoch": 1.1376,
"grad_norm": 17.45456314086914,
"kl": 0.44140625,
"learning_rate": 5.5625e-07,
"loss": 0.0044,
"reward": 3.793405294418335,
"reward_std": 0.09584336914122105,
"rewards/answer_entity_reward": 0.9953208565711975,
"rewards/answer_wer_reward": 0.9546021223068237,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.843482255935669,
"step": 356
},
{
"completion_length": 234.9375,
"epoch": 1.1408,
"grad_norm": 1.5193853378295898,
"kl": 0.056396484375,
"learning_rate": 5.55e-07,
"loss": 0.0006,
"reward": 3.9331583976745605,
"reward_std": 0.01793505996465683,
"rewards/answer_entity_reward": 0.9901185929775238,
"rewards/answer_wer_reward": 0.9450170993804932,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9980226159095764,
"step": 357
},
{
"completion_length": 225.21875,
"epoch": 1.144,
"grad_norm": 0.7461761236190796,
"kl": 0.050048828125,
"learning_rate": 5.5375e-07,
"loss": 0.0005,
"reward": 3.9532158374786377,
"reward_std": 0.013632898684591055,
"rewards/answer_entity_reward": 0.9930555522441864,
"rewards/answer_wer_reward": 0.9601602554321289,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 358
},
{
"completion_length": 196.21875,
"epoch": 1.1472,
"grad_norm": 1.688063621520996,
"kl": 0.0589599609375,
"learning_rate": 5.525e-07,
"loss": 0.0006,
"reward": 3.957648277282715,
"reward_std": 0.009953869972378016,
"rewards/answer_entity_reward": 0.9892857074737549,
"rewards/answer_wer_reward": 0.9689917266368866,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993708431720734,
"step": 359
},
{
"completion_length": 230.875,
"epoch": 1.1504,
"grad_norm": 1.0592241287231445,
"kl": 0.057861328125,
"learning_rate": 5.5125e-07,
"loss": 0.0006,
"reward": 3.9605822563171387,
"reward_std": 0.00902467966079712,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.961335301399231,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9992469847202301,
"step": 360
},
{
"completion_length": 177.25,
"epoch": 1.1536,
"grad_norm": 0.887911856174469,
"kl": 0.0631103515625,
"learning_rate": 5.5e-07,
"loss": 0.0006,
"reward": 3.9682934284210205,
"reward_std": 0.004935940261930227,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9682934284210205,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 361
},
{
"completion_length": 204.09375,
"epoch": 1.1568,
"grad_norm": 1.4796991348266602,
"kl": 0.0721435546875,
"learning_rate": 5.487499999999999e-07,
"loss": 0.0007,
"reward": 3.967429041862488,
"reward_std": 0.004718436859548092,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.967721164226532,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997079372406006,
"step": 362
},
{
"completion_length": 201.90625,
"epoch": 1.16,
"grad_norm": 1.349228858947754,
"kl": 0.0635986328125,
"learning_rate": 5.474999999999999e-07,
"loss": 0.0006,
"reward": 3.968218684196472,
"reward_std": 0.004579245578497648,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9686298072338104,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999588817358017,
"step": 363
},
{
"completion_length": 222.25,
"epoch": 1.1632,
"grad_norm": 8.183592796325684,
"kl": 0.7177734375,
"learning_rate": 5.4625e-07,
"loss": 0.0072,
"reward": 3.8565011024475098,
"reward_std": 0.14647854026407003,
"rewards/answer_entity_reward": 0.9628739356994629,
"rewards/answer_wer_reward": 0.897028774023056,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9965982735157013,
"step": 364
},
{
"completion_length": 203.875,
"epoch": 1.1663999999999999,
"grad_norm": 2.1804592609405518,
"kl": 0.07666015625,
"learning_rate": 5.45e-07,
"loss": 0.0008,
"reward": 3.9330880641937256,
"reward_std": 0.023633791133761406,
"rewards/answer_entity_reward": 0.9927884340286255,
"rewards/answer_wer_reward": 0.9594465494155884,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9808530211448669,
"step": 365
},
{
"completion_length": 187.53125,
"epoch": 1.1696,
"grad_norm": 0.952870786190033,
"kl": 0.068603515625,
"learning_rate": 5.4375e-07,
"loss": 0.0007,
"reward": 3.906123399734497,
"reward_std": 0.02216299483552575,
"rewards/answer_entity_reward": 0.9882478415966034,
"rewards/answer_wer_reward": 0.9373133480548859,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9805622696876526,
"step": 366
},
{
"completion_length": 180.28125,
"epoch": 1.1728,
"grad_norm": 1.6601589918136597,
"kl": 0.069091796875,
"learning_rate": 5.425e-07,
"loss": 0.0007,
"reward": 3.9451587200164795,
"reward_std": 0.01368240499868989,
"rewards/answer_entity_reward": 0.9923513829708099,
"rewards/answer_wer_reward": 0.9530614018440247,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997459352016449,
"step": 367
},
{
"completion_length": 207.5625,
"epoch": 1.176,
"grad_norm": 2.0661466121673584,
"kl": 0.142578125,
"learning_rate": 5.4125e-07,
"loss": 0.0014,
"reward": 3.9405598640441895,
"reward_std": 0.009340570773929358,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9443033933639526,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9962564706802368,
"step": 368
},
{
"completion_length": 193.4375,
"epoch": 1.1792,
"grad_norm": 2.3376078605651855,
"kl": 0.0548095703125,
"learning_rate": 5.4e-07,
"loss": 0.0005,
"reward": 3.9724533557891846,
"reward_std": 0.007678399793803692,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9739435911178589,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9985099136829376,
"step": 369
},
{
"completion_length": 244.9375,
"epoch": 1.1824,
"grad_norm": 8.994063377380371,
"kl": 0.067138671875,
"learning_rate": 5.387499999999999e-07,
"loss": 0.0007,
"reward": 3.8642784357070923,
"reward_std": 0.015206838492304087,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9453278481960297,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9217914342880249,
"step": 370
},
{
"completion_length": 223.5,
"epoch": 1.1856,
"grad_norm": 0.7140876054763794,
"kl": 0.0628662109375,
"learning_rate": 5.374999999999999e-07,
"loss": 0.0006,
"reward": 3.9566755294799805,
"reward_std": 0.008438330609351397,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9571858644485474,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994895756244659,
"step": 371
},
{
"completion_length": 236.09375,
"epoch": 1.1888,
"grad_norm": 5.422008514404297,
"kl": 0.072021484375,
"learning_rate": 5.3625e-07,
"loss": 0.0007,
"reward": 3.9092832803726196,
"reward_std": 0.02735153865069151,
"rewards/answer_entity_reward": 0.9869465231895447,
"rewards/answer_wer_reward": 0.9258767068386078,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9964599907398224,
"step": 372
},
{
"completion_length": 215.90625,
"epoch": 1.192,
"grad_norm": 2.5449435710906982,
"kl": 0.0655517578125,
"learning_rate": 5.35e-07,
"loss": 0.0007,
"reward": 3.8726375102996826,
"reward_std": 0.15768051333725452,
"rewards/answer_entity_reward": 0.991346150636673,
"rewards/answer_wer_reward": 0.9473030865192413,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9652382135391235,
"step": 373
},
{
"completion_length": 221.09375,
"epoch": 1.1952,
"grad_norm": 1.3450181484222412,
"kl": 0.0499267578125,
"learning_rate": 5.3375e-07,
"loss": 0.0005,
"reward": 3.945889711380005,
"reward_std": 0.021359253441914916,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9733871817588806,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9725023210048676,
"step": 374
},
{
"completion_length": 208.03125,
"epoch": 1.1984,
"grad_norm": 1.1699227094650269,
"kl": 0.067626953125,
"learning_rate": 5.325e-07,
"loss": 0.0007,
"reward": 3.951171040534973,
"reward_std": 0.008666176348924637,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9543131291866302,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9992617964744568,
"step": 375
},
{
"completion_length": 253.28125,
"epoch": 1.2016,
"grad_norm": 2.287163496017456,
"kl": 0.0572509765625,
"learning_rate": 5.3125e-07,
"loss": 0.0006,
"reward": 3.9154282808303833,
"reward_std": 0.04354940680786967,
"rewards/answer_entity_reward": 0.9888257682323456,
"rewards/answer_wer_reward": 0.9271413683891296,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994612038135529,
"step": 376
},
{
"completion_length": 187.21875,
"epoch": 1.2048,
"grad_norm": 1.3305357694625854,
"kl": 0.046142578125,
"learning_rate": 5.3e-07,
"loss": 0.0005,
"reward": 3.9359636306762695,
"reward_std": 0.00542741478420794,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9541498124599457,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.981813907623291,
"step": 377
},
{
"completion_length": 224.125,
"epoch": 1.208,
"grad_norm": 10.12941837310791,
"kl": 0.06201171875,
"learning_rate": 5.2875e-07,
"loss": 0.0006,
"reward": 3.9541337490081787,
"reward_std": 0.013694523833692074,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9624313712120056,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9917024075984955,
"step": 378
},
{
"completion_length": 158.96875,
"epoch": 1.2112,
"grad_norm": 1.3805967569351196,
"kl": 0.05859375,
"learning_rate": 5.274999999999999e-07,
"loss": 0.0006,
"reward": 3.947017788887024,
"reward_std": 0.02097574481740594,
"rewards/answer_entity_reward": 0.9902146458625793,
"rewards/answer_wer_reward": 0.961486428976059,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9953167736530304,
"step": 379
},
{
"completion_length": 250.40625,
"epoch": 1.2144,
"grad_norm": 1.2120996713638306,
"kl": 0.044921875,
"learning_rate": 5.262499999999999e-07,
"loss": 0.0004,
"reward": 3.918868899345398,
"reward_std": 0.021801823284476995,
"rewards/answer_entity_reward": 0.9937500059604645,
"rewards/answer_wer_reward": 0.9251189529895782,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 380
},
{
"completion_length": 211.34375,
"epoch": 1.2176,
"grad_norm": 2.19063138961792,
"kl": 0.078369140625,
"learning_rate": 5.25e-07,
"loss": 0.0008,
"reward": 3.8982889652252197,
"reward_std": 0.02524574287235737,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9512019455432892,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.947086900472641,
"step": 381
},
{
"completion_length": 241.28125,
"epoch": 1.2208,
"grad_norm": 1.619989275932312,
"kl": 0.05615234375,
"learning_rate": 5.237500000000001e-07,
"loss": 0.0006,
"reward": 3.9471057653427124,
"reward_std": 0.013869246933609247,
"rewards/answer_entity_reward": 0.9944852888584137,
"rewards/answer_wer_reward": 0.9526203572750092,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 382
},
{
"completion_length": 244.875,
"epoch": 1.224,
"grad_norm": 0.8697032928466797,
"kl": 0.061279296875,
"learning_rate": 5.225e-07,
"loss": 0.0006,
"reward": 3.9235615730285645,
"reward_std": 0.015196615364402533,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9275480508804321,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.998417317867279,
"step": 383
},
{
"completion_length": 191.875,
"epoch": 1.2272,
"grad_norm": 5.2052154541015625,
"kl": 0.06884765625,
"learning_rate": 5.2125e-07,
"loss": 0.0007,
"reward": 3.934178948402405,
"reward_std": 0.024661258328706026,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9814408719539642,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9527381658554077,
"step": 384
},
{
"completion_length": 218.15625,
"epoch": 1.2304,
"grad_norm": 1.1718415021896362,
"kl": 0.105224609375,
"learning_rate": 5.2e-07,
"loss": 0.0011,
"reward": 3.8538546562194824,
"reward_std": 0.013242242857813835,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9431050419807434,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9135904610157013,
"step": 385
},
{
"completion_length": 167.59375,
"epoch": 1.2336,
"grad_norm": 1.8933672904968262,
"kl": 0.0555419921875,
"learning_rate": 5.1875e-07,
"loss": 0.0006,
"reward": 3.942023754119873,
"reward_std": 0.04039308475330472,
"rewards/answer_entity_reward": 0.9895833432674408,
"rewards/answer_wer_reward": 0.9561411142349243,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9962993562221527,
"step": 386
},
{
"completion_length": 181.1875,
"epoch": 1.2368000000000001,
"grad_norm": 1.132387399673462,
"kl": 0.134033203125,
"learning_rate": 5.174999999999999e-07,
"loss": 0.0013,
"reward": 3.883729100227356,
"reward_std": 0.006107622524723411,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9661928117275238,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9175363183021545,
"step": 387
},
{
"completion_length": 245.78125,
"epoch": 1.24,
"grad_norm": 1.5286246538162231,
"kl": 0.0439453125,
"learning_rate": 5.162499999999999e-07,
"loss": 0.0004,
"reward": 3.9444308280944824,
"reward_std": 0.017588268965482712,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.951177716255188,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.993253082036972,
"step": 388
},
{
"completion_length": 214.5,
"epoch": 1.2432,
"grad_norm": 4.535660266876221,
"kl": 0.4443359375,
"learning_rate": 5.149999999999999e-07,
"loss": 0.0045,
"reward": 3.9712672233581543,
"reward_std": 0.017703328281641006,
"rewards/answer_entity_reward": 0.9923513829708099,
"rewards/answer_wer_reward": 0.9789157509803772,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 389
},
{
"completion_length": 237.71875,
"epoch": 1.2464,
"grad_norm": 1.100642204284668,
"kl": 0.0443115234375,
"learning_rate": 5.137500000000001e-07,
"loss": 0.0004,
"reward": 3.9504618644714355,
"reward_std": 0.01717091863974929,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9553267061710358,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9995993673801422,
"step": 390
},
{
"completion_length": 220.8125,
"epoch": 1.2496,
"grad_norm": 1.8153222799301147,
"kl": 0.050537109375,
"learning_rate": 5.125e-07,
"loss": 0.0005,
"reward": 3.954966902732849,
"reward_std": 0.023467861115932465,
"rewards/answer_entity_reward": 0.9909090995788574,
"rewards/answer_wer_reward": 0.9640579223632812,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 391
},
{
"completion_length": 215.75,
"epoch": 1.2528000000000001,
"grad_norm": 1.3607189655303955,
"kl": 0.0562744140625,
"learning_rate": 5.1125e-07,
"loss": 0.0006,
"reward": 3.947434425354004,
"reward_std": 0.01746128685772419,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9514667093753815,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9959677457809448,
"step": 392
},
{
"completion_length": 140.75,
"epoch": 1.256,
"grad_norm": 3.343885898590088,
"kl": 0.064208984375,
"learning_rate": 5.1e-07,
"loss": 0.0006,
"reward": 3.9535528421401978,
"reward_std": 0.016743881278671324,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9615642726421356,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9948294758796692,
"step": 393
},
{
"completion_length": 225.09375,
"epoch": 1.2591999999999999,
"grad_norm": 7.593709468841553,
"kl": 0.0628662109375,
"learning_rate": 5.0875e-07,
"loss": 0.0006,
"reward": 3.9337310791015625,
"reward_std": 0.01689326297491789,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9342745840549469,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999456524848938,
"step": 394
},
{
"completion_length": 195.15625,
"epoch": 1.2624,
"grad_norm": 1.6891230344772339,
"kl": 0.085693359375,
"learning_rate": 5.074999999999999e-07,
"loss": 0.0009,
"reward": 3.836549401283264,
"reward_std": 0.005918985931202769,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.8378467857837677,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987025856971741,
"step": 395
},
{
"completion_length": 218.71875,
"epoch": 1.2656,
"grad_norm": 2.0911483764648438,
"kl": 0.057373046875,
"learning_rate": 5.062499999999999e-07,
"loss": 0.0006,
"reward": 3.930617570877075,
"reward_std": 0.014833949506282806,
"rewards/answer_entity_reward": 0.9881944358348846,
"rewards/answer_wer_reward": 0.9436545968055725,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987684786319733,
"step": 396
},
{
"completion_length": 244.4375,
"epoch": 1.2688,
"grad_norm": 0.6879564523696899,
"kl": 0.05810546875,
"learning_rate": 5.049999999999999e-07,
"loss": 0.0006,
"reward": 3.9541516304016113,
"reward_std": 0.014136601239442825,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9578942954540253,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9983407258987427,
"step": 397
},
{
"completion_length": 171.875,
"epoch": 1.272,
"grad_norm": 1.0838266611099243,
"kl": 0.063232421875,
"learning_rate": 5.0375e-07,
"loss": 0.0006,
"reward": 3.961939811706543,
"reward_std": 0.007458951906301081,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9619399607181549,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 398
},
{
"completion_length": 224.53125,
"epoch": 1.2752,
"grad_norm": 2.0163495540618896,
"kl": 0.072265625,
"learning_rate": 5.025e-07,
"loss": 0.0007,
"reward": 3.964465856552124,
"reward_std": 0.014243231620639563,
"rewards/answer_entity_reward": 0.9957579076290131,
"rewards/answer_wer_reward": 0.9695450067520142,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9991629421710968,
"step": 399
},
{
"completion_length": 181.15625,
"epoch": 1.2784,
"grad_norm": 0.38955262303352356,
"kl": 0.0517578125,
"learning_rate": 5.0125e-07,
"loss": 0.0005,
"reward": 3.9557042121887207,
"reward_std": 0.005372793646529317,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9557042419910431,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 400
},
{
"completion_length": 208.3125,
"epoch": 1.2816,
"grad_norm": 3.9781861305236816,
"kl": 0.0716552734375,
"learning_rate": 5e-07,
"loss": 0.0007,
"reward": 3.8667571544647217,
"reward_std": 0.015388892497867346,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9693593382835388,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8998015820980072,
"step": 401
},
{
"completion_length": 204.375,
"epoch": 1.2848,
"grad_norm": 1.1456544399261475,
"kl": 0.103515625,
"learning_rate": 4.9875e-07,
"loss": 0.001,
"reward": 3.956982374191284,
"reward_std": 0.007417811662890017,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9575175940990448,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994648098945618,
"step": 402
},
{
"completion_length": 216.1875,
"epoch": 1.288,
"grad_norm": 1.1664754152297974,
"kl": 0.06396484375,
"learning_rate": 4.975e-07,
"loss": 0.0006,
"reward": 3.8699432611465454,
"reward_std": 0.02020346373319626,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9359997510910034,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.936026930809021,
"step": 403
},
{
"completion_length": 253.09375,
"epoch": 1.2912,
"grad_norm": 0.8103052377700806,
"kl": 0.0635986328125,
"learning_rate": 4.9625e-07,
"loss": 0.0006,
"reward": 3.937591075897217,
"reward_std": 0.018769525457173586,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9415221214294434,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9989098608493805,
"step": 404
},
{
"completion_length": 215.0625,
"epoch": 1.2944,
"grad_norm": 1.4777588844299316,
"kl": 0.068603515625,
"learning_rate": 4.95e-07,
"loss": 0.0007,
"reward": 3.949555516242981,
"reward_std": 0.009917980059981346,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.949555516242981,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 405
},
{
"completion_length": 202.65625,
"epoch": 1.2976,
"grad_norm": 0.7443984150886536,
"kl": 0.106689453125,
"learning_rate": 4.9375e-07,
"loss": 0.0011,
"reward": 3.7686209678649902,
"reward_std": 0.011178261134773493,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9451543390750885,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8234666287899017,
"step": 406
},
{
"completion_length": 189.21875,
"epoch": 1.3008,
"grad_norm": 0.9547207951545715,
"kl": 0.077392578125,
"learning_rate": 4.924999999999999e-07,
"loss": 0.0008,
"reward": 3.9593130350112915,
"reward_std": 0.006907296134158969,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9597530961036682,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9995598495006561,
"step": 407
},
{
"completion_length": 208.09375,
"epoch": 1.304,
"grad_norm": 0.8897162079811096,
"kl": 0.0604248046875,
"learning_rate": 4.9125e-07,
"loss": 0.0006,
"reward": 3.9529693126678467,
"reward_std": 0.0038969104643911123,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9714880287647247,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9814814925193787,
"step": 408
},
{
"completion_length": 199.78125,
"epoch": 1.3072,
"grad_norm": 1.1945850849151611,
"kl": 0.056640625,
"learning_rate": 4.9e-07,
"loss": 0.0006,
"reward": 3.951330065727234,
"reward_std": 0.0060545760206878185,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9513299763202667,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 409
},
{
"completion_length": 176.5,
"epoch": 1.3104,
"grad_norm": 1.5717577934265137,
"kl": 0.085205078125,
"learning_rate": 4.8875e-07,
"loss": 0.0009,
"reward": 3.9731186628341675,
"reward_std": 0.009643410099670291,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9749214053153992,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9981971085071564,
"step": 410
},
{
"completion_length": 209.25,
"epoch": 1.3136,
"grad_norm": 1.7357205152511597,
"kl": 0.05517578125,
"learning_rate": 4.875e-07,
"loss": 0.0006,
"reward": 3.9563956260681152,
"reward_std": 0.013218061067163944,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9563955068588257,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 411
},
{
"completion_length": 233.28125,
"epoch": 1.3168,
"grad_norm": 3.6717629432678223,
"kl": 0.070068359375,
"learning_rate": 4.8625e-07,
"loss": 0.0007,
"reward": 3.955284357070923,
"reward_std": 0.02536593284457922,
"rewards/answer_entity_reward": 0.9871794581413269,
"rewards/answer_wer_reward": 0.968104898929596,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 412
},
{
"completion_length": 205.125,
"epoch": 1.32,
"grad_norm": 1.0453362464904785,
"kl": 0.04473876953125,
"learning_rate": 4.85e-07,
"loss": 0.0005,
"reward": 3.9507482051849365,
"reward_std": 0.005348393111489713,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9646830558776855,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9860649704933167,
"step": 413
},
{
"completion_length": 197.71875,
"epoch": 1.3232,
"grad_norm": 10.967116355895996,
"kl": 0.4443359375,
"learning_rate": 4.8375e-07,
"loss": 0.0044,
"reward": 3.958775758743286,
"reward_std": 0.01469768793322146,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9608590006828308,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 414
},
{
"completion_length": 240.75,
"epoch": 1.3264,
"grad_norm": 1.771857738494873,
"kl": 0.056884765625,
"learning_rate": 4.824999999999999e-07,
"loss": 0.0006,
"reward": 3.9307100772857666,
"reward_std": 0.01262786379083991,
"rewards/answer_entity_reward": 0.9861111044883728,
"rewards/answer_wer_reward": 0.9445989429950714,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 415
},
{
"completion_length": 184.9375,
"epoch": 1.3296000000000001,
"grad_norm": 0.5742409825325012,
"kl": 0.081787109375,
"learning_rate": 4.812499999999999e-07,
"loss": 0.0008,
"reward": 3.965754270553589,
"reward_std": 0.003614649409428239,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9657542705535889,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 416
},
{
"completion_length": 173.90625,
"epoch": 1.3328,
"grad_norm": 1.4033151865005493,
"kl": 0.074462890625,
"learning_rate": 4.8e-07,
"loss": 0.0007,
"reward": 3.9543731212615967,
"reward_std": 0.006403392762877047,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9728915691375732,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9814814925193787,
"step": 417
},
{
"completion_length": 224.0625,
"epoch": 1.336,
"grad_norm": 1.0427494049072266,
"kl": 0.0576171875,
"learning_rate": 4.7875e-07,
"loss": 0.0006,
"reward": 3.965309262275696,
"reward_std": 0.011804148089140654,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9667502641677856,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9985590577125549,
"step": 418
},
{
"completion_length": 228.53125,
"epoch": 1.3392,
"grad_norm": 1.1613246202468872,
"kl": 0.06591796875,
"learning_rate": 4.775e-07,
"loss": 0.0007,
"reward": 3.948023200035095,
"reward_std": 0.012544674333184958,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9482711553573608,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997519850730896,
"step": 419
},
{
"completion_length": 197.34375,
"epoch": 1.3424,
"grad_norm": 0.8760451674461365,
"kl": 0.072265625,
"learning_rate": 4.7625e-07,
"loss": 0.0007,
"reward": 3.938261866569519,
"reward_std": 0.004269103752449155,
"rewards/answer_entity_reward": 0.9886363744735718,
"rewards/answer_wer_reward": 0.9496253132820129,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 420
},
{
"completion_length": 225.5,
"epoch": 1.3456000000000001,
"grad_norm": 2.4799275398254395,
"kl": 0.1290283203125,
"learning_rate": 4.7499999999999995e-07,
"loss": 0.0013,
"reward": 3.9379055500030518,
"reward_std": 0.008256121072918177,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9677021205425262,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9702034592628479,
"step": 421
},
{
"completion_length": 209.3125,
"epoch": 1.3488,
"grad_norm": 0.6864319443702698,
"kl": 0.0604248046875,
"learning_rate": 4.7374999999999996e-07,
"loss": 0.0006,
"reward": 3.9712308645248413,
"reward_std": 0.0032088530133478343,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9722216725349426,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990091919898987,
"step": 422
},
{
"completion_length": 187.5625,
"epoch": 1.3519999999999999,
"grad_norm": 1.9412598609924316,
"kl": 0.06787109375,
"learning_rate": 4.725e-07,
"loss": 0.0007,
"reward": 3.947052240371704,
"reward_std": 0.014190569054335356,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9569187760353088,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9925373196601868,
"step": 423
},
{
"completion_length": 225.59375,
"epoch": 1.3552,
"grad_norm": 1.4452259540557861,
"kl": 0.09619140625,
"learning_rate": 4.7125e-07,
"loss": 0.001,
"reward": 3.939266562461853,
"reward_std": 0.012853712774813175,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9556125402450562,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9860578179359436,
"step": 424
},
{
"completion_length": 261.0,
"epoch": 1.3584,
"grad_norm": 0.9420474171638489,
"kl": 0.054931640625,
"learning_rate": 4.6999999999999995e-07,
"loss": 0.0006,
"reward": 3.939144253730774,
"reward_std": 0.00785708031617105,
"rewards/answer_entity_reward": 0.9916666746139526,
"rewards/answer_wer_reward": 0.9474774897098541,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 425
},
{
"completion_length": 243.1875,
"epoch": 1.3616,
"grad_norm": 1.1776657104492188,
"kl": 0.078369140625,
"learning_rate": 4.6874999999999996e-07,
"loss": 0.0008,
"reward": 3.928247570991516,
"reward_std": 0.02044426929205656,
"rewards/answer_entity_reward": 0.995192289352417,
"rewards/answer_wer_reward": 0.9401307106018066,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9929245114326477,
"step": 426
},
{
"completion_length": 204.4375,
"epoch": 1.3648,
"grad_norm": 1.6268881559371948,
"kl": 0.073974609375,
"learning_rate": 4.675e-07,
"loss": 0.0007,
"reward": 3.9266600608825684,
"reward_std": 0.006853222264908254,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9440751671791077,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9825847446918488,
"step": 427
},
{
"completion_length": 232.0625,
"epoch": 1.3679999999999999,
"grad_norm": 34.5067138671875,
"kl": 0.755859375,
"learning_rate": 4.6625e-07,
"loss": 0.0076,
"reward": 3.844196319580078,
"reward_std": 0.04641831433400512,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9399954378604889,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9042008221149445,
"step": 428
},
{
"completion_length": 253.21875,
"epoch": 1.3712,
"grad_norm": 1.4444057941436768,
"kl": 0.0673828125,
"learning_rate": 4.65e-07,
"loss": 0.0007,
"reward": 3.963658928871155,
"reward_std": 0.009957378264516592,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9636587798595428,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 429
},
{
"completion_length": 241.875,
"epoch": 1.3744,
"grad_norm": 0.9258720278739929,
"kl": 0.0687255859375,
"learning_rate": 4.6374999999999995e-07,
"loss": 0.0007,
"reward": 3.9617748260498047,
"reward_std": 0.013449362479150295,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9652469456195831,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 430
},
{
"completion_length": 204.96875,
"epoch": 1.3776,
"grad_norm": 1.6328847408294678,
"kl": 0.0863037109375,
"learning_rate": 4.625e-07,
"loss": 0.0009,
"reward": 3.8922348022460938,
"reward_std": 0.007920752046629786,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9477903544902802,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9444444477558136,
"step": 431
},
{
"completion_length": 222.375,
"epoch": 1.3808,
"grad_norm": 2.479295492172241,
"kl": 0.0732421875,
"learning_rate": 4.6125e-07,
"loss": 0.0007,
"reward": 3.9312403202056885,
"reward_std": 0.02260798867791891,
"rewards/answer_entity_reward": 0.9941239356994629,
"rewards/answer_wer_reward": 0.937116414308548,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 432
},
{
"completion_length": 203.28125,
"epoch": 1.384,
"grad_norm": 2.6669020652770996,
"kl": 0.0631103515625,
"learning_rate": 4.6e-07,
"loss": 0.0006,
"reward": 3.938199043273926,
"reward_std": 0.014480275101959705,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9408722817897797,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997305870056152,
"step": 433
},
{
"completion_length": 255.1875,
"epoch": 1.3872,
"grad_norm": 1.4742846488952637,
"kl": 0.057373046875,
"learning_rate": 4.5874999999999995e-07,
"loss": 0.0006,
"reward": 3.9382212162017822,
"reward_std": 0.01696724910289049,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.94236820936203,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9982567131519318,
"step": 434
},
{
"completion_length": 211.15625,
"epoch": 1.3904,
"grad_norm": 1.795336365699768,
"kl": 0.0667724609375,
"learning_rate": 4.575e-07,
"loss": 0.0007,
"reward": 3.919999361038208,
"reward_std": 0.028288409113883972,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9725300371646881,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9474693238735199,
"step": 435
},
{
"completion_length": 208.65625,
"epoch": 1.3936,
"grad_norm": 2.1704065799713135,
"kl": 0.095947265625,
"learning_rate": 4.5624999999999997e-07,
"loss": 0.001,
"reward": 3.857280731201172,
"reward_std": 0.2144411588087678,
"rewards/answer_entity_reward": 0.9618055820465088,
"rewards/answer_wer_reward": 0.949828714132309,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9768964946269989,
"step": 436
},
{
"completion_length": 194.9375,
"epoch": 1.3968,
"grad_norm": 3.8814220428466797,
"kl": 0.082275390625,
"learning_rate": 4.55e-07,
"loss": 0.0008,
"reward": 3.941987633705139,
"reward_std": 0.015088737476617098,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.94545978307724,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 437
},
{
"completion_length": 217.5,
"epoch": 1.4,
"grad_norm": 1.3024876117706299,
"kl": 0.0389404296875,
"learning_rate": 4.5374999999999994e-07,
"loss": 0.0004,
"reward": 3.950901508331299,
"reward_std": 0.008365771966055036,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9589883685112,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9919130802154541,
"step": 438
},
{
"completion_length": 159.03125,
"epoch": 1.4032,
"grad_norm": 0.272270530462265,
"kl": 0.0396728515625,
"learning_rate": 4.525e-07,
"loss": 0.0004,
"reward": 3.9221452474594116,
"reward_std": 0.0014547138416673988,
"rewards/answer_entity_reward": 0.9916666746139526,
"rewards/answer_wer_reward": 0.9875754117965698,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9429032206535339,
"step": 439
},
{
"completion_length": 200.28125,
"epoch": 1.4064,
"grad_norm": 5.4578399658203125,
"kl": 0.0828857421875,
"learning_rate": 4.5124999999999997e-07,
"loss": 0.0008,
"reward": 3.9259976148605347,
"reward_std": 0.014895747415721416,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9536634683609009,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9758064448833466,
"step": 440
},
{
"completion_length": 229.1875,
"epoch": 1.4096,
"grad_norm": 0.6568198800086975,
"kl": 0.067138671875,
"learning_rate": 4.5e-07,
"loss": 0.0007,
"reward": 3.9455034732818604,
"reward_std": 0.011267438880167902,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9479073286056519,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 441
},
{
"completion_length": 199.3125,
"epoch": 1.4128,
"grad_norm": 1.0056089162826538,
"kl": 0.0567626953125,
"learning_rate": 4.4874999999999994e-07,
"loss": 0.0006,
"reward": 3.9622955322265625,
"reward_std": 0.008431105175986886,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9622955024242401,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 442
},
{
"completion_length": 212.375,
"epoch": 1.416,
"grad_norm": 0.7950085997581482,
"kl": 0.051025390625,
"learning_rate": 4.475e-07,
"loss": 0.0005,
"reward": 3.9517738819122314,
"reward_std": 0.03710572328418493,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9708344638347626,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9809393286705017,
"step": 443
},
{
"completion_length": 227.71875,
"epoch": 1.4192,
"grad_norm": 0.8971355557441711,
"kl": 0.0460205078125,
"learning_rate": 4.4624999999999996e-07,
"loss": 0.0005,
"reward": 3.980188012123108,
"reward_std": 0.00624943315051496,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9801879525184631,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 444
},
{
"completion_length": 226.78125,
"epoch": 1.4224,
"grad_norm": 2.114032745361328,
"kl": 0.0791015625,
"learning_rate": 4.45e-07,
"loss": 0.0008,
"reward": 3.879195213317871,
"reward_std": 0.03936337144114077,
"rewards/answer_entity_reward": 0.9981617629528046,
"rewards/answer_wer_reward": 0.9502902626991272,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9307432472705841,
"step": 445
},
{
"completion_length": 227.875,
"epoch": 1.4256,
"grad_norm": 1.0065126419067383,
"kl": 0.083984375,
"learning_rate": 4.4374999999999993e-07,
"loss": 0.0009,
"reward": 3.939829707145691,
"reward_std": 0.013783617876470089,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9398296475410461,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 446
},
{
"completion_length": 202.6875,
"epoch": 1.4288,
"grad_norm": 1.7568168640136719,
"kl": 0.0418701171875,
"learning_rate": 4.425e-07,
"loss": 0.0004,
"reward": 3.943518042564392,
"reward_std": 0.016201740596443415,
"rewards/answer_entity_reward": 0.9914772808551788,
"rewards/answer_wer_reward": 0.9520406126976013,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 447
},
{
"completion_length": 172.8125,
"epoch": 1.432,
"grad_norm": 1.0688170194625854,
"kl": 0.0494384765625,
"learning_rate": 4.4124999999999996e-07,
"loss": 0.0005,
"reward": 3.7196162939071655,
"reward_std": 0.006592530757188797,
"rewards/answer_entity_reward": 0.8677884340286255,
"rewards/answer_wer_reward": 0.8742637634277344,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9775640964508057,
"step": 448
},
{
"completion_length": 168.59375,
"epoch": 1.4352,
"grad_norm": 1.7712996006011963,
"kl": 0.0435791015625,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0004,
"reward": 3.8386131525039673,
"reward_std": 0.011066187638789415,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.8386130630970001,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 449
},
{
"completion_length": 197.40625,
"epoch": 1.4384000000000001,
"grad_norm": 0.8872710466384888,
"kl": 0.058349609375,
"learning_rate": 4.3874999999999993e-07,
"loss": 0.0006,
"reward": 3.7988067865371704,
"reward_std": 0.03104257071390748,
"rewards/answer_entity_reward": 0.9734432399272919,
"rewards/answer_wer_reward": 0.8270545899868011,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9983089566230774,
"step": 450
},
{
"completion_length": 178.40625,
"epoch": 1.4416,
"grad_norm": 6.044506072998047,
"kl": 0.0657958984375,
"learning_rate": 4.375e-07,
"loss": 0.0007,
"reward": 3.9419833421707153,
"reward_std": 0.021156481467187405,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9676234424114227,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9778319895267487,
"step": 451
},
{
"completion_length": 201.8125,
"epoch": 1.4447999999999999,
"grad_norm": 0.7943681478500366,
"kl": 0.0511474609375,
"learning_rate": 4.3625e-07,
"loss": 0.0005,
"reward": 3.956661581993103,
"reward_std": 0.007463611662387848,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9675310552120209,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.989130437374115,
"step": 452
},
{
"completion_length": 219.03125,
"epoch": 1.448,
"grad_norm": 1.069403052330017,
"kl": 0.0570068359375,
"learning_rate": 4.3499999999999996e-07,
"loss": 0.0006,
"reward": 3.9562065601348877,
"reward_std": 0.011006501503288746,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9564736187458038,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997329115867615,
"step": 453
},
{
"completion_length": 206.8125,
"epoch": 1.4512,
"grad_norm": 1.0987451076507568,
"kl": 0.0611572265625,
"learning_rate": 4.3375000000000003e-07,
"loss": 0.0006,
"reward": 3.9423000812530518,
"reward_std": 0.01284673297777772,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9693345129489899,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9758064448833466,
"step": 454
},
{
"completion_length": 211.375,
"epoch": 1.4544000000000001,
"grad_norm": 3.5896220207214355,
"kl": 0.065673828125,
"learning_rate": 4.325e-07,
"loss": 0.0007,
"reward": 3.961179494857788,
"reward_std": 0.012267218437045813,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9640858769416809,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.99709352850914,
"step": 455
},
{
"completion_length": 238.8125,
"epoch": 1.4576,
"grad_norm": 0.625076174736023,
"kl": 0.0399169921875,
"learning_rate": 4.3125e-07,
"loss": 0.0004,
"reward": 3.9661307334899902,
"reward_std": 0.013454007916152477,
"rewards/answer_entity_reward": 0.9958333373069763,
"rewards/answer_wer_reward": 0.9702973961830139,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 456
},
{
"completion_length": 206.9375,
"epoch": 1.4607999999999999,
"grad_norm": 0.6369054317474365,
"kl": 0.059814453125,
"learning_rate": 4.2999999999999996e-07,
"loss": 0.0006,
"reward": 3.9704521894454956,
"reward_std": 0.006653362594079226,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9733729660511017,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9970792829990387,
"step": 457
},
{
"completion_length": 199.625,
"epoch": 1.464,
"grad_norm": 1.2201271057128906,
"kl": 0.083251953125,
"learning_rate": 4.2875e-07,
"loss": 0.0008,
"reward": 3.967539429664612,
"reward_std": 0.012669337913393974,
"rewards/answer_entity_reward": 0.9927884340286255,
"rewards/answer_wer_reward": 0.9747509360313416,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 458
},
{
"completion_length": 220.0,
"epoch": 1.4672,
"grad_norm": 11.574130058288574,
"kl": 0.2125244140625,
"learning_rate": 4.275e-07,
"loss": 0.0021,
"reward": 3.9735381603240967,
"reward_std": 0.0033322512172162533,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9737901091575623,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997479915618896,
"step": 459
},
{
"completion_length": 181.0625,
"epoch": 1.4704,
"grad_norm": 1.050900936126709,
"kl": 0.0736083984375,
"learning_rate": 4.2625e-07,
"loss": 0.0007,
"reward": 3.9467893838882446,
"reward_std": 0.00827464903704822,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9717220067977905,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9750673770904541,
"step": 460
},
{
"completion_length": 207.4375,
"epoch": 1.4736,
"grad_norm": 1.25560462474823,
"kl": 0.07861328125,
"learning_rate": 4.2499999999999995e-07,
"loss": 0.0008,
"reward": 3.885838508605957,
"reward_std": 0.012714273296296597,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9541967213153839,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9316417276859283,
"step": 461
},
{
"completion_length": 205.125,
"epoch": 1.4768,
"grad_norm": 2.1235697269439697,
"kl": 0.064208984375,
"learning_rate": 4.2375e-07,
"loss": 0.0006,
"reward": 3.952380895614624,
"reward_std": 0.013835938647389412,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9538231492042542,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9985576868057251,
"step": 462
},
{
"completion_length": 229.25,
"epoch": 1.48,
"grad_norm": 3.838672399520874,
"kl": 0.09619140625,
"learning_rate": 4.225e-07,
"loss": 0.001,
"reward": 3.9537363052368164,
"reward_std": 0.014287983998656273,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9542993903160095,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994369447231293,
"step": 463
},
{
"completion_length": 224.71875,
"epoch": 1.4832,
"grad_norm": 0.7103460431098938,
"kl": 0.058837890625,
"learning_rate": 4.2125e-07,
"loss": 0.0006,
"reward": 3.9675354957580566,
"reward_std": 0.013558031525462866,
"rewards/answer_entity_reward": 0.9958333373069763,
"rewards/answer_wer_reward": 0.9719286262989044,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997735619544983,
"step": 464
},
{
"completion_length": 147.625,
"epoch": 1.4864,
"grad_norm": 2.865051031112671,
"kl": 0.099853515625,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.001,
"reward": 3.958040475845337,
"reward_std": 0.00422883324790746,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9780724942684174,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9799679517745972,
"step": 465
},
{
"completion_length": 250.625,
"epoch": 1.4896,
"grad_norm": 1.115330696105957,
"kl": 0.062744140625,
"learning_rate": 4.1875e-07,
"loss": 0.0006,
"reward": 3.925747871398926,
"reward_std": 0.01510471198707819,
"rewards/answer_entity_reward": 0.9895833134651184,
"rewards/answer_wer_reward": 0.9361644089221954,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 466
},
{
"completion_length": 181.28125,
"epoch": 1.4928,
"grad_norm": 0.8615334033966064,
"kl": 0.095703125,
"learning_rate": 4.1749999999999997e-07,
"loss": 0.001,
"reward": 3.9389272928237915,
"reward_std": 0.009215079713612795,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.947648286819458,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9912790656089783,
"step": 467
},
{
"completion_length": 201.1875,
"epoch": 1.496,
"grad_norm": 0.8399393558502197,
"kl": 0.067138671875,
"learning_rate": 4.1625e-07,
"loss": 0.0007,
"reward": 3.9645369052886963,
"reward_std": 0.005296911578625441,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9660760462284088,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9984607994556427,
"step": 468
},
{
"completion_length": 181.53125,
"epoch": 1.4992,
"grad_norm": 1.692581057548523,
"kl": 0.116455078125,
"learning_rate": 4.1499999999999994e-07,
"loss": 0.0012,
"reward": 3.91774320602417,
"reward_std": 0.007862454745918512,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9589084982872009,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.958834707736969,
"step": 469
},
{
"completion_length": 208.375,
"epoch": 1.5024,
"grad_norm": 1.0280638933181763,
"kl": 0.0733642578125,
"learning_rate": 4.1375e-07,
"loss": 0.0007,
"reward": 3.963421940803528,
"reward_std": 0.010574808926321566,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9634219110012054,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 470
},
{
"completion_length": 194.375,
"epoch": 1.5056,
"grad_norm": 0.9556618332862854,
"kl": 0.04541015625,
"learning_rate": 4.1249999999999997e-07,
"loss": 0.0005,
"reward": 3.9483964443206787,
"reward_std": 0.0071337176486849785,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9483965635299683,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 471
},
{
"completion_length": 219.90625,
"epoch": 1.5088,
"grad_norm": 8.583925247192383,
"kl": 0.057373046875,
"learning_rate": 4.1125e-07,
"loss": 0.0006,
"reward": 3.9298593997955322,
"reward_std": 0.010127428220584989,
"rewards/answer_entity_reward": 0.9764957129955292,
"rewards/answer_wer_reward": 0.9549680352210999,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9983957409858704,
"step": 472
},
{
"completion_length": 169.71875,
"epoch": 1.512,
"grad_norm": 1.0506740808486938,
"kl": 0.0703125,
"learning_rate": 4.0999999999999994e-07,
"loss": 0.0007,
"reward": 3.9712518453598022,
"reward_std": 0.004299861378967762,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9712517857551575,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 473
},
{
"completion_length": 254.0,
"epoch": 1.5152,
"grad_norm": 1.2391588687896729,
"kl": 0.055419921875,
"learning_rate": 4.0875e-07,
"loss": 0.0006,
"reward": 3.9443717002868652,
"reward_std": 0.007719833869487047,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9459867179393768,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9983848929405212,
"step": 474
},
{
"completion_length": 173.15625,
"epoch": 1.5184,
"grad_norm": 21.967166900634766,
"kl": 0.0810546875,
"learning_rate": 4.0749999999999996e-07,
"loss": 0.0008,
"reward": 3.892626404762268,
"reward_std": 0.03193977475166321,
"rewards/answer_entity_reward": 0.9926470518112183,
"rewards/answer_wer_reward": 0.9627694487571716,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9372097849845886,
"step": 475
},
{
"completion_length": 177.96875,
"epoch": 1.5215999999999998,
"grad_norm": 2.125126838684082,
"kl": 0.0814208984375,
"learning_rate": 4.0625e-07,
"loss": 0.0008,
"reward": 3.957445502281189,
"reward_std": 0.016827338375151157,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9618943929672241,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990234375,
"step": 476
},
{
"completion_length": 259.34375,
"epoch": 1.5248,
"grad_norm": 1.144234538078308,
"kl": 0.0545654296875,
"learning_rate": 4.05e-07,
"loss": 0.0005,
"reward": 3.9333302974700928,
"reward_std": 0.015490441583096981,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9336776435375214,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999652773141861,
"step": 477
},
{
"completion_length": 223.84375,
"epoch": 1.528,
"grad_norm": 0.8379483222961426,
"kl": 0.0653076171875,
"learning_rate": 4.0375e-07,
"loss": 0.0007,
"reward": 3.9397594928741455,
"reward_std": 0.006189712788909674,
"rewards/answer_entity_reward": 0.9926470518112183,
"rewards/answer_wer_reward": 0.9652985334396362,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.981813907623291,
"step": 478
},
{
"completion_length": 195.15625,
"epoch": 1.5312000000000001,
"grad_norm": 1.9627622365951538,
"kl": 0.0709228515625,
"learning_rate": 4.025e-07,
"loss": 0.0007,
"reward": 3.90268337726593,
"reward_std": 0.022933244705200195,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9422430694103241,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9632812440395355,
"step": 479
},
{
"completion_length": 212.03125,
"epoch": 1.5344,
"grad_norm": 1.4353668689727783,
"kl": 0.0572509765625,
"learning_rate": 4.0124999999999997e-07,
"loss": 0.0006,
"reward": 3.955712080001831,
"reward_std": 0.004905138397589326,
"rewards/answer_entity_reward": 0.9903846085071564,
"rewards/answer_wer_reward": 0.9653275012969971,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 480
},
{
"completion_length": 238.125,
"epoch": 1.5375999999999999,
"grad_norm": 0.9400500059127808,
"kl": 0.0516357421875,
"learning_rate": 4e-07,
"loss": 0.0005,
"reward": 3.9561740159988403,
"reward_std": 0.004761199816130102,
"rewards/answer_entity_reward": 0.9903846085071564,
"rewards/answer_wer_reward": 0.9657893478870392,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 481
},
{
"completion_length": 197.125,
"epoch": 1.5408,
"grad_norm": 1.7909142971038818,
"kl": 0.044677734375,
"learning_rate": 3.9875e-07,
"loss": 0.0004,
"reward": 3.9649877548217773,
"reward_std": 0.008824507240206003,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9712709188461304,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.993716835975647,
"step": 482
},
{
"completion_length": 247.28125,
"epoch": 1.544,
"grad_norm": 1.305432915687561,
"kl": 0.0885009765625,
"learning_rate": 3.975e-07,
"loss": 0.0009,
"reward": 3.9271016120910645,
"reward_std": 0.010741112288087606,
"rewards/answer_entity_reward": 0.9867424368858337,
"rewards/answer_wer_reward": 0.9422920942306519,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9980670213699341,
"step": 483
},
{
"completion_length": 183.71875,
"epoch": 1.5472000000000001,
"grad_norm": 1.2143511772155762,
"kl": 0.083251953125,
"learning_rate": 3.9624999999999996e-07,
"loss": 0.0008,
"reward": 3.961517810821533,
"reward_std": 0.015109732514247298,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9659819006919861,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 484
},
{
"completion_length": 190.96875,
"epoch": 1.5504,
"grad_norm": 1.3901034593582153,
"kl": 0.0478515625,
"learning_rate": 3.95e-07,
"loss": 0.0005,
"reward": 3.9620405435562134,
"reward_std": 0.007438812637701631,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.962040513753891,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 485
},
{
"completion_length": 236.71875,
"epoch": 1.5535999999999999,
"grad_norm": 1.005139946937561,
"kl": 0.064697265625,
"learning_rate": 3.9375e-07,
"loss": 0.0007,
"reward": 3.9681735038757324,
"reward_std": 0.007598390802741051,
"rewards/answer_entity_reward": 0.9981617629528046,
"rewards/answer_wer_reward": 0.9703975021839142,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996141791343689,
"step": 486
},
{
"completion_length": 167.71875,
"epoch": 1.5568,
"grad_norm": 14.769695281982422,
"kl": 0.088623046875,
"learning_rate": 3.925e-07,
"loss": 0.0009,
"reward": 3.9402579069137573,
"reward_std": 0.01711948262527585,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9504852592945099,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9897727370262146,
"step": 487
},
{
"completion_length": 245.59375,
"epoch": 1.56,
"grad_norm": 2.1311302185058594,
"kl": 0.0643310546875,
"learning_rate": 3.9124999999999996e-07,
"loss": 0.0006,
"reward": 3.965644121170044,
"reward_std": 0.006802293471992016,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9664610624313354,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9991829991340637,
"step": 488
},
{
"completion_length": 228.90625,
"epoch": 1.5632000000000001,
"grad_norm": 2.194638967514038,
"kl": 0.07861328125,
"learning_rate": 3.8999999999999997e-07,
"loss": 0.0008,
"reward": 3.940732479095459,
"reward_std": 0.00845141801983118,
"rewards/answer_entity_reward": 0.9916666746139526,
"rewards/answer_wer_reward": 0.9496362805366516,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994294047355652,
"step": 489
},
{
"completion_length": 229.09375,
"epoch": 1.5664,
"grad_norm": 1.4338947534561157,
"kl": 0.067138671875,
"learning_rate": 3.8875e-07,
"loss": 0.0007,
"reward": 3.974826216697693,
"reward_std": 0.008368036011233926,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9759277105331421,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9988985061645508,
"step": 490
},
{
"completion_length": 147.1875,
"epoch": 1.5695999999999999,
"grad_norm": 0.9500789046287537,
"kl": 0.055908203125,
"learning_rate": 3.875e-07,
"loss": 0.0006,
"reward": 3.900749683380127,
"reward_std": 0.004976645112037659,
"rewards/answer_entity_reward": 0.9886363744735718,
"rewards/answer_wer_reward": 0.981389045715332,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9307242631912231,
"step": 491
},
{
"completion_length": 207.1875,
"epoch": 1.5728,
"grad_norm": 18.29888916015625,
"kl": 0.0787353515625,
"learning_rate": 3.8624999999999995e-07,
"loss": 0.0008,
"reward": 3.9231996536254883,
"reward_std": 0.01712162047624588,
"rewards/answer_entity_reward": 0.9963235259056091,
"rewards/answer_wer_reward": 0.9278469979763031,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990289807319641,
"step": 492
},
{
"completion_length": 215.3125,
"epoch": 1.576,
"grad_norm": 2.524644613265991,
"kl": 0.0682373046875,
"learning_rate": 3.8499999999999997e-07,
"loss": 0.0007,
"reward": 3.9182220697402954,
"reward_std": 0.028343133628368378,
"rewards/answer_entity_reward": 0.9899839758872986,
"rewards/answer_wer_reward": 0.9533904790878296,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9748475551605225,
"step": 493
},
{
"completion_length": 205.21875,
"epoch": 1.5792000000000002,
"grad_norm": 0.8041574954986572,
"kl": 0.0572509765625,
"learning_rate": 3.8375e-07,
"loss": 0.0006,
"reward": 3.9712276458740234,
"reward_std": 0.006993145681917667,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9721719622612,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990555644035339,
"step": 494
},
{
"completion_length": 245.84375,
"epoch": 1.5824,
"grad_norm": 1.4723294973373413,
"kl": 0.0518798828125,
"learning_rate": 3.825e-07,
"loss": 0.0005,
"reward": 3.9171528816223145,
"reward_std": 0.007540189428254962,
"rewards/answer_entity_reward": 0.9707792401313782,
"rewards/answer_wer_reward": 0.9463737607002258,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 495
},
{
"completion_length": 191.1875,
"epoch": 1.5856,
"grad_norm": 5.778710842132568,
"kl": 0.095703125,
"learning_rate": 3.8124999999999995e-07,
"loss": 0.001,
"reward": 3.7989085912704468,
"reward_std": 0.02309321239590645,
"rewards/answer_entity_reward": 0.9837072491645813,
"rewards/answer_wer_reward": 0.9482426345348358,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.866958737373352,
"step": 496
},
{
"completion_length": 164.375,
"epoch": 1.5888,
"grad_norm": 3.773331880569458,
"kl": 0.0452880859375,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.0005,
"reward": 3.957179307937622,
"reward_std": 0.03012340608984232,
"rewards/answer_entity_reward": 0.995192289352417,
"rewards/answer_wer_reward": 0.9724558889865875,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9895310997962952,
"step": 497
},
{
"completion_length": 190.34375,
"epoch": 1.592,
"grad_norm": 1.7698373794555664,
"kl": 0.0579833984375,
"learning_rate": 3.7875e-07,
"loss": 0.0006,
"reward": 3.9473685026168823,
"reward_std": 0.009419793263077736,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9480363428592682,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993322789669037,
"step": 498
},
{
"completion_length": 223.03125,
"epoch": 1.5952,
"grad_norm": 1.197536587715149,
"kl": 0.074462890625,
"learning_rate": 3.775e-07,
"loss": 0.0007,
"reward": 3.9201695919036865,
"reward_std": 0.012398123741149902,
"rewards/answer_entity_reward": 0.9861111044883728,
"rewards/answer_wer_reward": 0.9409077167510986,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9931506812572479,
"step": 499
},
{
"completion_length": 204.46875,
"epoch": 1.5984,
"grad_norm": 1.5246530771255493,
"kl": 0.0849609375,
"learning_rate": 3.7624999999999994e-07,
"loss": 0.0008,
"reward": 3.9556870460510254,
"reward_std": 0.010473677422851324,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9580392241477966,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9976478517055511,
"step": 500
},
{
"completion_length": 230.0625,
"epoch": 1.6016,
"grad_norm": 1.1340093612670898,
"kl": 0.10595703125,
"learning_rate": 3.75e-07,
"loss": 0.0011,
"reward": 3.9659206867218018,
"reward_std": 0.008191006258130074,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9659207165241241,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 501
},
{
"completion_length": 185.15625,
"epoch": 1.6048,
"grad_norm": 1.2874914407730103,
"kl": 0.045654296875,
"learning_rate": 3.7375e-07,
"loss": 0.0005,
"reward": 3.9568817615509033,
"reward_std": 0.011238863109610975,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9603540003299713,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 502
},
{
"completion_length": 241.78125,
"epoch": 1.608,
"grad_norm": 0.9499295353889465,
"kl": 0.0531005859375,
"learning_rate": 3.725e-07,
"loss": 0.0005,
"reward": 3.9388747215270996,
"reward_std": 0.008348907809704542,
"rewards/answer_entity_reward": 0.9886363744735718,
"rewards/answer_wer_reward": 0.9510295391082764,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9992088675498962,
"step": 503
},
{
"completion_length": 233.25,
"epoch": 1.6112,
"grad_norm": 1.0857101678848267,
"kl": 0.062744140625,
"learning_rate": 3.7125e-07,
"loss": 0.0006,
"reward": 3.958517551422119,
"reward_std": 0.0058578201569616795,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.958990752696991,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9995267689228058,
"step": 504
},
{
"completion_length": 251.78125,
"epoch": 1.6143999999999998,
"grad_norm": 28.171039581298828,
"kl": 0.114013671875,
"learning_rate": 3.7e-07,
"loss": 0.0011,
"reward": 3.866329312324524,
"reward_std": 0.01942992489784956,
"rewards/answer_entity_reward": 0.9720904231071472,
"rewards/answer_wer_reward": 0.8955735862255096,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9986652433872223,
"step": 505
},
{
"completion_length": 186.46875,
"epoch": 1.6176,
"grad_norm": 6.638906955718994,
"kl": 0.06884765625,
"learning_rate": 3.6875e-07,
"loss": 0.0007,
"reward": 3.7806142568588257,
"reward_std": 0.013823950197547674,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.945627748966217,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8378273248672485,
"step": 506
},
{
"completion_length": 225.375,
"epoch": 1.6208,
"grad_norm": 2.12021803855896,
"kl": 0.07177734375,
"learning_rate": 3.675e-07,
"loss": 0.0007,
"reward": 3.9451769590377808,
"reward_std": 0.013169697020202875,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9672558605670929,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.977921187877655,
"step": 507
},
{
"completion_length": 219.125,
"epoch": 1.624,
"grad_norm": 1.5153933763504028,
"kl": 0.053955078125,
"learning_rate": 3.6625e-07,
"loss": 0.0005,
"reward": 3.959490180015564,
"reward_std": 0.010949777672067285,
"rewards/answer_entity_reward": 0.9958333373069763,
"rewards/answer_wer_reward": 0.9636567533016205,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 508
},
{
"completion_length": 228.4375,
"epoch": 1.6272,
"grad_norm": 3.832310676574707,
"kl": 0.0521240234375,
"learning_rate": 3.65e-07,
"loss": 0.0005,
"reward": 3.953840732574463,
"reward_std": 0.017153040505945683,
"rewards/answer_entity_reward": 0.9936868846416473,
"rewards/answer_wer_reward": 0.9603707194328308,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997829794883728,
"step": 509
},
{
"completion_length": 243.46875,
"epoch": 1.6303999999999998,
"grad_norm": 1.285962462425232,
"kl": 0.0673828125,
"learning_rate": 3.6375e-07,
"loss": 0.0007,
"reward": 3.960462808609009,
"reward_std": 0.0062334975227713585,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9608500599861145,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996127486228943,
"step": 510
},
{
"completion_length": 262.65625,
"epoch": 1.6336,
"grad_norm": 1.124130368232727,
"kl": 0.0596923828125,
"learning_rate": 3.6249999999999997e-07,
"loss": 0.0006,
"reward": 3.941042900085449,
"reward_std": 0.01204587472602725,
"rewards/answer_entity_reward": 0.9970238208770752,
"rewards/answer_wer_reward": 0.9446144104003906,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994047582149506,
"step": 511
},
{
"completion_length": 182.28125,
"epoch": 1.6368,
"grad_norm": 1.9966425895690918,
"kl": 0.061279296875,
"learning_rate": 3.6125e-07,
"loss": 0.0006,
"reward": 3.9531023502349854,
"reward_std": 0.02773769712075591,
"rewards/answer_entity_reward": 0.9917200803756714,
"rewards/answer_wer_reward": 0.9697157144546509,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9916666746139526,
"step": 512
},
{
"completion_length": 218.125,
"epoch": 1.6400000000000001,
"grad_norm": 3.2862062454223633,
"kl": 0.04736328125,
"learning_rate": 3.6e-07,
"loss": 0.0005,
"reward": 3.858319878578186,
"reward_std": 0.07778534758836031,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9565341770648956,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.90625,
"step": 513
},
{
"completion_length": 235.03125,
"epoch": 1.6432,
"grad_norm": 1.14111328125,
"kl": 0.054443359375,
"learning_rate": 3.5875e-07,
"loss": 0.0005,
"reward": 3.967674970626831,
"reward_std": 0.0044005257077515125,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9691169261932373,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9985580444335938,
"step": 514
},
{
"completion_length": 233.90625,
"epoch": 1.6463999999999999,
"grad_norm": 1.2006644010543823,
"kl": 0.06103515625,
"learning_rate": 3.5749999999999997e-07,
"loss": 0.0006,
"reward": 3.959411859512329,
"reward_std": 0.005820953520014882,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9596619009971619,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999750018119812,
"step": 515
},
{
"completion_length": 252.5625,
"epoch": 1.6496,
"grad_norm": 0.7272346615791321,
"kl": 0.0428466796875,
"learning_rate": 3.5625e-07,
"loss": 0.0004,
"reward": 3.963356375694275,
"reward_std": 0.0036240214249119163,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.964261919260025,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990943968296051,
"step": 516
},
{
"completion_length": 240.6875,
"epoch": 1.6528,
"grad_norm": 1.0241456031799316,
"kl": 0.0665283203125,
"learning_rate": 3.55e-07,
"loss": 0.0007,
"reward": 3.953768730163574,
"reward_std": 0.012724505737423897,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9555812776088715,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9981874525547028,
"step": 517
},
{
"completion_length": 221.5625,
"epoch": 1.6560000000000001,
"grad_norm": 0.9653159379959106,
"kl": 0.0732421875,
"learning_rate": 3.5375e-07,
"loss": 0.0007,
"reward": 3.928879141807556,
"reward_std": 0.03069964610040188,
"rewards/answer_entity_reward": 0.9769324958324432,
"rewards/answer_wer_reward": 0.9525844156742096,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993622303009033,
"step": 518
},
{
"completion_length": 186.53125,
"epoch": 1.6592,
"grad_norm": 1.616326928138733,
"kl": 0.0673828125,
"learning_rate": 3.5249999999999996e-07,
"loss": 0.0007,
"reward": 3.963484525680542,
"reward_std": 0.0024420777335762978,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9634844958782196,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 519
},
{
"completion_length": 197.53125,
"epoch": 1.6623999999999999,
"grad_norm": 1.1605949401855469,
"kl": 0.066162109375,
"learning_rate": 3.5124999999999997e-07,
"loss": 0.0007,
"reward": 3.871947407722473,
"reward_std": 0.008121895836666226,
"rewards/answer_entity_reward": 0.9832701981067657,
"rewards/answer_wer_reward": 0.9628296792507172,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9258474707603455,
"step": 520
},
{
"completion_length": 199.9375,
"epoch": 1.6656,
"grad_norm": 2.1799464225769043,
"kl": 0.098876953125,
"learning_rate": 3.5e-07,
"loss": 0.001,
"reward": 3.914597272872925,
"reward_std": 0.046278308145701885,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9440673291683197,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9705299139022827,
"step": 521
},
{
"completion_length": 213.78125,
"epoch": 1.6688,
"grad_norm": 1.8315109014511108,
"kl": 0.0609130859375,
"learning_rate": 3.4875e-07,
"loss": 0.0006,
"reward": 3.934143304824829,
"reward_std": 0.005300799617543817,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9627971351146698,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9713463485240936,
"step": 522
},
{
"completion_length": 233.21875,
"epoch": 1.6720000000000002,
"grad_norm": 2.7353854179382324,
"kl": 0.0634765625,
"learning_rate": 3.4749999999999996e-07,
"loss": 0.0006,
"reward": 3.940351963043213,
"reward_std": 0.012048345990478992,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9584531188011169,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9818988144397736,
"step": 523
},
{
"completion_length": 226.8125,
"epoch": 1.6752,
"grad_norm": 1.2798601388931274,
"kl": 0.0517578125,
"learning_rate": 3.4624999999999997e-07,
"loss": 0.0005,
"reward": 3.94057559967041,
"reward_std": 0.016422050073742867,
"rewards/answer_entity_reward": 0.9859203100204468,
"rewards/answer_wer_reward": 0.9546553492546082,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 524
},
{
"completion_length": 225.375,
"epoch": 1.6784,
"grad_norm": 2.434398651123047,
"kl": 0.0570068359375,
"learning_rate": 3.45e-07,
"loss": 0.0006,
"reward": 3.9358779191970825,
"reward_std": 0.02181497309356928,
"rewards/answer_entity_reward": 0.9961080551147461,
"rewards/answer_wer_reward": 0.9410910904407501,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9986788034439087,
"step": 525
},
{
"completion_length": 181.21875,
"epoch": 1.6816,
"grad_norm": 1.322139859199524,
"kl": 0.116943359375,
"learning_rate": 3.4375e-07,
"loss": 0.0012,
"reward": 3.946447730064392,
"reward_std": 0.007033249130472541,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9464477598667145,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 526
},
{
"completion_length": 195.8125,
"epoch": 1.6848,
"grad_norm": 1.412061333656311,
"kl": 0.06640625,
"learning_rate": 3.425e-07,
"loss": 0.0007,
"reward": 3.936468005180359,
"reward_std": 0.00922114565037191,
"rewards/answer_entity_reward": 0.9841346144676208,
"rewards/answer_wer_reward": 0.952333390712738,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 527
},
{
"completion_length": 210.84375,
"epoch": 1.688,
"grad_norm": 3.695819139480591,
"kl": 0.056640625,
"learning_rate": 3.4124999999999996e-07,
"loss": 0.0006,
"reward": 3.894517421722412,
"reward_std": 0.015210594050586224,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.9634661674499512,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9367331266403198,
"step": 528
},
{
"completion_length": 220.09375,
"epoch": 1.6912,
"grad_norm": 1.6299357414245605,
"kl": 0.0711669921875,
"learning_rate": 3.4000000000000003e-07,
"loss": 0.0007,
"reward": 3.9391125440597534,
"reward_std": 0.014290765568148345,
"rewards/answer_entity_reward": 0.9847222566604614,
"rewards/answer_wer_reward": 0.954390287399292,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 529
},
{
"completion_length": 195.65625,
"epoch": 1.6944,
"grad_norm": 4.491413116455078,
"kl": 0.064453125,
"learning_rate": 3.3875e-07,
"loss": 0.0007,
"reward": 3.971281409263611,
"reward_std": 0.017785906326025724,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9796920418739319,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9944303929805756,
"step": 530
},
{
"completion_length": 208.34375,
"epoch": 1.6976,
"grad_norm": 4.832588195800781,
"kl": 0.0972900390625,
"learning_rate": 3.375e-07,
"loss": 0.001,
"reward": 3.9011433124542236,
"reward_std": 0.010198547039180994,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9640267491340637,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9371165633201599,
"step": 531
},
{
"completion_length": 203.40625,
"epoch": 1.7008,
"grad_norm": 3.4038021564483643,
"kl": 0.071044921875,
"learning_rate": 3.3624999999999996e-07,
"loss": 0.0007,
"reward": 3.9605783224105835,
"reward_std": 0.0076046837493777275,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9607688188552856,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9998094439506531,
"step": 532
},
{
"completion_length": 241.8125,
"epoch": 1.704,
"grad_norm": 1.0362496376037598,
"kl": 0.063232421875,
"learning_rate": 3.35e-07,
"loss": 0.0006,
"reward": 3.9339258670806885,
"reward_std": 0.018858356634154916,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9387494027614594,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996408224105835,
"step": 533
},
{
"completion_length": 235.90625,
"epoch": 1.7072,
"grad_norm": 3.604599714279175,
"kl": 0.0853271484375,
"learning_rate": 3.3375e-07,
"loss": 0.0009,
"reward": 3.861118197441101,
"reward_std": 0.011326078558340669,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9576848149299622,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9034333229064941,
"step": 534
},
{
"completion_length": 229.0,
"epoch": 1.7104,
"grad_norm": 2.319185256958008,
"kl": 0.052001953125,
"learning_rate": 3.325e-07,
"loss": 0.0005,
"reward": 3.9228227138519287,
"reward_std": 0.03856424614787102,
"rewards/answer_entity_reward": 0.9914772808551788,
"rewards/answer_wer_reward": 0.9560109972953796,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9753345847129822,
"step": 535
},
{
"completion_length": 224.71875,
"epoch": 1.7136,
"grad_norm": 2.444124460220337,
"kl": 0.080810546875,
"learning_rate": 3.3124999999999995e-07,
"loss": 0.0008,
"reward": 3.9688942432403564,
"reward_std": 0.003912239335477352,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9688942730426788,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 536
},
{
"completion_length": 224.3125,
"epoch": 1.7168,
"grad_norm": 6.20790958404541,
"kl": 0.064697265625,
"learning_rate": 3.3e-07,
"loss": 0.0006,
"reward": 3.8677161931991577,
"reward_std": 0.02981195878237486,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9471929371356964,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9233641624450684,
"step": 537
},
{
"completion_length": 150.34375,
"epoch": 1.72,
"grad_norm": 1.6208490133285522,
"kl": 0.03924560546875,
"learning_rate": 3.2875e-07,
"loss": 0.0004,
"reward": 3.9733328819274902,
"reward_std": 0.002679725643247366,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9733329117298126,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 538
},
{
"completion_length": 183.0,
"epoch": 1.7231999999999998,
"grad_norm": 1.2286797761917114,
"kl": 0.057861328125,
"learning_rate": 3.275e-07,
"loss": 0.0006,
"reward": 3.935777187347412,
"reward_std": 0.003249647794291377,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9795266687870026,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9562505781650543,
"step": 539
},
{
"completion_length": 234.625,
"epoch": 1.7264,
"grad_norm": 1.304764747619629,
"kl": 0.054931640625,
"learning_rate": 3.2624999999999995e-07,
"loss": 0.0005,
"reward": 3.950987696647644,
"reward_std": 0.00898568145930767,
"rewards/answer_entity_reward": 0.9958333373069763,
"rewards/answer_wer_reward": 0.9557509124279022,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994034171104431,
"step": 540
},
{
"completion_length": 183.96875,
"epoch": 1.7296,
"grad_norm": 1.3975461721420288,
"kl": 0.07421875,
"learning_rate": 3.25e-07,
"loss": 0.0007,
"reward": 3.918307065963745,
"reward_std": 0.01607332704588771,
"rewards/answer_entity_reward": 0.9720314145088196,
"rewards/answer_wer_reward": 0.9547825455665588,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9914930462837219,
"step": 541
},
{
"completion_length": 204.0625,
"epoch": 1.7328000000000001,
"grad_norm": 2.0030770301818848,
"kl": 0.070068359375,
"learning_rate": 3.2374999999999997e-07,
"loss": 0.0007,
"reward": 3.9624624252319336,
"reward_std": 0.011391833890229464,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9645456969738007,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 542
},
{
"completion_length": 236.8125,
"epoch": 1.736,
"grad_norm": 1.0529872179031372,
"kl": 0.06396484375,
"learning_rate": 3.225e-07,
"loss": 0.0006,
"reward": 3.9355998039245605,
"reward_std": 0.011712775565683842,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.946576714515686,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9890230894088745,
"step": 543
},
{
"completion_length": 171.03125,
"epoch": 1.7391999999999999,
"grad_norm": 1.4777579307556152,
"kl": 0.07861328125,
"learning_rate": 3.2124999999999994e-07,
"loss": 0.0008,
"reward": 3.959132194519043,
"reward_std": 0.007866068510338664,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9591321349143982,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 544
},
{
"completion_length": 199.03125,
"epoch": 1.7424,
"grad_norm": 1.5819900035858154,
"kl": 0.07666015625,
"learning_rate": 3.2e-07,
"loss": 0.0008,
"reward": 3.9456801414489746,
"reward_std": 0.01446144049987197,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9492515921592712,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9985119104385376,
"step": 545
},
{
"completion_length": 243.53125,
"epoch": 1.7456,
"grad_norm": 6.461181640625,
"kl": 0.1029052734375,
"learning_rate": 3.1874999999999997e-07,
"loss": 0.001,
"reward": 3.9253257513046265,
"reward_std": 0.013943355064839125,
"rewards/answer_entity_reward": 0.9861111044883728,
"rewards/answer_wer_reward": 0.9411455988883972,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9980691075325012,
"step": 546
},
{
"completion_length": 190.21875,
"epoch": 1.7488000000000001,
"grad_norm": 1.5046278238296509,
"kl": 0.0430908203125,
"learning_rate": 3.175e-07,
"loss": 0.0004,
"reward": 3.946847081184387,
"reward_std": 0.006090850802138448,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9579125344753265,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9889345765113831,
"step": 547
},
{
"completion_length": 199.5625,
"epoch": 1.752,
"grad_norm": 2.7514781951904297,
"kl": 0.054931640625,
"learning_rate": 3.1624999999999994e-07,
"loss": 0.0006,
"reward": 3.9198288917541504,
"reward_std": 0.008053636411204934,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9198288321495056,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 548
},
{
"completion_length": 244.625,
"epoch": 1.7551999999999999,
"grad_norm": 1.0448155403137207,
"kl": 0.0426025390625,
"learning_rate": 3.15e-07,
"loss": 0.0004,
"reward": 3.958520531654358,
"reward_std": 0.008235724177211523,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9585205316543579,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 549
},
{
"completion_length": 249.0,
"epoch": 1.7584,
"grad_norm": 128.38499450683594,
"kl": 17.28076171875,
"learning_rate": 3.1374999999999996e-07,
"loss": 0.172,
"reward": 3.932722330093384,
"reward_std": 0.012139817699790001,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9340447783470154,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.998677521944046,
"step": 550
},
{
"completion_length": 202.25,
"epoch": 1.7616,
"grad_norm": 1.6289058923721313,
"kl": 0.0709228515625,
"learning_rate": 3.1249999999999997e-07,
"loss": 0.0007,
"reward": 3.931633234024048,
"reward_std": 0.015017563942819834,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.9620243012905121,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9752906560897827,
"step": 551
},
{
"completion_length": 223.65625,
"epoch": 1.7648000000000001,
"grad_norm": 0.650069534778595,
"kl": 0.0467529296875,
"learning_rate": 3.1125000000000004e-07,
"loss": 0.0005,
"reward": 3.9622879028320312,
"reward_std": 0.004962240578606725,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9622879028320312,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 552
},
{
"completion_length": 238.65625,
"epoch": 1.768,
"grad_norm": 9.516084671020508,
"kl": 0.0474853515625,
"learning_rate": 3.1e-07,
"loss": 0.0005,
"reward": 3.9525749683380127,
"reward_std": 0.012759724631905556,
"rewards/answer_entity_reward": 0.9937500059604645,
"rewards/answer_wer_reward": 0.9610438644886017,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9977810680866241,
"step": 553
},
{
"completion_length": 224.65625,
"epoch": 1.7711999999999999,
"grad_norm": 1.8886899948120117,
"kl": 0.044189453125,
"learning_rate": 3.0875e-07,
"loss": 0.0004,
"reward": 3.9586617946624756,
"reward_std": 0.01200480293482542,
"rewards/answer_entity_reward": 0.9914772808551788,
"rewards/answer_wer_reward": 0.9675752222537994,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996093809604645,
"step": 554
},
{
"completion_length": 220.9375,
"epoch": 1.7744,
"grad_norm": 5.122376918792725,
"kl": 0.048828125,
"learning_rate": 3.0749999999999997e-07,
"loss": 0.0005,
"reward": 3.9466060400009155,
"reward_std": 0.016119306907057762,
"rewards/answer_entity_reward": 0.9965170323848724,
"rewards/answer_wer_reward": 0.9567474722862244,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.993341475725174,
"step": 555
},
{
"completion_length": 198.8125,
"epoch": 1.7776,
"grad_norm": 4.916889667510986,
"kl": 0.068115234375,
"learning_rate": 3.0625000000000003e-07,
"loss": 0.0007,
"reward": 3.949711561203003,
"reward_std": 0.0163404387421906,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9576182961463928,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9920931458473206,
"step": 556
},
{
"completion_length": 180.875,
"epoch": 1.7808000000000002,
"grad_norm": 10.021855354309082,
"kl": 0.072021484375,
"learning_rate": 3.05e-07,
"loss": 0.0007,
"reward": 3.867478370666504,
"reward_std": 0.047242360189557076,
"rewards/answer_entity_reward": 0.9821428656578064,
"rewards/answer_wer_reward": 0.9576010704040527,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.927734375,
"step": 557
},
{
"completion_length": 227.8125,
"epoch": 1.784,
"grad_norm": 1.7502044439315796,
"kl": 0.04443359375,
"learning_rate": 3.0375e-07,
"loss": 0.0004,
"reward": 3.9525381326675415,
"reward_std": 0.013325697276741266,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9532942175865173,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999243974685669,
"step": 558
},
{
"completion_length": 204.15625,
"epoch": 1.7872,
"grad_norm": 5.304961681365967,
"kl": 0.0496826171875,
"learning_rate": 3.0249999999999996e-07,
"loss": 0.0005,
"reward": 3.957284450531006,
"reward_std": 0.005683758878149092,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9572845101356506,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 559
},
{
"completion_length": 228.34375,
"epoch": 1.7904,
"grad_norm": 1.2513984441757202,
"kl": 0.0577392578125,
"learning_rate": 3.0125000000000003e-07,
"loss": 0.0006,
"reward": 3.94599187374115,
"reward_std": 0.00800859834998846,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.957431435585022,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9885604083538055,
"step": 560
},
{
"completion_length": 211.03125,
"epoch": 1.7936,
"grad_norm": 5.97805118560791,
"kl": 0.1036376953125,
"learning_rate": 3e-07,
"loss": 0.001,
"reward": 3.9404828548431396,
"reward_std": 0.01265423372387886,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9433237612247467,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 561
},
{
"completion_length": 205.4375,
"epoch": 1.7968,
"grad_norm": 3.833575487136841,
"kl": 0.22998046875,
"learning_rate": 2.9875e-07,
"loss": 0.0023,
"reward": 3.909332752227783,
"reward_std": 0.007294894196093082,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9648370146751404,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9444957971572876,
"step": 562
},
{
"completion_length": 207.4375,
"epoch": 1.8,
"grad_norm": 0.8627040982246399,
"kl": 0.0611572265625,
"learning_rate": 2.9749999999999996e-07,
"loss": 0.0006,
"reward": 3.9548414945602417,
"reward_std": 0.006908831186592579,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9550975561141968,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997438490390778,
"step": 563
},
{
"completion_length": 198.15625,
"epoch": 1.8032,
"grad_norm": 0.9193502068519592,
"kl": 0.0518798828125,
"learning_rate": 2.9625e-07,
"loss": 0.0005,
"reward": 3.9462149143218994,
"reward_std": 0.007913234177976847,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9465437531471252,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996710419654846,
"step": 564
},
{
"completion_length": 198.15625,
"epoch": 1.8064,
"grad_norm": 1.9635776281356812,
"kl": 0.059814453125,
"learning_rate": 2.95e-07,
"loss": 0.0006,
"reward": 3.896806240081787,
"reward_std": 0.012922112364321947,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9503778219223022,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9464285671710968,
"step": 565
},
{
"completion_length": 164.90625,
"epoch": 1.8096,
"grad_norm": 1.2068322896957397,
"kl": 0.09375,
"learning_rate": 2.9375e-07,
"loss": 0.0009,
"reward": 3.8490008115768433,
"reward_std": 0.1467541428282857,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9502907395362854,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9328009486198425,
"step": 566
},
{
"completion_length": 206.34375,
"epoch": 1.8128,
"grad_norm": 2.1644375324249268,
"kl": 0.08251953125,
"learning_rate": 2.9249999999999995e-07,
"loss": 0.0008,
"reward": 3.970282793045044,
"reward_std": 0.0077400594018399715,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9728601574897766,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9974226951599121,
"step": 567
},
{
"completion_length": 233.09375,
"epoch": 1.8159999999999998,
"grad_norm": 1.106130599975586,
"kl": 0.0552978515625,
"learning_rate": 2.9125e-07,
"loss": 0.0005,
"reward": 3.9414994716644287,
"reward_std": 0.011295767035335302,
"rewards/answer_entity_reward": 0.9848698973655701,
"rewards/answer_wer_reward": 0.9577165246009827,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.998913049697876,
"step": 568
},
{
"completion_length": 206.46875,
"epoch": 1.8192,
"grad_norm": 1.2371478080749512,
"kl": 0.0599365234375,
"learning_rate": 2.9e-07,
"loss": 0.0006,
"reward": 3.9829952716827393,
"reward_std": 0.007155058206990361,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9829952716827393,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 569
},
{
"completion_length": 227.875,
"epoch": 1.8224,
"grad_norm": 0.9648468494415283,
"kl": 0.0587158203125,
"learning_rate": 2.8875e-07,
"loss": 0.0006,
"reward": 3.875002384185791,
"reward_std": 0.007613388821482658,
"rewards/answer_entity_reward": 0.9604166746139526,
"rewards/answer_wer_reward": 0.9299702048301697,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9846153855323792,
"step": 570
},
{
"completion_length": 242.1875,
"epoch": 1.8256000000000001,
"grad_norm": 3.7682442665100098,
"kl": 0.0732421875,
"learning_rate": 2.8749999999999995e-07,
"loss": 0.0007,
"reward": 3.790624737739563,
"reward_std": 0.14343099505640566,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9464230239391327,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.8754517436027527,
"step": 571
},
{
"completion_length": 248.28125,
"epoch": 1.8288,
"grad_norm": 0.7550325393676758,
"kl": 0.039794921875,
"learning_rate": 2.8625e-07,
"loss": 0.0004,
"reward": 3.9295032024383545,
"reward_std": 0.004920503590255976,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9295033514499664,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 572
},
{
"completion_length": 222.4375,
"epoch": 1.8319999999999999,
"grad_norm": 1.055333137512207,
"kl": 0.0567626953125,
"learning_rate": 2.8499999999999997e-07,
"loss": 0.0006,
"reward": 3.929059386253357,
"reward_std": 0.014613255392760038,
"rewards/answer_entity_reward": 0.9819711446762085,
"rewards/answer_wer_reward": 0.9496394395828247,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.997448742389679,
"step": 573
},
{
"completion_length": 217.40625,
"epoch": 1.8352,
"grad_norm": 1.640468716621399,
"kl": 0.0443115234375,
"learning_rate": 2.8375e-07,
"loss": 0.0004,
"reward": 3.9705777168273926,
"reward_std": 0.013166352873668075,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9736025929450989,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9998161792755127,
"step": 574
},
{
"completion_length": 229.1875,
"epoch": 1.8384,
"grad_norm": 3.271684169769287,
"kl": 0.0567626953125,
"learning_rate": 2.8249999999999994e-07,
"loss": 0.0006,
"reward": 3.9389246702194214,
"reward_std": 0.007664299104362726,
"rewards/answer_entity_reward": 0.9833333492279053,
"rewards/answer_wer_reward": 0.9555914402008057,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 575
},
{
"completion_length": 203.28125,
"epoch": 1.8416000000000001,
"grad_norm": 1.6847234964370728,
"kl": 0.063232421875,
"learning_rate": 2.8125e-07,
"loss": 0.0006,
"reward": 3.9692747592926025,
"reward_std": 0.006263851770199835,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9701676964759827,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999107152223587,
"step": 576
},
{
"completion_length": 251.0625,
"epoch": 1.8448,
"grad_norm": 4.737148761749268,
"kl": 0.128173828125,
"learning_rate": 2.8e-07,
"loss": 0.0013,
"reward": 3.935584545135498,
"reward_std": 0.016471964307129383,
"rewards/answer_entity_reward": 0.9937500059604645,
"rewards/answer_wer_reward": 0.9418345093727112,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 577
},
{
"completion_length": 199.5,
"epoch": 1.8479999999999999,
"grad_norm": 1.7424699068069458,
"kl": 0.0618896484375,
"learning_rate": 2.7875e-07,
"loss": 0.0006,
"reward": 3.966155171394348,
"reward_std": 0.012047166470438242,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9755966663360596,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9905584752559662,
"step": 578
},
{
"completion_length": 192.96875,
"epoch": 1.8512,
"grad_norm": 0.8571773171424866,
"kl": 0.0526123046875,
"learning_rate": 2.775e-07,
"loss": 0.0005,
"reward": 3.977761387825012,
"reward_std": 0.0047087406273931265,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9777614176273346,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 579
},
{
"completion_length": 223.6875,
"epoch": 1.8544,
"grad_norm": 1.3312608003616333,
"kl": 0.050537109375,
"learning_rate": 2.7625e-07,
"loss": 0.0005,
"reward": 3.9508321285247803,
"reward_std": 0.00891483761370182,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9508320689201355,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 580
},
{
"completion_length": 241.96875,
"epoch": 1.8576000000000001,
"grad_norm": 4.553063869476318,
"kl": 0.19140625,
"learning_rate": 2.75e-07,
"loss": 0.0019,
"reward": 3.925418257713318,
"reward_std": 0.016543671488761902,
"rewards/answer_entity_reward": 0.9963235259056091,
"rewards/answer_wer_reward": 0.9290946125984192,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 581
},
{
"completion_length": 241.90625,
"epoch": 1.8608,
"grad_norm": 0.8970361948013306,
"kl": 0.065185546875,
"learning_rate": 2.7374999999999997e-07,
"loss": 0.0007,
"reward": 3.9467151165008545,
"reward_std": 0.007796656806021929,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9470826387405396,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996323585510254,
"step": 582
},
{
"completion_length": 246.96875,
"epoch": 1.8639999999999999,
"grad_norm": 1.9463343620300293,
"kl": 0.04547119140625,
"learning_rate": 2.725e-07,
"loss": 0.0005,
"reward": 3.940864324569702,
"reward_std": 0.011073273373767734,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9416800141334534,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9991843402385712,
"step": 583
},
{
"completion_length": 206.625,
"epoch": 1.8672,
"grad_norm": 4.5208892822265625,
"kl": 0.092529296875,
"learning_rate": 2.7125e-07,
"loss": 0.0009,
"reward": 3.8930487632751465,
"reward_std": 0.032747200690209866,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.9660382270812988,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9326923191547394,
"step": 584
},
{
"completion_length": 255.25,
"epoch": 1.8704,
"grad_norm": 2.1606805324554443,
"kl": 0.04736328125,
"learning_rate": 2.7e-07,
"loss": 0.0005,
"reward": 3.936957836151123,
"reward_std": 0.013339729979634285,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9393823444843292,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.997575432062149,
"step": 585
},
{
"completion_length": 226.3125,
"epoch": 1.8736000000000002,
"grad_norm": 0.7422674298286438,
"kl": 0.048095703125,
"learning_rate": 2.6874999999999997e-07,
"loss": 0.0005,
"reward": 3.9866139888763428,
"reward_std": 0.0038484669639728963,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.987176924943924,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994369745254517,
"step": 586
},
{
"completion_length": 214.59375,
"epoch": 1.8768,
"grad_norm": 1.313864827156067,
"kl": 0.0684814453125,
"learning_rate": 2.675e-07,
"loss": 0.0007,
"reward": 3.9567151069641113,
"reward_std": 0.012406408437527716,
"rewards/answer_entity_reward": 0.9832702279090881,
"rewards/answer_wer_reward": 0.9734448790550232,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 587
},
{
"completion_length": 256.46875,
"epoch": 1.88,
"grad_norm": 1.4952497482299805,
"kl": 0.1278076171875,
"learning_rate": 2.6625e-07,
"loss": 0.0013,
"reward": 3.8717525005340576,
"reward_std": 0.13869436737149954,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9397719204425812,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9660714268684387,
"step": 588
},
{
"completion_length": 217.09375,
"epoch": 1.8832,
"grad_norm": 1.3716284036636353,
"kl": 0.054931640625,
"learning_rate": 2.65e-07,
"loss": 0.0006,
"reward": 3.962627410888672,
"reward_std": 0.006240109680220485,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9626273214817047,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 589
},
{
"completion_length": 253.0,
"epoch": 1.8864,
"grad_norm": 1.4284135103225708,
"kl": 0.07080078125,
"learning_rate": 2.6374999999999996e-07,
"loss": 0.0007,
"reward": 3.9501919746398926,
"reward_std": 0.012296234723180532,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9531300067901611,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9970619678497314,
"step": 590
},
{
"completion_length": 204.5,
"epoch": 1.8896,
"grad_norm": 3.8569161891937256,
"kl": 0.07421875,
"learning_rate": 2.625e-07,
"loss": 0.0007,
"reward": 3.9426995515823364,
"reward_std": 0.027584614232182503,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9779268503189087,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9647727012634277,
"step": 591
},
{
"completion_length": 229.0625,
"epoch": 1.8928,
"grad_norm": 2.589956760406494,
"kl": 0.08203125,
"learning_rate": 2.6125e-07,
"loss": 0.0008,
"reward": 3.9178069829940796,
"reward_std": 0.007971604820340872,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.95549076795578,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9623160660266876,
"step": 592
},
{
"completion_length": 170.65625,
"epoch": 1.896,
"grad_norm": 3.586792469024658,
"kl": 0.0423583984375,
"learning_rate": 2.6e-07,
"loss": 0.0004,
"reward": 3.9206513166427612,
"reward_std": 0.023992381058633327,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9824000000953674,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9382513165473938,
"step": 593
},
{
"completion_length": 229.34375,
"epoch": 1.8992,
"grad_norm": 4.520889759063721,
"kl": 0.07421875,
"learning_rate": 2.5874999999999996e-07,
"loss": 0.0007,
"reward": 3.942514419555664,
"reward_std": 0.038696477888152,
"rewards/answer_entity_reward": 0.984275609254837,
"rewards/answer_wer_reward": 0.9582389295101166,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 594
},
{
"completion_length": 223.4375,
"epoch": 1.9024,
"grad_norm": 1.3104579448699951,
"kl": 0.0565185546875,
"learning_rate": 2.5749999999999997e-07,
"loss": 0.0006,
"reward": 3.976773500442505,
"reward_std": 0.0044562743860296905,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9767734706401825,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 595
},
{
"completion_length": 254.09375,
"epoch": 1.9056,
"grad_norm": 1.03975510597229,
"kl": 0.05322265625,
"learning_rate": 2.5625e-07,
"loss": 0.0005,
"reward": 3.943529725074768,
"reward_std": 0.009816794656217098,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9451378583908081,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9983919560909271,
"step": 596
},
{
"completion_length": 243.03125,
"epoch": 1.9088,
"grad_norm": 1.0213077068328857,
"kl": 0.0506591796875,
"learning_rate": 2.55e-07,
"loss": 0.0005,
"reward": 3.9278059005737305,
"reward_std": 0.00602961634285748,
"rewards/answer_entity_reward": 0.9861111044883728,
"rewards/answer_wer_reward": 0.9420903027057648,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996044337749481,
"step": 597
},
{
"completion_length": 182.46875,
"epoch": 1.912,
"grad_norm": 1.8683794736862183,
"kl": 0.065185546875,
"learning_rate": 2.5374999999999995e-07,
"loss": 0.0007,
"reward": 3.9624691009521484,
"reward_std": 0.012565109878778458,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9729967415332794,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9939365684986115,
"step": 598
},
{
"completion_length": 166.25,
"epoch": 1.9152,
"grad_norm": 1.716305136680603,
"kl": 0.0968017578125,
"learning_rate": 2.5249999999999996e-07,
"loss": 0.001,
"reward": 3.896498918533325,
"reward_std": 0.11676233587786555,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9749563038349152,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9527925550937653,
"step": 599
},
{
"completion_length": 199.59375,
"epoch": 1.9184,
"grad_norm": 1.2319942712783813,
"kl": 0.0775146484375,
"learning_rate": 2.5125e-07,
"loss": 0.0008,
"reward": 3.9489831924438477,
"reward_std": 0.010235858615487814,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9580873548984528,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9908958971500397,
"step": 600
},
{
"completion_length": 210.53125,
"epoch": 1.9216,
"grad_norm": 1.0385370254516602,
"kl": 0.0650634765625,
"learning_rate": 2.5e-07,
"loss": 0.0007,
"reward": 3.966851830482483,
"reward_std": 0.005628936691209674,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9668518006801605,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 601
},
{
"completion_length": 186.9375,
"epoch": 1.9247999999999998,
"grad_norm": 2.1772327423095703,
"kl": 0.11279296875,
"learning_rate": 2.4875e-07,
"loss": 0.0011,
"reward": 3.9322038888931274,
"reward_std": 0.01743672974407673,
"rewards/answer_entity_reward": 0.9880681931972504,
"rewards/answer_wer_reward": 0.9574334919452667,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9867021441459656,
"step": 602
},
{
"completion_length": 209.65625,
"epoch": 1.928,
"grad_norm": 0.9661850929260254,
"kl": 0.072998046875,
"learning_rate": 2.475e-07,
"loss": 0.0007,
"reward": 3.9598844051361084,
"reward_std": 0.009228286100551486,
"rewards/answer_entity_reward": 0.9937500059604645,
"rewards/answer_wer_reward": 0.966718465089798,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994158744812012,
"step": 603
},
{
"completion_length": 193.65625,
"epoch": 1.9312,
"grad_norm": 2.6254851818084717,
"kl": 0.102294921875,
"learning_rate": 2.4624999999999997e-07,
"loss": 0.001,
"reward": 3.957027792930603,
"reward_std": 0.008546661585569382,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9570277333259583,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 604
},
{
"completion_length": 219.34375,
"epoch": 1.9344000000000001,
"grad_norm": 1.0413298606872559,
"kl": 0.104736328125,
"learning_rate": 2.45e-07,
"loss": 0.0011,
"reward": 3.9702824354171753,
"reward_std": 0.007483657216653228,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9702823162078857,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 605
},
{
"completion_length": 157.46875,
"epoch": 1.9376,
"grad_norm": 2.432849645614624,
"kl": 0.14453125,
"learning_rate": 2.4375e-07,
"loss": 0.0014,
"reward": 3.957343101501465,
"reward_std": 0.005332180997356772,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.957624614238739,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997184872627258,
"step": 606
},
{
"completion_length": 248.40625,
"epoch": 1.9407999999999999,
"grad_norm": 0.8216654062271118,
"kl": 0.071044921875,
"learning_rate": 2.425e-07,
"loss": 0.0007,
"reward": 3.9644582271575928,
"reward_std": 0.01216787239536643,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9688305556774139,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.998031497001648,
"step": 607
},
{
"completion_length": 218.625,
"epoch": 1.944,
"grad_norm": 0.9195014834403992,
"kl": 0.0545654296875,
"learning_rate": 2.4124999999999997e-07,
"loss": 0.0005,
"reward": 3.972040057182312,
"reward_std": 0.004315207479521632,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9726911783218384,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993489682674408,
"step": 608
},
{
"completion_length": 231.84375,
"epoch": 1.9472,
"grad_norm": 1.3564932346343994,
"kl": 0.06103515625,
"learning_rate": 2.4e-07,
"loss": 0.0006,
"reward": 3.951057553291321,
"reward_std": 0.013061597011983395,
"rewards/answer_entity_reward": 0.9963235259056091,
"rewards/answer_wer_reward": 0.9553851187229156,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993489682674408,
"step": 609
},
{
"completion_length": 241.4375,
"epoch": 1.9504000000000001,
"grad_norm": 0.9419238567352295,
"kl": 0.051513671875,
"learning_rate": 2.3875e-07,
"loss": 0.0005,
"reward": 3.971252202987671,
"reward_std": 0.006067809648811817,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9715149104595184,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997373819351196,
"step": 610
},
{
"completion_length": 222.09375,
"epoch": 1.9536,
"grad_norm": 1.4854899644851685,
"kl": 0.166748046875,
"learning_rate": 2.3749999999999998e-07,
"loss": 0.0017,
"reward": 3.9489357471466064,
"reward_std": 0.012118924409151077,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.948935866355896,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 611
},
{
"completion_length": 259.8125,
"epoch": 1.9567999999999999,
"grad_norm": 2.2286458015441895,
"kl": 0.0426025390625,
"learning_rate": 2.3625e-07,
"loss": 0.0004,
"reward": 3.96254563331604,
"reward_std": 0.005056597990915179,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9625457525253296,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 612
},
{
"completion_length": 209.4375,
"epoch": 1.96,
"grad_norm": 4.077661514282227,
"kl": 0.05615234375,
"learning_rate": 2.3499999999999997e-07,
"loss": 0.0006,
"reward": 3.941632628440857,
"reward_std": 0.01233140891417861,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9416325688362122,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 613
},
{
"completion_length": 221.71875,
"epoch": 1.9632,
"grad_norm": 0.7665371298789978,
"kl": 0.0555419921875,
"learning_rate": 2.3375e-07,
"loss": 0.0005,
"reward": 3.9698644876480103,
"reward_std": 0.009979546128306538,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.973064661026001,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996408224105835,
"step": 614
},
{
"completion_length": 219.875,
"epoch": 1.9664000000000001,
"grad_norm": 2.4666738510131836,
"kl": 0.0546875,
"learning_rate": 2.325e-07,
"loss": 0.0005,
"reward": 3.9548712968826294,
"reward_std": 0.011192699894309044,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9553521871566772,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9995192289352417,
"step": 615
},
{
"completion_length": 235.0,
"epoch": 1.9696,
"grad_norm": 1.5382620096206665,
"kl": 0.044921875,
"learning_rate": 2.3125e-07,
"loss": 0.0005,
"reward": 3.9565550088882446,
"reward_std": 0.008881408954039216,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9740456640720367,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9825093150138855,
"step": 616
},
{
"completion_length": 141.09375,
"epoch": 1.9727999999999999,
"grad_norm": 2.0756258964538574,
"kl": 0.0631103515625,
"learning_rate": 2.3e-07,
"loss": 0.0006,
"reward": 3.9571491479873657,
"reward_std": 0.005044124089181423,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.980070561170578,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9770786762237549,
"step": 617
},
{
"completion_length": 222.46875,
"epoch": 1.976,
"grad_norm": 5.071360111236572,
"kl": 0.075927734375,
"learning_rate": 2.2875e-07,
"loss": 0.0008,
"reward": 3.8557703495025635,
"reward_std": 0.06493359804153442,
"rewards/answer_entity_reward": 0.9847027957439423,
"rewards/answer_wer_reward": 0.9706770181655884,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.900390625,
"step": 618
},
{
"completion_length": 231.125,
"epoch": 1.9792,
"grad_norm": 1.0749843120574951,
"kl": 0.050537109375,
"learning_rate": 2.275e-07,
"loss": 0.0005,
"reward": 3.9660208225250244,
"reward_std": 0.0037171735893934965,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9660208523273468,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 619
},
{
"completion_length": 252.625,
"epoch": 1.9824000000000002,
"grad_norm": 1.5367364883422852,
"kl": 0.070068359375,
"learning_rate": 2.2625e-07,
"loss": 0.0007,
"reward": 3.946213126182556,
"reward_std": 0.01816728012636304,
"rewards/answer_entity_reward": 0.9867424070835114,
"rewards/answer_wer_reward": 0.9616928696632385,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9977777898311615,
"step": 620
},
{
"completion_length": 239.34375,
"epoch": 1.9856,
"grad_norm": 2.541694164276123,
"kl": 0.142578125,
"learning_rate": 2.25e-07,
"loss": 0.0014,
"reward": 3.947938561439514,
"reward_std": 0.009988004341721535,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9479385614395142,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 621
},
{
"completion_length": 224.65625,
"epoch": 1.9888,
"grad_norm": 1.3821133375167847,
"kl": 0.075927734375,
"learning_rate": 2.2375e-07,
"loss": 0.0007,
"reward": 3.953581690788269,
"reward_std": 0.006479294504970312,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.953581839799881,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 622
},
{
"completion_length": 206.3125,
"epoch": 1.992,
"grad_norm": 1.0023412704467773,
"kl": 0.13232421875,
"learning_rate": 2.225e-07,
"loss": 0.0013,
"reward": 3.8949310779571533,
"reward_std": 0.006026371265761554,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9634793996810913,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9314516186714172,
"step": 623
},
{
"completion_length": 179.96875,
"epoch": 1.9952,
"grad_norm": 1.534476637840271,
"kl": 0.078125,
"learning_rate": 2.2125e-07,
"loss": 0.0008,
"reward": 3.966533660888672,
"reward_std": 0.008991609327495098,
"rewards/answer_entity_reward": 0.9950658082962036,
"rewards/answer_wer_reward": 0.9756669104099274,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9958009123802185,
"step": 624
},
{
"completion_length": 232.75,
"epoch": 1.9984,
"grad_norm": 0.7324752807617188,
"kl": 0.0499267578125,
"learning_rate": 2.1999999999999998e-07,
"loss": 0.0005,
"reward": 3.946596384048462,
"reward_std": 0.011123172473162413,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9492979049682617,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997023940086365,
"step": 625
},
{
"completion_length": 176.0625,
"epoch": 2.0,
"grad_norm": 0.33141908049583435,
"kl": 0.06005859375,
"learning_rate": 2.1875e-07,
"loss": 0.0003,
"reward": 3.9717535972595215,
"reward_std": 0.012056672014296055,
"rewards/answer_entity_reward": 0.9963235259056091,
"rewards/answer_wer_reward": 0.975429892539978,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 626
},
{
"completion_length": 232.21875,
"epoch": 2.0032,
"grad_norm": 0.8334391117095947,
"kl": 0.0457763671875,
"learning_rate": 2.1749999999999998e-07,
"loss": 0.0004,
"reward": 3.970544457435608,
"reward_std": 0.003736199578270316,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9705445766448975,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 627
},
{
"completion_length": 173.375,
"epoch": 2.0064,
"grad_norm": 0.965114951133728,
"kl": 0.067626953125,
"learning_rate": 2.1625e-07,
"loss": 0.0007,
"reward": 3.974756956100464,
"reward_std": 0.004756669281050563,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9788074791431427,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9959495067596436,
"step": 628
},
{
"completion_length": 222.15625,
"epoch": 2.0096,
"grad_norm": 2.102520227432251,
"kl": 0.0474853515625,
"learning_rate": 2.1499999999999998e-07,
"loss": 0.0005,
"reward": 3.938779830932617,
"reward_std": 0.01813220279291272,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9791045486927032,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9596752524375916,
"step": 629
},
{
"completion_length": 206.40625,
"epoch": 2.0128,
"grad_norm": 1.3867822885513306,
"kl": 0.095458984375,
"learning_rate": 2.1375e-07,
"loss": 0.001,
"reward": 3.977003812789917,
"reward_std": 0.003467106493189931,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9772301912307739,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997735619544983,
"step": 630
},
{
"completion_length": 237.625,
"epoch": 2.016,
"grad_norm": 1.2721437215805054,
"kl": 0.0576171875,
"learning_rate": 2.1249999999999998e-07,
"loss": 0.0006,
"reward": 3.96044921875,
"reward_std": 0.007887857500463724,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9609974026679993,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999451756477356,
"step": 631
},
{
"completion_length": 190.65625,
"epoch": 2.0192,
"grad_norm": 1.6940927505493164,
"kl": 0.170166015625,
"learning_rate": 2.1125e-07,
"loss": 0.0017,
"reward": 3.92085862159729,
"reward_std": 0.012093114666640759,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9635953307151794,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9572633504867554,
"step": 632
},
{
"completion_length": 213.75,
"epoch": 2.0224,
"grad_norm": 1.3798060417175293,
"kl": 0.0552978515625,
"learning_rate": 2.0999999999999997e-07,
"loss": 0.0006,
"reward": 3.9467806816101074,
"reward_std": 0.00452708825469017,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9470699727535248,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997106492519379,
"step": 633
},
{
"completion_length": 193.5625,
"epoch": 2.0256,
"grad_norm": 1.5375889539718628,
"kl": 0.046875,
"learning_rate": 2.0874999999999999e-07,
"loss": 0.0005,
"reward": 3.9730241298675537,
"reward_std": 0.006102013634517789,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9743154048919678,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987086653709412,
"step": 634
},
{
"completion_length": 204.09375,
"epoch": 2.0288,
"grad_norm": 1.0933163166046143,
"kl": 0.09228515625,
"learning_rate": 2.0749999999999997e-07,
"loss": 0.0009,
"reward": 3.9593019485473633,
"reward_std": 0.008372287498787045,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9602685272693634,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999033510684967,
"step": 635
},
{
"completion_length": 186.875,
"epoch": 2.032,
"grad_norm": 3.5551085472106934,
"kl": 0.085205078125,
"learning_rate": 2.0624999999999998e-07,
"loss": 0.0008,
"reward": 3.937085270881653,
"reward_std": 0.028064538724720478,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9683353006839752,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9687500298023224,
"step": 636
},
{
"completion_length": 228.875,
"epoch": 2.0352,
"grad_norm": 0.9865986108779907,
"kl": 0.0728759765625,
"learning_rate": 2.0499999999999997e-07,
"loss": 0.0007,
"reward": 3.9492111206054688,
"reward_std": 0.007756081875413656,
"rewards/answer_entity_reward": 0.9916666746139526,
"rewards/answer_wer_reward": 0.9575444757938385,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 637
},
{
"completion_length": 212.28125,
"epoch": 2.0384,
"grad_norm": 3.542672872543335,
"kl": 0.110107421875,
"learning_rate": 2.0374999999999998e-07,
"loss": 0.0011,
"reward": 3.9374581575393677,
"reward_std": 0.009235690347850323,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9742424190044403,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9652990996837616,
"step": 638
},
{
"completion_length": 232.0,
"epoch": 2.0416,
"grad_norm": 1.4940472841262817,
"kl": 0.0565185546875,
"learning_rate": 2.025e-07,
"loss": 0.0006,
"reward": 3.947740077972412,
"reward_std": 0.006069941911846399,
"rewards/answer_entity_reward": 0.9861111044883728,
"rewards/answer_wer_reward": 0.9616289734840393,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 639
},
{
"completion_length": 214.46875,
"epoch": 2.0448,
"grad_norm": 1.0322229862213135,
"kl": 0.0865478515625,
"learning_rate": 2.0125e-07,
"loss": 0.0009,
"reward": 3.973870038986206,
"reward_std": 0.005974382860586047,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9738699197769165,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 640
},
{
"completion_length": 175.71875,
"epoch": 2.048,
"grad_norm": 2.1991164684295654,
"kl": 0.0986328125,
"learning_rate": 2e-07,
"loss": 0.001,
"reward": 3.9478849172592163,
"reward_std": 0.012253349646925926,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9485794901847839,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993055462837219,
"step": 641
},
{
"completion_length": 202.3125,
"epoch": 2.0512,
"grad_norm": 2.254936456680298,
"kl": 0.0758056640625,
"learning_rate": 1.9875e-07,
"loss": 0.0008,
"reward": 3.9462071657180786,
"reward_std": 0.007457165978848934,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9462071061134338,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 642
},
{
"completion_length": 205.03125,
"epoch": 2.0544,
"grad_norm": 2.473928928375244,
"kl": 0.079345703125,
"learning_rate": 1.975e-07,
"loss": 0.0008,
"reward": 3.92992103099823,
"reward_std": 0.014722079504281282,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.9436539113521576,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9919487535953522,
"step": 643
},
{
"completion_length": 202.3125,
"epoch": 2.0576,
"grad_norm": 1.5329126119613647,
"kl": 0.03643798828125,
"learning_rate": 1.9625e-07,
"loss": 0.0004,
"reward": 3.944863796234131,
"reward_std": 0.006489667110145092,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9667904078960419,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9780733287334442,
"step": 644
},
{
"completion_length": 202.53125,
"epoch": 2.0608,
"grad_norm": 0.6484522223472595,
"kl": 0.04443359375,
"learning_rate": 1.9499999999999999e-07,
"loss": 0.0004,
"reward": 3.975989580154419,
"reward_std": 0.0032934267073869705,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9759896695613861,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 645
},
{
"completion_length": 248.65625,
"epoch": 2.064,
"grad_norm": 3.43375301361084,
"kl": 0.0609130859375,
"learning_rate": 1.9375e-07,
"loss": 0.0006,
"reward": 3.952019691467285,
"reward_std": 0.010596145410090685,
"rewards/answer_entity_reward": 0.9983552694320679,
"rewards/answer_wer_reward": 0.9558849632740021,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9977796375751495,
"step": 646
},
{
"completion_length": 209.40625,
"epoch": 2.0672,
"grad_norm": 1.1015528440475464,
"kl": 0.057373046875,
"learning_rate": 1.9249999999999998e-07,
"loss": 0.0006,
"reward": 3.9535114765167236,
"reward_std": 0.0073295624461025,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9535112977027893,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 647
},
{
"completion_length": 247.15625,
"epoch": 2.0704,
"grad_norm": 5.493063449859619,
"kl": 0.052490234375,
"learning_rate": 1.9125e-07,
"loss": 0.0005,
"reward": 3.959768056869507,
"reward_std": 0.009880491998046637,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9597680270671844,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 648
},
{
"completion_length": 190.3125,
"epoch": 2.0736,
"grad_norm": 3.042928457260132,
"kl": 0.070556640625,
"learning_rate": 1.8999999999999998e-07,
"loss": 0.0007,
"reward": 3.935302972793579,
"reward_std": 0.008418679004535079,
"rewards/answer_entity_reward": 0.9861111044883728,
"rewards/answer_wer_reward": 0.9707636535167694,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9784283638000488,
"step": 649
},
{
"completion_length": 240.1875,
"epoch": 2.0768,
"grad_norm": 1.1801666021347046,
"kl": 0.068359375,
"learning_rate": 1.8875e-07,
"loss": 0.0007,
"reward": 3.944392442703247,
"reward_std": 0.008859490510076284,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9443924725055695,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 650
},
{
"completion_length": 212.0625,
"epoch": 2.08,
"grad_norm": 1.1967086791992188,
"kl": 0.072021484375,
"learning_rate": 1.875e-07,
"loss": 0.0007,
"reward": 3.96494197845459,
"reward_std": 0.011900570709258318,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9674758613109589,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9974662065505981,
"step": 651
},
{
"completion_length": 179.90625,
"epoch": 2.0832,
"grad_norm": 2.0556278228759766,
"kl": 0.056640625,
"learning_rate": 1.8625e-07,
"loss": 0.0006,
"reward": 3.925339102745056,
"reward_std": 0.005963671952486038,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9453259110450745,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9800131618976593,
"step": 652
},
{
"completion_length": 232.1875,
"epoch": 2.0864,
"grad_norm": 1.1875349283218384,
"kl": 0.076171875,
"learning_rate": 1.85e-07,
"loss": 0.0008,
"reward": 3.9718481302261353,
"reward_std": 0.01158686971757561,
"rewards/answer_entity_reward": 0.9955128133296967,
"rewards/answer_wer_reward": 0.9763352572917938,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 653
},
{
"completion_length": 222.65625,
"epoch": 2.0896,
"grad_norm": 2.1682872772216797,
"kl": 0.09423828125,
"learning_rate": 1.8375e-07,
"loss": 0.0009,
"reward": 3.94124174118042,
"reward_std": 0.008590340381488204,
"rewards/answer_entity_reward": 0.9903846085071564,
"rewards/answer_wer_reward": 0.9508572518825531,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 654
},
{
"completion_length": 173.03125,
"epoch": 2.0928,
"grad_norm": 2.1240601539611816,
"kl": 0.066162109375,
"learning_rate": 1.825e-07,
"loss": 0.0007,
"reward": 3.9930202960968018,
"reward_std": 0.0026576630771160126,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9934512376785278,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9995689690113068,
"step": 655
},
{
"completion_length": 177.09375,
"epoch": 2.096,
"grad_norm": 4.589439868927002,
"kl": 0.083984375,
"learning_rate": 1.8124999999999999e-07,
"loss": 0.0008,
"reward": 3.7905973196029663,
"reward_std": 0.05029802396893501,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9605589509010315,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8300382792949677,
"step": 656
},
{
"completion_length": 182.5,
"epoch": 2.0992,
"grad_norm": 2.9955060482025146,
"kl": 0.0601806640625,
"learning_rate": 1.8e-07,
"loss": 0.0006,
"reward": 3.959343194961548,
"reward_std": 0.010165283223614097,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9634606242179871,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.995882511138916,
"step": 657
},
{
"completion_length": 247.53125,
"epoch": 2.1024,
"grad_norm": 6.366602897644043,
"kl": 0.2166748046875,
"learning_rate": 1.7874999999999998e-07,
"loss": 0.0022,
"reward": 3.95376193523407,
"reward_std": 0.007726241368800402,
"rewards/answer_entity_reward": 0.9903846085071564,
"rewards/answer_wer_reward": 0.9633772671222687,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 658
},
{
"completion_length": 212.8125,
"epoch": 2.1056,
"grad_norm": 1.1973211765289307,
"kl": 0.0445556640625,
"learning_rate": 1.775e-07,
"loss": 0.0004,
"reward": 3.979708194732666,
"reward_std": 0.007615833543241024,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9800336956977844,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996744692325592,
"step": 659
},
{
"completion_length": 244.65625,
"epoch": 2.1088,
"grad_norm": 1.237342357635498,
"kl": 0.063232421875,
"learning_rate": 1.7624999999999998e-07,
"loss": 0.0006,
"reward": 3.9267531633377075,
"reward_std": 0.01262162160128355,
"rewards/answer_entity_reward": 0.9903846085071564,
"rewards/answer_wer_reward": 0.937911719083786,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9984567761421204,
"step": 660
},
{
"completion_length": 211.46875,
"epoch": 2.112,
"grad_norm": 1.6842882633209229,
"kl": 0.0623779296875,
"learning_rate": 1.75e-07,
"loss": 0.0006,
"reward": 3.9610049724578857,
"reward_std": 0.008832846768200397,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9619665145874023,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990384578704834,
"step": 661
},
{
"completion_length": 208.6875,
"epoch": 2.1152,
"grad_norm": 1.8498320579528809,
"kl": 0.0687255859375,
"learning_rate": 1.7374999999999998e-07,
"loss": 0.0007,
"reward": 3.908181667327881,
"reward_std": 0.05270358338020742,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9462520182132721,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9654017686843872,
"step": 662
},
{
"completion_length": 220.15625,
"epoch": 2.1184,
"grad_norm": 1.3248109817504883,
"kl": 0.0576171875,
"learning_rate": 1.725e-07,
"loss": 0.0006,
"reward": 3.977890729904175,
"reward_std": 0.0048680840991437435,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9778908789157867,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 663
},
{
"completion_length": 203.125,
"epoch": 2.1216,
"grad_norm": 1.2837951183319092,
"kl": 0.0660400390625,
"learning_rate": 1.7125e-07,
"loss": 0.0007,
"reward": 3.951757311820984,
"reward_std": 0.01306973909959197,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9517573118209839,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 664
},
{
"completion_length": 234.71875,
"epoch": 2.1248,
"grad_norm": 1.2517513036727905,
"kl": 0.072265625,
"learning_rate": 1.7000000000000001e-07,
"loss": 0.0007,
"reward": 3.932037830352783,
"reward_std": 0.018653371836990118,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9320378601551056,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 665
},
{
"completion_length": 154.4375,
"epoch": 2.128,
"grad_norm": 1.6812143325805664,
"kl": 0.057373046875,
"learning_rate": 1.6875e-07,
"loss": 0.0006,
"reward": 3.933722972869873,
"reward_std": 0.004374760144855827,
"rewards/answer_entity_reward": 0.9916666746139526,
"rewards/answer_wer_reward": 0.9603091180324554,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9817472994327545,
"step": 666
},
{
"completion_length": 194.375,
"epoch": 2.1312,
"grad_norm": 1.1369833946228027,
"kl": 0.10205078125,
"learning_rate": 1.675e-07,
"loss": 0.001,
"reward": 3.948467254638672,
"reward_std": 0.013669541105628014,
"rewards/answer_entity_reward": 0.9895833134651184,
"rewards/answer_wer_reward": 0.9588838517665863,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 667
},
{
"completion_length": 222.40625,
"epoch": 2.1344,
"grad_norm": 1.289441466331482,
"kl": 0.09716796875,
"learning_rate": 1.6625e-07,
"loss": 0.001,
"reward": 3.938557267189026,
"reward_std": 0.005478785838931799,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9577881693840027,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9807692170143127,
"step": 668
},
{
"completion_length": 185.71875,
"epoch": 2.1376,
"grad_norm": 1.9890272617340088,
"kl": 0.084716796875,
"learning_rate": 1.65e-07,
"loss": 0.0008,
"reward": 3.967849016189575,
"reward_std": 0.008760316297411919,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.967848926782608,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 669
},
{
"completion_length": 248.46875,
"epoch": 2.1408,
"grad_norm": 1.1813039779663086,
"kl": 0.074462890625,
"learning_rate": 1.6375e-07,
"loss": 0.0007,
"reward": 3.8907772302627563,
"reward_std": 0.07307082694023848,
"rewards/answer_entity_reward": 0.9749999940395355,
"rewards/answer_wer_reward": 0.915777176618576,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 670
},
{
"completion_length": 204.09375,
"epoch": 2.144,
"grad_norm": 1.4091624021530151,
"kl": 0.079833984375,
"learning_rate": 1.625e-07,
"loss": 0.0008,
"reward": 3.9357553720474243,
"reward_std": 0.018585966899991035,
"rewards/answer_entity_reward": 0.9924799501895905,
"rewards/answer_wer_reward": 0.9553823173046112,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9878930449485779,
"step": 671
},
{
"completion_length": 204.15625,
"epoch": 2.1471999999999998,
"grad_norm": 1.9349714517593384,
"kl": 0.0614013671875,
"learning_rate": 1.6125e-07,
"loss": 0.0006,
"reward": 3.963050127029419,
"reward_std": 0.011341096367686987,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9657188355922699,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997351765632629,
"step": 672
},
{
"completion_length": 183.1875,
"epoch": 2.1504,
"grad_norm": 3.866070508956909,
"kl": 0.1171875,
"learning_rate": 1.6e-07,
"loss": 0.0012,
"reward": 3.778456449508667,
"reward_std": 0.1051805429160595,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9563734233379364,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8244869709014893,
"step": 673
},
{
"completion_length": 237.875,
"epoch": 2.1536,
"grad_norm": 1.3984158039093018,
"kl": 0.0478515625,
"learning_rate": 1.5875e-07,
"loss": 0.0005,
"reward": 3.9681609869003296,
"reward_std": 0.007229159120470285,
"rewards/answer_entity_reward": 0.9981617629528046,
"rewards/answer_wer_reward": 0.9706325232982635,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993667006492615,
"step": 674
},
{
"completion_length": 201.9375,
"epoch": 2.1568,
"grad_norm": 4.475615501403809,
"kl": 0.06640625,
"learning_rate": 1.575e-07,
"loss": 0.0007,
"reward": 3.8558905124664307,
"reward_std": 0.0662167351692915,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9515935778617859,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.904296875,
"step": 675
},
{
"completion_length": 199.59375,
"epoch": 2.16,
"grad_norm": 1.3850592374801636,
"kl": 0.042236328125,
"learning_rate": 1.5624999999999999e-07,
"loss": 0.0004,
"reward": 3.9729303121566772,
"reward_std": 0.01144796540029347,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9750137031078339,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 676
},
{
"completion_length": 198.8125,
"epoch": 2.1632,
"grad_norm": 0.8988875150680542,
"kl": 0.0848388671875,
"learning_rate": 1.55e-07,
"loss": 0.0008,
"reward": 3.9634130001068115,
"reward_std": 0.016308533609844744,
"rewards/answer_entity_reward": 0.9937500059604645,
"rewards/answer_wer_reward": 0.969995379447937,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996675550937653,
"step": 677
},
{
"completion_length": 242.0,
"epoch": 2.1664,
"grad_norm": 0.886544406414032,
"kl": 0.057861328125,
"learning_rate": 1.5374999999999998e-07,
"loss": 0.0006,
"reward": 3.9666435718536377,
"reward_std": 0.009206962306052446,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9666436016559601,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 678
},
{
"completion_length": 208.09375,
"epoch": 2.1696,
"grad_norm": 1.2104874849319458,
"kl": 0.0665283203125,
"learning_rate": 1.525e-07,
"loss": 0.0007,
"reward": 3.956413745880127,
"reward_std": 0.008385751629248261,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9564136564731598,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 679
},
{
"completion_length": 205.65625,
"epoch": 2.1728,
"grad_norm": 1.4340012073516846,
"kl": 0.0653076171875,
"learning_rate": 1.5124999999999998e-07,
"loss": 0.0007,
"reward": 3.9660589694976807,
"reward_std": 0.007518206490203738,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9666839838027954,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993749856948853,
"step": 680
},
{
"completion_length": 243.875,
"epoch": 2.176,
"grad_norm": 2.6693804264068604,
"kl": 0.0611572265625,
"learning_rate": 1.5e-07,
"loss": 0.0006,
"reward": 3.9342352151870728,
"reward_std": 0.0278960638679564,
"rewards/answer_entity_reward": 0.9851190745830536,
"rewards/answer_wer_reward": 0.9509375989437103,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9981784820556641,
"step": 681
},
{
"completion_length": 247.875,
"epoch": 2.1792,
"grad_norm": 0.978139340877533,
"kl": 0.050537109375,
"learning_rate": 1.4874999999999998e-07,
"loss": 0.0005,
"reward": 3.9769967794418335,
"reward_std": 0.006702936254441738,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9769968390464783,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 682
},
{
"completion_length": 222.5625,
"epoch": 2.1824,
"grad_norm": 1.382318139076233,
"kl": 0.065185546875,
"learning_rate": 1.475e-07,
"loss": 0.0007,
"reward": 3.9492597579956055,
"reward_std": 0.008544785436242819,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9507622122764587,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9984976053237915,
"step": 683
},
{
"completion_length": 219.71875,
"epoch": 2.1856,
"grad_norm": 2.196531057357788,
"kl": 0.0595703125,
"learning_rate": 1.4624999999999998e-07,
"loss": 0.0006,
"reward": 3.9446985721588135,
"reward_std": 0.014558171853423119,
"rewards/answer_entity_reward": 0.9813033938407898,
"rewards/answer_wer_reward": 0.9633950591087341,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 684
},
{
"completion_length": 219.5,
"epoch": 2.1888,
"grad_norm": 1.4868621826171875,
"kl": 0.07177734375,
"learning_rate": 1.45e-07,
"loss": 0.0007,
"reward": 3.9446860551834106,
"reward_std": 0.010166772175580263,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9451901018619537,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994959831237793,
"step": 685
},
{
"completion_length": 261.59375,
"epoch": 2.192,
"grad_norm": 0.8591821789741516,
"kl": 0.0595703125,
"learning_rate": 1.4374999999999997e-07,
"loss": 0.0006,
"reward": 3.9277877807617188,
"reward_std": 0.010211648885160685,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.930150032043457,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9976378083229065,
"step": 686
},
{
"completion_length": 205.0625,
"epoch": 2.1952,
"grad_norm": 0.924826443195343,
"kl": 0.0703125,
"learning_rate": 1.4249999999999999e-07,
"loss": 0.0007,
"reward": 3.9727468490600586,
"reward_std": 0.006501165917143226,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.972746878862381,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 687
},
{
"completion_length": 197.625,
"epoch": 2.1984,
"grad_norm": 1.508520483970642,
"kl": 0.092041015625,
"learning_rate": 1.4124999999999997e-07,
"loss": 0.0009,
"reward": 3.9627835750579834,
"reward_std": 0.010947544127702713,
"rewards/answer_entity_reward": 0.9930555522441864,
"rewards/answer_wer_reward": 0.9707047045230865,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9990234375,
"step": 688
},
{
"completion_length": 205.09375,
"epoch": 2.2016,
"grad_norm": 2.3478713035583496,
"kl": 0.0712890625,
"learning_rate": 1.4e-07,
"loss": 0.0007,
"reward": 3.933359384536743,
"reward_std": 0.008363787084817886,
"rewards/answer_entity_reward": 0.9861111044883728,
"rewards/answer_wer_reward": 0.9626152515411377,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9846329689025879,
"step": 689
},
{
"completion_length": 225.03125,
"epoch": 2.2048,
"grad_norm": 1.3916107416152954,
"kl": 0.058837890625,
"learning_rate": 1.3875e-07,
"loss": 0.0006,
"reward": 3.9732636213302612,
"reward_std": 0.009609260130673647,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9732636511325836,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 690
},
{
"completion_length": 152.59375,
"epoch": 2.208,
"grad_norm": 1.322786808013916,
"kl": 0.0557861328125,
"learning_rate": 1.375e-07,
"loss": 0.0006,
"reward": 3.8575568199157715,
"reward_std": 0.011282142717391253,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9596264958381653,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9003343284130096,
"step": 691
},
{
"completion_length": 162.71875,
"epoch": 2.2112,
"grad_norm": 0.7846171855926514,
"kl": 0.0657958984375,
"learning_rate": 1.3625e-07,
"loss": 0.0007,
"reward": 3.9684951305389404,
"reward_std": 0.013251218944787979,
"rewards/answer_entity_reward": 0.9910714328289032,
"rewards/answer_wer_reward": 0.9774238169193268,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 692
},
{
"completion_length": 207.65625,
"epoch": 2.2144,
"grad_norm": 1.7230638265609741,
"kl": 0.1243896484375,
"learning_rate": 1.35e-07,
"loss": 0.0012,
"reward": 3.9475139379501343,
"reward_std": 0.00949817756190896,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9486435055732727,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.998870462179184,
"step": 693
},
{
"completion_length": 245.96875,
"epoch": 2.2176,
"grad_norm": 1.5247471332550049,
"kl": 0.061767578125,
"learning_rate": 1.3375e-07,
"loss": 0.0006,
"reward": 3.947926878929138,
"reward_std": 0.014066703617572784,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9513991177082062,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 694
},
{
"completion_length": 222.5625,
"epoch": 2.2208,
"grad_norm": 1.5721601247787476,
"kl": 0.0782470703125,
"learning_rate": 1.325e-07,
"loss": 0.0008,
"reward": 3.903387188911438,
"reward_std": 0.005873196758329868,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9640650153160095,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9393221139907837,
"step": 695
},
{
"completion_length": 187.03125,
"epoch": 2.224,
"grad_norm": 1.1470870971679688,
"kl": 0.0457763671875,
"learning_rate": 1.3125e-07,
"loss": 0.0005,
"reward": 3.9857735633850098,
"reward_std": 0.003898413386195898,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9857736229896545,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 696
},
{
"completion_length": 202.84375,
"epoch": 2.2272,
"grad_norm": 2.00569486618042,
"kl": 0.077392578125,
"learning_rate": 1.3e-07,
"loss": 0.0008,
"reward": 3.9439765214920044,
"reward_std": 0.00677294097840786,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9666953980922699,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9772810935974121,
"step": 697
},
{
"completion_length": 200.875,
"epoch": 2.2304,
"grad_norm": 0.5203324556350708,
"kl": 0.0533447265625,
"learning_rate": 1.2874999999999998e-07,
"loss": 0.0005,
"reward": 3.981989622116089,
"reward_std": 0.003249130444601178,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9819895327091217,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 698
},
{
"completion_length": 229.5,
"epoch": 2.2336,
"grad_norm": 1.028457760810852,
"kl": 0.0615234375,
"learning_rate": 1.275e-07,
"loss": 0.0006,
"reward": 3.9699747562408447,
"reward_std": 0.007223621942102909,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9699748456478119,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 699
},
{
"completion_length": 183.375,
"epoch": 2.2368,
"grad_norm": 1.1010169982910156,
"kl": 0.09619140625,
"learning_rate": 1.2624999999999998e-07,
"loss": 0.001,
"reward": 3.9709969758987427,
"reward_std": 0.013876417418941855,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9763848185539246,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.994612067937851,
"step": 700
},
{
"completion_length": 192.6875,
"epoch": 2.24,
"grad_norm": 1.9254510402679443,
"kl": 0.126708984375,
"learning_rate": 1.25e-07,
"loss": 0.0013,
"reward": 3.9508676528930664,
"reward_std": 0.007698251400142908,
"rewards/answer_entity_reward": 0.9903846085071564,
"rewards/answer_wer_reward": 0.9661648571491241,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9943181872367859,
"step": 701
},
{
"completion_length": 206.6875,
"epoch": 2.2432,
"grad_norm": 4.035684108734131,
"kl": 0.04833984375,
"learning_rate": 1.2375e-07,
"loss": 0.0005,
"reward": 3.9621732234954834,
"reward_std": 0.007325239945203066,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.977934330701828,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9842387735843658,
"step": 702
},
{
"completion_length": 240.78125,
"epoch": 2.2464,
"grad_norm": 1.4605140686035156,
"kl": 0.0582275390625,
"learning_rate": 1.225e-07,
"loss": 0.0006,
"reward": 3.951379179954529,
"reward_std": 0.005893495166674256,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9543100893497467,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9970689713954926,
"step": 703
},
{
"completion_length": 190.84375,
"epoch": 2.2496,
"grad_norm": 0.8877372741699219,
"kl": 0.064453125,
"learning_rate": 1.2125e-07,
"loss": 0.0007,
"reward": 3.9827821254730225,
"reward_std": 0.003501511411741376,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.983114629983902,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996675550937653,
"step": 704
},
{
"completion_length": 169.875,
"epoch": 2.2528,
"grad_norm": 4.669096946716309,
"kl": 0.0634765625,
"learning_rate": 1.2e-07,
"loss": 0.0006,
"reward": 3.9501044750213623,
"reward_std": 0.00536915916018188,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9683522582054138,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9817522466182709,
"step": 705
},
{
"completion_length": 208.34375,
"epoch": 2.2560000000000002,
"grad_norm": 2.4436697959899902,
"kl": 0.072998046875,
"learning_rate": 1.1874999999999999e-07,
"loss": 0.0007,
"reward": 3.95159912109375,
"reward_std": 0.012246299302205443,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9697677791118622,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9818313717842102,
"step": 706
},
{
"completion_length": 253.34375,
"epoch": 2.2592,
"grad_norm": 0.6258556842803955,
"kl": 0.0625,
"learning_rate": 1.1749999999999999e-07,
"loss": 0.0006,
"reward": 3.943672776222229,
"reward_std": 0.004726027720607817,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9436727464199066,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 707
},
{
"completion_length": 187.96875,
"epoch": 2.2624,
"grad_norm": 2.1608188152313232,
"kl": 0.09521484375,
"learning_rate": 1.1625e-07,
"loss": 0.0009,
"reward": 3.9321788549423218,
"reward_std": 0.021823766641318798,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9405494034290314,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.99609375,
"step": 708
},
{
"completion_length": 201.0625,
"epoch": 2.2656,
"grad_norm": 5.012310028076172,
"kl": 0.04071044921875,
"learning_rate": 1.15e-07,
"loss": 0.0004,
"reward": 3.9624879360198975,
"reward_std": 0.01549163879826665,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9760953187942505,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.989864856004715,
"step": 709
},
{
"completion_length": 238.71875,
"epoch": 2.2688,
"grad_norm": 1.1021510362625122,
"kl": 0.08154296875,
"learning_rate": 1.1375e-07,
"loss": 0.0008,
"reward": 3.9332664012908936,
"reward_std": 0.015113649424165487,
"rewards/answer_entity_reward": 0.9832701981067657,
"rewards/answer_wer_reward": 0.9499962031841278,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 710
},
{
"completion_length": 220.90625,
"epoch": 2.2720000000000002,
"grad_norm": 1.1716574430465698,
"kl": 0.053466796875,
"learning_rate": 1.125e-07,
"loss": 0.0005,
"reward": 3.9751139879226685,
"reward_std": 0.007001735270023346,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9751139879226685,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 711
},
{
"completion_length": 241.40625,
"epoch": 2.2752,
"grad_norm": 1.469359278678894,
"kl": 0.07275390625,
"learning_rate": 1.1125e-07,
"loss": 0.0007,
"reward": 3.898247718811035,
"reward_std": 0.039173625875264406,
"rewards/answer_entity_reward": 0.984375,
"rewards/answer_wer_reward": 0.9162905812263489,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9975819885730743,
"step": 712
},
{
"completion_length": 205.25,
"epoch": 2.2784,
"grad_norm": 0.7749589085578918,
"kl": 0.0621337890625,
"learning_rate": 1.0999999999999999e-07,
"loss": 0.0006,
"reward": 3.9739962816238403,
"reward_std": 0.0056007420644164085,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9743727445602417,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996234774589539,
"step": 713
},
{
"completion_length": 206.125,
"epoch": 2.2816,
"grad_norm": 0.5464848875999451,
"kl": 0.04901123046875,
"learning_rate": 1.0874999999999999e-07,
"loss": 0.0005,
"reward": 3.95177161693573,
"reward_std": 0.004434725036844611,
"rewards/answer_entity_reward": 0.9903846085071564,
"rewards/answer_wer_reward": 0.9615707993507385,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9998161792755127,
"step": 714
},
{
"completion_length": 169.09375,
"epoch": 2.2848,
"grad_norm": 3.133605480194092,
"kl": 0.06689453125,
"learning_rate": 1.0749999999999999e-07,
"loss": 0.0007,
"reward": 3.929832339286804,
"reward_std": 0.01732827629894018,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9689165651798248,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.960915744304657,
"step": 715
},
{
"completion_length": 204.59375,
"epoch": 2.288,
"grad_norm": 0.7156680822372437,
"kl": 0.06884765625,
"learning_rate": 1.0624999999999999e-07,
"loss": 0.0007,
"reward": 3.976062059402466,
"reward_std": 0.0025083101354539394,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9836839437484741,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9923780560493469,
"step": 716
},
{
"completion_length": 210.59375,
"epoch": 2.2912,
"grad_norm": 284.2210998535156,
"kl": 0.1416015625,
"learning_rate": 1.0499999999999999e-07,
"loss": 0.0014,
"reward": 3.9028064012527466,
"reward_std": 0.016830324195325375,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9614686369895935,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9413377344608307,
"step": 717
},
{
"completion_length": 232.53125,
"epoch": 2.2944,
"grad_norm": 1.077739953994751,
"kl": 0.08544921875,
"learning_rate": 1.0374999999999999e-07,
"loss": 0.0009,
"reward": 3.9475821256637573,
"reward_std": 0.011592368595302105,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9478915929794312,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999690592288971,
"step": 718
},
{
"completion_length": 217.875,
"epoch": 2.2976,
"grad_norm": 2.2114531993865967,
"kl": 0.195068359375,
"learning_rate": 1.0249999999999998e-07,
"loss": 0.002,
"reward": 3.941352367401123,
"reward_std": 0.00652403780259192,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9605833292007446,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9807692170143127,
"step": 719
},
{
"completion_length": 241.875,
"epoch": 2.3008,
"grad_norm": 2.330026865005493,
"kl": 0.10693359375,
"learning_rate": 1.0125e-07,
"loss": 0.0011,
"reward": 3.8385108709335327,
"reward_std": 0.0217201872728765,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9317739605903625,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9067369103431702,
"step": 720
},
{
"completion_length": 148.15625,
"epoch": 2.304,
"grad_norm": 6.020991802215576,
"kl": 0.0804443359375,
"learning_rate": 1e-07,
"loss": 0.0008,
"reward": 3.9653271436691284,
"reward_std": 0.010471278452314436,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9677309989929199,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 721
},
{
"completion_length": 242.34375,
"epoch": 2.3072,
"grad_norm": 1.3827441930770874,
"kl": 0.0606689453125,
"learning_rate": 9.875e-08,
"loss": 0.0006,
"reward": 3.9477760791778564,
"reward_std": 0.017027822323143482,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9524165093898773,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9988317787647247,
"step": 722
},
{
"completion_length": 182.875,
"epoch": 2.3104,
"grad_norm": 0.6132823824882507,
"kl": 0.0732421875,
"learning_rate": 9.749999999999999e-08,
"loss": 0.0007,
"reward": 3.9824774265289307,
"reward_std": 0.0017756590968929231,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9837089478969574,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9987684786319733,
"step": 723
},
{
"completion_length": 259.21875,
"epoch": 2.3136,
"grad_norm": 1.0919182300567627,
"kl": 0.052001953125,
"learning_rate": 9.624999999999999e-08,
"loss": 0.0005,
"reward": 3.9247913360595703,
"reward_std": 0.0157609935849905,
"rewards/answer_entity_reward": 0.9692307412624359,
"rewards/answer_wer_reward": 0.9555604159832001,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 724
},
{
"completion_length": 243.21875,
"epoch": 2.3168,
"grad_norm": 1.7886172533035278,
"kl": 0.04718017578125,
"learning_rate": 9.499999999999999e-08,
"loss": 0.0005,
"reward": 3.9662917852401733,
"reward_std": 0.005910404259338975,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9665379524230957,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997539520263672,
"step": 725
},
{
"completion_length": 201.65625,
"epoch": 2.32,
"grad_norm": 1.3444185256958008,
"kl": 0.0606689453125,
"learning_rate": 9.375e-08,
"loss": 0.0006,
"reward": 3.9709818363189697,
"reward_std": 0.00892023229971528,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9738226532936096,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 726
},
{
"completion_length": 224.1875,
"epoch": 2.3232,
"grad_norm": 4.107091426849365,
"kl": 0.229736328125,
"learning_rate": 9.25e-08,
"loss": 0.0023,
"reward": 3.9483840465545654,
"reward_std": 0.013201091904193163,
"rewards/answer_entity_reward": 0.9927884340286255,
"rewards/answer_wer_reward": 0.955822080373764,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997735619544983,
"step": 727
},
{
"completion_length": 189.75,
"epoch": 2.3264,
"grad_norm": 1.512626051902771,
"kl": 0.0589599609375,
"learning_rate": 9.125e-08,
"loss": 0.0006,
"reward": 3.9542768001556396,
"reward_std": 0.008582692593336105,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9702657759189606,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.984011173248291,
"step": 728
},
{
"completion_length": 172.8125,
"epoch": 2.3296,
"grad_norm": 4.1475830078125,
"kl": 0.110107421875,
"learning_rate": 9e-08,
"loss": 0.0011,
"reward": 3.9462348222732544,
"reward_std": 0.009323009755462408,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9772224724292755,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9690123498439789,
"step": 729
},
{
"completion_length": 198.84375,
"epoch": 2.3327999999999998,
"grad_norm": 1.3541475534439087,
"kl": 0.045166015625,
"learning_rate": 8.875e-08,
"loss": 0.0005,
"reward": 3.9697635173797607,
"reward_std": 0.00771446293219924,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9707715511322021,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9989919364452362,
"step": 730
},
{
"completion_length": 217.71875,
"epoch": 2.336,
"grad_norm": 1.2064177989959717,
"kl": 0.05908203125,
"learning_rate": 8.75e-08,
"loss": 0.0006,
"reward": 3.9431110620498657,
"reward_std": 0.01243708049878478,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9462102055549622,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9969007968902588,
"step": 731
},
{
"completion_length": 208.0,
"epoch": 2.3392,
"grad_norm": 1.1856428384780884,
"kl": 0.048095703125,
"learning_rate": 8.625e-08,
"loss": 0.0005,
"reward": 3.955425500869751,
"reward_std": 0.013023892883211374,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9704216420650482,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9850037693977356,
"step": 732
},
{
"completion_length": 230.46875,
"epoch": 2.3424,
"grad_norm": 7.96836519241333,
"kl": 0.0765380859375,
"learning_rate": 8.500000000000001e-08,
"loss": 0.0008,
"reward": 3.8350234031677246,
"reward_std": 0.0071187918074429035,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9640994668006897,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.8709239065647125,
"step": 733
},
{
"completion_length": 240.46875,
"epoch": 2.3456,
"grad_norm": 1.9817602634429932,
"kl": 0.067138671875,
"learning_rate": 8.375e-08,
"loss": 0.0007,
"reward": 3.8598886728286743,
"reward_std": 0.009870891459286213,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9313421249389648,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9285465180873871,
"step": 734
},
{
"completion_length": 232.625,
"epoch": 2.3487999999999998,
"grad_norm": 1.4039250612258911,
"kl": 0.05126953125,
"learning_rate": 8.25e-08,
"loss": 0.0005,
"reward": 3.9484113454818726,
"reward_std": 0.011133690131828189,
"rewards/answer_entity_reward": 0.9971590936183929,
"rewards/answer_wer_reward": 0.9577626585960388,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9934895932674408,
"step": 735
},
{
"completion_length": 168.15625,
"epoch": 2.352,
"grad_norm": 0.8416581153869629,
"kl": 0.068359375,
"learning_rate": 8.125e-08,
"loss": 0.0007,
"reward": 3.9322515726089478,
"reward_std": 0.002792949788272381,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9946084916591644,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9376430511474609,
"step": 736
},
{
"completion_length": 232.375,
"epoch": 2.3552,
"grad_norm": 1.3709439039230347,
"kl": 0.068359375,
"learning_rate": 8e-08,
"loss": 0.0007,
"reward": 3.9093856811523438,
"reward_std": 0.0034298759419471025,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.971885621547699,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9375,
"step": 737
},
{
"completion_length": 201.15625,
"epoch": 2.3584,
"grad_norm": 0.9587724804878235,
"kl": 0.0657958984375,
"learning_rate": 7.875e-08,
"loss": 0.0007,
"reward": 3.960189461708069,
"reward_std": 0.017379604279994965,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9656778275966644,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9979838728904724,
"step": 738
},
{
"completion_length": 204.3125,
"epoch": 2.3616,
"grad_norm": 1.5729237794876099,
"kl": 0.075439453125,
"learning_rate": 7.75e-08,
"loss": 0.0007,
"reward": 3.9626389741897583,
"reward_std": 0.01823890022933483,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9661112725734711,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 739
},
{
"completion_length": 239.875,
"epoch": 2.3648,
"grad_norm": 0.9296643733978271,
"kl": 0.064208984375,
"learning_rate": 7.625e-08,
"loss": 0.0006,
"reward": 3.968054413795471,
"reward_std": 0.0051011774921789765,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9680543541908264,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 740
},
{
"completion_length": 242.71875,
"epoch": 2.368,
"grad_norm": 0.9536841511726379,
"kl": 0.0606689453125,
"learning_rate": 7.5e-08,
"loss": 0.0006,
"reward": 3.9280422925949097,
"reward_std": 0.005676981760188937,
"rewards/answer_entity_reward": 0.9861111044883728,
"rewards/answer_wer_reward": 0.9430340826511383,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9988970458507538,
"step": 741
},
{
"completion_length": 239.59375,
"epoch": 2.3712,
"grad_norm": 1.1191787719726562,
"kl": 0.0565185546875,
"learning_rate": 7.375e-08,
"loss": 0.0006,
"reward": 3.9627801179885864,
"reward_std": 0.004723543883301318,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9627801775932312,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 742
},
{
"completion_length": 198.125,
"epoch": 2.3744,
"grad_norm": 19.45572280883789,
"kl": 0.0677490234375,
"learning_rate": 7.25e-08,
"loss": 0.0007,
"reward": 3.8835959434509277,
"reward_std": 0.0259452061727643,
"rewards/answer_entity_reward": 0.9975961446762085,
"rewards/answer_wer_reward": 0.9579322040081024,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9280676245689392,
"step": 743
},
{
"completion_length": 176.875,
"epoch": 2.3776,
"grad_norm": 2.2377281188964844,
"kl": 0.090087890625,
"learning_rate": 7.124999999999999e-08,
"loss": 0.0009,
"reward": 3.9422539472579956,
"reward_std": 0.039653101935982704,
"rewards/answer_entity_reward": 0.9895833134651184,
"rewards/answer_wer_reward": 0.9664814472198486,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.986189216375351,
"step": 744
},
{
"completion_length": 229.1875,
"epoch": 2.3808,
"grad_norm": 1.561314344406128,
"kl": 0.0491943359375,
"learning_rate": 7e-08,
"loss": 0.0005,
"reward": 3.8669506311416626,
"reward_std": 0.19146580225788057,
"rewards/answer_entity_reward": 0.96875,
"rewards/answer_wer_reward": 0.9294506311416626,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 1.0,
"step": 745
},
{
"completion_length": 192.875,
"epoch": 2.384,
"grad_norm": 1.9305033683776855,
"kl": 0.078857421875,
"learning_rate": 6.875e-08,
"loss": 0.0008,
"reward": 3.944983959197998,
"reward_std": 0.012190061155706644,
"rewards/answer_entity_reward": 0.9979166686534882,
"rewards/answer_wer_reward": 0.9473004341125488,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997667968273163,
"step": 746
},
{
"completion_length": 214.75,
"epoch": 2.3872,
"grad_norm": 13.16278076171875,
"kl": 0.0552978515625,
"learning_rate": 6.75e-08,
"loss": 0.0006,
"reward": 3.981534004211426,
"reward_std": 0.016841471777297556,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.989596426486969,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.991937518119812,
"step": 747
},
{
"completion_length": 202.65625,
"epoch": 2.3904,
"grad_norm": 1.269473671913147,
"kl": 0.0595703125,
"learning_rate": 6.625e-08,
"loss": 0.0006,
"reward": 3.9539172649383545,
"reward_std": 0.006352424388751388,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9545792937278748,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993380010128021,
"step": 748
},
{
"completion_length": 241.9375,
"epoch": 2.3936,
"grad_norm": 0.799062192440033,
"kl": 0.08447265625,
"learning_rate": 6.5e-08,
"loss": 0.0008,
"reward": 3.968814492225647,
"reward_std": 0.0058513006661087275,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9696769118309021,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9991375803947449,
"step": 749
},
{
"completion_length": 175.90625,
"epoch": 2.3968,
"grad_norm": 1.7988041639328003,
"kl": 0.06201171875,
"learning_rate": 6.375e-08,
"loss": 0.0006,
"reward": 3.9838857650756836,
"reward_std": 0.0046576057793572545,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9841121137142181,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997735619544983,
"step": 750
},
{
"completion_length": 215.59375,
"epoch": 2.4,
"grad_norm": 2.852858781814575,
"kl": 0.0533447265625,
"learning_rate": 6.25e-08,
"loss": 0.0005,
"reward": 3.943244457244873,
"reward_std": 0.03492546791676432,
"rewards/answer_entity_reward": 0.9943181872367859,
"rewards/answer_wer_reward": 0.9835853576660156,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9653409123420715,
"step": 751
},
{
"completion_length": 238.5,
"epoch": 2.4032,
"grad_norm": 12.164900779724121,
"kl": 0.0615234375,
"learning_rate": 6.125e-08,
"loss": 0.0006,
"reward": 3.9755419492721558,
"reward_std": 0.010625506052747369,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9782145917415619,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9973272979259491,
"step": 752
},
{
"completion_length": 179.15625,
"epoch": 2.4064,
"grad_norm": 0.9550566077232361,
"kl": 0.0693359375,
"learning_rate": 6e-08,
"loss": 0.0007,
"reward": 3.954240560531616,
"reward_std": 0.011055386741645634,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9720976054668427,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9821428656578064,
"step": 753
},
{
"completion_length": 216.0625,
"epoch": 2.4096,
"grad_norm": 1.3647923469543457,
"kl": 0.0582275390625,
"learning_rate": 5.8749999999999993e-08,
"loss": 0.0006,
"reward": 3.962032198905945,
"reward_std": 0.008129856083542109,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9623997509479523,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996323585510254,
"step": 754
},
{
"completion_length": 221.8125,
"epoch": 2.4128,
"grad_norm": 1.9497917890548706,
"kl": 0.0604248046875,
"learning_rate": 5.75e-08,
"loss": 0.0006,
"reward": 3.9653851985931396,
"reward_std": 0.02012356440536678,
"rewards/answer_entity_reward": 0.9937500059604645,
"rewards/answer_wer_reward": 0.9722139835357666,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9994212985038757,
"step": 755
},
{
"completion_length": 198.4375,
"epoch": 2.416,
"grad_norm": 0.6684221029281616,
"kl": 0.07568359375,
"learning_rate": 5.625e-08,
"loss": 0.0008,
"reward": 3.942944049835205,
"reward_std": 0.008921493077650666,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9674927294254303,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9754513502120972,
"step": 756
},
{
"completion_length": 234.875,
"epoch": 2.4192,
"grad_norm": 1.097367525100708,
"kl": 0.1142578125,
"learning_rate": 5.4999999999999996e-08,
"loss": 0.0011,
"reward": 3.9485758543014526,
"reward_std": 0.01669642748311162,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9544399976730347,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9986002445220947,
"step": 757
},
{
"completion_length": 150.46875,
"epoch": 2.4224,
"grad_norm": 0.21660760045051575,
"kl": 0.0321044921875,
"learning_rate": 5.3749999999999995e-08,
"loss": 0.0003,
"reward": 3.978167176246643,
"reward_std": 0.0010678768157958984,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9781671762466431,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 758
},
{
"completion_length": 231.25,
"epoch": 2.4256,
"grad_norm": 3.330300807952881,
"kl": 0.078857421875,
"learning_rate": 5.2499999999999994e-08,
"loss": 0.0008,
"reward": 3.9418994188308716,
"reward_std": 0.007436740444973111,
"rewards/answer_entity_reward": 0.9861111044883728,
"rewards/answer_wer_reward": 0.9557883143424988,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 759
},
{
"completion_length": 212.5,
"epoch": 2.4288,
"grad_norm": 3.427900791168213,
"kl": 0.13525390625,
"learning_rate": 5.124999999999999e-08,
"loss": 0.0014,
"reward": 3.9013478755950928,
"reward_std": 0.030906156171113253,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9625242948532104,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9388234913349152,
"step": 760
},
{
"completion_length": 218.90625,
"epoch": 2.432,
"grad_norm": 1.3307231664657593,
"kl": 0.0567626953125,
"learning_rate": 5e-08,
"loss": 0.0006,
"reward": 3.9774084091186523,
"reward_std": 0.0034683155827224255,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9774083495140076,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 761
},
{
"completion_length": 207.875,
"epoch": 2.4352,
"grad_norm": 0.7475162148475647,
"kl": 0.057373046875,
"learning_rate": 4.8749999999999996e-08,
"loss": 0.0006,
"reward": 3.9419760704040527,
"reward_std": 0.004616708727553487,
"rewards/answer_entity_reward": 0.9788995385169983,
"rewards/answer_wer_reward": 0.9634398818016052,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996366202831268,
"step": 762
},
{
"completion_length": 195.28125,
"epoch": 2.4384,
"grad_norm": 2.0728979110717773,
"kl": 0.0966796875,
"learning_rate": 4.7499999999999995e-08,
"loss": 0.001,
"reward": 3.944322109222412,
"reward_std": 0.017246471252292395,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9587452709674835,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9855769276618958,
"step": 763
},
{
"completion_length": 224.9375,
"epoch": 2.4416,
"grad_norm": 1.2122951745986938,
"kl": 0.1226806640625,
"learning_rate": 4.625e-08,
"loss": 0.0012,
"reward": 3.9620940685272217,
"reward_std": 0.007986569311469793,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9652903079986572,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.996803879737854,
"step": 764
},
{
"completion_length": 249.03125,
"epoch": 2.4448,
"grad_norm": 1.21713125705719,
"kl": 0.065673828125,
"learning_rate": 4.5e-08,
"loss": 0.0007,
"reward": 3.9346585273742676,
"reward_std": 0.006481441203504801,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9346585869789124,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 765
},
{
"completion_length": 239.875,
"epoch": 2.448,
"grad_norm": 5.105895519256592,
"kl": 0.0665283203125,
"learning_rate": 4.375e-08,
"loss": 0.0007,
"reward": 3.916127324104309,
"reward_std": 0.02047336893156171,
"rewards/answer_entity_reward": 0.9910714626312256,
"rewards/answer_wer_reward": 0.9319192171096802,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9931367635726929,
"step": 766
},
{
"completion_length": 216.84375,
"epoch": 2.4512,
"grad_norm": 3.230001449584961,
"kl": 0.0445556640625,
"learning_rate": 4.2500000000000003e-08,
"loss": 0.0004,
"reward": 3.9800050258636475,
"reward_std": 0.004955247277393937,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9800049960613251,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 767
},
{
"completion_length": 229.8125,
"epoch": 2.4544,
"grad_norm": 1.2354313135147095,
"kl": 0.0478515625,
"learning_rate": 4.125e-08,
"loss": 0.0005,
"reward": 3.9553003311157227,
"reward_std": 0.013880819431506097,
"rewards/answer_entity_reward": 0.9826388955116272,
"rewards/answer_wer_reward": 0.9739912152290344,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.998670220375061,
"step": 768
},
{
"completion_length": 248.34375,
"epoch": 2.4576000000000002,
"grad_norm": 0.8089145421981812,
"kl": 0.06005859375,
"learning_rate": 4e-08,
"loss": 0.0006,
"reward": 3.9643748998641968,
"reward_std": 0.007618119474500418,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9654783606529236,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9988966286182404,
"step": 769
},
{
"completion_length": 233.53125,
"epoch": 2.4608,
"grad_norm": 1.2253531217575073,
"kl": 0.0540771484375,
"learning_rate": 3.875e-08,
"loss": 0.0005,
"reward": 3.955801010131836,
"reward_std": 0.007193901808932424,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9558009505271912,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 770
},
{
"completion_length": 246.25,
"epoch": 2.464,
"grad_norm": 0.8907082080841064,
"kl": 0.0740966796875,
"learning_rate": 3.75e-08,
"loss": 0.0007,
"reward": 3.9567649364471436,
"reward_std": 0.007558103417977691,
"rewards/answer_entity_reward": 0.9926470518112183,
"rewards/answer_wer_reward": 0.9644212424755096,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996966123580933,
"step": 771
},
{
"completion_length": 157.84375,
"epoch": 2.4672,
"grad_norm": 0.6787045001983643,
"kl": 0.080078125,
"learning_rate": 3.625e-08,
"loss": 0.0008,
"reward": 3.989119529724121,
"reward_std": 0.0026377947069704533,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9893985092639923,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997209906578064,
"step": 772
},
{
"completion_length": 228.75,
"epoch": 2.4704,
"grad_norm": 0.6448482275009155,
"kl": 0.0562744140625,
"learning_rate": 3.5e-08,
"loss": 0.0006,
"reward": 3.960241913795471,
"reward_std": 0.005235916236415505,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.960241824388504,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 773
},
{
"completion_length": 226.5,
"epoch": 2.4736000000000002,
"grad_norm": 0.9646191596984863,
"kl": 0.05224609375,
"learning_rate": 3.375e-08,
"loss": 0.0005,
"reward": 3.9351943731307983,
"reward_std": 0.015792422462254763,
"rewards/answer_entity_reward": 0.9866071343421936,
"rewards/answer_wer_reward": 0.9691915214061737,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9793955981731415,
"step": 774
},
{
"completion_length": 249.53125,
"epoch": 2.4768,
"grad_norm": 2.9048826694488525,
"kl": 0.0540771484375,
"learning_rate": 3.25e-08,
"loss": 0.0005,
"reward": 3.952099561691284,
"reward_std": 0.00629690324421972,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9520994424819946,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 775
},
{
"completion_length": 222.21875,
"epoch": 2.48,
"grad_norm": 1.1555320024490356,
"kl": 0.0548095703125,
"learning_rate": 3.125e-08,
"loss": 0.0005,
"reward": 3.9609912633895874,
"reward_std": 0.017560790292918682,
"rewards/answer_entity_reward": 0.9927884340286255,
"rewards/answer_wer_reward": 0.9682029485702515,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 776
},
{
"completion_length": 245.46875,
"epoch": 2.4832,
"grad_norm": 2.5107345581054688,
"kl": 0.098388671875,
"learning_rate": 3e-08,
"loss": 0.001,
"reward": 3.9294867515563965,
"reward_std": 0.009384696371853352,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.932422935962677,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9970638751983643,
"step": 777
},
{
"completion_length": 175.1875,
"epoch": 2.4864,
"grad_norm": 3.319678783416748,
"kl": 0.06640625,
"learning_rate": 2.875e-08,
"loss": 0.0007,
"reward": 3.9766006469726562,
"reward_std": 0.005283091915771365,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9770888686180115,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.99951171875,
"step": 778
},
{
"completion_length": 215.8125,
"epoch": 2.4896,
"grad_norm": 1.7188315391540527,
"kl": 0.058837890625,
"learning_rate": 2.7499999999999998e-08,
"loss": 0.0006,
"reward": 3.945501208305359,
"reward_std": 0.006351021584123373,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9457343518733978,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9997667968273163,
"step": 779
},
{
"completion_length": 204.28125,
"epoch": 2.4928,
"grad_norm": 1.284071683883667,
"kl": 0.0640869140625,
"learning_rate": 2.6249999999999997e-08,
"loss": 0.0006,
"reward": 3.9768584966659546,
"reward_std": 0.003316762624308467,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9779550433158875,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9989035129547119,
"step": 780
},
{
"completion_length": 216.71875,
"epoch": 2.496,
"grad_norm": 1.442418098449707,
"kl": 0.067138671875,
"learning_rate": 2.5e-08,
"loss": 0.0007,
"reward": 3.945361614227295,
"reward_std": 0.03020885493606329,
"rewards/answer_entity_reward": 0.9867424070835114,
"rewards/answer_wer_reward": 0.9586191177368164,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 781
},
{
"completion_length": 199.28125,
"epoch": 2.4992,
"grad_norm": 2.220127582550049,
"kl": 0.071533203125,
"learning_rate": 2.3749999999999998e-08,
"loss": 0.0007,
"reward": 3.945390462875366,
"reward_std": 0.012143698055297136,
"rewards/answer_entity_reward": 0.9965277910232544,
"rewards/answer_wer_reward": 0.9488627314567566,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 782
},
{
"completion_length": 220.6875,
"epoch": 2.5023999999999997,
"grad_norm": 2.2362775802612305,
"kl": 0.0634765625,
"learning_rate": 2.25e-08,
"loss": 0.0006,
"reward": 3.960192322731018,
"reward_std": 0.006831311853602529,
"rewards/answer_entity_reward": 0.9916666746139526,
"rewards/answer_wer_reward": 0.9691977500915527,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9993279576301575,
"step": 783
},
{
"completion_length": 235.5,
"epoch": 2.5056000000000003,
"grad_norm": 0.9817630052566528,
"kl": 0.05224609375,
"learning_rate": 2.1250000000000002e-08,
"loss": 0.0005,
"reward": 3.9702308177948,
"reward_std": 0.006825624033808708,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9809376895427704,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9892931282520294,
"step": 784
},
{
"completion_length": 203.78125,
"epoch": 2.5088,
"grad_norm": 2.859792947769165,
"kl": 0.053955078125,
"learning_rate": 2e-08,
"loss": 0.0005,
"reward": 3.9142426252365112,
"reward_std": 0.01792304962873459,
"rewards/answer_entity_reward": 0.9944852888584137,
"rewards/answer_wer_reward": 0.9792338609695435,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9405233263969421,
"step": 785
},
{
"completion_length": 224.28125,
"epoch": 2.512,
"grad_norm": 3.7338051795959473,
"kl": 0.060791015625,
"learning_rate": 1.875e-08,
"loss": 0.0006,
"reward": 3.948864221572876,
"reward_std": 0.01559874601662159,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9604960083961487,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9883682429790497,
"step": 786
},
{
"completion_length": 175.1875,
"epoch": 2.5152,
"grad_norm": 4.41845703125,
"kl": 0.083740234375,
"learning_rate": 1.75e-08,
"loss": 0.0008,
"reward": 3.949966311454773,
"reward_std": 0.01157908933237195,
"rewards/answer_entity_reward": 0.9914772808551788,
"rewards/answer_wer_reward": 0.9717868566513062,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9867021441459656,
"step": 787
},
{
"completion_length": 259.25,
"epoch": 2.5183999999999997,
"grad_norm": 0.9571487903594971,
"kl": 0.0584716796875,
"learning_rate": 1.625e-08,
"loss": 0.0006,
"reward": 3.853899836540222,
"reward_std": 0.1917457883246243,
"rewards/answer_entity_reward": 0.9654605388641357,
"rewards/answer_wer_reward": 0.9225141406059265,
"rewards/format_reward": 0.96875,
"rewards/think_ocr_reward": 0.9971751868724823,
"step": 788
},
{
"completion_length": 249.375,
"epoch": 2.5216,
"grad_norm": 2.86120867729187,
"kl": 0.1368408203125,
"learning_rate": 1.5e-08,
"loss": 0.0014,
"reward": 3.9423060417175293,
"reward_std": 0.01874951831996441,
"rewards/answer_entity_reward": 0.9955357313156128,
"rewards/answer_wer_reward": 0.9478386044502258,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9989316165447235,
"step": 789
},
{
"completion_length": 198.78125,
"epoch": 2.5248,
"grad_norm": 4.95521879196167,
"kl": 0.0611572265625,
"learning_rate": 1.3749999999999999e-08,
"loss": 0.0006,
"reward": 3.915849447250366,
"reward_std": 0.016107629984617233,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9824348092079163,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9334145784378052,
"step": 790
},
{
"completion_length": 184.1875,
"epoch": 2.528,
"grad_norm": 0.8447386622428894,
"kl": 0.0634765625,
"learning_rate": 1.25e-08,
"loss": 0.0006,
"reward": 3.929018259048462,
"reward_std": 0.009709671430755407,
"rewards/answer_entity_reward": 0.9886363744735718,
"rewards/answer_wer_reward": 0.940733015537262,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9996488690376282,
"step": 791
},
{
"completion_length": 185.59375,
"epoch": 2.5312,
"grad_norm": 2.6198718547821045,
"kl": 0.0439453125,
"learning_rate": 1.125e-08,
"loss": 0.0004,
"reward": 3.9582111835479736,
"reward_std": 0.007002702914178371,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9582110941410065,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 792
},
{
"completion_length": 197.8125,
"epoch": 2.5343999999999998,
"grad_norm": 1.3550831079483032,
"kl": 0.065185546875,
"learning_rate": 1e-08,
"loss": 0.0007,
"reward": 3.8907723426818848,
"reward_std": 0.005525397136807442,
"rewards/answer_entity_reward": 0.9903846085071564,
"rewards/answer_wer_reward": 0.9687470197677612,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.931640625,
"step": 793
},
{
"completion_length": 187.375,
"epoch": 2.5376,
"grad_norm": 1.0252914428710938,
"kl": 0.086181640625,
"learning_rate": 8.75e-09,
"loss": 0.0009,
"reward": 3.86617374420166,
"reward_std": 0.011230799835175276,
"rewards/answer_entity_reward": 0.9981617629528046,
"rewards/answer_wer_reward": 0.9432033002376556,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9248086512088776,
"step": 794
},
{
"completion_length": 220.84375,
"epoch": 2.5408,
"grad_norm": 3.189028739929199,
"kl": 0.05078125,
"learning_rate": 7.5e-09,
"loss": 0.0005,
"reward": 3.9672648906707764,
"reward_std": 0.006707400782033801,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.968046098947525,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.999218761920929,
"step": 795
},
{
"completion_length": 148.875,
"epoch": 2.544,
"grad_norm": 0.518578052520752,
"kl": 0.085693359375,
"learning_rate": 6.25e-09,
"loss": 0.0009,
"reward": 3.8482353687286377,
"reward_std": 0.0038536423817276955,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.8497678339481354,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9984675645828247,
"step": 796
},
{
"completion_length": 193.40625,
"epoch": 2.5472,
"grad_norm": 0.928065299987793,
"kl": 0.081298828125,
"learning_rate": 5e-09,
"loss": 0.0008,
"reward": 3.9669394493103027,
"reward_std": 0.013519858941435814,
"rewards/answer_entity_reward": 0.9937500059604645,
"rewards/answer_wer_reward": 0.9760889112949371,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9971005320549011,
"step": 797
},
{
"completion_length": 220.0625,
"epoch": 2.5504,
"grad_norm": 2.7394306659698486,
"kl": 0.050537109375,
"learning_rate": 3.75e-09,
"loss": 0.0005,
"reward": 3.972287654876709,
"reward_std": 0.0053059973288327456,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9722877740859985,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 798
},
{
"completion_length": 221.4375,
"epoch": 2.5536,
"grad_norm": 3.9942383766174316,
"kl": 0.0673828125,
"learning_rate": 2.5e-09,
"loss": 0.0007,
"reward": 3.9344537258148193,
"reward_std": 0.01906409254297614,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9436750113964081,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 0.9907786846160889,
"step": 799
},
{
"completion_length": 231.1875,
"epoch": 2.5568,
"grad_norm": 2.3216702938079834,
"kl": 0.0462646484375,
"learning_rate": 1.25e-09,
"loss": 0.0005,
"reward": 3.959131956100464,
"reward_std": 0.005453485995531082,
"rewards/answer_entity_reward": 1.0,
"rewards/answer_wer_reward": 0.9591320157051086,
"rewards/format_reward": 1.0,
"rewards/think_ocr_reward": 1.0,
"step": 800
}
],
"logging_steps": 1,
"max_steps": 800,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}