output_0216_2e6_all / checkpoint-2000 /trainer_state.json
Prummn's picture
Add files using upload-large-folder tool
c9c6229 verified
{
"best_metric": 0.43360278,
"best_model_checkpoint": "/data/liuzihang/haobin/pangkaiyu/output/output_step_audio2_mini-encoder+align+llm-whole0130_signal_new1_dpdc-lora-1gpu-bs16_4_gckF_2e6_all/v2-20260215-150801/checkpoint-1200",
"epoch": 0.4219409282700422,
"eval_steps": 200,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0002109704641350211,
"grad_norm": 0.76171875,
"learning_rate": 5.263157894736842e-09,
"loss": 0.2671268880367279,
"step": 1,
"token_acc": 0.9268792044457443
},
{
"epoch": 0.0004219409282700422,
"grad_norm": 0.6640625,
"learning_rate": 1.0526315789473684e-08,
"loss": 0.25691983103752136,
"step": 2,
"token_acc": 0.9315326633165829
},
{
"epoch": 0.0006329113924050633,
"grad_norm": 0.58984375,
"learning_rate": 1.5789473684210525e-08,
"loss": 0.20553478598594666,
"step": 3,
"token_acc": 0.9399263247378861
},
{
"epoch": 0.0008438818565400844,
"grad_norm": 0.7578125,
"learning_rate": 2.1052631578947368e-08,
"loss": 0.2874465584754944,
"step": 4,
"token_acc": 0.9190901238122661
},
{
"epoch": 0.0010548523206751054,
"grad_norm": 0.99609375,
"learning_rate": 2.6315789473684208e-08,
"loss": 0.23955097794532776,
"step": 5,
"token_acc": 0.9297171186934966
},
{
"epoch": 0.0012658227848101266,
"grad_norm": 0.6484375,
"learning_rate": 3.157894736842105e-08,
"loss": 0.22314852476119995,
"step": 6,
"token_acc": 0.9365693865396069
},
{
"epoch": 0.0014767932489451476,
"grad_norm": 0.74609375,
"learning_rate": 3.684210526315789e-08,
"loss": 0.23751771450042725,
"step": 7,
"token_acc": 0.9336440910337264
},
{
"epoch": 0.0016877637130801688,
"grad_norm": 0.86328125,
"learning_rate": 4.2105263157894737e-08,
"loss": 0.22823776304721832,
"step": 8,
"token_acc": 0.9342301943198804
},
{
"epoch": 0.0018987341772151898,
"grad_norm": 0.6015625,
"learning_rate": 4.736842105263158e-08,
"loss": 0.22278322279453278,
"step": 9,
"token_acc": 0.9367531331973186
},
{
"epoch": 0.002109704641350211,
"grad_norm": 0.72265625,
"learning_rate": 5.2631578947368416e-08,
"loss": 0.25690096616744995,
"step": 10,
"token_acc": 0.9291217257318952
},
{
"epoch": 0.002320675105485232,
"grad_norm": 1.046875,
"learning_rate": 5.789473684210526e-08,
"loss": 0.3066456913948059,
"step": 11,
"token_acc": 0.9201006605850897
},
{
"epoch": 0.002531645569620253,
"grad_norm": 0.7890625,
"learning_rate": 6.31578947368421e-08,
"loss": 0.2525354027748108,
"step": 12,
"token_acc": 0.9276785714285715
},
{
"epoch": 0.0027426160337552744,
"grad_norm": 0.76953125,
"learning_rate": 6.842105263157895e-08,
"loss": 0.30059731006622314,
"step": 13,
"token_acc": 0.9238062986793092
},
{
"epoch": 0.002953586497890295,
"grad_norm": 0.9921875,
"learning_rate": 7.368421052631577e-08,
"loss": 0.250629723072052,
"step": 14,
"token_acc": 0.9291187739463601
},
{
"epoch": 0.0031645569620253164,
"grad_norm": 0.6796875,
"learning_rate": 7.894736842105262e-08,
"loss": 0.25745123624801636,
"step": 15,
"token_acc": 0.9317230273752013
},
{
"epoch": 0.0033755274261603376,
"grad_norm": 0.85546875,
"learning_rate": 8.421052631578947e-08,
"loss": 0.3376100957393646,
"step": 16,
"token_acc": 0.9077240566037735
},
{
"epoch": 0.003586497890295359,
"grad_norm": 0.703125,
"learning_rate": 8.947368421052631e-08,
"loss": 0.24760955572128296,
"step": 17,
"token_acc": 0.9222253760999148
},
{
"epoch": 0.0037974683544303796,
"grad_norm": 0.703125,
"learning_rate": 9.473684210526316e-08,
"loss": 0.2046602964401245,
"step": 18,
"token_acc": 0.9403993855606759
},
{
"epoch": 0.004008438818565401,
"grad_norm": 0.83203125,
"learning_rate": 1e-07,
"loss": 0.26379868388175964,
"step": 19,
"token_acc": 0.9294478527607362
},
{
"epoch": 0.004219409282700422,
"grad_norm": 0.72265625,
"learning_rate": 1.0526315789473683e-07,
"loss": 0.2657994031906128,
"step": 20,
"token_acc": 0.9254714157437893
},
{
"epoch": 0.004430379746835443,
"grad_norm": 0.6640625,
"learning_rate": 1.1052631578947368e-07,
"loss": 0.28728997707366943,
"step": 21,
"token_acc": 0.9262518968133535
},
{
"epoch": 0.004641350210970464,
"grad_norm": 0.77734375,
"learning_rate": 1.1578947368421052e-07,
"loss": 0.27618926763534546,
"step": 22,
"token_acc": 0.9308072487644151
},
{
"epoch": 0.004852320675105486,
"grad_norm": 0.7109375,
"learning_rate": 1.2105263157894737e-07,
"loss": 0.2314767688512802,
"step": 23,
"token_acc": 0.936124911284599
},
{
"epoch": 0.005063291139240506,
"grad_norm": 0.73828125,
"learning_rate": 1.263157894736842e-07,
"loss": 0.24274200201034546,
"step": 24,
"token_acc": 0.9345039018952063
},
{
"epoch": 0.005274261603375527,
"grad_norm": 0.64453125,
"learning_rate": 1.3157894736842104e-07,
"loss": 0.2632070481777191,
"step": 25,
"token_acc": 0.9223796033994335
},
{
"epoch": 0.005485232067510549,
"grad_norm": 0.7421875,
"learning_rate": 1.368421052631579e-07,
"loss": 0.2736364006996155,
"step": 26,
"token_acc": 0.9269776876267748
},
{
"epoch": 0.00569620253164557,
"grad_norm": 0.96875,
"learning_rate": 1.4210526315789474e-07,
"loss": 0.29377132654190063,
"step": 27,
"token_acc": 0.9181763285024155
},
{
"epoch": 0.00590717299578059,
"grad_norm": 0.7734375,
"learning_rate": 1.4736842105263155e-07,
"loss": 0.25689125061035156,
"step": 28,
"token_acc": 0.9299856527977044
},
{
"epoch": 0.006118143459915612,
"grad_norm": 0.9609375,
"learning_rate": 1.526315789473684e-07,
"loss": 0.24775874614715576,
"step": 29,
"token_acc": 0.9330016583747927
},
{
"epoch": 0.006329113924050633,
"grad_norm": 0.703125,
"learning_rate": 1.5789473684210525e-07,
"loss": 0.25338542461395264,
"step": 30,
"token_acc": 0.925512104283054
},
{
"epoch": 0.006540084388185654,
"grad_norm": 0.6328125,
"learning_rate": 1.631578947368421e-07,
"loss": 0.25087809562683105,
"step": 31,
"token_acc": 0.9313120472229676
},
{
"epoch": 0.006751054852320675,
"grad_norm": 0.67578125,
"learning_rate": 1.6842105263157895e-07,
"loss": 0.2502059042453766,
"step": 32,
"token_acc": 0.9279077218840115
},
{
"epoch": 0.006962025316455696,
"grad_norm": 0.578125,
"learning_rate": 1.7368421052631578e-07,
"loss": 0.18295930325984955,
"step": 33,
"token_acc": 0.9424480628860191
},
{
"epoch": 0.007172995780590718,
"grad_norm": 0.7265625,
"learning_rate": 1.7894736842105262e-07,
"loss": 0.2690507471561432,
"step": 34,
"token_acc": 0.9241155819605725
},
{
"epoch": 0.007383966244725738,
"grad_norm": 0.90234375,
"learning_rate": 1.8421052631578946e-07,
"loss": 0.2535433769226074,
"step": 35,
"token_acc": 0.9346446700507615
},
{
"epoch": 0.007594936708860759,
"grad_norm": 0.73046875,
"learning_rate": 1.8947368421052632e-07,
"loss": 0.26006314158439636,
"step": 36,
"token_acc": 0.9291425420457678
},
{
"epoch": 0.007805907172995781,
"grad_norm": 0.86328125,
"learning_rate": 1.9473684210526315e-07,
"loss": 0.2664929926395416,
"step": 37,
"token_acc": 0.9286128845037724
},
{
"epoch": 0.008016877637130802,
"grad_norm": 0.65234375,
"learning_rate": 2e-07,
"loss": 0.2170935869216919,
"step": 38,
"token_acc": 0.9359218028780885
},
{
"epoch": 0.008227848101265823,
"grad_norm": 0.75390625,
"learning_rate": 2.0526315789473683e-07,
"loss": 0.31706634163856506,
"step": 39,
"token_acc": 0.9133278822567457
},
{
"epoch": 0.008438818565400843,
"grad_norm": 0.6953125,
"learning_rate": 2.1052631578947366e-07,
"loss": 0.23433184623718262,
"step": 40,
"token_acc": 0.9313227829202747
},
{
"epoch": 0.008649789029535865,
"grad_norm": 0.65234375,
"learning_rate": 2.1578947368421053e-07,
"loss": 0.19157642126083374,
"step": 41,
"token_acc": 0.9416859122401847
},
{
"epoch": 0.008860759493670886,
"grad_norm": 0.75390625,
"learning_rate": 2.2105263157894736e-07,
"loss": 0.26239246129989624,
"step": 42,
"token_acc": 0.9240352476450927
},
{
"epoch": 0.009071729957805906,
"grad_norm": 0.97265625,
"learning_rate": 2.263157894736842e-07,
"loss": 0.27333155274391174,
"step": 43,
"token_acc": 0.9244935543278084
},
{
"epoch": 0.009282700421940928,
"grad_norm": 0.671875,
"learning_rate": 2.3157894736842104e-07,
"loss": 0.22739389538764954,
"step": 44,
"token_acc": 0.9345622119815669
},
{
"epoch": 0.00949367088607595,
"grad_norm": 1.359375,
"learning_rate": 2.3684210526315787e-07,
"loss": 0.2970912754535675,
"step": 45,
"token_acc": 0.918966119455117
},
{
"epoch": 0.009704641350210971,
"grad_norm": 0.98046875,
"learning_rate": 2.4210526315789473e-07,
"loss": 0.24367359280586243,
"step": 46,
"token_acc": 0.9307627357162961
},
{
"epoch": 0.009915611814345991,
"grad_norm": 0.69140625,
"learning_rate": 2.4736842105263157e-07,
"loss": 0.24166589975357056,
"step": 47,
"token_acc": 0.925096985974336
},
{
"epoch": 0.010126582278481013,
"grad_norm": 0.8203125,
"learning_rate": 2.526315789473684e-07,
"loss": 0.2498053014278412,
"step": 48,
"token_acc": 0.9309855154785572
},
{
"epoch": 0.010337552742616034,
"grad_norm": 0.81640625,
"learning_rate": 2.578947368421053e-07,
"loss": 0.27882808446884155,
"step": 49,
"token_acc": 0.9232902033271719
},
{
"epoch": 0.010548523206751054,
"grad_norm": 0.7890625,
"learning_rate": 2.631578947368421e-07,
"loss": 0.2516263723373413,
"step": 50,
"token_acc": 0.9283572142619126
},
{
"epoch": 0.010759493670886076,
"grad_norm": 0.65625,
"learning_rate": 2.684210526315789e-07,
"loss": 0.22138270735740662,
"step": 51,
"token_acc": 0.9392942583732058
},
{
"epoch": 0.010970464135021098,
"grad_norm": 1.015625,
"learning_rate": 2.736842105263158e-07,
"loss": 0.25101763010025024,
"step": 52,
"token_acc": 0.9323397913561848
},
{
"epoch": 0.011181434599156118,
"grad_norm": 0.77734375,
"learning_rate": 2.789473684210526e-07,
"loss": 0.2590043842792511,
"step": 53,
"token_acc": 0.9248989023685731
},
{
"epoch": 0.01139240506329114,
"grad_norm": 1.03125,
"learning_rate": 2.842105263157895e-07,
"loss": 0.22629833221435547,
"step": 54,
"token_acc": 0.9388133498145859
},
{
"epoch": 0.011603375527426161,
"grad_norm": 1.2421875,
"learning_rate": 2.894736842105263e-07,
"loss": 0.26315873861312866,
"step": 55,
"token_acc": 0.9215632686526374
},
{
"epoch": 0.01181434599156118,
"grad_norm": 1.390625,
"learning_rate": 2.947368421052631e-07,
"loss": 0.3269142806529999,
"step": 56,
"token_acc": 0.9163541967118546
},
{
"epoch": 0.012025316455696202,
"grad_norm": 0.8125,
"learning_rate": 3e-07,
"loss": 0.2740277945995331,
"step": 57,
"token_acc": 0.9223241590214067
},
{
"epoch": 0.012236286919831224,
"grad_norm": 0.7109375,
"learning_rate": 3.052631578947368e-07,
"loss": 0.2807028889656067,
"step": 58,
"token_acc": 0.9231628946633138
},
{
"epoch": 0.012447257383966244,
"grad_norm": 0.83984375,
"learning_rate": 3.105263157894737e-07,
"loss": 0.3021116256713867,
"step": 59,
"token_acc": 0.9209953343701399
},
{
"epoch": 0.012658227848101266,
"grad_norm": 0.73828125,
"learning_rate": 3.157894736842105e-07,
"loss": 0.2364785373210907,
"step": 60,
"token_acc": 0.9329750237116662
},
{
"epoch": 0.012869198312236287,
"grad_norm": 0.7109375,
"learning_rate": 3.2105263157894733e-07,
"loss": 0.2884541153907776,
"step": 61,
"token_acc": 0.9274515831540117
},
{
"epoch": 0.013080168776371307,
"grad_norm": 0.76953125,
"learning_rate": 3.263157894736842e-07,
"loss": 0.25490057468414307,
"step": 62,
"token_acc": 0.9336579427875837
},
{
"epoch": 0.013291139240506329,
"grad_norm": 0.859375,
"learning_rate": 3.31578947368421e-07,
"loss": 0.27591922879219055,
"step": 63,
"token_acc": 0.9201367308887508
},
{
"epoch": 0.01350210970464135,
"grad_norm": 0.83203125,
"learning_rate": 3.368421052631579e-07,
"loss": 0.2646903693675995,
"step": 64,
"token_acc": 0.9261189454322502
},
{
"epoch": 0.013713080168776372,
"grad_norm": 0.7265625,
"learning_rate": 3.4210526315789473e-07,
"loss": 0.2481774091720581,
"step": 65,
"token_acc": 0.9300921512551636
},
{
"epoch": 0.013924050632911392,
"grad_norm": 0.671875,
"learning_rate": 3.4736842105263157e-07,
"loss": 0.2667776644229889,
"step": 66,
"token_acc": 0.9202678027997565
},
{
"epoch": 0.014135021097046414,
"grad_norm": 0.77734375,
"learning_rate": 3.526315789473684e-07,
"loss": 0.2720962464809418,
"step": 67,
"token_acc": 0.9237356168049238
},
{
"epoch": 0.014345991561181435,
"grad_norm": 0.6171875,
"learning_rate": 3.5789473684210524e-07,
"loss": 0.25555452704429626,
"step": 68,
"token_acc": 0.9306469298245614
},
{
"epoch": 0.014556962025316455,
"grad_norm": 0.640625,
"learning_rate": 3.6315789473684213e-07,
"loss": 0.22453869879245758,
"step": 69,
"token_acc": 0.9388444990780578
},
{
"epoch": 0.014767932489451477,
"grad_norm": 0.75,
"learning_rate": 3.684210526315789e-07,
"loss": 0.28728553652763367,
"step": 70,
"token_acc": 0.9182989690721649
},
{
"epoch": 0.014978902953586498,
"grad_norm": 0.95703125,
"learning_rate": 3.7368421052631575e-07,
"loss": 0.2622889578342438,
"step": 71,
"token_acc": 0.9239098624524437
},
{
"epoch": 0.015189873417721518,
"grad_norm": 0.78125,
"learning_rate": 3.7894736842105264e-07,
"loss": 0.2780531346797943,
"step": 72,
"token_acc": 0.9266362252663622
},
{
"epoch": 0.01540084388185654,
"grad_norm": 0.703125,
"learning_rate": 3.842105263157894e-07,
"loss": 0.2625043988227844,
"step": 73,
"token_acc": 0.9331468531468532
},
{
"epoch": 0.015611814345991562,
"grad_norm": 0.703125,
"learning_rate": 3.894736842105263e-07,
"loss": 0.25795984268188477,
"step": 74,
"token_acc": 0.9242160278745645
},
{
"epoch": 0.015822784810126583,
"grad_norm": 0.71484375,
"learning_rate": 3.9473684210526315e-07,
"loss": 0.2481173276901245,
"step": 75,
"token_acc": 0.9310846176214016
},
{
"epoch": 0.016033755274261603,
"grad_norm": 0.640625,
"learning_rate": 4e-07,
"loss": 0.23631326854228973,
"step": 76,
"token_acc": 0.9369342184671092
},
{
"epoch": 0.016244725738396623,
"grad_norm": 0.69140625,
"learning_rate": 4.052631578947368e-07,
"loss": 0.24659401178359985,
"step": 77,
"token_acc": 0.930279458369346
},
{
"epoch": 0.016455696202531647,
"grad_norm": 0.77734375,
"learning_rate": 4.1052631578947365e-07,
"loss": 0.28330034017562866,
"step": 78,
"token_acc": 0.9203966005665722
},
{
"epoch": 0.016666666666666666,
"grad_norm": 0.96484375,
"learning_rate": 4.1578947368421054e-07,
"loss": 0.2582593560218811,
"step": 79,
"token_acc": 0.9296250768285187
},
{
"epoch": 0.016877637130801686,
"grad_norm": 0.734375,
"learning_rate": 4.2105263157894733e-07,
"loss": 0.2518593370914459,
"step": 80,
"token_acc": 0.9288208434058555
},
{
"epoch": 0.01708860759493671,
"grad_norm": 0.859375,
"learning_rate": 4.2631578947368416e-07,
"loss": 0.30441492795944214,
"step": 81,
"token_acc": 0.9242614707730987
},
{
"epoch": 0.01729957805907173,
"grad_norm": 0.7578125,
"learning_rate": 4.3157894736842105e-07,
"loss": 0.30916911363601685,
"step": 82,
"token_acc": 0.9124253625248792
},
{
"epoch": 0.01751054852320675,
"grad_norm": 0.609375,
"learning_rate": 4.368421052631579e-07,
"loss": 0.28638702630996704,
"step": 83,
"token_acc": 0.9245689655172413
},
{
"epoch": 0.017721518987341773,
"grad_norm": 0.80859375,
"learning_rate": 4.421052631578947e-07,
"loss": 0.2646373510360718,
"step": 84,
"token_acc": 0.9279176201372997
},
{
"epoch": 0.017932489451476793,
"grad_norm": 0.97265625,
"learning_rate": 4.4736842105263156e-07,
"loss": 0.27530571818351746,
"step": 85,
"token_acc": 0.9217687074829932
},
{
"epoch": 0.018143459915611813,
"grad_norm": 0.73828125,
"learning_rate": 4.526315789473684e-07,
"loss": 0.30989915132522583,
"step": 86,
"token_acc": 0.9133137062479555
},
{
"epoch": 0.018354430379746836,
"grad_norm": 0.91015625,
"learning_rate": 4.5789473684210523e-07,
"loss": 0.2850973308086395,
"step": 87,
"token_acc": 0.9211218229623137
},
{
"epoch": 0.018565400843881856,
"grad_norm": 0.72265625,
"learning_rate": 4.6315789473684207e-07,
"loss": 0.2523067593574524,
"step": 88,
"token_acc": 0.9337899543378996
},
{
"epoch": 0.018776371308016876,
"grad_norm": 0.734375,
"learning_rate": 4.6842105263157896e-07,
"loss": 0.2510542869567871,
"step": 89,
"token_acc": 0.9345845983991168
},
{
"epoch": 0.0189873417721519,
"grad_norm": 0.65625,
"learning_rate": 4.7368421052631574e-07,
"loss": 0.27831587195396423,
"step": 90,
"token_acc": 0.9237072619384007
},
{
"epoch": 0.01919831223628692,
"grad_norm": 0.67578125,
"learning_rate": 4.789473684210526e-07,
"loss": 0.2806670665740967,
"step": 91,
"token_acc": 0.9245553643144004
},
{
"epoch": 0.019409282700421943,
"grad_norm": 0.8046875,
"learning_rate": 4.842105263157895e-07,
"loss": 0.2630135416984558,
"step": 92,
"token_acc": 0.9274447949526814
},
{
"epoch": 0.019620253164556962,
"grad_norm": 0.71875,
"learning_rate": 4.894736842105263e-07,
"loss": 0.2662945091724396,
"step": 93,
"token_acc": 0.9303838646714379
},
{
"epoch": 0.019831223628691982,
"grad_norm": 0.703125,
"learning_rate": 4.947368421052631e-07,
"loss": 0.2430751472711563,
"step": 94,
"token_acc": 0.9333737129012719
},
{
"epoch": 0.020042194092827006,
"grad_norm": 0.80078125,
"learning_rate": 5e-07,
"loss": 0.2676389515399933,
"step": 95,
"token_acc": 0.9231901118304885
},
{
"epoch": 0.020253164556962026,
"grad_norm": 0.765625,
"learning_rate": 5.052631578947368e-07,
"loss": 0.24123789370059967,
"step": 96,
"token_acc": 0.9292089873807325
},
{
"epoch": 0.020464135021097046,
"grad_norm": 0.80078125,
"learning_rate": 5.105263157894736e-07,
"loss": 0.24845993518829346,
"step": 97,
"token_acc": 0.9311657879320445
},
{
"epoch": 0.02067510548523207,
"grad_norm": 0.6953125,
"learning_rate": 5.157894736842106e-07,
"loss": 0.2502240836620331,
"step": 98,
"token_acc": 0.9316136772645471
},
{
"epoch": 0.02088607594936709,
"grad_norm": 0.64453125,
"learning_rate": 5.210526315789473e-07,
"loss": 0.2715577483177185,
"step": 99,
"token_acc": 0.9284542172628816
},
{
"epoch": 0.02109704641350211,
"grad_norm": 0.640625,
"learning_rate": 5.263157894736842e-07,
"loss": 0.25573915243148804,
"step": 100,
"token_acc": 0.9303937007874016
},
{
"epoch": 0.021308016877637132,
"grad_norm": 0.5859375,
"learning_rate": 5.31578947368421e-07,
"loss": 0.23634707927703857,
"step": 101,
"token_acc": 0.9278404618210443
},
{
"epoch": 0.021518987341772152,
"grad_norm": 0.79296875,
"learning_rate": 5.368421052631578e-07,
"loss": 0.24541448056697845,
"step": 102,
"token_acc": 0.9269794721407625
},
{
"epoch": 0.021729957805907172,
"grad_norm": 0.8046875,
"learning_rate": 5.421052631578948e-07,
"loss": 0.29984721541404724,
"step": 103,
"token_acc": 0.9175257731958762
},
{
"epoch": 0.021940928270042195,
"grad_norm": 0.703125,
"learning_rate": 5.473684210526316e-07,
"loss": 0.23752300441265106,
"step": 104,
"token_acc": 0.9307146753955264
},
{
"epoch": 0.022151898734177215,
"grad_norm": 0.7578125,
"learning_rate": 5.526315789473684e-07,
"loss": 0.26209786534309387,
"step": 105,
"token_acc": 0.9331191588785047
},
{
"epoch": 0.022362869198312235,
"grad_norm": 0.6015625,
"learning_rate": 5.578947368421052e-07,
"loss": 0.26135504245758057,
"step": 106,
"token_acc": 0.9239098624524437
},
{
"epoch": 0.02257383966244726,
"grad_norm": 0.76171875,
"learning_rate": 5.63157894736842e-07,
"loss": 0.2286645919084549,
"step": 107,
"token_acc": 0.9346016646848989
},
{
"epoch": 0.02278481012658228,
"grad_norm": 0.94921875,
"learning_rate": 5.68421052631579e-07,
"loss": 0.2228844314813614,
"step": 108,
"token_acc": 0.9335610058987892
},
{
"epoch": 0.0229957805907173,
"grad_norm": 0.7265625,
"learning_rate": 5.736842105263158e-07,
"loss": 0.23979443311691284,
"step": 109,
"token_acc": 0.9332460732984293
},
{
"epoch": 0.023206751054852322,
"grad_norm": 0.7734375,
"learning_rate": 5.789473684210526e-07,
"loss": 0.2925586402416229,
"step": 110,
"token_acc": 0.9247163973874184
},
{
"epoch": 0.02341772151898734,
"grad_norm": 0.89453125,
"learning_rate": 5.842105263157895e-07,
"loss": 0.26546645164489746,
"step": 111,
"token_acc": 0.9265167364016736
},
{
"epoch": 0.02362869198312236,
"grad_norm": 0.71875,
"learning_rate": 5.894736842105262e-07,
"loss": 0.23430243134498596,
"step": 112,
"token_acc": 0.9325294286534597
},
{
"epoch": 0.023839662447257385,
"grad_norm": 0.6875,
"learning_rate": 5.947368421052631e-07,
"loss": 0.2555156946182251,
"step": 113,
"token_acc": 0.9277472527472528
},
{
"epoch": 0.024050632911392405,
"grad_norm": 0.625,
"learning_rate": 6e-07,
"loss": 0.27733132243156433,
"step": 114,
"token_acc": 0.9281454979129398
},
{
"epoch": 0.024261603375527425,
"grad_norm": 0.6328125,
"learning_rate": 6.052631578947368e-07,
"loss": 0.2438090741634369,
"step": 115,
"token_acc": 0.9304691916336914
},
{
"epoch": 0.024472573839662448,
"grad_norm": 0.70703125,
"learning_rate": 6.105263157894736e-07,
"loss": 0.2810766100883484,
"step": 116,
"token_acc": 0.9260921603830042
},
{
"epoch": 0.024683544303797468,
"grad_norm": 0.6171875,
"learning_rate": 6.157894736842105e-07,
"loss": 0.21388083696365356,
"step": 117,
"token_acc": 0.9422394320748628
},
{
"epoch": 0.024894514767932488,
"grad_norm": 0.8828125,
"learning_rate": 6.210526315789474e-07,
"loss": 0.304085373878479,
"step": 118,
"token_acc": 0.9148753224419605
},
{
"epoch": 0.02510548523206751,
"grad_norm": 0.8125,
"learning_rate": 6.263157894736842e-07,
"loss": 0.24785375595092773,
"step": 119,
"token_acc": 0.9345686160972785
},
{
"epoch": 0.02531645569620253,
"grad_norm": 0.6015625,
"learning_rate": 6.31578947368421e-07,
"loss": 0.23662757873535156,
"step": 120,
"token_acc": 0.9339464882943144
},
{
"epoch": 0.02552742616033755,
"grad_norm": 0.7890625,
"learning_rate": 6.368421052631578e-07,
"loss": 0.30741703510284424,
"step": 121,
"token_acc": 0.9190517490604221
},
{
"epoch": 0.025738396624472575,
"grad_norm": 0.58984375,
"learning_rate": 6.421052631578947e-07,
"loss": 0.2324860692024231,
"step": 122,
"token_acc": 0.9350512753089666
},
{
"epoch": 0.025949367088607594,
"grad_norm": 0.71875,
"learning_rate": 6.473684210526316e-07,
"loss": 0.22093364596366882,
"step": 123,
"token_acc": 0.9382284382284383
},
{
"epoch": 0.026160337552742614,
"grad_norm": 0.58203125,
"learning_rate": 6.526315789473684e-07,
"loss": 0.26719486713409424,
"step": 124,
"token_acc": 0.9285504263451926
},
{
"epoch": 0.026371308016877638,
"grad_norm": 0.69921875,
"learning_rate": 6.578947368421053e-07,
"loss": 0.26403629779815674,
"step": 125,
"token_acc": 0.9270715096481271
},
{
"epoch": 0.026582278481012658,
"grad_norm": 1.046875,
"learning_rate": 6.63157894736842e-07,
"loss": 0.24764738976955414,
"step": 126,
"token_acc": 0.9328358208955224
},
{
"epoch": 0.02679324894514768,
"grad_norm": 0.80078125,
"learning_rate": 6.684210526315788e-07,
"loss": 0.2644532322883606,
"step": 127,
"token_acc": 0.9267714201008005
},
{
"epoch": 0.0270042194092827,
"grad_norm": 0.80859375,
"learning_rate": 6.736842105263158e-07,
"loss": 0.2985777258872986,
"step": 128,
"token_acc": 0.9214629997164729
},
{
"epoch": 0.02721518987341772,
"grad_norm": 0.62109375,
"learning_rate": 6.789473684210526e-07,
"loss": 0.23605869710445404,
"step": 129,
"token_acc": 0.9335576114381834
},
{
"epoch": 0.027426160337552744,
"grad_norm": 0.72265625,
"learning_rate": 6.842105263157895e-07,
"loss": 0.254613995552063,
"step": 130,
"token_acc": 0.9289265867212635
},
{
"epoch": 0.027637130801687764,
"grad_norm": 0.91015625,
"learning_rate": 6.894736842105263e-07,
"loss": 0.32649004459381104,
"step": 131,
"token_acc": 0.9107457428068115
},
{
"epoch": 0.027848101265822784,
"grad_norm": 0.66796875,
"learning_rate": 6.947368421052631e-07,
"loss": 0.2345716655254364,
"step": 132,
"token_acc": 0.9308671922377199
},
{
"epoch": 0.028059071729957807,
"grad_norm": 0.796875,
"learning_rate": 7e-07,
"loss": 0.2658767104148865,
"step": 133,
"token_acc": 0.9226377390807879
},
{
"epoch": 0.028270042194092827,
"grad_norm": 0.70703125,
"learning_rate": 7.052631578947368e-07,
"loss": 0.2791082561016083,
"step": 134,
"token_acc": 0.9298836497244336
},
{
"epoch": 0.028481012658227847,
"grad_norm": 0.75390625,
"learning_rate": 7.105263157894736e-07,
"loss": 0.28342780470848083,
"step": 135,
"token_acc": 0.9173340961098398
},
{
"epoch": 0.02869198312236287,
"grad_norm": 0.53125,
"learning_rate": 7.157894736842105e-07,
"loss": 0.19060048460960388,
"step": 136,
"token_acc": 0.9426000620539869
},
{
"epoch": 0.02890295358649789,
"grad_norm": 0.76171875,
"learning_rate": 7.210526315789473e-07,
"loss": 0.24044275283813477,
"step": 137,
"token_acc": 0.9315665883931566
},
{
"epoch": 0.02911392405063291,
"grad_norm": 0.69921875,
"learning_rate": 7.263157894736843e-07,
"loss": 0.22750994563102722,
"step": 138,
"token_acc": 0.9367798193709125
},
{
"epoch": 0.029324894514767934,
"grad_norm": 0.65234375,
"learning_rate": 7.315789473684211e-07,
"loss": 0.2409280240535736,
"step": 139,
"token_acc": 0.9357879234167894
},
{
"epoch": 0.029535864978902954,
"grad_norm": 0.74609375,
"learning_rate": 7.368421052631578e-07,
"loss": 0.24030432105064392,
"step": 140,
"token_acc": 0.928141912206855
},
{
"epoch": 0.029746835443037974,
"grad_norm": 0.703125,
"learning_rate": 7.421052631578947e-07,
"loss": 0.255402147769928,
"step": 141,
"token_acc": 0.9247853124074622
},
{
"epoch": 0.029957805907172997,
"grad_norm": 0.74609375,
"learning_rate": 7.473684210526315e-07,
"loss": 0.21290147304534912,
"step": 142,
"token_acc": 0.9362466327446872
},
{
"epoch": 0.030168776371308017,
"grad_norm": 0.88671875,
"learning_rate": 7.526315789473684e-07,
"loss": 0.2536450922489166,
"step": 143,
"token_acc": 0.9314112291350531
},
{
"epoch": 0.030379746835443037,
"grad_norm": 0.6015625,
"learning_rate": 7.578947368421053e-07,
"loss": 0.20192307233810425,
"step": 144,
"token_acc": 0.9473519272955186
},
{
"epoch": 0.03059071729957806,
"grad_norm": 0.7734375,
"learning_rate": 7.631578947368421e-07,
"loss": 0.27158498764038086,
"step": 145,
"token_acc": 0.9229805886036319
},
{
"epoch": 0.03080168776371308,
"grad_norm": 0.93359375,
"learning_rate": 7.684210526315788e-07,
"loss": 0.2483355551958084,
"step": 146,
"token_acc": 0.9249113760876571
},
{
"epoch": 0.0310126582278481,
"grad_norm": 0.546875,
"learning_rate": 7.736842105263157e-07,
"loss": 0.2523292303085327,
"step": 147,
"token_acc": 0.9309432853364679
},
{
"epoch": 0.031223628691983123,
"grad_norm": 0.9375,
"learning_rate": 7.789473684210526e-07,
"loss": 0.28833281993865967,
"step": 148,
"token_acc": 0.9170937594211637
},
{
"epoch": 0.03143459915611815,
"grad_norm": 0.7421875,
"learning_rate": 7.842105263157895e-07,
"loss": 0.250460147857666,
"step": 149,
"token_acc": 0.9285714285714286
},
{
"epoch": 0.03164556962025317,
"grad_norm": 0.9453125,
"learning_rate": 7.894736842105263e-07,
"loss": 0.2840534746646881,
"step": 150,
"token_acc": 0.9216349108789182
},
{
"epoch": 0.03185654008438819,
"grad_norm": 0.74609375,
"learning_rate": 7.947368421052631e-07,
"loss": 0.2967279851436615,
"step": 151,
"token_acc": 0.9164603960396039
},
{
"epoch": 0.032067510548523206,
"grad_norm": 0.83203125,
"learning_rate": 8e-07,
"loss": 0.25097131729125977,
"step": 152,
"token_acc": 0.929299572509043
},
{
"epoch": 0.032278481012658226,
"grad_norm": 0.6953125,
"learning_rate": 8.052631578947368e-07,
"loss": 0.23201878368854523,
"step": 153,
"token_acc": 0.9383025367992484
},
{
"epoch": 0.032489451476793246,
"grad_norm": 0.65625,
"learning_rate": 8.105263157894736e-07,
"loss": 0.2245524525642395,
"step": 154,
"token_acc": 0.9406554472984943
},
{
"epoch": 0.03270042194092827,
"grad_norm": 0.75,
"learning_rate": 8.157894736842105e-07,
"loss": 0.22958879172801971,
"step": 155,
"token_acc": 0.9288455860643637
},
{
"epoch": 0.03291139240506329,
"grad_norm": 0.70703125,
"learning_rate": 8.210526315789473e-07,
"loss": 0.2539287805557251,
"step": 156,
"token_acc": 0.9255386565272496
},
{
"epoch": 0.03312236286919831,
"grad_norm": 0.6875,
"learning_rate": 8.263157894736841e-07,
"loss": 0.245978444814682,
"step": 157,
"token_acc": 0.9341576506955178
},
{
"epoch": 0.03333333333333333,
"grad_norm": 0.66796875,
"learning_rate": 8.315789473684211e-07,
"loss": 0.26281115412712097,
"step": 158,
"token_acc": 0.927784222737819
},
{
"epoch": 0.03354430379746835,
"grad_norm": 1.0,
"learning_rate": 8.368421052631579e-07,
"loss": 0.229181706905365,
"step": 159,
"token_acc": 0.9329073482428115
},
{
"epoch": 0.03375527426160337,
"grad_norm": 0.73046875,
"learning_rate": 8.421052631578947e-07,
"loss": 0.2598130702972412,
"step": 160,
"token_acc": 0.9302388707926167
},
{
"epoch": 0.0339662447257384,
"grad_norm": 0.75,
"learning_rate": 8.473684210526315e-07,
"loss": 0.25113117694854736,
"step": 161,
"token_acc": 0.9281177829099307
},
{
"epoch": 0.03417721518987342,
"grad_norm": 0.6015625,
"learning_rate": 8.526315789473683e-07,
"loss": 0.20510195195674896,
"step": 162,
"token_acc": 0.936978417266187
},
{
"epoch": 0.03438818565400844,
"grad_norm": 0.75,
"learning_rate": 8.578947368421053e-07,
"loss": 0.25259485840797424,
"step": 163,
"token_acc": 0.9289311695579183
},
{
"epoch": 0.03459915611814346,
"grad_norm": 0.6484375,
"learning_rate": 8.631578947368421e-07,
"loss": 0.2823118567466736,
"step": 164,
"token_acc": 0.9225122349102773
},
{
"epoch": 0.03481012658227848,
"grad_norm": 0.84375,
"learning_rate": 8.684210526315789e-07,
"loss": 0.2576562762260437,
"step": 165,
"token_acc": 0.931045050566963
},
{
"epoch": 0.0350210970464135,
"grad_norm": 0.6796875,
"learning_rate": 8.736842105263158e-07,
"loss": 0.2349683940410614,
"step": 166,
"token_acc": 0.9357770372614359
},
{
"epoch": 0.035232067510548526,
"grad_norm": 0.89453125,
"learning_rate": 8.789473684210525e-07,
"loss": 0.19951120018959045,
"step": 167,
"token_acc": 0.9381918819188192
},
{
"epoch": 0.035443037974683546,
"grad_norm": 0.58984375,
"learning_rate": 8.842105263157895e-07,
"loss": 0.24796079099178314,
"step": 168,
"token_acc": 0.9311111111111111
},
{
"epoch": 0.035654008438818566,
"grad_norm": 0.73828125,
"learning_rate": 8.894736842105263e-07,
"loss": 0.2532733082771301,
"step": 169,
"token_acc": 0.9297912713472486
},
{
"epoch": 0.035864978902953586,
"grad_norm": 0.98046875,
"learning_rate": 8.947368421052631e-07,
"loss": 0.25062763690948486,
"step": 170,
"token_acc": 0.9289099526066351
},
{
"epoch": 0.036075949367088606,
"grad_norm": 0.6484375,
"learning_rate": 9e-07,
"loss": 0.2228512465953827,
"step": 171,
"token_acc": 0.9302030456852792
},
{
"epoch": 0.036286919831223625,
"grad_norm": 0.55859375,
"learning_rate": 9.052631578947368e-07,
"loss": 0.20684444904327393,
"step": 172,
"token_acc": 0.9388583019414662
},
{
"epoch": 0.03649789029535865,
"grad_norm": 0.68359375,
"learning_rate": 9.105263157894737e-07,
"loss": 0.2119678407907486,
"step": 173,
"token_acc": 0.938135593220339
},
{
"epoch": 0.03670886075949367,
"grad_norm": 0.828125,
"learning_rate": 9.157894736842105e-07,
"loss": 0.25168293714523315,
"step": 174,
"token_acc": 0.9275818639798489
},
{
"epoch": 0.03691983122362869,
"grad_norm": 2.046875,
"learning_rate": 9.210526315789473e-07,
"loss": 0.2518053650856018,
"step": 175,
"token_acc": 0.9275232105420784
},
{
"epoch": 0.03713080168776371,
"grad_norm": 0.6171875,
"learning_rate": 9.263157894736841e-07,
"loss": 0.25257354974746704,
"step": 176,
"token_acc": 0.9272777932571747
},
{
"epoch": 0.03734177215189873,
"grad_norm": 1.0390625,
"learning_rate": 9.31578947368421e-07,
"loss": 0.24908028542995453,
"step": 177,
"token_acc": 0.9316982303632412
},
{
"epoch": 0.03755274261603375,
"grad_norm": 0.796875,
"learning_rate": 9.368421052631579e-07,
"loss": 0.2594815194606781,
"step": 178,
"token_acc": 0.9222846441947565
},
{
"epoch": 0.03776371308016878,
"grad_norm": 1.734375,
"learning_rate": 9.421052631578948e-07,
"loss": 0.301219642162323,
"step": 179,
"token_acc": 0.9261637239165329
},
{
"epoch": 0.0379746835443038,
"grad_norm": 0.6171875,
"learning_rate": 9.473684210526315e-07,
"loss": 0.2224687933921814,
"step": 180,
"token_acc": 0.9298196166854565
},
{
"epoch": 0.03818565400843882,
"grad_norm": 0.76953125,
"learning_rate": 9.526315789473683e-07,
"loss": 0.2755109667778015,
"step": 181,
"token_acc": 0.9245460237946149
},
{
"epoch": 0.03839662447257384,
"grad_norm": 0.6796875,
"learning_rate": 9.578947368421053e-07,
"loss": 0.24350810050964355,
"step": 182,
"token_acc": 0.9347500748278958
},
{
"epoch": 0.03860759493670886,
"grad_norm": 0.70703125,
"learning_rate": 9.63157894736842e-07,
"loss": 0.26835542917251587,
"step": 183,
"token_acc": 0.9246247205365697
},
{
"epoch": 0.038818565400843885,
"grad_norm": 0.703125,
"learning_rate": 9.68421052631579e-07,
"loss": 0.25252771377563477,
"step": 184,
"token_acc": 0.9330130016958734
},
{
"epoch": 0.039029535864978905,
"grad_norm": 1.21875,
"learning_rate": 9.736842105263158e-07,
"loss": 0.27294090390205383,
"step": 185,
"token_acc": 0.9220917822838848
},
{
"epoch": 0.039240506329113925,
"grad_norm": 0.9140625,
"learning_rate": 9.789473684210526e-07,
"loss": 0.257973313331604,
"step": 186,
"token_acc": 0.9298401420959147
},
{
"epoch": 0.039451476793248945,
"grad_norm": 0.71875,
"learning_rate": 9.842105263157894e-07,
"loss": 0.20286661386489868,
"step": 187,
"token_acc": 0.9379893517068587
},
{
"epoch": 0.039662447257383965,
"grad_norm": 1.0859375,
"learning_rate": 9.894736842105263e-07,
"loss": 0.30547526478767395,
"step": 188,
"token_acc": 0.9224360815857512
},
{
"epoch": 0.039873417721518985,
"grad_norm": 0.6796875,
"learning_rate": 9.947368421052631e-07,
"loss": 0.2581551671028137,
"step": 189,
"token_acc": 0.9263346257083209
},
{
"epoch": 0.04008438818565401,
"grad_norm": 0.99609375,
"learning_rate": 1e-06,
"loss": 0.267391562461853,
"step": 190,
"token_acc": 0.9272947591638897
},
{
"epoch": 0.04029535864978903,
"grad_norm": 0.7265625,
"learning_rate": 9.99999971410384e-07,
"loss": 0.22360718250274658,
"step": 191,
"token_acc": 0.9367160775370581
},
{
"epoch": 0.04050632911392405,
"grad_norm": 0.71875,
"learning_rate": 9.999998856415392e-07,
"loss": 0.2589290738105774,
"step": 192,
"token_acc": 0.9256432004523607
},
{
"epoch": 0.04071729957805907,
"grad_norm": 0.65234375,
"learning_rate": 9.999997426934757e-07,
"loss": 0.2469128668308258,
"step": 193,
"token_acc": 0.9290098745663197
},
{
"epoch": 0.04092827004219409,
"grad_norm": 1.203125,
"learning_rate": 9.999995425662095e-07,
"loss": 0.2602100968360901,
"step": 194,
"token_acc": 0.9284731774415406
},
{
"epoch": 0.04113924050632911,
"grad_norm": 0.61328125,
"learning_rate": 9.999992852597638e-07,
"loss": 0.2579442858695984,
"step": 195,
"token_acc": 0.9280116110304789
},
{
"epoch": 0.04135021097046414,
"grad_norm": 0.75,
"learning_rate": 9.999989707741678e-07,
"loss": 0.266757071018219,
"step": 196,
"token_acc": 0.923786841321822
},
{
"epoch": 0.04156118143459916,
"grad_norm": 0.59765625,
"learning_rate": 9.999985991094577e-07,
"loss": 0.22912907600402832,
"step": 197,
"token_acc": 0.9336933693369337
},
{
"epoch": 0.04177215189873418,
"grad_norm": 0.73046875,
"learning_rate": 9.999981702656756e-07,
"loss": 0.23296543955802917,
"step": 198,
"token_acc": 0.931110498759989
},
{
"epoch": 0.0419831223628692,
"grad_norm": 0.75390625,
"learning_rate": 9.999976842428708e-07,
"loss": 0.27944594621658325,
"step": 199,
"token_acc": 0.9281785829828535
},
{
"epoch": 0.04219409282700422,
"grad_norm": 0.6328125,
"learning_rate": 9.99997141041099e-07,
"loss": 0.2449186146259308,
"step": 200,
"token_acc": 0.9309120699071546
},
{
"epoch": 0.04219409282700422,
"eval_loss": 0.43372446298599243,
"eval_runtime": 245.8313,
"eval_samples_per_second": 137.106,
"eval_steps_per_second": 2.144,
"eval_token_acc": 0.8990801399982051,
"step": 200
},
{
"epoch": 0.04240506329113924,
"grad_norm": 0.765625,
"learning_rate": 9.99996540660422e-07,
"loss": 0.2534567713737488,
"step": 201,
"token_acc": 0.9282550930026572
},
{
"epoch": 0.042616033755274264,
"grad_norm": 1.0,
"learning_rate": 9.999958831009087e-07,
"loss": 0.2861325144767761,
"step": 202,
"token_acc": 0.9250295159386068
},
{
"epoch": 0.042827004219409284,
"grad_norm": 0.671875,
"learning_rate": 9.999951683626345e-07,
"loss": 0.24760206043720245,
"step": 203,
"token_acc": 0.9320360151031077
},
{
"epoch": 0.043037974683544304,
"grad_norm": 0.62109375,
"learning_rate": 9.999943964456805e-07,
"loss": 0.2488883137702942,
"step": 204,
"token_acc": 0.9300783604581073
},
{
"epoch": 0.043248945147679324,
"grad_norm": 0.80078125,
"learning_rate": 9.999935673501355e-07,
"loss": 0.257844477891922,
"step": 205,
"token_acc": 0.9278959810874704
},
{
"epoch": 0.043459915611814344,
"grad_norm": 0.71875,
"learning_rate": 9.99992681076094e-07,
"loss": 0.21238219738006592,
"step": 206,
"token_acc": 0.937776467118844
},
{
"epoch": 0.043670886075949364,
"grad_norm": 0.67578125,
"learning_rate": 9.999917376236578e-07,
"loss": 0.21476256847381592,
"step": 207,
"token_acc": 0.9357326478149101
},
{
"epoch": 0.04388185654008439,
"grad_norm": 0.97265625,
"learning_rate": 9.999907369929344e-07,
"loss": 0.24194155633449554,
"step": 208,
"token_acc": 0.9311714096624751
},
{
"epoch": 0.04409282700421941,
"grad_norm": 0.765625,
"learning_rate": 9.999896791840383e-07,
"loss": 0.2757856249809265,
"step": 209,
"token_acc": 0.9243792325056434
},
{
"epoch": 0.04430379746835443,
"grad_norm": 0.66015625,
"learning_rate": 9.999885641970906e-07,
"loss": 0.2318935990333557,
"step": 210,
"token_acc": 0.9357142857142857
},
{
"epoch": 0.04451476793248945,
"grad_norm": 0.64453125,
"learning_rate": 9.999873920322186e-07,
"loss": 0.2802179157733917,
"step": 211,
"token_acc": 0.9227716727716728
},
{
"epoch": 0.04472573839662447,
"grad_norm": 0.6796875,
"learning_rate": 9.999861626895565e-07,
"loss": 0.2558714747428894,
"step": 212,
"token_acc": 0.9255022321428571
},
{
"epoch": 0.04493670886075949,
"grad_norm": 0.890625,
"learning_rate": 9.99984876169245e-07,
"loss": 0.29882046580314636,
"step": 213,
"token_acc": 0.9213449414590213
},
{
"epoch": 0.04514767932489452,
"grad_norm": 0.76171875,
"learning_rate": 9.999835324714307e-07,
"loss": 0.24097202718257904,
"step": 214,
"token_acc": 0.9359388774610637
},
{
"epoch": 0.04535864978902954,
"grad_norm": 1.703125,
"learning_rate": 9.99982131596268e-07,
"loss": 0.28899186849594116,
"step": 215,
"token_acc": 0.9218701937865272
},
{
"epoch": 0.04556962025316456,
"grad_norm": 0.68359375,
"learning_rate": 9.999806735439165e-07,
"loss": 0.2710872292518616,
"step": 216,
"token_acc": 0.9248780487804878
},
{
"epoch": 0.04578059071729958,
"grad_norm": 0.6484375,
"learning_rate": 9.999791583145433e-07,
"loss": 0.2295331209897995,
"step": 217,
"token_acc": 0.9357304643261608
},
{
"epoch": 0.0459915611814346,
"grad_norm": 0.9921875,
"learning_rate": 9.999775859083216e-07,
"loss": 0.2171935886144638,
"step": 218,
"token_acc": 0.9356940509915014
},
{
"epoch": 0.046202531645569624,
"grad_norm": 0.94921875,
"learning_rate": 9.99975956325431e-07,
"loss": 0.2726757526397705,
"step": 219,
"token_acc": 0.9209691375829248
},
{
"epoch": 0.046413502109704644,
"grad_norm": 0.703125,
"learning_rate": 9.99974269566058e-07,
"loss": 0.27028149366378784,
"step": 220,
"token_acc": 0.9284016636957814
},
{
"epoch": 0.04662447257383966,
"grad_norm": 0.56640625,
"learning_rate": 9.999725256303957e-07,
"loss": 0.20975014567375183,
"step": 221,
"token_acc": 0.9344503233392122
},
{
"epoch": 0.04683544303797468,
"grad_norm": 0.80078125,
"learning_rate": 9.999707245186434e-07,
"loss": 0.3065168261528015,
"step": 222,
"token_acc": 0.9186879823594267
},
{
"epoch": 0.0470464135021097,
"grad_norm": 0.66796875,
"learning_rate": 9.999688662310072e-07,
"loss": 0.20764990150928497,
"step": 223,
"token_acc": 0.9452255418863503
},
{
"epoch": 0.04725738396624472,
"grad_norm": 0.89453125,
"learning_rate": 9.99966950767699e-07,
"loss": 0.2654411196708679,
"step": 224,
"token_acc": 0.9302244039270687
},
{
"epoch": 0.04746835443037975,
"grad_norm": 0.6953125,
"learning_rate": 9.999649781289385e-07,
"loss": 0.2514041066169739,
"step": 225,
"token_acc": 0.933082271147161
},
{
"epoch": 0.04767932489451477,
"grad_norm": 0.6328125,
"learning_rate": 9.99962948314951e-07,
"loss": 0.21037127077579498,
"step": 226,
"token_acc": 0.9351134846461949
},
{
"epoch": 0.04789029535864979,
"grad_norm": 0.59765625,
"learning_rate": 9.99960861325969e-07,
"loss": 0.21236909925937653,
"step": 227,
"token_acc": 0.940097449125824
},
{
"epoch": 0.04810126582278481,
"grad_norm": 0.6640625,
"learning_rate": 9.999587171622305e-07,
"loss": 0.21992863714694977,
"step": 228,
"token_acc": 0.9344711978055471
},
{
"epoch": 0.04831223628691983,
"grad_norm": 0.95703125,
"learning_rate": 9.999565158239812e-07,
"loss": 0.26401764154434204,
"step": 229,
"token_acc": 0.9244654262704805
},
{
"epoch": 0.04852320675105485,
"grad_norm": 0.73828125,
"learning_rate": 9.999542573114728e-07,
"loss": 0.24087585508823395,
"step": 230,
"token_acc": 0.926786751888437
},
{
"epoch": 0.048734177215189876,
"grad_norm": 0.703125,
"learning_rate": 9.999519416249634e-07,
"loss": 0.2533552646636963,
"step": 231,
"token_acc": 0.9275784028451342
},
{
"epoch": 0.048945147679324896,
"grad_norm": 1.4296875,
"learning_rate": 9.999495687647178e-07,
"loss": 0.2529897689819336,
"step": 232,
"token_acc": 0.9269195189639223
},
{
"epoch": 0.049156118143459916,
"grad_norm": 0.81640625,
"learning_rate": 9.999471387310077e-07,
"loss": 0.2788076400756836,
"step": 233,
"token_acc": 0.9202168861347793
},
{
"epoch": 0.049367088607594936,
"grad_norm": 0.68359375,
"learning_rate": 9.999446515241108e-07,
"loss": 0.2300492525100708,
"step": 234,
"token_acc": 0.9325668116842759
},
{
"epoch": 0.049578059071729956,
"grad_norm": 0.85546875,
"learning_rate": 9.999421071443115e-07,
"loss": 0.2711006700992584,
"step": 235,
"token_acc": 0.9220738900962434
},
{
"epoch": 0.049789029535864976,
"grad_norm": 0.66796875,
"learning_rate": 9.999395055919007e-07,
"loss": 0.24382656812667847,
"step": 236,
"token_acc": 0.9297777777777778
},
{
"epoch": 0.05,
"grad_norm": 0.76953125,
"learning_rate": 9.999368468671758e-07,
"loss": 0.2818126380443573,
"step": 237,
"token_acc": 0.9211531781868705
},
{
"epoch": 0.05021097046413502,
"grad_norm": 0.79296875,
"learning_rate": 9.999341309704413e-07,
"loss": 0.29420921206474304,
"step": 238,
"token_acc": 0.9187301587301587
},
{
"epoch": 0.05042194092827004,
"grad_norm": 0.6796875,
"learning_rate": 9.999313579020074e-07,
"loss": 0.24233081936836243,
"step": 239,
"token_acc": 0.9322516367776829
},
{
"epoch": 0.05063291139240506,
"grad_norm": 0.93359375,
"learning_rate": 9.999285276621913e-07,
"loss": 0.22199922800064087,
"step": 240,
"token_acc": 0.9347892956013534
},
{
"epoch": 0.05084388185654008,
"grad_norm": 0.6796875,
"learning_rate": 9.999256402513168e-07,
"loss": 0.2756049931049347,
"step": 241,
"token_acc": 0.9229754682141915
},
{
"epoch": 0.0510548523206751,
"grad_norm": 1.7421875,
"learning_rate": 9.999226956697138e-07,
"loss": 0.2459474354982376,
"step": 242,
"token_acc": 0.9287122207621551
},
{
"epoch": 0.05126582278481013,
"grad_norm": 0.7265625,
"learning_rate": 9.999196939177195e-07,
"loss": 0.26543667912483215,
"step": 243,
"token_acc": 0.9251445086705202
},
{
"epoch": 0.05147679324894515,
"grad_norm": 0.69921875,
"learning_rate": 9.999166349956768e-07,
"loss": 0.29306668043136597,
"step": 244,
"token_acc": 0.922089552238806
},
{
"epoch": 0.05168776371308017,
"grad_norm": 0.78515625,
"learning_rate": 9.999135189039356e-07,
"loss": 0.232993021607399,
"step": 245,
"token_acc": 0.933374460209747
},
{
"epoch": 0.05189873417721519,
"grad_norm": 0.703125,
"learning_rate": 9.999103456428522e-07,
"loss": 0.29452502727508545,
"step": 246,
"token_acc": 0.9255251432208784
},
{
"epoch": 0.05210970464135021,
"grad_norm": 0.75,
"learning_rate": 9.999071152127897e-07,
"loss": 0.2289431095123291,
"step": 247,
"token_acc": 0.9372047791053071
},
{
"epoch": 0.05232067510548523,
"grad_norm": 0.8046875,
"learning_rate": 9.999038276141175e-07,
"loss": 0.3194141983985901,
"step": 248,
"token_acc": 0.914375
},
{
"epoch": 0.052531645569620256,
"grad_norm": 0.68359375,
"learning_rate": 9.999004828472112e-07,
"loss": 0.24136003851890564,
"step": 249,
"token_acc": 0.9315025252525253
},
{
"epoch": 0.052742616033755275,
"grad_norm": 0.74609375,
"learning_rate": 9.998970809124537e-07,
"loss": 0.31663718819618225,
"step": 250,
"token_acc": 0.9186367823150138
},
{
"epoch": 0.052953586497890295,
"grad_norm": 0.765625,
"learning_rate": 9.998936218102338e-07,
"loss": 0.2638603448867798,
"step": 251,
"token_acc": 0.9242610837438424
},
{
"epoch": 0.053164556962025315,
"grad_norm": 0.62109375,
"learning_rate": 9.998901055409474e-07,
"loss": 0.26234734058380127,
"step": 252,
"token_acc": 0.9283480238839921
},
{
"epoch": 0.053375527426160335,
"grad_norm": 0.78125,
"learning_rate": 9.99886532104996e-07,
"loss": 0.27683377265930176,
"step": 253,
"token_acc": 0.9228208232445521
},
{
"epoch": 0.05358649789029536,
"grad_norm": 0.65234375,
"learning_rate": 9.99882901502789e-07,
"loss": 0.20958667993545532,
"step": 254,
"token_acc": 0.9367622259696459
},
{
"epoch": 0.05379746835443038,
"grad_norm": 0.765625,
"learning_rate": 9.998792137347412e-07,
"loss": 0.2769642174243927,
"step": 255,
"token_acc": 0.9259877573734001
},
{
"epoch": 0.0540084388185654,
"grad_norm": 0.7890625,
"learning_rate": 9.998754688012744e-07,
"loss": 0.291420578956604,
"step": 256,
"token_acc": 0.9195469067673541
},
{
"epoch": 0.05421940928270042,
"grad_norm": 0.7109375,
"learning_rate": 9.998716667028166e-07,
"loss": 0.2671175003051758,
"step": 257,
"token_acc": 0.9248520710059172
},
{
"epoch": 0.05443037974683544,
"grad_norm": 0.640625,
"learning_rate": 9.99867807439803e-07,
"loss": 0.2104148268699646,
"step": 258,
"token_acc": 0.9412735070933685
},
{
"epoch": 0.05464135021097046,
"grad_norm": 0.8125,
"learning_rate": 9.99863891012675e-07,
"loss": 0.25562331080436707,
"step": 259,
"token_acc": 0.9300422386483632
},
{
"epoch": 0.05485232067510549,
"grad_norm": 0.93359375,
"learning_rate": 9.998599174218797e-07,
"loss": 0.25945645570755005,
"step": 260,
"token_acc": 0.9278557114228457
},
{
"epoch": 0.05506329113924051,
"grad_norm": 0.76953125,
"learning_rate": 9.998558866678726e-07,
"loss": 0.2575325667858124,
"step": 261,
"token_acc": 0.9237336368810473
},
{
"epoch": 0.05527426160337553,
"grad_norm": 0.79296875,
"learning_rate": 9.998517987511139e-07,
"loss": 0.21312668919563293,
"step": 262,
"token_acc": 0.9391352244560727
},
{
"epoch": 0.05548523206751055,
"grad_norm": 1.5390625,
"learning_rate": 9.998476536720712e-07,
"loss": 0.27397406101226807,
"step": 263,
"token_acc": 0.9302109181141439
},
{
"epoch": 0.05569620253164557,
"grad_norm": 0.7265625,
"learning_rate": 9.998434514312187e-07,
"loss": 0.27095240354537964,
"step": 264,
"token_acc": 0.9266853059956508
},
{
"epoch": 0.05590717299578059,
"grad_norm": 0.76953125,
"learning_rate": 9.99839192029037e-07,
"loss": 0.237601175904274,
"step": 265,
"token_acc": 0.9364719228587635
},
{
"epoch": 0.056118143459915615,
"grad_norm": 0.7421875,
"learning_rate": 9.998348754660129e-07,
"loss": 0.2851409316062927,
"step": 266,
"token_acc": 0.9176470588235294
},
{
"epoch": 0.056329113924050635,
"grad_norm": 0.796875,
"learning_rate": 9.998305017426403e-07,
"loss": 0.26605701446533203,
"step": 267,
"token_acc": 0.9261460101867572
},
{
"epoch": 0.056540084388185655,
"grad_norm": 0.67578125,
"learning_rate": 9.998260708594192e-07,
"loss": 0.26237568259239197,
"step": 268,
"token_acc": 0.9257142857142857
},
{
"epoch": 0.056751054852320675,
"grad_norm": 0.859375,
"learning_rate": 9.998215828168566e-07,
"loss": 0.2315206527709961,
"step": 269,
"token_acc": 0.9332755632582322
},
{
"epoch": 0.056962025316455694,
"grad_norm": 0.70703125,
"learning_rate": 9.998170376154654e-07,
"loss": 0.26748204231262207,
"step": 270,
"token_acc": 0.9308067757680161
},
{
"epoch": 0.057172995780590714,
"grad_norm": 0.9609375,
"learning_rate": 9.998124352557655e-07,
"loss": 0.33397209644317627,
"step": 271,
"token_acc": 0.9161147902869757
},
{
"epoch": 0.05738396624472574,
"grad_norm": 0.71484375,
"learning_rate": 9.998077757382835e-07,
"loss": 0.2637864351272583,
"step": 272,
"token_acc": 0.9291949563530553
},
{
"epoch": 0.05759493670886076,
"grad_norm": 0.734375,
"learning_rate": 9.998030590635517e-07,
"loss": 0.2878430485725403,
"step": 273,
"token_acc": 0.919882100750268
},
{
"epoch": 0.05780590717299578,
"grad_norm": 0.65625,
"learning_rate": 9.997982852321099e-07,
"loss": 0.2438146024942398,
"step": 274,
"token_acc": 0.9312955692652832
},
{
"epoch": 0.0580168776371308,
"grad_norm": 0.74609375,
"learning_rate": 9.99793454244504e-07,
"loss": 0.2523839771747589,
"step": 275,
"token_acc": 0.9254349627174814
},
{
"epoch": 0.05822784810126582,
"grad_norm": 0.609375,
"learning_rate": 9.997885661012865e-07,
"loss": 0.23295487463474274,
"step": 276,
"token_acc": 0.937351934719663
},
{
"epoch": 0.05843881856540084,
"grad_norm": 0.77734375,
"learning_rate": 9.99783620803016e-07,
"loss": 0.287087619304657,
"step": 277,
"token_acc": 0.9255798969072165
},
{
"epoch": 0.05864978902953587,
"grad_norm": 0.65234375,
"learning_rate": 9.997786183502584e-07,
"loss": 0.23424138128757477,
"step": 278,
"token_acc": 0.924812030075188
},
{
"epoch": 0.05886075949367089,
"grad_norm": 0.9296875,
"learning_rate": 9.997735587435858e-07,
"loss": 0.25225332379341125,
"step": 279,
"token_acc": 0.9257325210327821
},
{
"epoch": 0.05907172995780591,
"grad_norm": 0.62890625,
"learning_rate": 9.997684419835767e-07,
"loss": 0.24867427349090576,
"step": 280,
"token_acc": 0.9290465631929047
},
{
"epoch": 0.05928270042194093,
"grad_norm": 1.265625,
"learning_rate": 9.997632680708163e-07,
"loss": 0.2555754482746124,
"step": 281,
"token_acc": 0.9300212056952438
},
{
"epoch": 0.05949367088607595,
"grad_norm": 0.6796875,
"learning_rate": 9.99758037005896e-07,
"loss": 0.25050830841064453,
"step": 282,
"token_acc": 0.9328039095907147
},
{
"epoch": 0.05970464135021097,
"grad_norm": 0.77734375,
"learning_rate": 9.997527487894144e-07,
"loss": 0.264704167842865,
"step": 283,
"token_acc": 0.9269878805793674
},
{
"epoch": 0.059915611814345994,
"grad_norm": 0.765625,
"learning_rate": 9.997474034219762e-07,
"loss": 0.29550492763519287,
"step": 284,
"token_acc": 0.9211438474870017
},
{
"epoch": 0.060126582278481014,
"grad_norm": 0.7109375,
"learning_rate": 9.997420009041927e-07,
"loss": 0.264403373003006,
"step": 285,
"token_acc": 0.9260048721071864
},
{
"epoch": 0.060337552742616034,
"grad_norm": 0.78125,
"learning_rate": 9.997365412366812e-07,
"loss": 0.2595897316932678,
"step": 286,
"token_acc": 0.9286173633440514
},
{
"epoch": 0.060548523206751054,
"grad_norm": 0.70703125,
"learning_rate": 9.997310244200667e-07,
"loss": 0.23976776003837585,
"step": 287,
"token_acc": 0.9318894271872328
},
{
"epoch": 0.060759493670886074,
"grad_norm": 0.9140625,
"learning_rate": 9.997254504549799e-07,
"loss": 0.26183438301086426,
"step": 288,
"token_acc": 0.9322949777495232
},
{
"epoch": 0.0609704641350211,
"grad_norm": 1.0078125,
"learning_rate": 9.99719819342058e-07,
"loss": 0.24600914120674133,
"step": 289,
"token_acc": 0.9311287236949987
},
{
"epoch": 0.06118143459915612,
"grad_norm": 0.83984375,
"learning_rate": 9.997141310819454e-07,
"loss": 0.3296029567718506,
"step": 290,
"token_acc": 0.9126184834123223
},
{
"epoch": 0.06139240506329114,
"grad_norm": 0.85546875,
"learning_rate": 9.997083856752923e-07,
"loss": 0.2794192433357239,
"step": 291,
"token_acc": 0.9190751445086706
},
{
"epoch": 0.06160337552742616,
"grad_norm": 0.6953125,
"learning_rate": 9.997025831227557e-07,
"loss": 0.23178298771381378,
"step": 292,
"token_acc": 0.9380645161290323
},
{
"epoch": 0.06181434599156118,
"grad_norm": 0.87109375,
"learning_rate": 9.996967234249994e-07,
"loss": 0.2989250123500824,
"step": 293,
"token_acc": 0.9201101928374655
},
{
"epoch": 0.0620253164556962,
"grad_norm": 0.625,
"learning_rate": 9.996908065826935e-07,
"loss": 0.20801636576652527,
"step": 294,
"token_acc": 0.9374828626268166
},
{
"epoch": 0.06223628691983123,
"grad_norm": 0.68359375,
"learning_rate": 9.996848325965142e-07,
"loss": 0.2513968050479889,
"step": 295,
"token_acc": 0.9286151960784313
},
{
"epoch": 0.06244725738396625,
"grad_norm": 0.77734375,
"learning_rate": 9.99678801467145e-07,
"loss": 0.23670442402362823,
"step": 296,
"token_acc": 0.9297945205479452
},
{
"epoch": 0.06265822784810127,
"grad_norm": 0.71875,
"learning_rate": 9.99672713195276e-07,
"loss": 0.3005760908126831,
"step": 297,
"token_acc": 0.9181008902077151
},
{
"epoch": 0.0628691983122363,
"grad_norm": 0.64453125,
"learning_rate": 9.996665677816027e-07,
"loss": 0.2198331356048584,
"step": 298,
"token_acc": 0.934411226357535
},
{
"epoch": 0.0630801687763713,
"grad_norm": 0.7421875,
"learning_rate": 9.996603652268283e-07,
"loss": 0.22930385172367096,
"step": 299,
"token_acc": 0.9343891402714932
},
{
"epoch": 0.06329113924050633,
"grad_norm": 0.6953125,
"learning_rate": 9.99654105531662e-07,
"loss": 0.26769182085990906,
"step": 300,
"token_acc": 0.9295731707317073
},
{
"epoch": 0.06350210970464135,
"grad_norm": 0.58203125,
"learning_rate": 9.9964778869682e-07,
"loss": 0.2113886922597885,
"step": 301,
"token_acc": 0.9383966244725739
},
{
"epoch": 0.06371308016877637,
"grad_norm": 0.78515625,
"learning_rate": 9.996414147230242e-07,
"loss": 0.2549387812614441,
"step": 302,
"token_acc": 0.9245056920311564
},
{
"epoch": 0.06392405063291139,
"grad_norm": 0.83203125,
"learning_rate": 9.996349836110035e-07,
"loss": 0.24877741932868958,
"step": 303,
"token_acc": 0.9278890600924499
},
{
"epoch": 0.06413502109704641,
"grad_norm": 0.85546875,
"learning_rate": 9.996284953614938e-07,
"loss": 0.2965357303619385,
"step": 304,
"token_acc": 0.9167351410572446
},
{
"epoch": 0.06434599156118144,
"grad_norm": 0.6484375,
"learning_rate": 9.996219499752365e-07,
"loss": 0.21444806456565857,
"step": 305,
"token_acc": 0.938101788170564
},
{
"epoch": 0.06455696202531645,
"grad_norm": 0.671875,
"learning_rate": 9.996153474529807e-07,
"loss": 0.24650560319423676,
"step": 306,
"token_acc": 0.928284854563691
},
{
"epoch": 0.06476793248945148,
"grad_norm": 0.73046875,
"learning_rate": 9.996086877954812e-07,
"loss": 0.26447594165802,
"step": 307,
"token_acc": 0.9272459499263623
},
{
"epoch": 0.06497890295358649,
"grad_norm": 0.6484375,
"learning_rate": 9.996019710034997e-07,
"loss": 0.22312676906585693,
"step": 308,
"token_acc": 0.9304399524375743
},
{
"epoch": 0.06518987341772152,
"grad_norm": 0.75,
"learning_rate": 9.99595197077804e-07,
"loss": 0.3124806582927704,
"step": 309,
"token_acc": 0.9134506242905789
},
{
"epoch": 0.06540084388185655,
"grad_norm": 0.6640625,
"learning_rate": 9.99588366019169e-07,
"loss": 0.21681739389896393,
"step": 310,
"token_acc": 0.9393859879296772
},
{
"epoch": 0.06561181434599156,
"grad_norm": 0.7734375,
"learning_rate": 9.99581477828376e-07,
"loss": 0.262542188167572,
"step": 311,
"token_acc": 0.9339063426200356
},
{
"epoch": 0.06582278481012659,
"grad_norm": 0.77734375,
"learning_rate": 9.995745325062126e-07,
"loss": 0.24062800407409668,
"step": 312,
"token_acc": 0.9331594391913922
},
{
"epoch": 0.0660337552742616,
"grad_norm": 0.71484375,
"learning_rate": 9.995675300534729e-07,
"loss": 0.26486438512802124,
"step": 313,
"token_acc": 0.9250824093497153
},
{
"epoch": 0.06624472573839663,
"grad_norm": 0.54296875,
"learning_rate": 9.995604704709578e-07,
"loss": 0.18465927243232727,
"step": 314,
"token_acc": 0.9464985994397759
},
{
"epoch": 0.06645569620253164,
"grad_norm": 0.91015625,
"learning_rate": 9.99553353759475e-07,
"loss": 0.2520803213119507,
"step": 315,
"token_acc": 0.9257213014119091
},
{
"epoch": 0.06666666666666667,
"grad_norm": 0.65625,
"learning_rate": 9.995461799198378e-07,
"loss": 0.29753151535987854,
"step": 316,
"token_acc": 0.9256516587677726
},
{
"epoch": 0.06687763713080169,
"grad_norm": 0.71875,
"learning_rate": 9.995389489528667e-07,
"loss": 0.2546613812446594,
"step": 317,
"token_acc": 0.9310240048617442
},
{
"epoch": 0.0670886075949367,
"grad_norm": 1.4140625,
"learning_rate": 9.995316608593886e-07,
"loss": 0.24808946251869202,
"step": 318,
"token_acc": 0.9316290130796671
},
{
"epoch": 0.06729957805907173,
"grad_norm": 0.73046875,
"learning_rate": 9.995243156402374e-07,
"loss": 0.2512444853782654,
"step": 319,
"token_acc": 0.9294703723125328
},
{
"epoch": 0.06751054852320675,
"grad_norm": 0.921875,
"learning_rate": 9.995169132962527e-07,
"loss": 0.2597760260105133,
"step": 320,
"token_acc": 0.9277310924369748
},
{
"epoch": 0.06772151898734177,
"grad_norm": 0.65234375,
"learning_rate": 9.99509453828281e-07,
"loss": 0.2552398443222046,
"step": 321,
"token_acc": 0.9278263321116437
},
{
"epoch": 0.0679324894514768,
"grad_norm": 0.75390625,
"learning_rate": 9.995019372371754e-07,
"loss": 0.29060834646224976,
"step": 322,
"token_acc": 0.9247988807275271
},
{
"epoch": 0.06814345991561181,
"grad_norm": 0.57421875,
"learning_rate": 9.994943635237955e-07,
"loss": 0.21358612179756165,
"step": 323,
"token_acc": 0.9360210341805434
},
{
"epoch": 0.06835443037974684,
"grad_norm": 0.8671875,
"learning_rate": 9.994867326890078e-07,
"loss": 0.2634425759315491,
"step": 324,
"token_acc": 0.9219944937289691
},
{
"epoch": 0.06856540084388185,
"grad_norm": 0.82421875,
"learning_rate": 9.994790447336842e-07,
"loss": 0.3185754120349884,
"step": 325,
"token_acc": 0.9133514986376022
},
{
"epoch": 0.06877637130801688,
"grad_norm": 0.73828125,
"learning_rate": 9.994712996587044e-07,
"loss": 0.29746031761169434,
"step": 326,
"token_acc": 0.9256002705444707
},
{
"epoch": 0.0689873417721519,
"grad_norm": 0.76953125,
"learning_rate": 9.994634974649541e-07,
"loss": 0.29588472843170166,
"step": 327,
"token_acc": 0.9272846380609236
},
{
"epoch": 0.06919831223628692,
"grad_norm": 0.875,
"learning_rate": 9.994556381533252e-07,
"loss": 0.277068167924881,
"step": 328,
"token_acc": 0.922756981580511
},
{
"epoch": 0.06940928270042195,
"grad_norm": 0.73046875,
"learning_rate": 9.994477217247168e-07,
"loss": 0.27129507064819336,
"step": 329,
"token_acc": 0.9270741068792442
},
{
"epoch": 0.06962025316455696,
"grad_norm": 0.8671875,
"learning_rate": 9.994397481800342e-07,
"loss": 0.24473360180854797,
"step": 330,
"token_acc": 0.9313361611876988
},
{
"epoch": 0.06983122362869199,
"grad_norm": 0.69140625,
"learning_rate": 9.994317175201893e-07,
"loss": 0.22818127274513245,
"step": 331,
"token_acc": 0.9380833851897946
},
{
"epoch": 0.070042194092827,
"grad_norm": 0.6953125,
"learning_rate": 9.994236297461003e-07,
"loss": 0.262783944606781,
"step": 332,
"token_acc": 0.9235968263297091
},
{
"epoch": 0.07025316455696203,
"grad_norm": 0.63671875,
"learning_rate": 9.994154848586919e-07,
"loss": 0.24861930310726166,
"step": 333,
"token_acc": 0.930621342992477
},
{
"epoch": 0.07046413502109705,
"grad_norm": 0.62109375,
"learning_rate": 9.99407282858896e-07,
"loss": 0.26165515184402466,
"step": 334,
"token_acc": 0.9247956403269755
},
{
"epoch": 0.07067510548523206,
"grad_norm": 0.64453125,
"learning_rate": 9.993990237476504e-07,
"loss": 0.23681169748306274,
"step": 335,
"token_acc": 0.9285078611687927
},
{
"epoch": 0.07088607594936709,
"grad_norm": 1.65625,
"learning_rate": 9.993907075258994e-07,
"loss": 0.2824210524559021,
"step": 336,
"token_acc": 0.925273390036452
},
{
"epoch": 0.0710970464135021,
"grad_norm": 0.6796875,
"learning_rate": 9.993823341945942e-07,
"loss": 0.2578677535057068,
"step": 337,
"token_acc": 0.9232230059685296
},
{
"epoch": 0.07130801687763713,
"grad_norm": 0.69140625,
"learning_rate": 9.993739037546924e-07,
"loss": 0.25358960032463074,
"step": 338,
"token_acc": 0.9312054539820267
},
{
"epoch": 0.07151898734177216,
"grad_norm": 0.83984375,
"learning_rate": 9.99365416207158e-07,
"loss": 0.2980523705482483,
"step": 339,
"token_acc": 0.9155807365439094
},
{
"epoch": 0.07172995780590717,
"grad_norm": 0.9296875,
"learning_rate": 9.993568715529616e-07,
"loss": 0.29448622465133667,
"step": 340,
"token_acc": 0.9224402207234825
},
{
"epoch": 0.0719409282700422,
"grad_norm": 0.6328125,
"learning_rate": 9.993482697930805e-07,
"loss": 0.2686302065849304,
"step": 341,
"token_acc": 0.9284134881149807
},
{
"epoch": 0.07215189873417721,
"grad_norm": 0.75,
"learning_rate": 9.993396109284985e-07,
"loss": 0.2800794839859009,
"step": 342,
"token_acc": 0.9271758436944938
},
{
"epoch": 0.07236286919831224,
"grad_norm": 0.765625,
"learning_rate": 9.993308949602054e-07,
"loss": 0.2576884329319,
"step": 343,
"token_acc": 0.9227618490345231
},
{
"epoch": 0.07257383966244725,
"grad_norm": 0.65625,
"learning_rate": 9.993221218891982e-07,
"loss": 0.24451857805252075,
"step": 344,
"token_acc": 0.933461117196057
},
{
"epoch": 0.07278481012658228,
"grad_norm": 0.95703125,
"learning_rate": 9.993132917164801e-07,
"loss": 0.2957763075828552,
"step": 345,
"token_acc": 0.9123943661971831
},
{
"epoch": 0.0729957805907173,
"grad_norm": 1.6875,
"learning_rate": 9.99304404443061e-07,
"loss": 0.253467321395874,
"step": 346,
"token_acc": 0.9318112633181126
},
{
"epoch": 0.07320675105485232,
"grad_norm": 0.8046875,
"learning_rate": 9.99295460069957e-07,
"loss": 0.2847754955291748,
"step": 347,
"token_acc": 0.9224688355123137
},
{
"epoch": 0.07341772151898734,
"grad_norm": 0.73046875,
"learning_rate": 9.992864585981913e-07,
"loss": 0.25176408886909485,
"step": 348,
"token_acc": 0.9317745035233824
},
{
"epoch": 0.07362869198312236,
"grad_norm": 0.96484375,
"learning_rate": 9.99277400028793e-07,
"loss": 0.282976359128952,
"step": 349,
"token_acc": 0.9220568335588634
},
{
"epoch": 0.07383966244725738,
"grad_norm": 0.64453125,
"learning_rate": 9.992682843627984e-07,
"loss": 0.2807369530200958,
"step": 350,
"token_acc": 0.9215148188803512
},
{
"epoch": 0.07405063291139241,
"grad_norm": 0.7265625,
"learning_rate": 9.992591116012495e-07,
"loss": 0.2882058322429657,
"step": 351,
"token_acc": 0.9216602528862012
},
{
"epoch": 0.07426160337552742,
"grad_norm": 0.73828125,
"learning_rate": 9.992498817451955e-07,
"loss": 0.27112358808517456,
"step": 352,
"token_acc": 0.9312214611872146
},
{
"epoch": 0.07447257383966245,
"grad_norm": 0.77734375,
"learning_rate": 9.99240594795692e-07,
"loss": 0.25564056634902954,
"step": 353,
"token_acc": 0.9308086560364465
},
{
"epoch": 0.07468354430379746,
"grad_norm": 0.66796875,
"learning_rate": 9.99231250753801e-07,
"loss": 0.21060852706432343,
"step": 354,
"token_acc": 0.9366489046773239
},
{
"epoch": 0.07489451476793249,
"grad_norm": 0.5546875,
"learning_rate": 9.992218496205908e-07,
"loss": 0.23291520774364471,
"step": 355,
"token_acc": 0.9379619852164731
},
{
"epoch": 0.0751054852320675,
"grad_norm": 0.703125,
"learning_rate": 9.99212391397137e-07,
"loss": 0.23014740645885468,
"step": 356,
"token_acc": 0.930849478390462
},
{
"epoch": 0.07531645569620253,
"grad_norm": 0.73046875,
"learning_rate": 9.992028760845207e-07,
"loss": 0.2653324604034424,
"step": 357,
"token_acc": 0.9264833574529667
},
{
"epoch": 0.07552742616033756,
"grad_norm": 0.76171875,
"learning_rate": 9.991933036838303e-07,
"loss": 0.23712849617004395,
"step": 358,
"token_acc": 0.9348139601961349
},
{
"epoch": 0.07573839662447257,
"grad_norm": 0.61328125,
"learning_rate": 9.991836741961605e-07,
"loss": 0.24832651019096375,
"step": 359,
"token_acc": 0.9297736506094022
},
{
"epoch": 0.0759493670886076,
"grad_norm": 2.21875,
"learning_rate": 9.991739876226127e-07,
"loss": 0.30170413851737976,
"step": 360,
"token_acc": 0.9175753688261706
},
{
"epoch": 0.07616033755274261,
"grad_norm": 0.6875,
"learning_rate": 9.991642439642944e-07,
"loss": 0.2096886932849884,
"step": 361,
"token_acc": 0.9416713404374649
},
{
"epoch": 0.07637130801687764,
"grad_norm": 0.609375,
"learning_rate": 9.991544432223198e-07,
"loss": 0.24230161309242249,
"step": 362,
"token_acc": 0.9317358595709111
},
{
"epoch": 0.07658227848101266,
"grad_norm": 0.79296875,
"learning_rate": 9.991445853978098e-07,
"loss": 0.2464846670627594,
"step": 363,
"token_acc": 0.9277708592777086
},
{
"epoch": 0.07679324894514768,
"grad_norm": 0.99609375,
"learning_rate": 9.991346704918918e-07,
"loss": 0.25032496452331543,
"step": 364,
"token_acc": 0.931261207411835
},
{
"epoch": 0.0770042194092827,
"grad_norm": 1.0,
"learning_rate": 9.991246985056995e-07,
"loss": 0.3197912871837616,
"step": 365,
"token_acc": 0.9187062937062938
},
{
"epoch": 0.07721518987341772,
"grad_norm": 0.6953125,
"learning_rate": 9.991146694403733e-07,
"loss": 0.2740510404109955,
"step": 366,
"token_acc": 0.9192671056398511
},
{
"epoch": 0.07742616033755274,
"grad_norm": 0.875,
"learning_rate": 9.991045832970603e-07,
"loss": 0.29503384232521057,
"step": 367,
"token_acc": 0.919302394324564
},
{
"epoch": 0.07763713080168777,
"grad_norm": 0.796875,
"learning_rate": 9.990944400769138e-07,
"loss": 0.27579015493392944,
"step": 368,
"token_acc": 0.9176502882239912
},
{
"epoch": 0.07784810126582278,
"grad_norm": 0.8828125,
"learning_rate": 9.99084239781094e-07,
"loss": 0.2694048583507538,
"step": 369,
"token_acc": 0.925148762918885
},
{
"epoch": 0.07805907172995781,
"grad_norm": 0.7578125,
"learning_rate": 9.990739824107669e-07,
"loss": 0.2885046601295471,
"step": 370,
"token_acc": 0.9219858156028369
},
{
"epoch": 0.07827004219409282,
"grad_norm": 1.109375,
"learning_rate": 9.99063667967106e-07,
"loss": 0.2373015433549881,
"step": 371,
"token_acc": 0.9304477611940298
},
{
"epoch": 0.07848101265822785,
"grad_norm": 0.93359375,
"learning_rate": 9.990532964512901e-07,
"loss": 0.29645416140556335,
"step": 372,
"token_acc": 0.918646080760095
},
{
"epoch": 0.07869198312236286,
"grad_norm": 0.66796875,
"learning_rate": 9.990428678645062e-07,
"loss": 0.24266409873962402,
"step": 373,
"token_acc": 0.9363528715216104
},
{
"epoch": 0.07890295358649789,
"grad_norm": 0.8046875,
"learning_rate": 9.990323822079464e-07,
"loss": 0.2219400256872177,
"step": 374,
"token_acc": 0.9366262814538676
},
{
"epoch": 0.07911392405063292,
"grad_norm": 0.609375,
"learning_rate": 9.9902183948281e-07,
"loss": 0.2171144187450409,
"step": 375,
"token_acc": 0.937206572769953
},
{
"epoch": 0.07932489451476793,
"grad_norm": 0.61328125,
"learning_rate": 9.990112396903027e-07,
"loss": 0.23284628987312317,
"step": 376,
"token_acc": 0.9356833642547928
},
{
"epoch": 0.07953586497890296,
"grad_norm": 0.765625,
"learning_rate": 9.990005828316363e-07,
"loss": 0.26610440015792847,
"step": 377,
"token_acc": 0.9276832460732984
},
{
"epoch": 0.07974683544303797,
"grad_norm": 0.55859375,
"learning_rate": 9.989898689080299e-07,
"loss": 0.19865182042121887,
"step": 378,
"token_acc": 0.946978672985782
},
{
"epoch": 0.079957805907173,
"grad_norm": 0.73046875,
"learning_rate": 9.989790979207085e-07,
"loss": 0.2547116279602051,
"step": 379,
"token_acc": 0.9301753306674869
},
{
"epoch": 0.08016877637130802,
"grad_norm": 0.71484375,
"learning_rate": 9.98968269870904e-07,
"loss": 0.2702917456626892,
"step": 380,
"token_acc": 0.9255110613273593
},
{
"epoch": 0.08037974683544304,
"grad_norm": 0.67578125,
"learning_rate": 9.989573847598545e-07,
"loss": 0.24545586109161377,
"step": 381,
"token_acc": 0.9353233830845771
},
{
"epoch": 0.08059071729957806,
"grad_norm": 0.70703125,
"learning_rate": 9.98946442588805e-07,
"loss": 0.25997835397720337,
"step": 382,
"token_acc": 0.9318181818181818
},
{
"epoch": 0.08080168776371308,
"grad_norm": 0.921875,
"learning_rate": 9.989354433590067e-07,
"loss": 0.2865683436393738,
"step": 383,
"token_acc": 0.9229352164568622
},
{
"epoch": 0.0810126582278481,
"grad_norm": 0.93359375,
"learning_rate": 9.989243870717174e-07,
"loss": 0.25773969292640686,
"step": 384,
"token_acc": 0.9284097340124505
},
{
"epoch": 0.08122362869198312,
"grad_norm": 0.87890625,
"learning_rate": 9.989132737282015e-07,
"loss": 0.2665586471557617,
"step": 385,
"token_acc": 0.92599672310213
},
{
"epoch": 0.08143459915611814,
"grad_norm": 0.7890625,
"learning_rate": 9.989021033297302e-07,
"loss": 0.29251331090927124,
"step": 386,
"token_acc": 0.9255349500713267
},
{
"epoch": 0.08164556962025317,
"grad_norm": 0.90625,
"learning_rate": 9.988908758775807e-07,
"loss": 0.31350889801979065,
"step": 387,
"token_acc": 0.9161179501860864
},
{
"epoch": 0.08185654008438818,
"grad_norm": 0.77734375,
"learning_rate": 9.98879591373037e-07,
"loss": 0.2779198884963989,
"step": 388,
"token_acc": 0.9238483234095894
},
{
"epoch": 0.08206751054852321,
"grad_norm": 0.60546875,
"learning_rate": 9.988682498173895e-07,
"loss": 0.22718225419521332,
"step": 389,
"token_acc": 0.9420247204237787
},
{
"epoch": 0.08227848101265822,
"grad_norm": 0.609375,
"learning_rate": 9.98856851211935e-07,
"loss": 0.22414159774780273,
"step": 390,
"token_acc": 0.9357366771159875
},
{
"epoch": 0.08248945147679325,
"grad_norm": 0.74609375,
"learning_rate": 9.988453955579776e-07,
"loss": 0.2700081467628479,
"step": 391,
"token_acc": 0.9235555555555556
},
{
"epoch": 0.08270042194092828,
"grad_norm": 0.66796875,
"learning_rate": 9.98833882856827e-07,
"loss": 0.24140852689743042,
"step": 392,
"token_acc": 0.9277988101676582
},
{
"epoch": 0.08291139240506329,
"grad_norm": 0.8359375,
"learning_rate": 9.988223131097996e-07,
"loss": 0.28851526975631714,
"step": 393,
"token_acc": 0.9173528514791095
},
{
"epoch": 0.08312236286919832,
"grad_norm": 0.6796875,
"learning_rate": 9.98810686318219e-07,
"loss": 0.2673947215080261,
"step": 394,
"token_acc": 0.9220706930141943
},
{
"epoch": 0.08333333333333333,
"grad_norm": 0.66796875,
"learning_rate": 9.98799002483414e-07,
"loss": 0.27388495206832886,
"step": 395,
"token_acc": 0.9260257562144355
},
{
"epoch": 0.08354430379746836,
"grad_norm": 0.6484375,
"learning_rate": 9.987872616067216e-07,
"loss": 0.2556672692298889,
"step": 396,
"token_acc": 0.9255903349807798
},
{
"epoch": 0.08375527426160338,
"grad_norm": 0.96875,
"learning_rate": 9.987754636894843e-07,
"loss": 0.32614314556121826,
"step": 397,
"token_acc": 0.9155054847316929
},
{
"epoch": 0.0839662447257384,
"grad_norm": 0.60546875,
"learning_rate": 9.987636087330509e-07,
"loss": 0.23008616268634796,
"step": 398,
"token_acc": 0.9367875647668393
},
{
"epoch": 0.08417721518987342,
"grad_norm": 0.73828125,
"learning_rate": 9.987516967387775e-07,
"loss": 0.2754250764846802,
"step": 399,
"token_acc": 0.9202546998180715
},
{
"epoch": 0.08438818565400844,
"grad_norm": 0.671875,
"learning_rate": 9.98739727708026e-07,
"loss": 0.23332414031028748,
"step": 400,
"token_acc": 0.9356028368794326
},
{
"epoch": 0.08438818565400844,
"eval_loss": 0.43364420533180237,
"eval_runtime": 245.8014,
"eval_samples_per_second": 137.123,
"eval_steps_per_second": 2.144,
"eval_token_acc": 0.8990711657542853,
"step": 400
},
{
"epoch": 0.08459915611814346,
"grad_norm": 0.7578125,
"learning_rate": 9.987277016421654e-07,
"loss": 0.2699899673461914,
"step": 401,
"token_acc": 0.9288135593220339
},
{
"epoch": 0.08481012658227848,
"grad_norm": 0.78515625,
"learning_rate": 9.98715618542571e-07,
"loss": 0.25560492277145386,
"step": 402,
"token_acc": 0.9252772913018097
},
{
"epoch": 0.0850210970464135,
"grad_norm": 0.7890625,
"learning_rate": 9.987034784106244e-07,
"loss": 0.3024590015411377,
"step": 403,
"token_acc": 0.9186206896551724
},
{
"epoch": 0.08523206751054853,
"grad_norm": 0.703125,
"learning_rate": 9.98691281247714e-07,
"loss": 0.2880774438381195,
"step": 404,
"token_acc": 0.9188865609099072
},
{
"epoch": 0.08544303797468354,
"grad_norm": 0.859375,
"learning_rate": 9.986790270552347e-07,
"loss": 0.2641194760799408,
"step": 405,
"token_acc": 0.9306480920654149
},
{
"epoch": 0.08565400843881857,
"grad_norm": 0.72265625,
"learning_rate": 9.98666715834588e-07,
"loss": 0.25727787613868713,
"step": 406,
"token_acc": 0.9288548752834467
},
{
"epoch": 0.08586497890295358,
"grad_norm": 0.61328125,
"learning_rate": 9.986543475871818e-07,
"loss": 0.2398534119129181,
"step": 407,
"token_acc": 0.9377962085308057
},
{
"epoch": 0.08607594936708861,
"grad_norm": 0.671875,
"learning_rate": 9.986419223144302e-07,
"loss": 0.25430333614349365,
"step": 408,
"token_acc": 0.9305912596401028
},
{
"epoch": 0.08628691983122364,
"grad_norm": 0.60546875,
"learning_rate": 9.986294400177544e-07,
"loss": 0.20513233542442322,
"step": 409,
"token_acc": 0.9459538416593631
},
{
"epoch": 0.08649789029535865,
"grad_norm": 0.5625,
"learning_rate": 9.986169006985817e-07,
"loss": 0.20912200212478638,
"step": 410,
"token_acc": 0.9390818128310771
},
{
"epoch": 0.08670886075949367,
"grad_norm": 0.67578125,
"learning_rate": 9.986043043583462e-07,
"loss": 0.2466573864221573,
"step": 411,
"token_acc": 0.9298196948682386
},
{
"epoch": 0.08691983122362869,
"grad_norm": 0.7578125,
"learning_rate": 9.98591650998488e-07,
"loss": 0.23664775490760803,
"step": 412,
"token_acc": 0.9329004329004329
},
{
"epoch": 0.08713080168776371,
"grad_norm": 0.640625,
"learning_rate": 9.985789406204547e-07,
"loss": 0.23415768146514893,
"step": 413,
"token_acc": 0.9317912218268091
},
{
"epoch": 0.08734177215189873,
"grad_norm": 0.75,
"learning_rate": 9.985661732256998e-07,
"loss": 0.2954852283000946,
"step": 414,
"token_acc": 0.9218203033838973
},
{
"epoch": 0.08755274261603375,
"grad_norm": 0.703125,
"learning_rate": 9.98553348815683e-07,
"loss": 0.2565650939941406,
"step": 415,
"token_acc": 0.9298298906439855
},
{
"epoch": 0.08776371308016878,
"grad_norm": 0.7578125,
"learning_rate": 9.98540467391871e-07,
"loss": 0.22425369918346405,
"step": 416,
"token_acc": 0.9359098228663446
},
{
"epoch": 0.0879746835443038,
"grad_norm": 0.73046875,
"learning_rate": 9.98527528955737e-07,
"loss": 0.2637927532196045,
"step": 417,
"token_acc": 0.9261443414771132
},
{
"epoch": 0.08818565400843882,
"grad_norm": 0.70703125,
"learning_rate": 9.985145335087605e-07,
"loss": 0.27013063430786133,
"step": 418,
"token_acc": 0.9248719408081958
},
{
"epoch": 0.08839662447257383,
"grad_norm": 0.62890625,
"learning_rate": 9.985014810524278e-07,
"loss": 0.25381606817245483,
"step": 419,
"token_acc": 0.9342265529841657
},
{
"epoch": 0.08860759493670886,
"grad_norm": 1.59375,
"learning_rate": 9.984883715882315e-07,
"loss": 0.2093265801668167,
"step": 420,
"token_acc": 0.936447410231967
},
{
"epoch": 0.08881856540084389,
"grad_norm": 0.65625,
"learning_rate": 9.984752051176707e-07,
"loss": 0.2633010447025299,
"step": 421,
"token_acc": 0.9257308401369502
},
{
"epoch": 0.0890295358649789,
"grad_norm": 0.6796875,
"learning_rate": 9.98461981642251e-07,
"loss": 0.28037169575691223,
"step": 422,
"token_acc": 0.9264617239300783
},
{
"epoch": 0.08924050632911393,
"grad_norm": 0.640625,
"learning_rate": 9.984487011634848e-07,
"loss": 0.23874756693840027,
"step": 423,
"token_acc": 0.9345043167609407
},
{
"epoch": 0.08945147679324894,
"grad_norm": 4.9375,
"learning_rate": 9.984353636828908e-07,
"loss": 0.2935020923614502,
"step": 424,
"token_acc": 0.9228658536585366
},
{
"epoch": 0.08966244725738397,
"grad_norm": 0.62890625,
"learning_rate": 9.984219692019943e-07,
"loss": 0.2578403949737549,
"step": 425,
"token_acc": 0.9282193468884782
},
{
"epoch": 0.08987341772151898,
"grad_norm": 0.73828125,
"learning_rate": 9.98408517722327e-07,
"loss": 0.24426788091659546,
"step": 426,
"token_acc": 0.9371653987038603
},
{
"epoch": 0.09008438818565401,
"grad_norm": 0.80859375,
"learning_rate": 9.983950092454272e-07,
"loss": 0.2677040994167328,
"step": 427,
"token_acc": 0.9215219976218787
},
{
"epoch": 0.09029535864978903,
"grad_norm": 0.73828125,
"learning_rate": 9.983814437728396e-07,
"loss": 0.2604065239429474,
"step": 428,
"token_acc": 0.9289383561643836
},
{
"epoch": 0.09050632911392405,
"grad_norm": 1.3125,
"learning_rate": 9.983678213061157e-07,
"loss": 0.24889585375785828,
"step": 429,
"token_acc": 0.9251644736842105
},
{
"epoch": 0.09071729957805907,
"grad_norm": 0.796875,
"learning_rate": 9.983541418468134e-07,
"loss": 0.2805905342102051,
"step": 430,
"token_acc": 0.9248591108328115
},
{
"epoch": 0.09092827004219409,
"grad_norm": 0.7734375,
"learning_rate": 9.983404053964967e-07,
"loss": 0.2725668251514435,
"step": 431,
"token_acc": 0.9280293116985082
},
{
"epoch": 0.09113924050632911,
"grad_norm": 0.84375,
"learning_rate": 9.98326611956737e-07,
"loss": 0.3275222182273865,
"step": 432,
"token_acc": 0.9095477386934674
},
{
"epoch": 0.09135021097046414,
"grad_norm": 0.91015625,
"learning_rate": 9.98312761529111e-07,
"loss": 0.27813225984573364,
"step": 433,
"token_acc": 0.9216018048505358
},
{
"epoch": 0.09156118143459915,
"grad_norm": 0.74609375,
"learning_rate": 9.982988541152036e-07,
"loss": 0.2637915015220642,
"step": 434,
"token_acc": 0.9244929797191888
},
{
"epoch": 0.09177215189873418,
"grad_norm": 1.0703125,
"learning_rate": 9.982848897166042e-07,
"loss": 0.2686794102191925,
"step": 435,
"token_acc": 0.9231661351116266
},
{
"epoch": 0.0919831223628692,
"grad_norm": 0.62890625,
"learning_rate": 9.982708683349105e-07,
"loss": 0.24819687008857727,
"step": 436,
"token_acc": 0.9376739009460211
},
{
"epoch": 0.09219409282700422,
"grad_norm": 0.75,
"learning_rate": 9.982567899717256e-07,
"loss": 0.25789859890937805,
"step": 437,
"token_acc": 0.9243027888446215
},
{
"epoch": 0.09240506329113925,
"grad_norm": 0.65625,
"learning_rate": 9.982426546286596e-07,
"loss": 0.2573246359825134,
"step": 438,
"token_acc": 0.9290590679726922
},
{
"epoch": 0.09261603375527426,
"grad_norm": 0.7578125,
"learning_rate": 9.98228462307329e-07,
"loss": 0.2979215383529663,
"step": 439,
"token_acc": 0.9225908372827805
},
{
"epoch": 0.09282700421940929,
"grad_norm": 0.703125,
"learning_rate": 9.982142130093566e-07,
"loss": 0.2403128445148468,
"step": 440,
"token_acc": 0.9290377519159807
},
{
"epoch": 0.0930379746835443,
"grad_norm": 0.7890625,
"learning_rate": 9.98199906736372e-07,
"loss": 0.2767883241176605,
"step": 441,
"token_acc": 0.9238754325259516
},
{
"epoch": 0.09324894514767933,
"grad_norm": 0.609375,
"learning_rate": 9.981855434900115e-07,
"loss": 0.25662270188331604,
"step": 442,
"token_acc": 0.9294367050272562
},
{
"epoch": 0.09345991561181434,
"grad_norm": 0.7109375,
"learning_rate": 9.981711232719175e-07,
"loss": 0.24665901064872742,
"step": 443,
"token_acc": 0.9237830319888735
},
{
"epoch": 0.09367088607594937,
"grad_norm": 0.7890625,
"learning_rate": 9.98156646083739e-07,
"loss": 0.23571115732192993,
"step": 444,
"token_acc": 0.9281145293938471
},
{
"epoch": 0.0938818565400844,
"grad_norm": 1.21875,
"learning_rate": 9.981421119271316e-07,
"loss": 0.2607622742652893,
"step": 445,
"token_acc": 0.9253941441441441
},
{
"epoch": 0.0940928270042194,
"grad_norm": 0.73046875,
"learning_rate": 9.981275208037575e-07,
"loss": 0.2898206114768982,
"step": 446,
"token_acc": 0.918967587034814
},
{
"epoch": 0.09430379746835443,
"grad_norm": 0.6640625,
"learning_rate": 9.981128727152854e-07,
"loss": 0.2372782677412033,
"step": 447,
"token_acc": 0.9295408605255558
},
{
"epoch": 0.09451476793248945,
"grad_norm": 0.65234375,
"learning_rate": 9.980981676633903e-07,
"loss": 0.22987963259220123,
"step": 448,
"token_acc": 0.9354383986467437
},
{
"epoch": 0.09472573839662447,
"grad_norm": 0.95703125,
"learning_rate": 9.980834056497538e-07,
"loss": 0.26702481508255005,
"step": 449,
"token_acc": 0.9252548131370328
},
{
"epoch": 0.0949367088607595,
"grad_norm": 0.76171875,
"learning_rate": 9.98068586676064e-07,
"loss": 0.27268826961517334,
"step": 450,
"token_acc": 0.9244391971664699
},
{
"epoch": 0.09514767932489451,
"grad_norm": 0.93359375,
"learning_rate": 9.98053710744016e-07,
"loss": 0.22254578769207,
"step": 451,
"token_acc": 0.9397944199706314
},
{
"epoch": 0.09535864978902954,
"grad_norm": 0.6484375,
"learning_rate": 9.980387778553103e-07,
"loss": 0.2529526948928833,
"step": 452,
"token_acc": 0.9301044083526682
},
{
"epoch": 0.09556962025316455,
"grad_norm": 0.93359375,
"learning_rate": 9.980237880116553e-07,
"loss": 0.2600526809692383,
"step": 453,
"token_acc": 0.9255893212155638
},
{
"epoch": 0.09578059071729958,
"grad_norm": 0.74609375,
"learning_rate": 9.980087412147648e-07,
"loss": 0.2552299499511719,
"step": 454,
"token_acc": 0.9276672694394213
},
{
"epoch": 0.09599156118143459,
"grad_norm": 0.984375,
"learning_rate": 9.979936374663595e-07,
"loss": 0.28409841656684875,
"step": 455,
"token_acc": 0.9230544177881802
},
{
"epoch": 0.09620253164556962,
"grad_norm": 0.83984375,
"learning_rate": 9.979784767681668e-07,
"loss": 0.256397545337677,
"step": 456,
"token_acc": 0.9331357048748353
},
{
"epoch": 0.09641350210970465,
"grad_norm": 0.69140625,
"learning_rate": 9.979632591219207e-07,
"loss": 0.2313995659351349,
"step": 457,
"token_acc": 0.9336188436830836
},
{
"epoch": 0.09662447257383966,
"grad_norm": 0.87890625,
"learning_rate": 9.97947984529361e-07,
"loss": 0.2967644929885864,
"step": 458,
"token_acc": 0.9195630585898709
},
{
"epoch": 0.09683544303797469,
"grad_norm": 0.76953125,
"learning_rate": 9.979326529922348e-07,
"loss": 0.30269140005111694,
"step": 459,
"token_acc": 0.9189985272459499
},
{
"epoch": 0.0970464135021097,
"grad_norm": 0.68359375,
"learning_rate": 9.97917264512295e-07,
"loss": 0.2591363787651062,
"step": 460,
"token_acc": 0.9290578887627696
},
{
"epoch": 0.09725738396624473,
"grad_norm": 0.84375,
"learning_rate": 9.979018190913018e-07,
"loss": 0.32560282945632935,
"step": 461,
"token_acc": 0.9178757980266976
},
{
"epoch": 0.09746835443037975,
"grad_norm": 0.83984375,
"learning_rate": 9.978863167310213e-07,
"loss": 0.2893942892551422,
"step": 462,
"token_acc": 0.924191063174114
},
{
"epoch": 0.09767932489451477,
"grad_norm": 0.625,
"learning_rate": 9.978707574332266e-07,
"loss": 0.2492993026971817,
"step": 463,
"token_acc": 0.9310970081595649
},
{
"epoch": 0.09789029535864979,
"grad_norm": 0.734375,
"learning_rate": 9.978551411996967e-07,
"loss": 0.27646076679229736,
"step": 464,
"token_acc": 0.9283480238839921
},
{
"epoch": 0.0981012658227848,
"grad_norm": 0.87890625,
"learning_rate": 9.978394680322176e-07,
"loss": 0.22209137678146362,
"step": 465,
"token_acc": 0.9360902255639098
},
{
"epoch": 0.09831223628691983,
"grad_norm": 0.84375,
"learning_rate": 9.978237379325818e-07,
"loss": 0.2588399648666382,
"step": 466,
"token_acc": 0.9257776408992916
},
{
"epoch": 0.09852320675105486,
"grad_norm": 0.88671875,
"learning_rate": 9.978079509025878e-07,
"loss": 0.3038383722305298,
"step": 467,
"token_acc": 0.9133605600933489
},
{
"epoch": 0.09873417721518987,
"grad_norm": 0.8125,
"learning_rate": 9.977921069440415e-07,
"loss": 0.24923110008239746,
"step": 468,
"token_acc": 0.9317915690866511
},
{
"epoch": 0.0989451476793249,
"grad_norm": 0.8515625,
"learning_rate": 9.97776206058754e-07,
"loss": 0.23833706974983215,
"step": 469,
"token_acc": 0.9329593267882188
},
{
"epoch": 0.09915611814345991,
"grad_norm": 0.66015625,
"learning_rate": 9.977602482485445e-07,
"loss": 0.25747478008270264,
"step": 470,
"token_acc": 0.9295946357817738
},
{
"epoch": 0.09936708860759494,
"grad_norm": 0.68359375,
"learning_rate": 9.977442335152377e-07,
"loss": 0.2688140571117401,
"step": 471,
"token_acc": 0.9248041775456919
},
{
"epoch": 0.09957805907172995,
"grad_norm": 0.65625,
"learning_rate": 9.977281618606649e-07,
"loss": 0.19290462136268616,
"step": 472,
"token_acc": 0.9412288512911843
},
{
"epoch": 0.09978902953586498,
"grad_norm": 0.66015625,
"learning_rate": 9.977120332866638e-07,
"loss": 0.24847334623336792,
"step": 473,
"token_acc": 0.9335578689528475
},
{
"epoch": 0.1,
"grad_norm": 0.73828125,
"learning_rate": 9.976958477950794e-07,
"loss": 0.24599069356918335,
"step": 474,
"token_acc": 0.9284507042253521
},
{
"epoch": 0.10021097046413502,
"grad_norm": 0.83203125,
"learning_rate": 9.976796053877622e-07,
"loss": 0.2468043714761734,
"step": 475,
"token_acc": 0.9286099137931034
},
{
"epoch": 0.10042194092827005,
"grad_norm": 0.73828125,
"learning_rate": 9.976633060665697e-07,
"loss": 0.2741178572177887,
"step": 476,
"token_acc": 0.9224250325945241
},
{
"epoch": 0.10063291139240506,
"grad_norm": 0.6484375,
"learning_rate": 9.97646949833366e-07,
"loss": 0.23784510791301727,
"step": 477,
"token_acc": 0.9328483491885842
},
{
"epoch": 0.10084388185654009,
"grad_norm": 0.66015625,
"learning_rate": 9.976305366900216e-07,
"loss": 0.23838309943675995,
"step": 478,
"token_acc": 0.9320939839917377
},
{
"epoch": 0.10105485232067511,
"grad_norm": 0.86328125,
"learning_rate": 9.976140666384134e-07,
"loss": 0.2787632346153259,
"step": 479,
"token_acc": 0.9210836277974087
},
{
"epoch": 0.10126582278481013,
"grad_norm": 0.7265625,
"learning_rate": 9.97597539680425e-07,
"loss": 0.2557927370071411,
"step": 480,
"token_acc": 0.9304747320061256
},
{
"epoch": 0.10147679324894515,
"grad_norm": 0.8359375,
"learning_rate": 9.975809558179463e-07,
"loss": 0.2617788314819336,
"step": 481,
"token_acc": 0.9297163995067818
},
{
"epoch": 0.10168776371308016,
"grad_norm": 0.77734375,
"learning_rate": 9.975643150528737e-07,
"loss": 0.24790287017822266,
"step": 482,
"token_acc": 0.9321890827236916
},
{
"epoch": 0.10189873417721519,
"grad_norm": 0.62109375,
"learning_rate": 9.975476173871102e-07,
"loss": 0.22530625760555267,
"step": 483,
"token_acc": 0.9375520399666945
},
{
"epoch": 0.1021097046413502,
"grad_norm": 0.68359375,
"learning_rate": 9.975308628225657e-07,
"loss": 0.23794007301330566,
"step": 484,
"token_acc": 0.9279202279202279
},
{
"epoch": 0.10232067510548523,
"grad_norm": 0.71875,
"learning_rate": 9.975140513611558e-07,
"loss": 0.2270554155111313,
"step": 485,
"token_acc": 0.9365750528541226
},
{
"epoch": 0.10253164556962026,
"grad_norm": 0.77734375,
"learning_rate": 9.974971830048033e-07,
"loss": 0.23995614051818848,
"step": 486,
"token_acc": 0.9316101238556812
},
{
"epoch": 0.10274261603375527,
"grad_norm": 0.58203125,
"learning_rate": 9.974802577554372e-07,
"loss": 0.2599806487560272,
"step": 487,
"token_acc": 0.9297071129707113
},
{
"epoch": 0.1029535864978903,
"grad_norm": 0.703125,
"learning_rate": 9.974632756149928e-07,
"loss": 0.2610231935977936,
"step": 488,
"token_acc": 0.9277639922801213
},
{
"epoch": 0.10316455696202531,
"grad_norm": 0.69921875,
"learning_rate": 9.974462365854124e-07,
"loss": 0.2433297038078308,
"step": 489,
"token_acc": 0.9279082468596396
},
{
"epoch": 0.10337552742616034,
"grad_norm": 0.765625,
"learning_rate": 9.974291406686446e-07,
"loss": 0.21656793355941772,
"step": 490,
"token_acc": 0.9369074861065708
},
{
"epoch": 0.10358649789029536,
"grad_norm": 0.77734375,
"learning_rate": 9.974119878666442e-07,
"loss": 0.2721899151802063,
"step": 491,
"token_acc": 0.9287462605384824
},
{
"epoch": 0.10379746835443038,
"grad_norm": 0.87890625,
"learning_rate": 9.973947781813731e-07,
"loss": 0.25939926505088806,
"step": 492,
"token_acc": 0.9284844796104686
},
{
"epoch": 0.1040084388185654,
"grad_norm": 0.73828125,
"learning_rate": 9.973775116147992e-07,
"loss": 0.2712750732898712,
"step": 493,
"token_acc": 0.9242995689655172
},
{
"epoch": 0.10421940928270042,
"grad_norm": 0.8203125,
"learning_rate": 9.97360188168897e-07,
"loss": 0.2513953447341919,
"step": 494,
"token_acc": 0.929803328290469
},
{
"epoch": 0.10443037974683544,
"grad_norm": 0.62890625,
"learning_rate": 9.973428078456475e-07,
"loss": 0.2344273030757904,
"step": 495,
"token_acc": 0.9309220278683664
},
{
"epoch": 0.10464135021097046,
"grad_norm": 0.6796875,
"learning_rate": 9.973253706470388e-07,
"loss": 0.24709591269493103,
"step": 496,
"token_acc": 0.9282845668387837
},
{
"epoch": 0.10485232067510548,
"grad_norm": 0.734375,
"learning_rate": 9.973078765750644e-07,
"loss": 0.26154980063438416,
"step": 497,
"token_acc": 0.9249655172413793
},
{
"epoch": 0.10506329113924051,
"grad_norm": 0.671875,
"learning_rate": 9.972903256317251e-07,
"loss": 0.2260134369134903,
"step": 498,
"token_acc": 0.9395458566794456
},
{
"epoch": 0.10527426160337552,
"grad_norm": 0.80859375,
"learning_rate": 9.972727178190281e-07,
"loss": 0.33081650733947754,
"step": 499,
"token_acc": 0.9097568121886903
},
{
"epoch": 0.10548523206751055,
"grad_norm": 0.7109375,
"learning_rate": 9.97255053138987e-07,
"loss": 0.23815643787384033,
"step": 500,
"token_acc": 0.929639889196676
},
{
"epoch": 0.10569620253164556,
"grad_norm": 0.73046875,
"learning_rate": 9.972373315936218e-07,
"loss": 0.2648988962173462,
"step": 501,
"token_acc": 0.9245283018867925
},
{
"epoch": 0.10590717299578059,
"grad_norm": 0.765625,
"learning_rate": 9.972195531849592e-07,
"loss": 0.2421625256538391,
"step": 502,
"token_acc": 0.9339531901250401
},
{
"epoch": 0.10611814345991562,
"grad_norm": 0.53125,
"learning_rate": 9.97201717915032e-07,
"loss": 0.2174941599369049,
"step": 503,
"token_acc": 0.9379822806516147
},
{
"epoch": 0.10632911392405063,
"grad_norm": 0.9375,
"learning_rate": 9.971838257858804e-07,
"loss": 0.24187928438186646,
"step": 504,
"token_acc": 0.924503742271396
},
{
"epoch": 0.10654008438818566,
"grad_norm": 0.6640625,
"learning_rate": 9.9716587679955e-07,
"loss": 0.2514258921146393,
"step": 505,
"token_acc": 0.9230990783410138
},
{
"epoch": 0.10675105485232067,
"grad_norm": 0.69921875,
"learning_rate": 9.971478709580937e-07,
"loss": 0.282311350107193,
"step": 506,
"token_acc": 0.9201725997842503
},
{
"epoch": 0.1069620253164557,
"grad_norm": 0.66015625,
"learning_rate": 9.971298082635705e-07,
"loss": 0.2298332005739212,
"step": 507,
"token_acc": 0.936689779921616
},
{
"epoch": 0.10717299578059072,
"grad_norm": 0.7109375,
"learning_rate": 9.971116887180461e-07,
"loss": 0.26396387815475464,
"step": 508,
"token_acc": 0.9267986176562991
},
{
"epoch": 0.10738396624472574,
"grad_norm": 1.3046875,
"learning_rate": 9.970935123235926e-07,
"loss": 0.2639835476875305,
"step": 509,
"token_acc": 0.9236704326260677
},
{
"epoch": 0.10759493670886076,
"grad_norm": 0.75390625,
"learning_rate": 9.970752790822886e-07,
"loss": 0.27394697070121765,
"step": 510,
"token_acc": 0.9261565836298933
},
{
"epoch": 0.10780590717299578,
"grad_norm": 0.73828125,
"learning_rate": 9.97056988996219e-07,
"loss": 0.2229761779308319,
"step": 511,
"token_acc": 0.9351882160392798
},
{
"epoch": 0.1080168776371308,
"grad_norm": 0.73828125,
"learning_rate": 9.970386420674758e-07,
"loss": 0.26045358180999756,
"step": 512,
"token_acc": 0.9279547484370348
},
{
"epoch": 0.10822784810126582,
"grad_norm": 0.66015625,
"learning_rate": 9.97020238298157e-07,
"loss": 0.23026743531227112,
"step": 513,
"token_acc": 0.9363579080025205
},
{
"epoch": 0.10843881856540084,
"grad_norm": 0.76171875,
"learning_rate": 9.970017776903671e-07,
"loss": 0.2587951421737671,
"step": 514,
"token_acc": 0.9307073030477285
},
{
"epoch": 0.10864978902953587,
"grad_norm": 0.87109375,
"learning_rate": 9.969832602462174e-07,
"loss": 0.22050908207893372,
"step": 515,
"token_acc": 0.9343434343434344
},
{
"epoch": 0.10886075949367088,
"grad_norm": 0.7421875,
"learning_rate": 9.969646859678256e-07,
"loss": 0.25485992431640625,
"step": 516,
"token_acc": 0.9255828808687321
},
{
"epoch": 0.10907172995780591,
"grad_norm": 0.671875,
"learning_rate": 9.969460548573156e-07,
"loss": 0.24492983520030975,
"step": 517,
"token_acc": 0.9348314606741573
},
{
"epoch": 0.10928270042194092,
"grad_norm": 0.7265625,
"learning_rate": 9.96927366916818e-07,
"loss": 0.28373780846595764,
"step": 518,
"token_acc": 0.9258015267175572
},
{
"epoch": 0.10949367088607595,
"grad_norm": 0.75,
"learning_rate": 9.969086221484701e-07,
"loss": 0.2899026870727539,
"step": 519,
"token_acc": 0.9206824304100568
},
{
"epoch": 0.10970464135021098,
"grad_norm": 0.80078125,
"learning_rate": 9.968898205544153e-07,
"loss": 0.2812725305557251,
"step": 520,
"token_acc": 0.9226502311248074
},
{
"epoch": 0.10991561181434599,
"grad_norm": 0.6171875,
"learning_rate": 9.968709621368041e-07,
"loss": 0.24981510639190674,
"step": 521,
"token_acc": 0.9326704545454545
},
{
"epoch": 0.11012658227848102,
"grad_norm": 0.79296875,
"learning_rate": 9.96852046897793e-07,
"loss": 0.3039143681526184,
"step": 522,
"token_acc": 0.9203347799132052
},
{
"epoch": 0.11033755274261603,
"grad_norm": 0.92578125,
"learning_rate": 9.968330748395448e-07,
"loss": 0.2633418142795563,
"step": 523,
"token_acc": 0.9283835135925168
},
{
"epoch": 0.11054852320675106,
"grad_norm": 1.4609375,
"learning_rate": 9.968140459642294e-07,
"loss": 0.24586576223373413,
"step": 524,
"token_acc": 0.9281601316150261
},
{
"epoch": 0.11075949367088607,
"grad_norm": 0.8515625,
"learning_rate": 9.967949602740228e-07,
"loss": 0.2739730477333069,
"step": 525,
"token_acc": 0.9166417687481326
},
{
"epoch": 0.1109704641350211,
"grad_norm": 0.796875,
"learning_rate": 9.967758177711076e-07,
"loss": 0.2627703845500946,
"step": 526,
"token_acc": 0.9227409227409228
},
{
"epoch": 0.11118143459915612,
"grad_norm": 0.80859375,
"learning_rate": 9.967566184576732e-07,
"loss": 0.26023009419441223,
"step": 527,
"token_acc": 0.927381745502998
},
{
"epoch": 0.11139240506329114,
"grad_norm": 0.78125,
"learning_rate": 9.967373623359148e-07,
"loss": 0.24462240934371948,
"step": 528,
"token_acc": 0.9283416203568294
},
{
"epoch": 0.11160337552742616,
"grad_norm": 0.72265625,
"learning_rate": 9.967180494080347e-07,
"loss": 0.24981698393821716,
"step": 529,
"token_acc": 0.9291265153870065
},
{
"epoch": 0.11181434599156118,
"grad_norm": 0.78125,
"learning_rate": 9.966986796762414e-07,
"loss": 0.2446298450231552,
"step": 530,
"token_acc": 0.9370728929384966
},
{
"epoch": 0.1120253164556962,
"grad_norm": 2.421875,
"learning_rate": 9.9667925314275e-07,
"loss": 0.24656617641448975,
"step": 531,
"token_acc": 0.9356594110115237
},
{
"epoch": 0.11223628691983123,
"grad_norm": 0.8203125,
"learning_rate": 9.966597698097823e-07,
"loss": 0.2559359073638916,
"step": 532,
"token_acc": 0.9327158812312721
},
{
"epoch": 0.11244725738396624,
"grad_norm": 0.6640625,
"learning_rate": 9.966402296795661e-07,
"loss": 0.2284064143896103,
"step": 533,
"token_acc": 0.9354838709677419
},
{
"epoch": 0.11265822784810127,
"grad_norm": 0.77734375,
"learning_rate": 9.966206327543362e-07,
"loss": 0.2628895938396454,
"step": 534,
"token_acc": 0.923578751164958
},
{
"epoch": 0.11286919831223628,
"grad_norm": 0.6796875,
"learning_rate": 9.966009790363337e-07,
"loss": 0.2363075464963913,
"step": 535,
"token_acc": 0.9275167785234899
},
{
"epoch": 0.11308016877637131,
"grad_norm": 0.90625,
"learning_rate": 9.965812685278059e-07,
"loss": 0.2766547203063965,
"step": 536,
"token_acc": 0.9212160836874795
},
{
"epoch": 0.11329113924050632,
"grad_norm": 0.6875,
"learning_rate": 9.96561501231007e-07,
"loss": 0.24981704354286194,
"step": 537,
"token_acc": 0.9287794545935928
},
{
"epoch": 0.11350210970464135,
"grad_norm": 0.72265625,
"learning_rate": 9.965416771481975e-07,
"loss": 0.2477213591337204,
"step": 538,
"token_acc": 0.9247878255779924
},
{
"epoch": 0.11371308016877638,
"grad_norm": 0.97265625,
"learning_rate": 9.965217962816446e-07,
"loss": 0.2585391104221344,
"step": 539,
"token_acc": 0.9276463963963963
},
{
"epoch": 0.11392405063291139,
"grad_norm": 0.703125,
"learning_rate": 9.965018586336218e-07,
"loss": 0.24559935927391052,
"step": 540,
"token_acc": 0.9349939246658566
},
{
"epoch": 0.11413502109704642,
"grad_norm": 0.72265625,
"learning_rate": 9.96481864206409e-07,
"loss": 0.22781570255756378,
"step": 541,
"token_acc": 0.9306022623051055
},
{
"epoch": 0.11434599156118143,
"grad_norm": 1.0390625,
"learning_rate": 9.964618130022931e-07,
"loss": 0.2166275829076767,
"step": 542,
"token_acc": 0.9374454466104161
},
{
"epoch": 0.11455696202531646,
"grad_norm": 0.703125,
"learning_rate": 9.964417050235665e-07,
"loss": 0.267704039812088,
"step": 543,
"token_acc": 0.9261410788381743
},
{
"epoch": 0.11476793248945148,
"grad_norm": 0.8203125,
"learning_rate": 9.964215402725294e-07,
"loss": 0.23303918540477753,
"step": 544,
"token_acc": 0.9341683658607631
},
{
"epoch": 0.1149789029535865,
"grad_norm": 0.921875,
"learning_rate": 9.964013187514872e-07,
"loss": 0.33097875118255615,
"step": 545,
"token_acc": 0.9122987324426174
},
{
"epoch": 0.11518987341772152,
"grad_norm": 1.9453125,
"learning_rate": 9.963810404627529e-07,
"loss": 0.2524172067642212,
"step": 546,
"token_acc": 0.9373441396508728
},
{
"epoch": 0.11540084388185654,
"grad_norm": 0.60546875,
"learning_rate": 9.963607054086453e-07,
"loss": 0.25729498267173767,
"step": 547,
"token_acc": 0.9215870040612308
},
{
"epoch": 0.11561181434599156,
"grad_norm": 0.75,
"learning_rate": 9.963403135914898e-07,
"loss": 0.2928774356842041,
"step": 548,
"token_acc": 0.9192731605600238
},
{
"epoch": 0.11582278481012659,
"grad_norm": 0.6953125,
"learning_rate": 9.963198650136184e-07,
"loss": 0.25337544083595276,
"step": 549,
"token_acc": 0.9240650870682272
},
{
"epoch": 0.1160337552742616,
"grad_norm": 0.7109375,
"learning_rate": 9.962993596773697e-07,
"loss": 0.27310362458229065,
"step": 550,
"token_acc": 0.9247853124074622
},
{
"epoch": 0.11624472573839663,
"grad_norm": 0.640625,
"learning_rate": 9.962787975850886e-07,
"loss": 0.22571566700935364,
"step": 551,
"token_acc": 0.9384902143522833
},
{
"epoch": 0.11645569620253164,
"grad_norm": 0.8671875,
"learning_rate": 9.962581787391265e-07,
"loss": 0.25049251317977905,
"step": 552,
"token_acc": 0.9287239722370528
},
{
"epoch": 0.11666666666666667,
"grad_norm": 1.2734375,
"learning_rate": 9.962375031418413e-07,
"loss": 0.24676430225372314,
"step": 553,
"token_acc": 0.9325946445060018
},
{
"epoch": 0.11687763713080168,
"grad_norm": 0.75,
"learning_rate": 9.962167707955977e-07,
"loss": 0.22018642723560333,
"step": 554,
"token_acc": 0.9440233236151604
},
{
"epoch": 0.11708860759493671,
"grad_norm": 0.7421875,
"learning_rate": 9.96195981702766e-07,
"loss": 0.2333768904209137,
"step": 555,
"token_acc": 0.9352249928346231
},
{
"epoch": 0.11729957805907174,
"grad_norm": 0.703125,
"learning_rate": 9.961751358657244e-07,
"loss": 0.2830660939216614,
"step": 556,
"token_acc": 0.9188869153345175
},
{
"epoch": 0.11751054852320675,
"grad_norm": 0.81640625,
"learning_rate": 9.961542332868564e-07,
"loss": 0.26290833950042725,
"step": 557,
"token_acc": 0.9261704681872749
},
{
"epoch": 0.11772151898734177,
"grad_norm": 0.7421875,
"learning_rate": 9.961332739685523e-07,
"loss": 0.2768633961677551,
"step": 558,
"token_acc": 0.9245508982035928
},
{
"epoch": 0.11793248945147679,
"grad_norm": 0.80078125,
"learning_rate": 9.96112257913209e-07,
"loss": 0.2084287852048874,
"step": 559,
"token_acc": 0.9381261048909841
},
{
"epoch": 0.11814345991561181,
"grad_norm": 0.796875,
"learning_rate": 9.960911851232301e-07,
"loss": 0.2791953682899475,
"step": 560,
"token_acc": 0.924936025021325
},
{
"epoch": 0.11835443037974684,
"grad_norm": 1.34375,
"learning_rate": 9.960700556010253e-07,
"loss": 0.319602370262146,
"step": 561,
"token_acc": 0.9191286183228887
},
{
"epoch": 0.11856540084388185,
"grad_norm": 0.8515625,
"learning_rate": 9.960488693490108e-07,
"loss": 0.21053284406661987,
"step": 562,
"token_acc": 0.9407194244604317
},
{
"epoch": 0.11877637130801688,
"grad_norm": 0.7421875,
"learning_rate": 9.960276263696097e-07,
"loss": 0.27438345551490784,
"step": 563,
"token_acc": 0.9290875033449291
},
{
"epoch": 0.1189873417721519,
"grad_norm": 0.6328125,
"learning_rate": 9.960063266652512e-07,
"loss": 0.2969055771827698,
"step": 564,
"token_acc": 0.918200408997955
},
{
"epoch": 0.11919831223628692,
"grad_norm": 1.0234375,
"learning_rate": 9.95984970238371e-07,
"loss": 0.3027820587158203,
"step": 565,
"token_acc": 0.924516531503431
},
{
"epoch": 0.11940928270042193,
"grad_norm": 0.69921875,
"learning_rate": 9.959635570914115e-07,
"loss": 0.26206478476524353,
"step": 566,
"token_acc": 0.9267332727823191
},
{
"epoch": 0.11962025316455696,
"grad_norm": 0.8359375,
"learning_rate": 9.959420872268214e-07,
"loss": 0.22268003225326538,
"step": 567,
"token_acc": 0.9377475947934352
},
{
"epoch": 0.11983122362869199,
"grad_norm": 0.72265625,
"learning_rate": 9.95920560647056e-07,
"loss": 0.2821509838104248,
"step": 568,
"token_acc": 0.9257203277821835
},
{
"epoch": 0.120042194092827,
"grad_norm": 0.8125,
"learning_rate": 9.958989773545772e-07,
"loss": 0.2774399518966675,
"step": 569,
"token_acc": 0.9208851167020309
},
{
"epoch": 0.12025316455696203,
"grad_norm": 0.73046875,
"learning_rate": 9.95877337351853e-07,
"loss": 0.20950725674629211,
"step": 570,
"token_acc": 0.9383966244725739
},
{
"epoch": 0.12046413502109704,
"grad_norm": 2.375,
"learning_rate": 9.95855640641358e-07,
"loss": 0.24889153242111206,
"step": 571,
"token_acc": 0.9338555265448216
},
{
"epoch": 0.12067510548523207,
"grad_norm": 0.7265625,
"learning_rate": 9.958338872255738e-07,
"loss": 0.27347537875175476,
"step": 572,
"token_acc": 0.9261669024045261
},
{
"epoch": 0.1208860759493671,
"grad_norm": 0.625,
"learning_rate": 9.958120771069878e-07,
"loss": 0.2640995383262634,
"step": 573,
"token_acc": 0.9278178789561354
},
{
"epoch": 0.12109704641350211,
"grad_norm": 0.609375,
"learning_rate": 9.957902102880945e-07,
"loss": 0.23652713000774384,
"step": 574,
"token_acc": 0.9364988558352403
},
{
"epoch": 0.12130801687763713,
"grad_norm": 0.8359375,
"learning_rate": 9.957682867713942e-07,
"loss": 0.291990727186203,
"step": 575,
"token_acc": 0.9223826714801444
},
{
"epoch": 0.12151898734177215,
"grad_norm": 0.6328125,
"learning_rate": 9.95746306559394e-07,
"loss": 0.21727707982063293,
"step": 576,
"token_acc": 0.9308996088657105
},
{
"epoch": 0.12172995780590717,
"grad_norm": 0.66796875,
"learning_rate": 9.957242696546077e-07,
"loss": 0.2906607985496521,
"step": 577,
"token_acc": 0.9158725837190308
},
{
"epoch": 0.1219409282700422,
"grad_norm": 0.7265625,
"learning_rate": 9.957021760595556e-07,
"loss": 0.226593479514122,
"step": 578,
"token_acc": 0.9271303824149353
},
{
"epoch": 0.12215189873417721,
"grad_norm": 0.7265625,
"learning_rate": 9.956800257767639e-07,
"loss": 0.26656001806259155,
"step": 579,
"token_acc": 0.930952380952381
},
{
"epoch": 0.12236286919831224,
"grad_norm": 0.6875,
"learning_rate": 9.956578188087658e-07,
"loss": 0.2880259156227112,
"step": 580,
"token_acc": 0.9256572982774252
},
{
"epoch": 0.12257383966244725,
"grad_norm": 0.83984375,
"learning_rate": 9.95635555158101e-07,
"loss": 0.24349641799926758,
"step": 581,
"token_acc": 0.9316569954867827
},
{
"epoch": 0.12278481012658228,
"grad_norm": 0.7109375,
"learning_rate": 9.956132348273157e-07,
"loss": 0.24496349692344666,
"step": 582,
"token_acc": 0.9309711286089238
},
{
"epoch": 0.1229957805907173,
"grad_norm": 0.75390625,
"learning_rate": 9.955908578189619e-07,
"loss": 0.2652456760406494,
"step": 583,
"token_acc": 0.9269628727936701
},
{
"epoch": 0.12320675105485232,
"grad_norm": 0.87109375,
"learning_rate": 9.955684241355988e-07,
"loss": 0.2777090072631836,
"step": 584,
"token_acc": 0.9167887489012599
},
{
"epoch": 0.12341772151898735,
"grad_norm": 1.2109375,
"learning_rate": 9.95545933779792e-07,
"loss": 0.3164791464805603,
"step": 585,
"token_acc": 0.919442072302875
},
{
"epoch": 0.12362869198312236,
"grad_norm": 0.67578125,
"learning_rate": 9.955233867541134e-07,
"loss": 0.26809951663017273,
"step": 586,
"token_acc": 0.9227027027027027
},
{
"epoch": 0.12383966244725739,
"grad_norm": 0.70703125,
"learning_rate": 9.955007830611414e-07,
"loss": 0.25988298654556274,
"step": 587,
"token_acc": 0.9277679100059206
},
{
"epoch": 0.1240506329113924,
"grad_norm": 1.453125,
"learning_rate": 9.954781227034612e-07,
"loss": 0.22518092393875122,
"step": 588,
"token_acc": 0.935474701534963
},
{
"epoch": 0.12426160337552743,
"grad_norm": 0.62890625,
"learning_rate": 9.954554056836637e-07,
"loss": 0.23757173120975494,
"step": 589,
"token_acc": 0.929593589009731
},
{
"epoch": 0.12447257383966245,
"grad_norm": 0.7734375,
"learning_rate": 9.954326320043472e-07,
"loss": 0.2483949363231659,
"step": 590,
"token_acc": 0.9315320847405588
},
{
"epoch": 0.12468354430379747,
"grad_norm": 0.82421875,
"learning_rate": 9.95409801668116e-07,
"loss": 0.26530349254608154,
"step": 591,
"token_acc": 0.9214407260351674
},
{
"epoch": 0.1248945147679325,
"grad_norm": 0.703125,
"learning_rate": 9.953869146775806e-07,
"loss": 0.2826001048088074,
"step": 592,
"token_acc": 0.9162200282087447
},
{
"epoch": 0.12510548523206752,
"grad_norm": 0.59375,
"learning_rate": 9.953639710353589e-07,
"loss": 0.1961941421031952,
"step": 593,
"token_acc": 0.9426091825307951
},
{
"epoch": 0.12531645569620253,
"grad_norm": 0.82421875,
"learning_rate": 9.953409707440742e-07,
"loss": 0.26104363799095154,
"step": 594,
"token_acc": 0.9215813350615684
},
{
"epoch": 0.12552742616033755,
"grad_norm": 0.828125,
"learning_rate": 9.95317913806357e-07,
"loss": 0.30334994196891785,
"step": 595,
"token_acc": 0.9229497354497355
},
{
"epoch": 0.1257383966244726,
"grad_norm": 0.8125,
"learning_rate": 9.95294800224844e-07,
"loss": 0.27462461590766907,
"step": 596,
"token_acc": 0.9222963177732676
},
{
"epoch": 0.1259493670886076,
"grad_norm": 0.68359375,
"learning_rate": 9.952716300021784e-07,
"loss": 0.25919121503829956,
"step": 597,
"token_acc": 0.9274289099526066
},
{
"epoch": 0.1261603375527426,
"grad_norm": 0.6328125,
"learning_rate": 9.952484031410102e-07,
"loss": 0.24202126264572144,
"step": 598,
"token_acc": 0.9379543094496365
},
{
"epoch": 0.12637130801687763,
"grad_norm": 0.703125,
"learning_rate": 9.95225119643995e-07,
"loss": 0.22879423201084137,
"step": 599,
"token_acc": 0.9303405572755418
},
{
"epoch": 0.12658227848101267,
"grad_norm": 0.66015625,
"learning_rate": 9.952017795137962e-07,
"loss": 0.2557697892189026,
"step": 600,
"token_acc": 0.9307598039215687
},
{
"epoch": 0.12658227848101267,
"eval_loss": 0.4336538016796112,
"eval_runtime": 246.0329,
"eval_samples_per_second": 136.994,
"eval_steps_per_second": 2.142,
"eval_token_acc": 0.899098088486045,
"step": 600
},
{
"epoch": 0.12679324894514768,
"grad_norm": 0.62109375,
"learning_rate": 9.951783827530821e-07,
"loss": 0.24460609257221222,
"step": 601,
"token_acc": 0.935226264418811
},
{
"epoch": 0.1270042194092827,
"grad_norm": 0.65625,
"learning_rate": 9.951549293645292e-07,
"loss": 0.24832656979560852,
"step": 602,
"token_acc": 0.9309408926417371
},
{
"epoch": 0.12721518987341773,
"grad_norm": 0.69140625,
"learning_rate": 9.95131419350819e-07,
"loss": 0.23030316829681396,
"step": 603,
"token_acc": 0.9332206255283179
},
{
"epoch": 0.12742616033755275,
"grad_norm": 0.73046875,
"learning_rate": 9.951078527146403e-07,
"loss": 0.2880566418170929,
"step": 604,
"token_acc": 0.9192907367777438
},
{
"epoch": 0.12763713080168776,
"grad_norm": 0.6796875,
"learning_rate": 9.95084229458688e-07,
"loss": 0.24888081848621368,
"step": 605,
"token_acc": 0.929957805907173
},
{
"epoch": 0.12784810126582277,
"grad_norm": 0.609375,
"learning_rate": 9.950605495856637e-07,
"loss": 0.2833850681781769,
"step": 606,
"token_acc": 0.9213352685050799
},
{
"epoch": 0.1280590717299578,
"grad_norm": 0.875,
"learning_rate": 9.950368130982755e-07,
"loss": 0.26693737506866455,
"step": 607,
"token_acc": 0.9250070482097548
},
{
"epoch": 0.12827004219409283,
"grad_norm": 0.62890625,
"learning_rate": 9.950130199992377e-07,
"loss": 0.23543164134025574,
"step": 608,
"token_acc": 0.9348515422311905
},
{
"epoch": 0.12848101265822784,
"grad_norm": 0.7265625,
"learning_rate": 9.949891702912712e-07,
"loss": 0.22989103198051453,
"step": 609,
"token_acc": 0.9318626082099972
},
{
"epoch": 0.12869198312236288,
"grad_norm": 0.74609375,
"learning_rate": 9.949652639771036e-07,
"loss": 0.24984115362167358,
"step": 610,
"token_acc": 0.9261559696342305
},
{
"epoch": 0.1289029535864979,
"grad_norm": 0.70703125,
"learning_rate": 9.94941301059469e-07,
"loss": 0.2549693286418915,
"step": 611,
"token_acc": 0.9259363559560687
},
{
"epoch": 0.1291139240506329,
"grad_norm": 0.66015625,
"learning_rate": 9.94917281541107e-07,
"loss": 0.2592216432094574,
"step": 612,
"token_acc": 0.9255747126436782
},
{
"epoch": 0.12932489451476795,
"grad_norm": 0.8046875,
"learning_rate": 9.948932054247652e-07,
"loss": 0.2784273624420166,
"step": 613,
"token_acc": 0.9198324022346369
},
{
"epoch": 0.12953586497890296,
"grad_norm": 1.359375,
"learning_rate": 9.948690727131965e-07,
"loss": 0.2754824161529541,
"step": 614,
"token_acc": 0.9211413748378728
},
{
"epoch": 0.12974683544303797,
"grad_norm": 0.62890625,
"learning_rate": 9.948448834091608e-07,
"loss": 0.22421778738498688,
"step": 615,
"token_acc": 0.9337539432176656
},
{
"epoch": 0.12995780590717299,
"grad_norm": 0.6640625,
"learning_rate": 9.948206375154244e-07,
"loss": 0.22916918992996216,
"step": 616,
"token_acc": 0.933944374209861
},
{
"epoch": 0.13016877637130803,
"grad_norm": 0.640625,
"learning_rate": 9.947963350347598e-07,
"loss": 0.23158694803714752,
"step": 617,
"token_acc": 0.9291338582677166
},
{
"epoch": 0.13037974683544304,
"grad_norm": 0.85546875,
"learning_rate": 9.947719759699466e-07,
"loss": 0.2788570523262024,
"step": 618,
"token_acc": 0.9231003039513678
},
{
"epoch": 0.13059071729957805,
"grad_norm": 0.69140625,
"learning_rate": 9.947475603237702e-07,
"loss": 0.28004133701324463,
"step": 619,
"token_acc": 0.9180544541369283
},
{
"epoch": 0.1308016877637131,
"grad_norm": 0.71484375,
"learning_rate": 9.947230880990227e-07,
"loss": 0.2773160934448242,
"step": 620,
"token_acc": 0.9248013620885358
},
{
"epoch": 0.1310126582278481,
"grad_norm": 0.71484375,
"learning_rate": 9.946985592985028e-07,
"loss": 0.2508021593093872,
"step": 621,
"token_acc": 0.9334818586887333
},
{
"epoch": 0.13122362869198312,
"grad_norm": 0.8046875,
"learning_rate": 9.946739739250156e-07,
"loss": 0.23769596219062805,
"step": 622,
"token_acc": 0.9302249755461363
},
{
"epoch": 0.13143459915611813,
"grad_norm": 1.2109375,
"learning_rate": 9.946493319813725e-07,
"loss": 0.21937592327594757,
"step": 623,
"token_acc": 0.9373860182370821
},
{
"epoch": 0.13164556962025317,
"grad_norm": 0.62109375,
"learning_rate": 9.946246334703916e-07,
"loss": 0.27754154801368713,
"step": 624,
"token_acc": 0.9261245159368484
},
{
"epoch": 0.13185654008438819,
"grad_norm": 0.78515625,
"learning_rate": 9.945998783948975e-07,
"loss": 0.2924942672252655,
"step": 625,
"token_acc": 0.9139688249400479
},
{
"epoch": 0.1320675105485232,
"grad_norm": 0.64453125,
"learning_rate": 9.945750667577209e-07,
"loss": 0.2303755283355713,
"step": 626,
"token_acc": 0.9341597796143251
},
{
"epoch": 0.13227848101265824,
"grad_norm": 0.7734375,
"learning_rate": 9.945501985616995e-07,
"loss": 0.20424559712409973,
"step": 627,
"token_acc": 0.9430594900849858
},
{
"epoch": 0.13248945147679325,
"grad_norm": 0.64453125,
"learning_rate": 9.94525273809677e-07,
"loss": 0.2620590031147003,
"step": 628,
"token_acc": 0.9274099883855982
},
{
"epoch": 0.13270042194092826,
"grad_norm": 0.76171875,
"learning_rate": 9.945002925045038e-07,
"loss": 0.2684752643108368,
"step": 629,
"token_acc": 0.9270194986072423
},
{
"epoch": 0.13291139240506328,
"grad_norm": 0.70703125,
"learning_rate": 9.944752546490367e-07,
"loss": 0.23374760150909424,
"step": 630,
"token_acc": 0.9325842696629213
},
{
"epoch": 0.13312236286919832,
"grad_norm": 1.1953125,
"learning_rate": 9.94450160246139e-07,
"loss": 0.22228139638900757,
"step": 631,
"token_acc": 0.9354534005037783
},
{
"epoch": 0.13333333333333333,
"grad_norm": 0.73046875,
"learning_rate": 9.944250092986807e-07,
"loss": 0.2018851488828659,
"step": 632,
"token_acc": 0.9405840886203424
},
{
"epoch": 0.13354430379746834,
"grad_norm": 1.1796875,
"learning_rate": 9.943998018095377e-07,
"loss": 0.26342812180519104,
"step": 633,
"token_acc": 0.9279495646952867
},
{
"epoch": 0.13375527426160339,
"grad_norm": 0.734375,
"learning_rate": 9.943745377815927e-07,
"loss": 0.24731256067752838,
"step": 634,
"token_acc": 0.9329608938547486
},
{
"epoch": 0.1339662447257384,
"grad_norm": 0.72265625,
"learning_rate": 9.94349217217735e-07,
"loss": 0.22763285040855408,
"step": 635,
"token_acc": 0.9298531810766721
},
{
"epoch": 0.1341772151898734,
"grad_norm": 0.72265625,
"learning_rate": 9.943238401208602e-07,
"loss": 0.25396978855133057,
"step": 636,
"token_acc": 0.9292196007259528
},
{
"epoch": 0.13438818565400845,
"grad_norm": 0.7421875,
"learning_rate": 9.942984064938705e-07,
"loss": 0.30096304416656494,
"step": 637,
"token_acc": 0.9151027703306523
},
{
"epoch": 0.13459915611814346,
"grad_norm": 0.6484375,
"learning_rate": 9.942729163396741e-07,
"loss": 0.29584643244743347,
"step": 638,
"token_acc": 0.919442072302875
},
{
"epoch": 0.13481012658227848,
"grad_norm": 0.65625,
"learning_rate": 9.942473696611862e-07,
"loss": 0.2260875701904297,
"step": 639,
"token_acc": 0.934560327198364
},
{
"epoch": 0.1350210970464135,
"grad_norm": 0.703125,
"learning_rate": 9.942217664613284e-07,
"loss": 0.25592464208602905,
"step": 640,
"token_acc": 0.926865671641791
},
{
"epoch": 0.13523206751054853,
"grad_norm": 0.6171875,
"learning_rate": 9.941961067430285e-07,
"loss": 0.24965469539165497,
"step": 641,
"token_acc": 0.92850705917693
},
{
"epoch": 0.13544303797468354,
"grad_norm": 0.7109375,
"learning_rate": 9.94170390509221e-07,
"loss": 0.2692350149154663,
"step": 642,
"token_acc": 0.9312614259597807
},
{
"epoch": 0.13565400843881856,
"grad_norm": 0.78125,
"learning_rate": 9.941446177628467e-07,
"loss": 0.2497376799583435,
"step": 643,
"token_acc": 0.925979262672811
},
{
"epoch": 0.1358649789029536,
"grad_norm": 1.046875,
"learning_rate": 9.94118788506853e-07,
"loss": 0.27541112899780273,
"step": 644,
"token_acc": 0.9211567732115677
},
{
"epoch": 0.1360759493670886,
"grad_norm": 0.6953125,
"learning_rate": 9.940929027441936e-07,
"loss": 0.2414344996213913,
"step": 645,
"token_acc": 0.9313432835820895
},
{
"epoch": 0.13628691983122362,
"grad_norm": 0.62109375,
"learning_rate": 9.940669604778288e-07,
"loss": 0.2977514863014221,
"step": 646,
"token_acc": 0.922911547911548
},
{
"epoch": 0.13649789029535864,
"grad_norm": 0.71484375,
"learning_rate": 9.940409617107252e-07,
"loss": 0.2718917727470398,
"step": 647,
"token_acc": 0.9186241610738255
},
{
"epoch": 0.13670886075949368,
"grad_norm": 0.765625,
"learning_rate": 9.940149064458563e-07,
"loss": 0.2234637290239334,
"step": 648,
"token_acc": 0.9408866995073891
},
{
"epoch": 0.1369198312236287,
"grad_norm": 0.640625,
"learning_rate": 9.939887946862017e-07,
"loss": 0.2774735689163208,
"step": 649,
"token_acc": 0.9290869155946031
},
{
"epoch": 0.1371308016877637,
"grad_norm": 0.90625,
"learning_rate": 9.93962626434747e-07,
"loss": 0.2615464925765991,
"step": 650,
"token_acc": 0.9272910881090634
},
{
"epoch": 0.13734177215189874,
"grad_norm": 0.65625,
"learning_rate": 9.939364016944852e-07,
"loss": 0.23573726415634155,
"step": 651,
"token_acc": 0.9329399141630901
},
{
"epoch": 0.13755274261603376,
"grad_norm": 0.6328125,
"learning_rate": 9.939101204684151e-07,
"loss": 0.2487039864063263,
"step": 652,
"token_acc": 0.932972972972973
},
{
"epoch": 0.13776371308016877,
"grad_norm": 0.6875,
"learning_rate": 9.938837827595424e-07,
"loss": 0.26214438676834106,
"step": 653,
"token_acc": 0.9278832116788321
},
{
"epoch": 0.1379746835443038,
"grad_norm": 0.69921875,
"learning_rate": 9.938573885708792e-07,
"loss": 0.24997225403785706,
"step": 654,
"token_acc": 0.9322553666016169
},
{
"epoch": 0.13818565400843882,
"grad_norm": 0.76953125,
"learning_rate": 9.938309379054433e-07,
"loss": 0.28863316774368286,
"step": 655,
"token_acc": 0.9217944831767202
},
{
"epoch": 0.13839662447257384,
"grad_norm": 0.625,
"learning_rate": 9.9380443076626e-07,
"loss": 0.23938237130641937,
"step": 656,
"token_acc": 0.9307647740440325
},
{
"epoch": 0.13860759493670885,
"grad_norm": 0.703125,
"learning_rate": 9.937778671563606e-07,
"loss": 0.26946017146110535,
"step": 657,
"token_acc": 0.9250278706800446
},
{
"epoch": 0.1388185654008439,
"grad_norm": 0.64453125,
"learning_rate": 9.937512470787827e-07,
"loss": 0.25879329442977905,
"step": 658,
"token_acc": 0.9263622974963182
},
{
"epoch": 0.1390295358649789,
"grad_norm": 0.75390625,
"learning_rate": 9.937245705365707e-07,
"loss": 0.26367712020874023,
"step": 659,
"token_acc": 0.9273255813953488
},
{
"epoch": 0.13924050632911392,
"grad_norm": 0.73828125,
"learning_rate": 9.93697837532775e-07,
"loss": 0.27003130316734314,
"step": 660,
"token_acc": 0.9255381035485748
},
{
"epoch": 0.13945147679324896,
"grad_norm": 0.77734375,
"learning_rate": 9.936710480704531e-07,
"loss": 0.3241864740848541,
"step": 661,
"token_acc": 0.9117466174661747
},
{
"epoch": 0.13966244725738397,
"grad_norm": 0.765625,
"learning_rate": 9.936442021526685e-07,
"loss": 0.254525363445282,
"step": 662,
"token_acc": 0.9274787535410765
},
{
"epoch": 0.13987341772151898,
"grad_norm": 0.85546875,
"learning_rate": 9.936172997824912e-07,
"loss": 0.22039127349853516,
"step": 663,
"token_acc": 0.9337885985748219
},
{
"epoch": 0.140084388185654,
"grad_norm": 0.71484375,
"learning_rate": 9.935903409629977e-07,
"loss": 0.26330018043518066,
"step": 664,
"token_acc": 0.9245566576495341
},
{
"epoch": 0.14029535864978904,
"grad_norm": 0.79296875,
"learning_rate": 9.93563325697271e-07,
"loss": 0.25053250789642334,
"step": 665,
"token_acc": 0.9321644150617994
},
{
"epoch": 0.14050632911392405,
"grad_norm": 1.015625,
"learning_rate": 9.935362539884004e-07,
"loss": 0.23359492421150208,
"step": 666,
"token_acc": 0.9295649600473513
},
{
"epoch": 0.14071729957805906,
"grad_norm": 0.80078125,
"learning_rate": 9.935091258394821e-07,
"loss": 0.3050011098384857,
"step": 667,
"token_acc": 0.9222160044767768
},
{
"epoch": 0.1409282700421941,
"grad_norm": 0.65625,
"learning_rate": 9.93481941253618e-07,
"loss": 0.24552345275878906,
"step": 668,
"token_acc": 0.9327153110047847
},
{
"epoch": 0.14113924050632912,
"grad_norm": 0.6328125,
"learning_rate": 9.934547002339174e-07,
"loss": 0.2593832015991211,
"step": 669,
"token_acc": 0.9236835410836938
},
{
"epoch": 0.14135021097046413,
"grad_norm": 0.765625,
"learning_rate": 9.93427402783495e-07,
"loss": 0.2546125054359436,
"step": 670,
"token_acc": 0.9320175438596491
},
{
"epoch": 0.14156118143459914,
"grad_norm": 0.84765625,
"learning_rate": 9.93400048905473e-07,
"loss": 0.27444595098495483,
"step": 671,
"token_acc": 0.9244940321743643
},
{
"epoch": 0.14177215189873418,
"grad_norm": 0.79296875,
"learning_rate": 9.93372638602979e-07,
"loss": 0.2675279378890991,
"step": 672,
"token_acc": 0.9254424136930665
},
{
"epoch": 0.1419831223628692,
"grad_norm": 0.63671875,
"learning_rate": 9.933451718791481e-07,
"loss": 0.22922030091285706,
"step": 673,
"token_acc": 0.9329147389292796
},
{
"epoch": 0.1421940928270042,
"grad_norm": 1.015625,
"learning_rate": 9.933176487371213e-07,
"loss": 0.3030126094818115,
"step": 674,
"token_acc": 0.9166399487015069
},
{
"epoch": 0.14240506329113925,
"grad_norm": 0.76171875,
"learning_rate": 9.932900691800457e-07,
"loss": 0.2756281793117523,
"step": 675,
"token_acc": 0.921146953405018
},
{
"epoch": 0.14261603375527426,
"grad_norm": 0.6875,
"learning_rate": 9.932624332110758e-07,
"loss": 0.23098278045654297,
"step": 676,
"token_acc": 0.937677859988617
},
{
"epoch": 0.14282700421940928,
"grad_norm": 0.58203125,
"learning_rate": 9.932347408333715e-07,
"loss": 0.22887524962425232,
"step": 677,
"token_acc": 0.9341611319665031
},
{
"epoch": 0.14303797468354432,
"grad_norm": 0.98046875,
"learning_rate": 9.932069920501e-07,
"loss": 0.2759955823421478,
"step": 678,
"token_acc": 0.9240579710144927
},
{
"epoch": 0.14324894514767933,
"grad_norm": 0.59375,
"learning_rate": 9.931791868644341e-07,
"loss": 0.2028590440750122,
"step": 679,
"token_acc": 0.9378352792679079
},
{
"epoch": 0.14345991561181434,
"grad_norm": 0.8203125,
"learning_rate": 9.931513252795543e-07,
"loss": 0.30346542596817017,
"step": 680,
"token_acc": 0.9129104062326099
},
{
"epoch": 0.14367088607594936,
"grad_norm": 0.98828125,
"learning_rate": 9.931234072986466e-07,
"loss": 0.27435851097106934,
"step": 681,
"token_acc": 0.9276353276353276
},
{
"epoch": 0.1438818565400844,
"grad_norm": 0.83203125,
"learning_rate": 9.930954329249032e-07,
"loss": 0.2799455523490906,
"step": 682,
"token_acc": 0.9241399588356366
},
{
"epoch": 0.1440928270042194,
"grad_norm": 0.82421875,
"learning_rate": 9.930674021615237e-07,
"loss": 0.28436923027038574,
"step": 683,
"token_acc": 0.9250146455770357
},
{
"epoch": 0.14430379746835442,
"grad_norm": 0.69921875,
"learning_rate": 9.930393150117133e-07,
"loss": 0.29506832361221313,
"step": 684,
"token_acc": 0.9232
},
{
"epoch": 0.14451476793248946,
"grad_norm": 0.66796875,
"learning_rate": 9.930111714786844e-07,
"loss": 0.27069365978240967,
"step": 685,
"token_acc": 0.926836079307456
},
{
"epoch": 0.14472573839662448,
"grad_norm": 0.78125,
"learning_rate": 9.92982971565655e-07,
"loss": 0.21447613835334778,
"step": 686,
"token_acc": 0.9363425925925926
},
{
"epoch": 0.1449367088607595,
"grad_norm": 0.66796875,
"learning_rate": 9.929547152758505e-07,
"loss": 0.2686905264854431,
"step": 687,
"token_acc": 0.9303255282695603
},
{
"epoch": 0.1451476793248945,
"grad_norm": 0.71484375,
"learning_rate": 9.929264026125017e-07,
"loss": 0.27938973903656006,
"step": 688,
"token_acc": 0.92536881689326
},
{
"epoch": 0.14535864978902954,
"grad_norm": 0.6953125,
"learning_rate": 9.928980335788469e-07,
"loss": 0.2390938103199005,
"step": 689,
"token_acc": 0.9279638490164805
},
{
"epoch": 0.14556962025316456,
"grad_norm": 0.69921875,
"learning_rate": 9.928696081781299e-07,
"loss": 0.2756063640117645,
"step": 690,
"token_acc": 0.920317553660688
},
{
"epoch": 0.14578059071729957,
"grad_norm": 0.76171875,
"learning_rate": 9.928411264136017e-07,
"loss": 0.23743261396884918,
"step": 691,
"token_acc": 0.9318757921419518
},
{
"epoch": 0.1459915611814346,
"grad_norm": 0.70703125,
"learning_rate": 9.928125882885193e-07,
"loss": 0.2753446102142334,
"step": 692,
"token_acc": 0.9266131265577402
},
{
"epoch": 0.14620253164556962,
"grad_norm": 0.8359375,
"learning_rate": 9.927839938061461e-07,
"loss": 0.24559064209461212,
"step": 693,
"token_acc": 0.9276517473942366
},
{
"epoch": 0.14641350210970464,
"grad_norm": 0.6640625,
"learning_rate": 9.927553429697526e-07,
"loss": 0.24906222522258759,
"step": 694,
"token_acc": 0.9354838709677419
},
{
"epoch": 0.14662447257383968,
"grad_norm": 0.73828125,
"learning_rate": 9.92726635782615e-07,
"loss": 0.26196521520614624,
"step": 695,
"token_acc": 0.9294971487817522
},
{
"epoch": 0.1468354430379747,
"grad_norm": 0.69921875,
"learning_rate": 9.92697872248016e-07,
"loss": 0.28846031427383423,
"step": 696,
"token_acc": 0.9216428779493154
},
{
"epoch": 0.1470464135021097,
"grad_norm": 0.71875,
"learning_rate": 9.926690523692454e-07,
"loss": 0.2781599164009094,
"step": 697,
"token_acc": 0.9191949534394713
},
{
"epoch": 0.14725738396624471,
"grad_norm": 0.6953125,
"learning_rate": 9.926401761495986e-07,
"loss": 0.24464154243469238,
"step": 698,
"token_acc": 0.9295774647887324
},
{
"epoch": 0.14746835443037976,
"grad_norm": 0.6328125,
"learning_rate": 9.926112435923778e-07,
"loss": 0.24627582728862762,
"step": 699,
"token_acc": 0.9308590242442383
},
{
"epoch": 0.14767932489451477,
"grad_norm": 0.80078125,
"learning_rate": 9.92582254700892e-07,
"loss": 0.27795839309692383,
"step": 700,
"token_acc": 0.9238838084991932
},
{
"epoch": 0.14789029535864978,
"grad_norm": 0.62890625,
"learning_rate": 9.925532094784563e-07,
"loss": 0.24588271975517273,
"step": 701,
"token_acc": 0.9284253578732107
},
{
"epoch": 0.14810126582278482,
"grad_norm": 0.66796875,
"learning_rate": 9.92524107928392e-07,
"loss": 0.24630197882652283,
"step": 702,
"token_acc": 0.9327267714364489
},
{
"epoch": 0.14831223628691984,
"grad_norm": 0.7421875,
"learning_rate": 9.924949500540275e-07,
"loss": 0.2578659653663635,
"step": 703,
"token_acc": 0.9267192784667418
},
{
"epoch": 0.14852320675105485,
"grad_norm": 0.671875,
"learning_rate": 9.924657358586967e-07,
"loss": 0.25091686844825745,
"step": 704,
"token_acc": 0.9329545454545455
},
{
"epoch": 0.14873417721518986,
"grad_norm": 0.71484375,
"learning_rate": 9.924364653457411e-07,
"loss": 0.2511135935783386,
"step": 705,
"token_acc": 0.9301768055139347
},
{
"epoch": 0.1489451476793249,
"grad_norm": 0.78125,
"learning_rate": 9.924071385185075e-07,
"loss": 0.2616545259952545,
"step": 706,
"token_acc": 0.927741935483871
},
{
"epoch": 0.14915611814345991,
"grad_norm": 0.68359375,
"learning_rate": 9.9237775538035e-07,
"loss": 0.2902517318725586,
"step": 707,
"token_acc": 0.9226774379688402
},
{
"epoch": 0.14936708860759493,
"grad_norm": 0.7421875,
"learning_rate": 9.92348315934629e-07,
"loss": 0.27046293020248413,
"step": 708,
"token_acc": 0.9296824368114064
},
{
"epoch": 0.14957805907172997,
"grad_norm": 0.62109375,
"learning_rate": 9.923188201847107e-07,
"loss": 0.20588457584381104,
"step": 709,
"token_acc": 0.9350493864112541
},
{
"epoch": 0.14978902953586498,
"grad_norm": 0.83984375,
"learning_rate": 9.92289268133968e-07,
"loss": 0.25359445810317993,
"step": 710,
"token_acc": 0.9285503395335105
},
{
"epoch": 0.15,
"grad_norm": 0.77734375,
"learning_rate": 9.922596597857811e-07,
"loss": 0.267612099647522,
"step": 711,
"token_acc": 0.9265569917743831
},
{
"epoch": 0.150210970464135,
"grad_norm": 0.9140625,
"learning_rate": 9.922299951435357e-07,
"loss": 0.2501794993877411,
"step": 712,
"token_acc": 0.9300184162062615
},
{
"epoch": 0.15042194092827005,
"grad_norm": 0.63671875,
"learning_rate": 9.922002742106242e-07,
"loss": 0.2614431381225586,
"step": 713,
"token_acc": 0.9250471825289835
},
{
"epoch": 0.15063291139240506,
"grad_norm": 0.61328125,
"learning_rate": 9.921704969904453e-07,
"loss": 0.2227068841457367,
"step": 714,
"token_acc": 0.934162192709805
},
{
"epoch": 0.15084388185654007,
"grad_norm": 0.95703125,
"learning_rate": 9.92140663486404e-07,
"loss": 0.2870650887489319,
"step": 715,
"token_acc": 0.9198347107438016
},
{
"epoch": 0.15105485232067511,
"grad_norm": 0.59765625,
"learning_rate": 9.92110773701913e-07,
"loss": 0.24414213001728058,
"step": 716,
"token_acc": 0.9290909090909091
},
{
"epoch": 0.15126582278481013,
"grad_norm": 0.6328125,
"learning_rate": 9.920808276403893e-07,
"loss": 0.27001482248306274,
"step": 717,
"token_acc": 0.9276070094800345
},
{
"epoch": 0.15147679324894514,
"grad_norm": 0.7265625,
"learning_rate": 9.920508253052584e-07,
"loss": 0.24048057198524475,
"step": 718,
"token_acc": 0.9305245535714286
},
{
"epoch": 0.15168776371308018,
"grad_norm": 0.9921875,
"learning_rate": 9.92020766699951e-07,
"loss": 0.267702579498291,
"step": 719,
"token_acc": 0.9254603916983338
},
{
"epoch": 0.1518987341772152,
"grad_norm": 0.66015625,
"learning_rate": 9.919906518279043e-07,
"loss": 0.23744544386863708,
"step": 720,
"token_acc": 0.9312857886517438
},
{
"epoch": 0.1521097046413502,
"grad_norm": 0.85546875,
"learning_rate": 9.919604806925623e-07,
"loss": 0.2514658570289612,
"step": 721,
"token_acc": 0.9293015332197615
},
{
"epoch": 0.15232067510548522,
"grad_norm": 0.7109375,
"learning_rate": 9.919302532973754e-07,
"loss": 0.2536316215991974,
"step": 722,
"token_acc": 0.9287620064034151
},
{
"epoch": 0.15253164556962026,
"grad_norm": 0.8359375,
"learning_rate": 9.918999696458006e-07,
"loss": 0.23538361489772797,
"step": 723,
"token_acc": 0.9311200744647844
},
{
"epoch": 0.15274261603375527,
"grad_norm": 0.64453125,
"learning_rate": 9.918696297413008e-07,
"loss": 0.2112676054239273,
"step": 724,
"token_acc": 0.9377406931964056
},
{
"epoch": 0.1529535864978903,
"grad_norm": 0.6171875,
"learning_rate": 9.918392335873457e-07,
"loss": 0.22141136229038239,
"step": 725,
"token_acc": 0.9383989145183175
},
{
"epoch": 0.15316455696202533,
"grad_norm": 0.8046875,
"learning_rate": 9.91808781187411e-07,
"loss": 0.27773499488830566,
"step": 726,
"token_acc": 0.9278820375335121
},
{
"epoch": 0.15337552742616034,
"grad_norm": 0.94921875,
"learning_rate": 9.917782725449799e-07,
"loss": 0.32096052169799805,
"step": 727,
"token_acc": 0.9175288205734555
},
{
"epoch": 0.15358649789029535,
"grad_norm": 0.71875,
"learning_rate": 9.91747707663541e-07,
"loss": 0.2451501190662384,
"step": 728,
"token_acc": 0.9271503803393798
},
{
"epoch": 0.15379746835443037,
"grad_norm": 1.546875,
"learning_rate": 9.917170865465894e-07,
"loss": 0.29911404848098755,
"step": 729,
"token_acc": 0.9180237372343362
},
{
"epoch": 0.1540084388185654,
"grad_norm": 0.953125,
"learning_rate": 9.91686409197627e-07,
"loss": 0.3019820749759674,
"step": 730,
"token_acc": 0.9172510518934082
},
{
"epoch": 0.15421940928270042,
"grad_norm": 1.1484375,
"learning_rate": 9.916556756201624e-07,
"loss": 0.281706839799881,
"step": 731,
"token_acc": 0.9272495213784301
},
{
"epoch": 0.15443037974683543,
"grad_norm": 0.93359375,
"learning_rate": 9.916248858177099e-07,
"loss": 0.27722233533859253,
"step": 732,
"token_acc": 0.9146039603960396
},
{
"epoch": 0.15464135021097047,
"grad_norm": 0.73046875,
"learning_rate": 9.915940397937906e-07,
"loss": 0.29295605421066284,
"step": 733,
"token_acc": 0.9225071225071225
},
{
"epoch": 0.1548523206751055,
"grad_norm": 0.80078125,
"learning_rate": 9.91563137551932e-07,
"loss": 0.24334131181240082,
"step": 734,
"token_acc": 0.9294417682062908
},
{
"epoch": 0.1550632911392405,
"grad_norm": 0.81640625,
"learning_rate": 9.91532179095668e-07,
"loss": 0.2572481334209442,
"step": 735,
"token_acc": 0.9311695579182988
},
{
"epoch": 0.15527426160337554,
"grad_norm": 0.75,
"learning_rate": 9.915011644285391e-07,
"loss": 0.26280131936073303,
"step": 736,
"token_acc": 0.9256795835743205
},
{
"epoch": 0.15548523206751055,
"grad_norm": 0.62890625,
"learning_rate": 9.91470093554092e-07,
"loss": 0.23156148195266724,
"step": 737,
"token_acc": 0.9377058999700509
},
{
"epoch": 0.15569620253164557,
"grad_norm": 0.60546875,
"learning_rate": 9.914389664758799e-07,
"loss": 0.24967870116233826,
"step": 738,
"token_acc": 0.9284906726964387
},
{
"epoch": 0.15590717299578058,
"grad_norm": 0.6484375,
"learning_rate": 9.914077831974626e-07,
"loss": 0.24829944968223572,
"step": 739,
"token_acc": 0.931044267877412
},
{
"epoch": 0.15611814345991562,
"grad_norm": 0.61328125,
"learning_rate": 9.91376543722406e-07,
"loss": 0.24180346727371216,
"step": 740,
"token_acc": 0.9371293001186239
},
{
"epoch": 0.15632911392405063,
"grad_norm": 0.84765625,
"learning_rate": 9.913452480542825e-07,
"loss": 0.26637858152389526,
"step": 741,
"token_acc": 0.9162153552086651
},
{
"epoch": 0.15654008438818565,
"grad_norm": 0.5625,
"learning_rate": 9.913138961966715e-07,
"loss": 0.22019389271736145,
"step": 742,
"token_acc": 0.9336415556159913
},
{
"epoch": 0.1567510548523207,
"grad_norm": 0.83203125,
"learning_rate": 9.912824881531577e-07,
"loss": 0.2972027361392975,
"step": 743,
"token_acc": 0.9233983286908078
},
{
"epoch": 0.1569620253164557,
"grad_norm": 0.6953125,
"learning_rate": 9.912510239273332e-07,
"loss": 0.26124250888824463,
"step": 744,
"token_acc": 0.9267277268942548
},
{
"epoch": 0.1571729957805907,
"grad_norm": 0.95703125,
"learning_rate": 9.912195035227964e-07,
"loss": 0.32723483443260193,
"step": 745,
"token_acc": 0.9195718654434251
},
{
"epoch": 0.15738396624472573,
"grad_norm": 0.71484375,
"learning_rate": 9.911879269431517e-07,
"loss": 0.23630690574645996,
"step": 746,
"token_acc": 0.9347326049453709
},
{
"epoch": 0.15759493670886077,
"grad_norm": 0.6796875,
"learning_rate": 9.911562941920099e-07,
"loss": 0.21784129738807678,
"step": 747,
"token_acc": 0.9337892446378614
},
{
"epoch": 0.15780590717299578,
"grad_norm": 0.76171875,
"learning_rate": 9.911246052729891e-07,
"loss": 0.26233282685279846,
"step": 748,
"token_acc": 0.9323260937991816
},
{
"epoch": 0.1580168776371308,
"grad_norm": 0.6328125,
"learning_rate": 9.910928601897126e-07,
"loss": 0.2327466756105423,
"step": 749,
"token_acc": 0.9362054681027341
},
{
"epoch": 0.15822784810126583,
"grad_norm": 0.7578125,
"learning_rate": 9.91061058945811e-07,
"loss": 0.27062827348709106,
"step": 750,
"token_acc": 0.918719909374115
},
{
"epoch": 0.15843881856540085,
"grad_norm": 0.671875,
"learning_rate": 9.910292015449211e-07,
"loss": 0.20303724706172943,
"step": 751,
"token_acc": 0.9412310547479121
},
{
"epoch": 0.15864978902953586,
"grad_norm": 0.65234375,
"learning_rate": 9.909972879906858e-07,
"loss": 0.24677664041519165,
"step": 752,
"token_acc": 0.925
},
{
"epoch": 0.15886075949367087,
"grad_norm": 0.7578125,
"learning_rate": 9.90965318286755e-07,
"loss": 0.2709593176841736,
"step": 753,
"token_acc": 0.9230769230769231
},
{
"epoch": 0.1590717299578059,
"grad_norm": 0.99609375,
"learning_rate": 9.909332924367846e-07,
"loss": 0.265384703874588,
"step": 754,
"token_acc": 0.9230769230769231
},
{
"epoch": 0.15928270042194093,
"grad_norm": 0.80078125,
"learning_rate": 9.909012104444368e-07,
"loss": 0.2868095636367798,
"step": 755,
"token_acc": 0.920038228735266
},
{
"epoch": 0.15949367088607594,
"grad_norm": 0.81640625,
"learning_rate": 9.908690723133807e-07,
"loss": 0.24986404180526733,
"step": 756,
"token_acc": 0.9256695756846224
},
{
"epoch": 0.15970464135021098,
"grad_norm": 0.62109375,
"learning_rate": 9.908368780472916e-07,
"loss": 0.20347082614898682,
"step": 757,
"token_acc": 0.9389263902282224
},
{
"epoch": 0.159915611814346,
"grad_norm": 0.59375,
"learning_rate": 9.908046276498511e-07,
"loss": 0.2612215578556061,
"step": 758,
"token_acc": 0.9279416235937975
},
{
"epoch": 0.160126582278481,
"grad_norm": 0.68359375,
"learning_rate": 9.907723211247472e-07,
"loss": 0.23647598922252655,
"step": 759,
"token_acc": 0.9354395604395604
},
{
"epoch": 0.16033755274261605,
"grad_norm": 0.72265625,
"learning_rate": 9.907399584756744e-07,
"loss": 0.28146815299987793,
"step": 760,
"token_acc": 0.92171219374824
},
{
"epoch": 0.16054852320675106,
"grad_norm": 0.72265625,
"learning_rate": 9.90707539706334e-07,
"loss": 0.2895510494709015,
"step": 761,
"token_acc": 0.9175170068027211
},
{
"epoch": 0.16075949367088607,
"grad_norm": 0.64453125,
"learning_rate": 9.90675064820433e-07,
"loss": 0.25281795859336853,
"step": 762,
"token_acc": 0.9276477832512315
},
{
"epoch": 0.16097046413502109,
"grad_norm": 0.67578125,
"learning_rate": 9.906425338216852e-07,
"loss": 0.2702397108078003,
"step": 763,
"token_acc": 0.9337220006136852
},
{
"epoch": 0.16118143459915613,
"grad_norm": 0.82421875,
"learning_rate": 9.906099467138111e-07,
"loss": 0.3201596736907959,
"step": 764,
"token_acc": 0.9149093599704032
},
{
"epoch": 0.16139240506329114,
"grad_norm": 0.8359375,
"learning_rate": 9.90577303500537e-07,
"loss": 0.26031017303466797,
"step": 765,
"token_acc": 0.922656699252444
},
{
"epoch": 0.16160337552742615,
"grad_norm": 0.84765625,
"learning_rate": 9.90544604185596e-07,
"loss": 0.2320261150598526,
"step": 766,
"token_acc": 0.933118216485773
},
{
"epoch": 0.1618143459915612,
"grad_norm": 0.69921875,
"learning_rate": 9.905118487727277e-07,
"loss": 0.2794190049171448,
"step": 767,
"token_acc": 0.9201467268623025
},
{
"epoch": 0.1620253164556962,
"grad_norm": 0.84375,
"learning_rate": 9.904790372656778e-07,
"loss": 0.2765384018421173,
"step": 768,
"token_acc": 0.9225014961101137
},
{
"epoch": 0.16223628691983122,
"grad_norm": 0.84765625,
"learning_rate": 9.904461696681984e-07,
"loss": 0.3068510890007019,
"step": 769,
"token_acc": 0.9177502267916541
},
{
"epoch": 0.16244725738396623,
"grad_norm": 0.71875,
"learning_rate": 9.904132459840485e-07,
"loss": 0.28465330600738525,
"step": 770,
"token_acc": 0.9240544629349471
},
{
"epoch": 0.16265822784810127,
"grad_norm": 0.75,
"learning_rate": 9.903802662169932e-07,
"loss": 0.2329617142677307,
"step": 771,
"token_acc": 0.9319875776397516
},
{
"epoch": 0.16286919831223629,
"grad_norm": 0.64453125,
"learning_rate": 9.903472303708038e-07,
"loss": 0.2284744679927826,
"step": 772,
"token_acc": 0.931237721021611
},
{
"epoch": 0.1630801687763713,
"grad_norm": 1.140625,
"learning_rate": 9.903141384492583e-07,
"loss": 0.23831237852573395,
"step": 773,
"token_acc": 0.9291455790413814
},
{
"epoch": 0.16329113924050634,
"grad_norm": 0.6796875,
"learning_rate": 9.902809904561414e-07,
"loss": 0.23870491981506348,
"step": 774,
"token_acc": 0.9348308374930671
},
{
"epoch": 0.16350210970464135,
"grad_norm": 0.75,
"learning_rate": 9.902477863952431e-07,
"loss": 0.27838945388793945,
"step": 775,
"token_acc": 0.9261146496815287
},
{
"epoch": 0.16371308016877636,
"grad_norm": 0.68359375,
"learning_rate": 9.902145262703613e-07,
"loss": 0.2492181956768036,
"step": 776,
"token_acc": 0.9293759512937595
},
{
"epoch": 0.1639240506329114,
"grad_norm": 0.58984375,
"learning_rate": 9.901812100852993e-07,
"loss": 0.2085292637348175,
"step": 777,
"token_acc": 0.9389517569982132
},
{
"epoch": 0.16413502109704642,
"grad_norm": 0.953125,
"learning_rate": 9.90147837843867e-07,
"loss": 0.2694481909275055,
"step": 778,
"token_acc": 0.9280432309442548
},
{
"epoch": 0.16434599156118143,
"grad_norm": 0.80859375,
"learning_rate": 9.901144095498808e-07,
"loss": 0.25209715962409973,
"step": 779,
"token_acc": 0.9302030456852792
},
{
"epoch": 0.16455696202531644,
"grad_norm": 0.88671875,
"learning_rate": 9.900809252071635e-07,
"loss": 0.31358861923217773,
"step": 780,
"token_acc": 0.9151069518716578
},
{
"epoch": 0.16476793248945149,
"grad_norm": 0.734375,
"learning_rate": 9.900473848195446e-07,
"loss": 0.23959940671920776,
"step": 781,
"token_acc": 0.9326456310679612
},
{
"epoch": 0.1649789029535865,
"grad_norm": 0.80078125,
"learning_rate": 9.900137883908592e-07,
"loss": 0.29789382219314575,
"step": 782,
"token_acc": 0.9166666666666666
},
{
"epoch": 0.1651898734177215,
"grad_norm": 0.796875,
"learning_rate": 9.8998013592495e-07,
"loss": 0.22469905018806458,
"step": 783,
"token_acc": 0.933932193567082
},
{
"epoch": 0.16540084388185655,
"grad_norm": 0.6875,
"learning_rate": 9.89946427425665e-07,
"loss": 0.28561171889305115,
"step": 784,
"token_acc": 0.9251565167899829
},
{
"epoch": 0.16561181434599156,
"grad_norm": 0.90625,
"learning_rate": 9.89912662896859e-07,
"loss": 0.28935301303863525,
"step": 785,
"token_acc": 0.9215164615896242
},
{
"epoch": 0.16582278481012658,
"grad_norm": 0.73046875,
"learning_rate": 9.898788423423935e-07,
"loss": 0.2708919048309326,
"step": 786,
"token_acc": 0.927591706539075
},
{
"epoch": 0.1660337552742616,
"grad_norm": 0.66015625,
"learning_rate": 9.898449657661362e-07,
"loss": 0.2672666311264038,
"step": 787,
"token_acc": 0.9263598326359833
},
{
"epoch": 0.16624472573839663,
"grad_norm": 0.8046875,
"learning_rate": 9.89811033171961e-07,
"loss": 0.2862321734428406,
"step": 788,
"token_acc": 0.9180470793374019
},
{
"epoch": 0.16645569620253164,
"grad_norm": 0.8046875,
"learning_rate": 9.897770445637483e-07,
"loss": 0.2871711850166321,
"step": 789,
"token_acc": 0.9249183895538629
},
{
"epoch": 0.16666666666666666,
"grad_norm": 0.80859375,
"learning_rate": 9.897429999453852e-07,
"loss": 0.2397966980934143,
"step": 790,
"token_acc": 0.9377081945369754
},
{
"epoch": 0.1668776371308017,
"grad_norm": 0.80859375,
"learning_rate": 9.89708899320765e-07,
"loss": 0.2998412847518921,
"step": 791,
"token_acc": 0.9202363367799113
},
{
"epoch": 0.1670886075949367,
"grad_norm": 0.6640625,
"learning_rate": 9.89674742693787e-07,
"loss": 0.27035748958587646,
"step": 792,
"token_acc": 0.9229891614375356
},
{
"epoch": 0.16729957805907172,
"grad_norm": 0.76953125,
"learning_rate": 9.89640530068358e-07,
"loss": 0.17933598160743713,
"step": 793,
"token_acc": 0.9483188044831881
},
{
"epoch": 0.16751054852320676,
"grad_norm": 0.91796875,
"learning_rate": 9.896062614483898e-07,
"loss": 0.2540227472782135,
"step": 794,
"token_acc": 0.929927414852038
},
{
"epoch": 0.16772151898734178,
"grad_norm": 0.68359375,
"learning_rate": 9.895719368378016e-07,
"loss": 0.24861711263656616,
"step": 795,
"token_acc": 0.9333129397369226
},
{
"epoch": 0.1679324894514768,
"grad_norm": 0.69140625,
"learning_rate": 9.89537556240519e-07,
"loss": 0.2633477449417114,
"step": 796,
"token_acc": 0.9196560924992588
},
{
"epoch": 0.1681434599156118,
"grad_norm": 0.8671875,
"learning_rate": 9.89503119660473e-07,
"loss": 0.2524856925010681,
"step": 797,
"token_acc": 0.9313725490196079
},
{
"epoch": 0.16835443037974684,
"grad_norm": 0.91796875,
"learning_rate": 9.894686271016027e-07,
"loss": 0.30388563871383667,
"step": 798,
"token_acc": 0.9198352344740177
},
{
"epoch": 0.16856540084388186,
"grad_norm": 0.6328125,
"learning_rate": 9.894340785678517e-07,
"loss": 0.2910333573818207,
"step": 799,
"token_acc": 0.9234449760765551
},
{
"epoch": 0.16877637130801687,
"grad_norm": 0.859375,
"learning_rate": 9.893994740631713e-07,
"loss": 0.25983309745788574,
"step": 800,
"token_acc": 0.9273416982783775
},
{
"epoch": 0.16877637130801687,
"eval_loss": 0.43369975686073303,
"eval_runtime": 245.7245,
"eval_samples_per_second": 137.166,
"eval_steps_per_second": 2.145,
"eval_token_acc": 0.8991631517544647,
"step": 800
},
{
"epoch": 0.1689873417721519,
"grad_norm": 0.640625,
"learning_rate": 9.893648135915188e-07,
"loss": 0.26705414056777954,
"step": 801,
"token_acc": 0.9297355062783863
},
{
"epoch": 0.16919831223628692,
"grad_norm": 0.68359375,
"learning_rate": 9.893300971568578e-07,
"loss": 0.2386769950389862,
"step": 802,
"token_acc": 0.935367545076283
},
{
"epoch": 0.16940928270042194,
"grad_norm": 0.71875,
"learning_rate": 9.892953247631589e-07,
"loss": 0.2654857337474823,
"step": 803,
"token_acc": 0.9260304912478825
},
{
"epoch": 0.16962025316455695,
"grad_norm": 0.71484375,
"learning_rate": 9.89260496414398e-07,
"loss": 0.2585882842540741,
"step": 804,
"token_acc": 0.9277403551745255
},
{
"epoch": 0.169831223628692,
"grad_norm": 0.76953125,
"learning_rate": 9.892256121145584e-07,
"loss": 0.25679513812065125,
"step": 805,
"token_acc": 0.928305133352452
},
{
"epoch": 0.170042194092827,
"grad_norm": 0.671875,
"learning_rate": 9.891906718676291e-07,
"loss": 0.269248902797699,
"step": 806,
"token_acc": 0.9328318108543794
},
{
"epoch": 0.17025316455696202,
"grad_norm": 0.69140625,
"learning_rate": 9.89155675677606e-07,
"loss": 0.2557290196418762,
"step": 807,
"token_acc": 0.9275456919060052
},
{
"epoch": 0.17046413502109706,
"grad_norm": 0.63671875,
"learning_rate": 9.891206235484913e-07,
"loss": 0.23980513215065002,
"step": 808,
"token_acc": 0.9353355807539074
},
{
"epoch": 0.17067510548523207,
"grad_norm": 0.65625,
"learning_rate": 9.890855154842935e-07,
"loss": 0.2392064481973648,
"step": 809,
"token_acc": 0.9303870595031773
},
{
"epoch": 0.17088607594936708,
"grad_norm": 0.7890625,
"learning_rate": 9.890503514890275e-07,
"loss": 0.23739401996135712,
"step": 810,
"token_acc": 0.9328014728444308
},
{
"epoch": 0.1710970464135021,
"grad_norm": 0.7890625,
"learning_rate": 9.89015131566714e-07,
"loss": 0.2900955080986023,
"step": 811,
"token_acc": 0.9233797698364627
},
{
"epoch": 0.17130801687763714,
"grad_norm": 0.69921875,
"learning_rate": 9.889798557213818e-07,
"loss": 0.27924585342407227,
"step": 812,
"token_acc": 0.9214012363850457
},
{
"epoch": 0.17151898734177215,
"grad_norm": 0.7421875,
"learning_rate": 9.88944523957064e-07,
"loss": 0.2851244807243347,
"step": 813,
"token_acc": 0.9264660254421346
},
{
"epoch": 0.17172995780590716,
"grad_norm": 1.1875,
"learning_rate": 9.889091362778017e-07,
"loss": 0.23967793583869934,
"step": 814,
"token_acc": 0.9278113316077078
},
{
"epoch": 0.1719409282700422,
"grad_norm": 0.6796875,
"learning_rate": 9.888736926876415e-07,
"loss": 0.23629070818424225,
"step": 815,
"token_acc": 0.9284259528658714
},
{
"epoch": 0.17215189873417722,
"grad_norm": 0.93359375,
"learning_rate": 9.88838193190637e-07,
"loss": 0.3324328064918518,
"step": 816,
"token_acc": 0.9166411277965063
},
{
"epoch": 0.17236286919831223,
"grad_norm": 0.73046875,
"learning_rate": 9.888026377908472e-07,
"loss": 0.2603840231895447,
"step": 817,
"token_acc": 0.9286516853932584
},
{
"epoch": 0.17257383966244727,
"grad_norm": 0.7421875,
"learning_rate": 9.887670264923387e-07,
"loss": 0.25500792264938354,
"step": 818,
"token_acc": 0.9334559950935296
},
{
"epoch": 0.17278481012658228,
"grad_norm": 0.69921875,
"learning_rate": 9.88731359299184e-07,
"loss": 0.25971531867980957,
"step": 819,
"token_acc": 0.928698752228164
},
{
"epoch": 0.1729957805907173,
"grad_norm": 0.65625,
"learning_rate": 9.886956362154617e-07,
"loss": 0.2521659731864929,
"step": 820,
"token_acc": 0.9252336448598131
},
{
"epoch": 0.1732067510548523,
"grad_norm": 1.0078125,
"learning_rate": 9.88659857245257e-07,
"loss": 0.23598764836788177,
"step": 821,
"token_acc": 0.9364791288566243
},
{
"epoch": 0.17341772151898735,
"grad_norm": 2.28125,
"learning_rate": 9.886240223926617e-07,
"loss": 0.2466164529323578,
"step": 822,
"token_acc": 0.9289433384379786
},
{
"epoch": 0.17362869198312236,
"grad_norm": 0.609375,
"learning_rate": 9.88588131661774e-07,
"loss": 0.2287391722202301,
"step": 823,
"token_acc": 0.9339788732394366
},
{
"epoch": 0.17383966244725738,
"grad_norm": 0.9921875,
"learning_rate": 9.885521850566977e-07,
"loss": 0.3011782765388489,
"step": 824,
"token_acc": 0.9158669225847729
},
{
"epoch": 0.17405063291139242,
"grad_norm": 0.88671875,
"learning_rate": 9.88516182581544e-07,
"loss": 0.24732713401317596,
"step": 825,
"token_acc": 0.9297990096125838
},
{
"epoch": 0.17426160337552743,
"grad_norm": 0.8046875,
"learning_rate": 9.884801242404303e-07,
"loss": 0.2557525038719177,
"step": 826,
"token_acc": 0.9272430668841762
},
{
"epoch": 0.17447257383966244,
"grad_norm": 0.58203125,
"learning_rate": 9.884440100374798e-07,
"loss": 0.2030971348285675,
"step": 827,
"token_acc": 0.9397905759162304
},
{
"epoch": 0.17468354430379746,
"grad_norm": 0.72265625,
"learning_rate": 9.884078399768226e-07,
"loss": 0.23457200825214386,
"step": 828,
"token_acc": 0.9320594479830149
},
{
"epoch": 0.1748945147679325,
"grad_norm": 0.75,
"learning_rate": 9.88371614062595e-07,
"loss": 0.27027198672294617,
"step": 829,
"token_acc": 0.9281524926686217
},
{
"epoch": 0.1751054852320675,
"grad_norm": 0.578125,
"learning_rate": 9.8833533229894e-07,
"loss": 0.2117438018321991,
"step": 830,
"token_acc": 0.9373803664205633
},
{
"epoch": 0.17531645569620252,
"grad_norm": 0.71484375,
"learning_rate": 9.882989946900063e-07,
"loss": 0.21879255771636963,
"step": 831,
"token_acc": 0.9381270903010034
},
{
"epoch": 0.17552742616033756,
"grad_norm": 0.75,
"learning_rate": 9.882626012399495e-07,
"loss": 0.27527743577957153,
"step": 832,
"token_acc": 0.9288235294117647
},
{
"epoch": 0.17573839662447258,
"grad_norm": 0.74609375,
"learning_rate": 9.882261519529318e-07,
"loss": 0.2788648307323456,
"step": 833,
"token_acc": 0.9233587786259542
},
{
"epoch": 0.1759493670886076,
"grad_norm": 0.75,
"learning_rate": 9.881896468331215e-07,
"loss": 0.26945775747299194,
"step": 834,
"token_acc": 0.9267376330619912
},
{
"epoch": 0.17616033755274263,
"grad_norm": 0.89453125,
"learning_rate": 9.881530858846928e-07,
"loss": 0.30408790707588196,
"step": 835,
"token_acc": 0.9146719234018587
},
{
"epoch": 0.17637130801687764,
"grad_norm": 0.8125,
"learning_rate": 9.88116469111827e-07,
"loss": 0.2584350109100342,
"step": 836,
"token_acc": 0.9246458923512748
},
{
"epoch": 0.17658227848101266,
"grad_norm": 0.8125,
"learning_rate": 9.880797965187119e-07,
"loss": 0.3014131784439087,
"step": 837,
"token_acc": 0.9151281344723065
},
{
"epoch": 0.17679324894514767,
"grad_norm": 0.59375,
"learning_rate": 9.880430681095407e-07,
"loss": 0.21773123741149902,
"step": 838,
"token_acc": 0.9367166004280036
},
{
"epoch": 0.1770042194092827,
"grad_norm": 1.9609375,
"learning_rate": 9.88006283888514e-07,
"loss": 0.26315808296203613,
"step": 839,
"token_acc": 0.9259569712210115
},
{
"epoch": 0.17721518987341772,
"grad_norm": 0.93359375,
"learning_rate": 9.879694438598383e-07,
"loss": 0.2646620571613312,
"step": 840,
"token_acc": 0.9318522966076254
},
{
"epoch": 0.17742616033755274,
"grad_norm": 0.78125,
"learning_rate": 9.879325480277266e-07,
"loss": 0.2713755965232849,
"step": 841,
"token_acc": 0.9206214689265537
},
{
"epoch": 0.17763713080168778,
"grad_norm": 0.859375,
"learning_rate": 9.878955963963979e-07,
"loss": 0.26667535305023193,
"step": 842,
"token_acc": 0.9262686567164179
},
{
"epoch": 0.1778481012658228,
"grad_norm": 0.734375,
"learning_rate": 9.878585889700785e-07,
"loss": 0.24986431002616882,
"step": 843,
"token_acc": 0.9370695053224797
},
{
"epoch": 0.1780590717299578,
"grad_norm": 1.4140625,
"learning_rate": 9.878215257530004e-07,
"loss": 0.2651556730270386,
"step": 844,
"token_acc": 0.9286946520989074
},
{
"epoch": 0.17827004219409281,
"grad_norm": 0.828125,
"learning_rate": 9.877844067494017e-07,
"loss": 0.2608075737953186,
"step": 845,
"token_acc": 0.9263001485884101
},
{
"epoch": 0.17848101265822786,
"grad_norm": 0.75390625,
"learning_rate": 9.877472319635275e-07,
"loss": 0.28958860039711,
"step": 846,
"token_acc": 0.9233128834355828
},
{
"epoch": 0.17869198312236287,
"grad_norm": 1.0234375,
"learning_rate": 9.877100013996291e-07,
"loss": 0.2941049039363861,
"step": 847,
"token_acc": 0.9213372664700098
},
{
"epoch": 0.17890295358649788,
"grad_norm": 0.84375,
"learning_rate": 9.876727150619642e-07,
"loss": 0.2620714604854584,
"step": 848,
"token_acc": 0.9288433382137629
},
{
"epoch": 0.17911392405063292,
"grad_norm": 0.60546875,
"learning_rate": 9.876353729547968e-07,
"loss": 0.2020449936389923,
"step": 849,
"token_acc": 0.9392366412213741
},
{
"epoch": 0.17932489451476794,
"grad_norm": 0.953125,
"learning_rate": 9.875979750823969e-07,
"loss": 0.2892880439758301,
"step": 850,
"token_acc": 0.9246448424953675
},
{
"epoch": 0.17953586497890295,
"grad_norm": 0.70703125,
"learning_rate": 9.875605214490417e-07,
"loss": 0.2778629660606384,
"step": 851,
"token_acc": 0.923582580115037
},
{
"epoch": 0.17974683544303796,
"grad_norm": 0.5859375,
"learning_rate": 9.875230120590142e-07,
"loss": 0.23370903730392456,
"step": 852,
"token_acc": 0.9249401117913228
},
{
"epoch": 0.179957805907173,
"grad_norm": 0.75390625,
"learning_rate": 9.874854469166038e-07,
"loss": 0.28334856033325195,
"step": 853,
"token_acc": 0.9264705882352942
},
{
"epoch": 0.18016877637130801,
"grad_norm": 0.55078125,
"learning_rate": 9.874478260261067e-07,
"loss": 0.2282511293888092,
"step": 854,
"token_acc": 0.933620159803319
},
{
"epoch": 0.18037974683544303,
"grad_norm": 1.1015625,
"learning_rate": 9.874101493918249e-07,
"loss": 0.27366286516189575,
"step": 855,
"token_acc": 0.9260089686098655
},
{
"epoch": 0.18059071729957807,
"grad_norm": 0.6875,
"learning_rate": 9.87372417018067e-07,
"loss": 0.25619056820869446,
"step": 856,
"token_acc": 0.9255610290093049
},
{
"epoch": 0.18080168776371308,
"grad_norm": 0.828125,
"learning_rate": 9.873346289091483e-07,
"loss": 0.270757257938385,
"step": 857,
"token_acc": 0.923998738568275
},
{
"epoch": 0.1810126582278481,
"grad_norm": 0.66796875,
"learning_rate": 9.8729678506939e-07,
"loss": 0.28628918528556824,
"step": 858,
"token_acc": 0.9234957020057306
},
{
"epoch": 0.18122362869198314,
"grad_norm": 0.7265625,
"learning_rate": 9.872588855031197e-07,
"loss": 0.2525092661380768,
"step": 859,
"token_acc": 0.9317073170731708
},
{
"epoch": 0.18143459915611815,
"grad_norm": 0.83984375,
"learning_rate": 9.872209302146718e-07,
"loss": 0.28244319558143616,
"step": 860,
"token_acc": 0.9267202859696158
},
{
"epoch": 0.18164556962025316,
"grad_norm": 0.828125,
"learning_rate": 9.871829192083867e-07,
"loss": 0.254133403301239,
"step": 861,
"token_acc": 0.928436911487759
},
{
"epoch": 0.18185654008438817,
"grad_norm": 0.75,
"learning_rate": 9.871448524886113e-07,
"loss": 0.2619815468788147,
"step": 862,
"token_acc": 0.9243888573052871
},
{
"epoch": 0.18206751054852321,
"grad_norm": 0.91796875,
"learning_rate": 9.87106730059699e-07,
"loss": 0.2682092487812042,
"step": 863,
"token_acc": 0.9261158021712907
},
{
"epoch": 0.18227848101265823,
"grad_norm": 0.6015625,
"learning_rate": 9.870685519260092e-07,
"loss": 0.245108962059021,
"step": 864,
"token_acc": 0.9272880404267265
},
{
"epoch": 0.18248945147679324,
"grad_norm": 0.9609375,
"learning_rate": 9.870303180919078e-07,
"loss": 0.2907876670360565,
"step": 865,
"token_acc": 0.9245337159253946
},
{
"epoch": 0.18270042194092828,
"grad_norm": 0.60546875,
"learning_rate": 9.869920285617676e-07,
"loss": 0.24249601364135742,
"step": 866,
"token_acc": 0.931304347826087
},
{
"epoch": 0.1829113924050633,
"grad_norm": 0.83203125,
"learning_rate": 9.869536833399669e-07,
"loss": 0.2370653748512268,
"step": 867,
"token_acc": 0.9323520200438459
},
{
"epoch": 0.1831223628691983,
"grad_norm": 0.859375,
"learning_rate": 9.869152824308912e-07,
"loss": 0.3008883595466614,
"step": 868,
"token_acc": 0.9213197969543148
},
{
"epoch": 0.18333333333333332,
"grad_norm": 0.8125,
"learning_rate": 9.868768258389314e-07,
"loss": 0.2317754030227661,
"step": 869,
"token_acc": 0.9351635514018691
},
{
"epoch": 0.18354430379746836,
"grad_norm": 0.71484375,
"learning_rate": 9.868383135684857e-07,
"loss": 0.2313736081123352,
"step": 870,
"token_acc": 0.9373202990224266
},
{
"epoch": 0.18375527426160337,
"grad_norm": 0.9296875,
"learning_rate": 9.867997456239586e-07,
"loss": 0.28026607632637024,
"step": 871,
"token_acc": 0.9176308539944904
},
{
"epoch": 0.1839662447257384,
"grad_norm": 0.6875,
"learning_rate": 9.8676112200976e-07,
"loss": 0.254774272441864,
"step": 872,
"token_acc": 0.9306763962952568
},
{
"epoch": 0.18417721518987343,
"grad_norm": 0.75,
"learning_rate": 9.867224427303073e-07,
"loss": 0.24183842539787292,
"step": 873,
"token_acc": 0.9341463414634147
},
{
"epoch": 0.18438818565400844,
"grad_norm": 0.86328125,
"learning_rate": 9.86683707790024e-07,
"loss": 0.23453059792518616,
"step": 874,
"token_acc": 0.9323812299621101
},
{
"epoch": 0.18459915611814345,
"grad_norm": 0.734375,
"learning_rate": 9.86644917193339e-07,
"loss": 0.24839141964912415,
"step": 875,
"token_acc": 0.9287037037037037
},
{
"epoch": 0.1848101265822785,
"grad_norm": 0.671875,
"learning_rate": 9.86606070944689e-07,
"loss": 0.2521136403083801,
"step": 876,
"token_acc": 0.9300189993666877
},
{
"epoch": 0.1850210970464135,
"grad_norm": 1.046875,
"learning_rate": 9.865671690485162e-07,
"loss": 0.3050832748413086,
"step": 877,
"token_acc": 0.9174647887323943
},
{
"epoch": 0.18523206751054852,
"grad_norm": 0.74609375,
"learning_rate": 9.865282115092692e-07,
"loss": 0.2835577726364136,
"step": 878,
"token_acc": 0.9225286643941741
},
{
"epoch": 0.18544303797468353,
"grad_norm": 0.7890625,
"learning_rate": 9.864891983314033e-07,
"loss": 0.29184651374816895,
"step": 879,
"token_acc": 0.9218231210383339
},
{
"epoch": 0.18565400843881857,
"grad_norm": 2.859375,
"learning_rate": 9.8645012951938e-07,
"loss": 0.2807004451751709,
"step": 880,
"token_acc": 0.9238008500303583
},
{
"epoch": 0.1858649789029536,
"grad_norm": 0.62109375,
"learning_rate": 9.864110050776672e-07,
"loss": 0.25495046377182007,
"step": 881,
"token_acc": 0.9281183932346723
},
{
"epoch": 0.1860759493670886,
"grad_norm": 0.62890625,
"learning_rate": 9.86371825010739e-07,
"loss": 0.26357853412628174,
"step": 882,
"token_acc": 0.9274457329765091
},
{
"epoch": 0.18628691983122364,
"grad_norm": 0.703125,
"learning_rate": 9.86332589323076e-07,
"loss": 0.2856602966785431,
"step": 883,
"token_acc": 0.9247496423462088
},
{
"epoch": 0.18649789029535865,
"grad_norm": 0.63671875,
"learning_rate": 9.862932980191652e-07,
"loss": 0.26217591762542725,
"step": 884,
"token_acc": 0.9308156140907649
},
{
"epoch": 0.18670886075949367,
"grad_norm": 0.859375,
"learning_rate": 9.862539511034997e-07,
"loss": 0.2957126498222351,
"step": 885,
"token_acc": 0.9175007582650895
},
{
"epoch": 0.18691983122362868,
"grad_norm": 0.71875,
"learning_rate": 9.862145485805793e-07,
"loss": 0.2381889373064041,
"step": 886,
"token_acc": 0.9338040600176523
},
{
"epoch": 0.18713080168776372,
"grad_norm": 0.61328125,
"learning_rate": 9.861750904549099e-07,
"loss": 0.23038305342197418,
"step": 887,
"token_acc": 0.933588010578901
},
{
"epoch": 0.18734177215189873,
"grad_norm": 0.7578125,
"learning_rate": 9.86135576731004e-07,
"loss": 0.2670343518257141,
"step": 888,
"token_acc": 0.9209346991037132
},
{
"epoch": 0.18755274261603375,
"grad_norm": 0.875,
"learning_rate": 9.860960074133802e-07,
"loss": 0.3037135899066925,
"step": 889,
"token_acc": 0.9173450219160927
},
{
"epoch": 0.1877637130801688,
"grad_norm": 0.6953125,
"learning_rate": 9.860563825065637e-07,
"loss": 0.23587052524089813,
"step": 890,
"token_acc": 0.9326950971859588
},
{
"epoch": 0.1879746835443038,
"grad_norm": 1.046875,
"learning_rate": 9.86016702015086e-07,
"loss": 0.2837037444114685,
"step": 891,
"token_acc": 0.9186681222707423
},
{
"epoch": 0.1881856540084388,
"grad_norm": 0.80078125,
"learning_rate": 9.85976965943485e-07,
"loss": 0.273685485124588,
"step": 892,
"token_acc": 0.9198871650211565
},
{
"epoch": 0.18839662447257383,
"grad_norm": 0.74609375,
"learning_rate": 9.859371742963043e-07,
"loss": 0.24621078372001648,
"step": 893,
"token_acc": 0.9370564640543042
},
{
"epoch": 0.18860759493670887,
"grad_norm": 0.63671875,
"learning_rate": 9.85897327078095e-07,
"loss": 0.2355155646800995,
"step": 894,
"token_acc": 0.933295647258338
},
{
"epoch": 0.18881856540084388,
"grad_norm": 0.6875,
"learning_rate": 9.858574242934136e-07,
"loss": 0.29725679755210876,
"step": 895,
"token_acc": 0.9205632306057385
},
{
"epoch": 0.1890295358649789,
"grad_norm": 0.71484375,
"learning_rate": 9.858174659468237e-07,
"loss": 0.23919257521629333,
"step": 896,
"token_acc": 0.9367167919799498
},
{
"epoch": 0.18924050632911393,
"grad_norm": 0.6640625,
"learning_rate": 9.857774520428945e-07,
"loss": 0.2421645075082779,
"step": 897,
"token_acc": 0.928311057108141
},
{
"epoch": 0.18945147679324895,
"grad_norm": 0.90625,
"learning_rate": 9.85737382586202e-07,
"loss": 0.22805655002593994,
"step": 898,
"token_acc": 0.9353342428376534
},
{
"epoch": 0.18966244725738396,
"grad_norm": 1.125,
"learning_rate": 9.856972575813285e-07,
"loss": 0.2736568748950958,
"step": 899,
"token_acc": 0.9199507389162561
},
{
"epoch": 0.189873417721519,
"grad_norm": 0.71875,
"learning_rate": 9.85657077032863e-07,
"loss": 0.251034677028656,
"step": 900,
"token_acc": 0.933903806432576
},
{
"epoch": 0.190084388185654,
"grad_norm": 0.671875,
"learning_rate": 9.856168409454e-07,
"loss": 0.2377174347639084,
"step": 901,
"token_acc": 0.9313658201784488
},
{
"epoch": 0.19029535864978903,
"grad_norm": 0.625,
"learning_rate": 9.855765493235408e-07,
"loss": 0.27164188027381897,
"step": 902,
"token_acc": 0.9263128176171654
},
{
"epoch": 0.19050632911392404,
"grad_norm": 0.84765625,
"learning_rate": 9.855362021718936e-07,
"loss": 0.250331312417984,
"step": 903,
"token_acc": 0.9330877839165131
},
{
"epoch": 0.19071729957805908,
"grad_norm": 0.60546875,
"learning_rate": 9.85495799495072e-07,
"loss": 0.20965948700904846,
"step": 904,
"token_acc": 0.936726272352132
},
{
"epoch": 0.1909282700421941,
"grad_norm": 0.6640625,
"learning_rate": 9.854553412976965e-07,
"loss": 0.24447084963321686,
"step": 905,
"token_acc": 0.9254159495123351
},
{
"epoch": 0.1911392405063291,
"grad_norm": 0.6875,
"learning_rate": 9.854148275843939e-07,
"loss": 0.2490314543247223,
"step": 906,
"token_acc": 0.9285930408472012
},
{
"epoch": 0.19135021097046415,
"grad_norm": 0.61328125,
"learning_rate": 9.853742583597973e-07,
"loss": 0.21816563606262207,
"step": 907,
"token_acc": 0.9387067116150781
},
{
"epoch": 0.19156118143459916,
"grad_norm": 0.66015625,
"learning_rate": 9.853336336285461e-07,
"loss": 0.24077807366847992,
"step": 908,
"token_acc": 0.9278679026651216
},
{
"epoch": 0.19177215189873417,
"grad_norm": 0.82421875,
"learning_rate": 9.852929533952858e-07,
"loss": 0.2617112696170807,
"step": 909,
"token_acc": 0.9296264118158123
},
{
"epoch": 0.19198312236286919,
"grad_norm": 0.75,
"learning_rate": 9.852522176646692e-07,
"loss": 0.22484534978866577,
"step": 910,
"token_acc": 0.9368040926873308
},
{
"epoch": 0.19219409282700423,
"grad_norm": 0.671875,
"learning_rate": 9.85211426441354e-07,
"loss": 0.2532415986061096,
"step": 911,
"token_acc": 0.9334488734835356
},
{
"epoch": 0.19240506329113924,
"grad_norm": 0.87890625,
"learning_rate": 9.851705797300056e-07,
"loss": 0.31424853205680847,
"step": 912,
"token_acc": 0.9197608558842039
},
{
"epoch": 0.19261603375527425,
"grad_norm": 0.70703125,
"learning_rate": 9.851296775352948e-07,
"loss": 0.29285135865211487,
"step": 913,
"token_acc": 0.9182986536107711
},
{
"epoch": 0.1928270042194093,
"grad_norm": 0.59765625,
"learning_rate": 9.850887198618996e-07,
"loss": 0.21450576186180115,
"step": 914,
"token_acc": 0.9373626373626374
},
{
"epoch": 0.1930379746835443,
"grad_norm": 0.78515625,
"learning_rate": 9.850477067145031e-07,
"loss": 0.2844701111316681,
"step": 915,
"token_acc": 0.919885094158953
},
{
"epoch": 0.19324894514767932,
"grad_norm": 0.68359375,
"learning_rate": 9.850066380977961e-07,
"loss": 0.26549211144447327,
"step": 916,
"token_acc": 0.9245901639344263
},
{
"epoch": 0.19345991561181436,
"grad_norm": 0.765625,
"learning_rate": 9.849655140164752e-07,
"loss": 0.258350133895874,
"step": 917,
"token_acc": 0.918646080760095
},
{
"epoch": 0.19367088607594937,
"grad_norm": 0.7421875,
"learning_rate": 9.849243344752427e-07,
"loss": 0.2719504237174988,
"step": 918,
"token_acc": 0.9267654751525719
},
{
"epoch": 0.19388185654008439,
"grad_norm": 0.79296875,
"learning_rate": 9.848830994788083e-07,
"loss": 0.27195435762405396,
"step": 919,
"token_acc": 0.9220445459737293
},
{
"epoch": 0.1940928270042194,
"grad_norm": 0.71484375,
"learning_rate": 9.848418090318876e-07,
"loss": 0.24952857196331024,
"step": 920,
"token_acc": 0.9340033500837521
},
{
"epoch": 0.19430379746835444,
"grad_norm": 0.7421875,
"learning_rate": 9.848004631392022e-07,
"loss": 0.22502082586288452,
"step": 921,
"token_acc": 0.935454267360049
},
{
"epoch": 0.19451476793248945,
"grad_norm": 0.76171875,
"learning_rate": 9.847590618054806e-07,
"loss": 0.30236607789993286,
"step": 922,
"token_acc": 0.9160954208938854
},
{
"epoch": 0.19472573839662446,
"grad_norm": 0.7578125,
"learning_rate": 9.847176050354573e-07,
"loss": 0.26875466108322144,
"step": 923,
"token_acc": 0.9231622746185852
},
{
"epoch": 0.1949367088607595,
"grad_norm": 0.79296875,
"learning_rate": 9.846760928338734e-07,
"loss": 0.21099932491779327,
"step": 924,
"token_acc": 0.9381474710542352
},
{
"epoch": 0.19514767932489452,
"grad_norm": 2.921875,
"learning_rate": 9.846345252054758e-07,
"loss": 0.24902689456939697,
"step": 925,
"token_acc": 0.9338211899459116
},
{
"epoch": 0.19535864978902953,
"grad_norm": 0.80859375,
"learning_rate": 9.845929021550184e-07,
"loss": 0.22670647501945496,
"step": 926,
"token_acc": 0.9382314694408322
},
{
"epoch": 0.19556962025316454,
"grad_norm": 0.70703125,
"learning_rate": 9.84551223687261e-07,
"loss": 0.24977506697177887,
"step": 927,
"token_acc": 0.927700089259149
},
{
"epoch": 0.19578059071729959,
"grad_norm": 0.7109375,
"learning_rate": 9.8450948980697e-07,
"loss": 0.2701714038848877,
"step": 928,
"token_acc": 0.9202069716775599
},
{
"epoch": 0.1959915611814346,
"grad_norm": 0.82421875,
"learning_rate": 9.844677005189182e-07,
"loss": 0.2738378643989563,
"step": 929,
"token_acc": 0.9230093676814989
},
{
"epoch": 0.1962025316455696,
"grad_norm": 0.7890625,
"learning_rate": 9.844258558278842e-07,
"loss": 0.2802038788795471,
"step": 930,
"token_acc": 0.9218846869187849
},
{
"epoch": 0.19641350210970465,
"grad_norm": 1.125,
"learning_rate": 9.843839557386534e-07,
"loss": 0.28460338711738586,
"step": 931,
"token_acc": 0.9176701204144497
},
{
"epoch": 0.19662447257383966,
"grad_norm": 0.70703125,
"learning_rate": 9.843420002560173e-07,
"loss": 0.2364983856678009,
"step": 932,
"token_acc": 0.9339788732394366
},
{
"epoch": 0.19683544303797468,
"grad_norm": 0.69921875,
"learning_rate": 9.842999893847744e-07,
"loss": 0.24972565472126007,
"step": 933,
"token_acc": 0.9282414536495226
},
{
"epoch": 0.19704641350210972,
"grad_norm": 0.62890625,
"learning_rate": 9.842579231297284e-07,
"loss": 0.23772844672203064,
"step": 934,
"token_acc": 0.9301578024547048
},
{
"epoch": 0.19725738396624473,
"grad_norm": 0.86328125,
"learning_rate": 9.842158014956901e-07,
"loss": 0.2724204659461975,
"step": 935,
"token_acc": 0.920952380952381
},
{
"epoch": 0.19746835443037974,
"grad_norm": 0.67578125,
"learning_rate": 9.841736244874769e-07,
"loss": 0.20951035618782043,
"step": 936,
"token_acc": 0.9367752622860298
},
{
"epoch": 0.19767932489451476,
"grad_norm": 1.84375,
"learning_rate": 9.841313921099112e-07,
"loss": 0.2654408812522888,
"step": 937,
"token_acc": 0.9252709640616087
},
{
"epoch": 0.1978902953586498,
"grad_norm": 0.609375,
"learning_rate": 9.840891043678235e-07,
"loss": 0.20615626871585846,
"step": 938,
"token_acc": 0.9382829208677055
},
{
"epoch": 0.1981012658227848,
"grad_norm": 0.78515625,
"learning_rate": 9.840467612660494e-07,
"loss": 0.24997380375862122,
"step": 939,
"token_acc": 0.9305555555555556
},
{
"epoch": 0.19831223628691982,
"grad_norm": 0.62890625,
"learning_rate": 9.84004362809431e-07,
"loss": 0.2462070733308792,
"step": 940,
"token_acc": 0.9301426872770512
},
{
"epoch": 0.19852320675105486,
"grad_norm": 0.81640625,
"learning_rate": 9.839619090028173e-07,
"loss": 0.28827589750289917,
"step": 941,
"token_acc": 0.9256148770245951
},
{
"epoch": 0.19873417721518988,
"grad_norm": 0.7265625,
"learning_rate": 9.83919399851063e-07,
"loss": 0.27797433733940125,
"step": 942,
"token_acc": 0.9241207421766824
},
{
"epoch": 0.1989451476793249,
"grad_norm": 0.84765625,
"learning_rate": 9.838768353590297e-07,
"loss": 0.3198699951171875,
"step": 943,
"token_acc": 0.909556313993174
},
{
"epoch": 0.1991561181434599,
"grad_norm": 0.625,
"learning_rate": 9.838342155315847e-07,
"loss": 0.23603345453739166,
"step": 944,
"token_acc": 0.9322977725674091
},
{
"epoch": 0.19936708860759494,
"grad_norm": 0.63671875,
"learning_rate": 9.837915403736017e-07,
"loss": 0.1939564049243927,
"step": 945,
"token_acc": 0.9405116002379535
},
{
"epoch": 0.19957805907172996,
"grad_norm": 0.6953125,
"learning_rate": 9.837488098899616e-07,
"loss": 0.2682676911354065,
"step": 946,
"token_acc": 0.9287606711804534
},
{
"epoch": 0.19978902953586497,
"grad_norm": 0.94921875,
"learning_rate": 9.837060240855506e-07,
"loss": 0.264107882976532,
"step": 947,
"token_acc": 0.9270292429625581
},
{
"epoch": 0.2,
"grad_norm": 0.78515625,
"learning_rate": 9.836631829652617e-07,
"loss": 0.31936952471733093,
"step": 948,
"token_acc": 0.9171994884910486
},
{
"epoch": 0.20021097046413502,
"grad_norm": 0.76953125,
"learning_rate": 9.83620286533994e-07,
"loss": 0.2610703706741333,
"step": 949,
"token_acc": 0.9247813411078717
},
{
"epoch": 0.20042194092827004,
"grad_norm": 0.76953125,
"learning_rate": 9.835773347966535e-07,
"loss": 0.27383172512054443,
"step": 950,
"token_acc": 0.9266730707652898
},
{
"epoch": 0.20063291139240505,
"grad_norm": 0.7578125,
"learning_rate": 9.835343277581513e-07,
"loss": 0.253266841173172,
"step": 951,
"token_acc": 0.9326241134751773
},
{
"epoch": 0.2008438818565401,
"grad_norm": 0.6875,
"learning_rate": 9.834912654234065e-07,
"loss": 0.24679061770439148,
"step": 952,
"token_acc": 0.9246597024374802
},
{
"epoch": 0.2010548523206751,
"grad_norm": 0.92578125,
"learning_rate": 9.834481477973433e-07,
"loss": 0.25128480792045593,
"step": 953,
"token_acc": 0.9299694189602447
},
{
"epoch": 0.20126582278481012,
"grad_norm": 0.7578125,
"learning_rate": 9.834049748848924e-07,
"loss": 0.26062366366386414,
"step": 954,
"token_acc": 0.9283121597096189
},
{
"epoch": 0.20147679324894516,
"grad_norm": 0.765625,
"learning_rate": 9.833617466909912e-07,
"loss": 0.2557450234889984,
"step": 955,
"token_acc": 0.92808867261422
},
{
"epoch": 0.20168776371308017,
"grad_norm": 0.7265625,
"learning_rate": 9.83318463220583e-07,
"loss": 0.28644537925720215,
"step": 956,
"token_acc": 0.9155513065646909
},
{
"epoch": 0.20189873417721518,
"grad_norm": 0.9375,
"learning_rate": 9.832751244786178e-07,
"loss": 0.27308011054992676,
"step": 957,
"token_acc": 0.9252837326607818
},
{
"epoch": 0.20210970464135022,
"grad_norm": 0.6640625,
"learning_rate": 9.832317304700517e-07,
"loss": 0.2365753948688507,
"step": 958,
"token_acc": 0.9307948860478044
},
{
"epoch": 0.20232067510548524,
"grad_norm": 1.2265625,
"learning_rate": 9.831882811998472e-07,
"loss": 0.25882843136787415,
"step": 959,
"token_acc": 0.9280114041339986
},
{
"epoch": 0.20253164556962025,
"grad_norm": 0.6171875,
"learning_rate": 9.83144776672973e-07,
"loss": 0.24906384944915771,
"step": 960,
"token_acc": 0.9268585131894485
},
{
"epoch": 0.20274261603375526,
"grad_norm": 1.421875,
"learning_rate": 9.831012168944045e-07,
"loss": 0.25317683815956116,
"step": 961,
"token_acc": 0.9272997032640949
},
{
"epoch": 0.2029535864978903,
"grad_norm": 1.40625,
"learning_rate": 9.830576018691227e-07,
"loss": 0.2348695993423462,
"step": 962,
"token_acc": 0.9317293233082706
},
{
"epoch": 0.20316455696202532,
"grad_norm": 0.65625,
"learning_rate": 9.830139316021155e-07,
"loss": 0.22149190306663513,
"step": 963,
"token_acc": 0.9351633986928105
},
{
"epoch": 0.20337552742616033,
"grad_norm": 0.734375,
"learning_rate": 9.829702060983772e-07,
"loss": 0.2570660710334778,
"step": 964,
"token_acc": 0.9295127183573398
},
{
"epoch": 0.20358649789029537,
"grad_norm": 1.0234375,
"learning_rate": 9.829264253629079e-07,
"loss": 0.2847985625267029,
"step": 965,
"token_acc": 0.9199036434808793
},
{
"epoch": 0.20379746835443038,
"grad_norm": 0.72265625,
"learning_rate": 9.828825894007146e-07,
"loss": 0.267423540353775,
"step": 966,
"token_acc": 0.9270031365839749
},
{
"epoch": 0.2040084388185654,
"grad_norm": 6.0625,
"learning_rate": 9.8283869821681e-07,
"loss": 0.2526509165763855,
"step": 967,
"token_acc": 0.9267187106522287
},
{
"epoch": 0.2042194092827004,
"grad_norm": 0.66796875,
"learning_rate": 9.827947518162135e-07,
"loss": 0.22644475102424622,
"step": 968,
"token_acc": 0.9335453100158982
},
{
"epoch": 0.20443037974683545,
"grad_norm": 0.859375,
"learning_rate": 9.827507502039507e-07,
"loss": 0.313146710395813,
"step": 969,
"token_acc": 0.9218163195629908
},
{
"epoch": 0.20464135021097046,
"grad_norm": 0.72265625,
"learning_rate": 9.82706693385054e-07,
"loss": 0.2515157163143158,
"step": 970,
"token_acc": 0.9318849089841457
},
{
"epoch": 0.20485232067510548,
"grad_norm": 0.7890625,
"learning_rate": 9.82662581364561e-07,
"loss": 0.33044931292533875,
"step": 971,
"token_acc": 0.911062906724512
},
{
"epoch": 0.20506329113924052,
"grad_norm": 1.09375,
"learning_rate": 9.826184141475165e-07,
"loss": 0.3272978961467743,
"step": 972,
"token_acc": 0.9140117537890504
},
{
"epoch": 0.20527426160337553,
"grad_norm": 0.59375,
"learning_rate": 9.825741917389717e-07,
"loss": 0.21767356991767883,
"step": 973,
"token_acc": 0.9358974358974359
},
{
"epoch": 0.20548523206751054,
"grad_norm": 0.734375,
"learning_rate": 9.825299141439835e-07,
"loss": 0.28333914279937744,
"step": 974,
"token_acc": 0.9232728430436167
},
{
"epoch": 0.20569620253164558,
"grad_norm": 0.66796875,
"learning_rate": 9.824855813676157e-07,
"loss": 0.23762467503547668,
"step": 975,
"token_acc": 0.9316065192083819
},
{
"epoch": 0.2059071729957806,
"grad_norm": 0.80859375,
"learning_rate": 9.824411934149377e-07,
"loss": 0.2822648882865906,
"step": 976,
"token_acc": 0.9237147595356551
},
{
"epoch": 0.2061181434599156,
"grad_norm": 0.63671875,
"learning_rate": 9.823967502910259e-07,
"loss": 0.2508828043937683,
"step": 977,
"token_acc": 0.9297841726618705
},
{
"epoch": 0.20632911392405062,
"grad_norm": 0.625,
"learning_rate": 9.82352252000963e-07,
"loss": 0.2554951608181,
"step": 978,
"token_acc": 0.9255730872283418
},
{
"epoch": 0.20654008438818566,
"grad_norm": 0.74609375,
"learning_rate": 9.823076985498373e-07,
"loss": 0.2603085935115814,
"step": 979,
"token_acc": 0.927246790299572
},
{
"epoch": 0.20675105485232068,
"grad_norm": 0.75390625,
"learning_rate": 9.82263089942744e-07,
"loss": 0.27707576751708984,
"step": 980,
"token_acc": 0.9242250287026407
},
{
"epoch": 0.2069620253164557,
"grad_norm": 0.74609375,
"learning_rate": 9.822184261847847e-07,
"loss": 0.23693615198135376,
"step": 981,
"token_acc": 0.9334923948702655
},
{
"epoch": 0.20717299578059073,
"grad_norm": 0.69921875,
"learning_rate": 9.821737072810668e-07,
"loss": 0.2479907125234604,
"step": 982,
"token_acc": 0.9293939393939394
},
{
"epoch": 0.20738396624472574,
"grad_norm": 0.72265625,
"learning_rate": 9.821289332367043e-07,
"loss": 0.25571757555007935,
"step": 983,
"token_acc": 0.9304549405969285
},
{
"epoch": 0.20759493670886076,
"grad_norm": 0.7421875,
"learning_rate": 9.820841040568177e-07,
"loss": 0.2608758807182312,
"step": 984,
"token_acc": 0.9285266457680251
},
{
"epoch": 0.20780590717299577,
"grad_norm": 0.7578125,
"learning_rate": 9.820392197465335e-07,
"loss": 0.28490036725997925,
"step": 985,
"token_acc": 0.920461445051609
},
{
"epoch": 0.2080168776371308,
"grad_norm": 0.70703125,
"learning_rate": 9.819942803109844e-07,
"loss": 0.2503746449947357,
"step": 986,
"token_acc": 0.9277822689302075
},
{
"epoch": 0.20822784810126582,
"grad_norm": 0.6953125,
"learning_rate": 9.8194928575531e-07,
"loss": 0.2538071870803833,
"step": 987,
"token_acc": 0.9264251614714968
},
{
"epoch": 0.20843881856540084,
"grad_norm": 0.6875,
"learning_rate": 9.819042360846554e-07,
"loss": 0.2641909718513489,
"step": 988,
"token_acc": 0.9284467713787086
},
{
"epoch": 0.20864978902953588,
"grad_norm": 0.75,
"learning_rate": 9.818591313041727e-07,
"loss": 0.2759447395801544,
"step": 989,
"token_acc": 0.9222654081066074
},
{
"epoch": 0.2088607594936709,
"grad_norm": 0.8125,
"learning_rate": 9.818139714190198e-07,
"loss": 0.23161228001117706,
"step": 990,
"token_acc": 0.9333521604066648
},
{
"epoch": 0.2090717299578059,
"grad_norm": 0.765625,
"learning_rate": 9.817687564343615e-07,
"loss": 0.2939218580722809,
"step": 991,
"token_acc": 0.9156313204276221
},
{
"epoch": 0.20928270042194091,
"grad_norm": 0.7265625,
"learning_rate": 9.817234863553681e-07,
"loss": 0.259197473526001,
"step": 992,
"token_acc": 0.9242243436754176
},
{
"epoch": 0.20949367088607596,
"grad_norm": 0.76171875,
"learning_rate": 9.816781611872167e-07,
"loss": 0.27298709750175476,
"step": 993,
"token_acc": 0.9176136363636364
},
{
"epoch": 0.20970464135021097,
"grad_norm": 0.6328125,
"learning_rate": 9.816327809350907e-07,
"loss": 0.2868914008140564,
"step": 994,
"token_acc": 0.9294675216057987
},
{
"epoch": 0.20991561181434598,
"grad_norm": 0.6171875,
"learning_rate": 9.815873456041797e-07,
"loss": 0.26026803255081177,
"step": 995,
"token_acc": 0.9263676432460461
},
{
"epoch": 0.21012658227848102,
"grad_norm": 0.8359375,
"learning_rate": 9.815418551996795e-07,
"loss": 0.2792215049266815,
"step": 996,
"token_acc": 0.9241547365214743
},
{
"epoch": 0.21033755274261604,
"grad_norm": 0.6953125,
"learning_rate": 9.814963097267925e-07,
"loss": 0.23070243000984192,
"step": 997,
"token_acc": 0.9354066985645934
},
{
"epoch": 0.21054852320675105,
"grad_norm": 0.640625,
"learning_rate": 9.814507091907271e-07,
"loss": 0.2509482502937317,
"step": 998,
"token_acc": 0.9299403078856425
},
{
"epoch": 0.2107594936708861,
"grad_norm": 0.73828125,
"learning_rate": 9.814050535966981e-07,
"loss": 0.24006497859954834,
"step": 999,
"token_acc": 0.9315068493150684
},
{
"epoch": 0.2109704641350211,
"grad_norm": 0.83984375,
"learning_rate": 9.813593429499268e-07,
"loss": 0.28949546813964844,
"step": 1000,
"token_acc": 0.9210992907801419
},
{
"epoch": 0.2109704641350211,
"eval_loss": 0.4336377680301666,
"eval_runtime": 245.5659,
"eval_samples_per_second": 137.254,
"eval_steps_per_second": 2.146,
"eval_token_acc": 0.8990386341200753,
"step": 1000
},
{
"epoch": 0.21118143459915611,
"grad_norm": 0.78125,
"learning_rate": 9.813135772556405e-07,
"loss": 0.28296542167663574,
"step": 1001,
"token_acc": 0.9212338198843294
},
{
"epoch": 0.21139240506329113,
"grad_norm": 0.74609375,
"learning_rate": 9.812677565190728e-07,
"loss": 0.2738898694515228,
"step": 1002,
"token_acc": 0.926865671641791
},
{
"epoch": 0.21160337552742617,
"grad_norm": 0.73828125,
"learning_rate": 9.812218807454635e-07,
"loss": 0.24410274624824524,
"step": 1003,
"token_acc": 0.9221090473337328
},
{
"epoch": 0.21181434599156118,
"grad_norm": 0.734375,
"learning_rate": 9.811759499400593e-07,
"loss": 0.2736046314239502,
"step": 1004,
"token_acc": 0.9195751138088012
},
{
"epoch": 0.2120253164556962,
"grad_norm": 0.6171875,
"learning_rate": 9.811299641081126e-07,
"loss": 0.24416279792785645,
"step": 1005,
"token_acc": 0.9294429708222812
},
{
"epoch": 0.21223628691983124,
"grad_norm": 0.734375,
"learning_rate": 9.81083923254882e-07,
"loss": 0.26535558700561523,
"step": 1006,
"token_acc": 0.9219015280135824
},
{
"epoch": 0.21244725738396625,
"grad_norm": 0.7734375,
"learning_rate": 9.81037827385633e-07,
"loss": 0.25298815965652466,
"step": 1007,
"token_acc": 0.9291294642857143
},
{
"epoch": 0.21265822784810126,
"grad_norm": 0.60546875,
"learning_rate": 9.809916765056373e-07,
"loss": 0.18925216794013977,
"step": 1008,
"token_acc": 0.9450386215092097
},
{
"epoch": 0.21286919831223627,
"grad_norm": 0.6171875,
"learning_rate": 9.809454706201719e-07,
"loss": 0.262004017829895,
"step": 1009,
"token_acc": 0.9288862768145754
},
{
"epoch": 0.21308016877637131,
"grad_norm": 0.71484375,
"learning_rate": 9.808992097345216e-07,
"loss": 0.289574533700943,
"step": 1010,
"token_acc": 0.9239130434782609
},
{
"epoch": 0.21329113924050633,
"grad_norm": 0.67578125,
"learning_rate": 9.80852893853976e-07,
"loss": 0.19858276844024658,
"step": 1011,
"token_acc": 0.9515328467153285
},
{
"epoch": 0.21350210970464134,
"grad_norm": 0.58984375,
"learning_rate": 9.808065229838323e-07,
"loss": 0.24258631467819214,
"step": 1012,
"token_acc": 0.9321187187775493
},
{
"epoch": 0.21371308016877638,
"grad_norm": 0.7578125,
"learning_rate": 9.807600971293932e-07,
"loss": 0.27060335874557495,
"step": 1013,
"token_acc": 0.9243027888446215
},
{
"epoch": 0.2139240506329114,
"grad_norm": 0.625,
"learning_rate": 9.807136162959678e-07,
"loss": 0.27900418639183044,
"step": 1014,
"token_acc": 0.9236888626988804
},
{
"epoch": 0.2141350210970464,
"grad_norm": 0.6875,
"learning_rate": 9.806670804888716e-07,
"loss": 0.2552199959754944,
"step": 1015,
"token_acc": 0.9293139293139293
},
{
"epoch": 0.21434599156118145,
"grad_norm": 0.76171875,
"learning_rate": 9.806204897134265e-07,
"loss": 0.2198866307735443,
"step": 1016,
"token_acc": 0.933115823817292
},
{
"epoch": 0.21455696202531646,
"grad_norm": 0.94140625,
"learning_rate": 9.805738439749604e-07,
"loss": 0.2905495762825012,
"step": 1017,
"token_acc": 0.9233921815889029
},
{
"epoch": 0.21476793248945147,
"grad_norm": 0.7421875,
"learning_rate": 9.805271432788077e-07,
"loss": 0.23594287037849426,
"step": 1018,
"token_acc": 0.9323369565217391
},
{
"epoch": 0.2149789029535865,
"grad_norm": 0.73828125,
"learning_rate": 9.80480387630309e-07,
"loss": 0.2431170493364334,
"step": 1019,
"token_acc": 0.9300763358778626
},
{
"epoch": 0.21518987341772153,
"grad_norm": 0.78515625,
"learning_rate": 9.804335770348115e-07,
"loss": 0.32254183292388916,
"step": 1020,
"token_acc": 0.9199491740787802
},
{
"epoch": 0.21540084388185654,
"grad_norm": 0.65234375,
"learning_rate": 9.803867114976678e-07,
"loss": 0.2706320285797119,
"step": 1021,
"token_acc": 0.9207768975210836
},
{
"epoch": 0.21561181434599155,
"grad_norm": 0.671875,
"learning_rate": 9.803397910242378e-07,
"loss": 0.26340216398239136,
"step": 1022,
"token_acc": 0.9282065834279228
},
{
"epoch": 0.2158227848101266,
"grad_norm": 0.859375,
"learning_rate": 9.802928156198871e-07,
"loss": 0.27076318860054016,
"step": 1023,
"token_acc": 0.9234196602617655
},
{
"epoch": 0.2160337552742616,
"grad_norm": 0.68359375,
"learning_rate": 9.802457852899878e-07,
"loss": 0.24231645464897156,
"step": 1024,
"token_acc": 0.9345052452509215
},
{
"epoch": 0.21624472573839662,
"grad_norm": 0.6484375,
"learning_rate": 9.80198700039918e-07,
"loss": 0.26031437516212463,
"step": 1025,
"token_acc": 0.9236091092533871
},
{
"epoch": 0.21645569620253163,
"grad_norm": 0.69921875,
"learning_rate": 9.801515598750626e-07,
"loss": 0.29187077283859253,
"step": 1026,
"token_acc": 0.9215414471860272
},
{
"epoch": 0.21666666666666667,
"grad_norm": 0.578125,
"learning_rate": 9.801043648008126e-07,
"loss": 0.21546629071235657,
"step": 1027,
"token_acc": 0.9334701055099648
},
{
"epoch": 0.2168776371308017,
"grad_norm": 0.69921875,
"learning_rate": 9.800571148225647e-07,
"loss": 0.2522656321525574,
"step": 1028,
"token_acc": 0.930021868166198
},
{
"epoch": 0.2170886075949367,
"grad_norm": 0.74609375,
"learning_rate": 9.800098099457225e-07,
"loss": 0.293454647064209,
"step": 1029,
"token_acc": 0.9204295154185022
},
{
"epoch": 0.21729957805907174,
"grad_norm": 0.53125,
"learning_rate": 9.799624501756957e-07,
"loss": 0.22426486015319824,
"step": 1030,
"token_acc": 0.943021582733813
},
{
"epoch": 0.21751054852320675,
"grad_norm": 0.7109375,
"learning_rate": 9.799150355179007e-07,
"loss": 0.25977182388305664,
"step": 1031,
"token_acc": 0.9289996844430419
},
{
"epoch": 0.21772151898734177,
"grad_norm": 0.734375,
"learning_rate": 9.79867565977759e-07,
"loss": 0.23819683492183685,
"step": 1032,
"token_acc": 0.932983323038913
},
{
"epoch": 0.21793248945147678,
"grad_norm": 0.65234375,
"learning_rate": 9.798200415607e-07,
"loss": 0.22482289373874664,
"step": 1033,
"token_acc": 0.9358479358479358
},
{
"epoch": 0.21814345991561182,
"grad_norm": 1.015625,
"learning_rate": 9.797724622721578e-07,
"loss": 0.2759067416191101,
"step": 1034,
"token_acc": 0.9251207729468599
},
{
"epoch": 0.21835443037974683,
"grad_norm": 0.671875,
"learning_rate": 9.797248281175737e-07,
"loss": 0.28211140632629395,
"step": 1035,
"token_acc": 0.9303056269637247
},
{
"epoch": 0.21856540084388185,
"grad_norm": 0.7421875,
"learning_rate": 9.796771391023952e-07,
"loss": 0.26458674669265747,
"step": 1036,
"token_acc": 0.9243227326266196
},
{
"epoch": 0.2187763713080169,
"grad_norm": 0.85546875,
"learning_rate": 9.79629395232076e-07,
"loss": 0.24054330587387085,
"step": 1037,
"token_acc": 0.9339233038348083
},
{
"epoch": 0.2189873417721519,
"grad_norm": 0.6171875,
"learning_rate": 9.795815965120757e-07,
"loss": 0.21638503670692444,
"step": 1038,
"token_acc": 0.9390750484630296
},
{
"epoch": 0.2191983122362869,
"grad_norm": 0.8515625,
"learning_rate": 9.79533742947861e-07,
"loss": 0.247116357088089,
"step": 1039,
"token_acc": 0.9303322615219721
},
{
"epoch": 0.21940928270042195,
"grad_norm": 0.71484375,
"learning_rate": 9.794858345449039e-07,
"loss": 0.24935264885425568,
"step": 1040,
"token_acc": 0.9318113547611896
},
{
"epoch": 0.21962025316455697,
"grad_norm": 0.5546875,
"learning_rate": 9.794378713086833e-07,
"loss": 0.2215247005224228,
"step": 1041,
"token_acc": 0.9337332969730024
},
{
"epoch": 0.21983122362869198,
"grad_norm": 0.86328125,
"learning_rate": 9.793898532446841e-07,
"loss": 0.2293887734413147,
"step": 1042,
"token_acc": 0.9368061485909479
},
{
"epoch": 0.220042194092827,
"grad_norm": 0.75390625,
"learning_rate": 9.793417803583979e-07,
"loss": 0.24102169275283813,
"step": 1043,
"token_acc": 0.9339817270851754
},
{
"epoch": 0.22025316455696203,
"grad_norm": 0.6484375,
"learning_rate": 9.792936526553218e-07,
"loss": 0.23523153364658356,
"step": 1044,
"token_acc": 0.9332579185520362
},
{
"epoch": 0.22046413502109705,
"grad_norm": 2.34375,
"learning_rate": 9.7924547014096e-07,
"loss": 0.2483244389295578,
"step": 1045,
"token_acc": 0.9263482280431433
},
{
"epoch": 0.22067510548523206,
"grad_norm": 0.7265625,
"learning_rate": 9.79197232820822e-07,
"loss": 0.2850404679775238,
"step": 1046,
"token_acc": 0.9232827832292596
},
{
"epoch": 0.2208860759493671,
"grad_norm": 0.56640625,
"learning_rate": 9.791489407004248e-07,
"loss": 0.22615236043930054,
"step": 1047,
"token_acc": 0.9381818181818182
},
{
"epoch": 0.2210970464135021,
"grad_norm": 1.203125,
"learning_rate": 9.791005937852906e-07,
"loss": 0.2923763692378998,
"step": 1048,
"token_acc": 0.9200850805226375
},
{
"epoch": 0.22130801687763713,
"grad_norm": 0.67578125,
"learning_rate": 9.790521920809485e-07,
"loss": 0.24023482203483582,
"step": 1049,
"token_acc": 0.9306590257879657
},
{
"epoch": 0.22151898734177214,
"grad_norm": 0.7265625,
"learning_rate": 9.790037355929336e-07,
"loss": 0.24101434648036957,
"step": 1050,
"token_acc": 0.9310859188544153
},
{
"epoch": 0.22172995780590718,
"grad_norm": 0.8359375,
"learning_rate": 9.789552243267873e-07,
"loss": 0.24817001819610596,
"step": 1051,
"token_acc": 0.9296261388627082
},
{
"epoch": 0.2219409282700422,
"grad_norm": 0.65625,
"learning_rate": 9.789066582880573e-07,
"loss": 0.24397243559360504,
"step": 1052,
"token_acc": 0.9321930360415394
},
{
"epoch": 0.2221518987341772,
"grad_norm": 0.62890625,
"learning_rate": 9.788580374822974e-07,
"loss": 0.2505425810813904,
"step": 1053,
"token_acc": 0.9343989343989344
},
{
"epoch": 0.22236286919831225,
"grad_norm": 0.859375,
"learning_rate": 9.78809361915068e-07,
"loss": 0.2832062244415283,
"step": 1054,
"token_acc": 0.9229067930489732
},
{
"epoch": 0.22257383966244726,
"grad_norm": 0.6875,
"learning_rate": 9.787606315919353e-07,
"loss": 0.2667236328125,
"step": 1055,
"token_acc": 0.9272619751626257
},
{
"epoch": 0.22278481012658227,
"grad_norm": 0.59765625,
"learning_rate": 9.787118465184723e-07,
"loss": 0.2523917853832245,
"step": 1056,
"token_acc": 0.930327868852459
},
{
"epoch": 0.2229957805907173,
"grad_norm": 0.9921875,
"learning_rate": 9.78663006700258e-07,
"loss": 0.2004603147506714,
"step": 1057,
"token_acc": 0.9434229137199435
},
{
"epoch": 0.22320675105485233,
"grad_norm": 0.7890625,
"learning_rate": 9.786141121428773e-07,
"loss": 0.2887023687362671,
"step": 1058,
"token_acc": 0.9227068633739577
},
{
"epoch": 0.22341772151898734,
"grad_norm": 0.66796875,
"learning_rate": 9.78565162851922e-07,
"loss": 0.2264334261417389,
"step": 1059,
"token_acc": 0.9340407226842558
},
{
"epoch": 0.22362869198312235,
"grad_norm": 0.66015625,
"learning_rate": 9.785161588329896e-07,
"loss": 0.25807666778564453,
"step": 1060,
"token_acc": 0.9267456359102244
},
{
"epoch": 0.2238396624472574,
"grad_norm": 0.73828125,
"learning_rate": 9.784671000916844e-07,
"loss": 0.2947354018688202,
"step": 1061,
"token_acc": 0.919414969888156
},
{
"epoch": 0.2240506329113924,
"grad_norm": 0.70703125,
"learning_rate": 9.784179866336167e-07,
"loss": 0.25580474734306335,
"step": 1062,
"token_acc": 0.9270348837209302
},
{
"epoch": 0.22426160337552742,
"grad_norm": 0.5390625,
"learning_rate": 9.783688184644027e-07,
"loss": 0.23662078380584717,
"step": 1063,
"token_acc": 0.9339968569931901
},
{
"epoch": 0.22447257383966246,
"grad_norm": 0.765625,
"learning_rate": 9.783195955896656e-07,
"loss": 0.24781087040901184,
"step": 1064,
"token_acc": 0.9331036462819409
},
{
"epoch": 0.22468354430379747,
"grad_norm": 0.63671875,
"learning_rate": 9.782703180150345e-07,
"loss": 0.25100642442703247,
"step": 1065,
"token_acc": 0.9296745725317154
},
{
"epoch": 0.22489451476793249,
"grad_norm": 0.61328125,
"learning_rate": 9.782209857461441e-07,
"loss": 0.25573456287384033,
"step": 1066,
"token_acc": 0.9306306306306307
},
{
"epoch": 0.2251054852320675,
"grad_norm": 0.75,
"learning_rate": 9.781715987886365e-07,
"loss": 0.28791964054107666,
"step": 1067,
"token_acc": 0.9302730970366067
},
{
"epoch": 0.22531645569620254,
"grad_norm": 0.67578125,
"learning_rate": 9.781221571481594e-07,
"loss": 0.2471371442079544,
"step": 1068,
"token_acc": 0.9301613800779076
},
{
"epoch": 0.22552742616033755,
"grad_norm": 0.6796875,
"learning_rate": 9.780726608303669e-07,
"loss": 0.27067384123802185,
"step": 1069,
"token_acc": 0.9273190045248869
},
{
"epoch": 0.22573839662447256,
"grad_norm": 0.734375,
"learning_rate": 9.780231098409191e-07,
"loss": 0.28875893354415894,
"step": 1070,
"token_acc": 0.927710843373494
},
{
"epoch": 0.2259493670886076,
"grad_norm": 0.609375,
"learning_rate": 9.779735041854829e-07,
"loss": 0.2351369857788086,
"step": 1071,
"token_acc": 0.9268510258697591
},
{
"epoch": 0.22616033755274262,
"grad_norm": 0.69921875,
"learning_rate": 9.779238438697309e-07,
"loss": 0.2744210958480835,
"step": 1072,
"token_acc": 0.9206251915415262
},
{
"epoch": 0.22637130801687763,
"grad_norm": 0.84375,
"learning_rate": 9.778741288993423e-07,
"loss": 0.3181830942630768,
"step": 1073,
"token_acc": 0.9197530864197531
},
{
"epoch": 0.22658227848101264,
"grad_norm": 0.79296875,
"learning_rate": 9.778243592800021e-07,
"loss": 0.26819008588790894,
"step": 1074,
"token_acc": 0.9223744292237442
},
{
"epoch": 0.22679324894514769,
"grad_norm": 0.765625,
"learning_rate": 9.777745350174023e-07,
"loss": 0.2756550908088684,
"step": 1075,
"token_acc": 0.9224232456140351
},
{
"epoch": 0.2270042194092827,
"grad_norm": 0.75390625,
"learning_rate": 9.777246561172408e-07,
"loss": 0.23100243508815765,
"step": 1076,
"token_acc": 0.9374826340650181
},
{
"epoch": 0.2272151898734177,
"grad_norm": 0.8984375,
"learning_rate": 9.776747225852212e-07,
"loss": 0.2505825161933899,
"step": 1077,
"token_acc": 0.9271615234917987
},
{
"epoch": 0.22742616033755275,
"grad_norm": 1.828125,
"learning_rate": 9.77624734427054e-07,
"loss": 0.2634645998477936,
"step": 1078,
"token_acc": 0.9258319232938522
},
{
"epoch": 0.22763713080168776,
"grad_norm": 0.71484375,
"learning_rate": 9.77574691648456e-07,
"loss": 0.2142142951488495,
"step": 1079,
"token_acc": 0.9364801864801865
},
{
"epoch": 0.22784810126582278,
"grad_norm": 1.34375,
"learning_rate": 9.775245942551499e-07,
"loss": 0.3275116980075836,
"step": 1080,
"token_acc": 0.9079022171688459
},
{
"epoch": 0.22805907172995782,
"grad_norm": 0.82421875,
"learning_rate": 9.774744422528645e-07,
"loss": 0.2572461664676666,
"step": 1081,
"token_acc": 0.9303560274828232
},
{
"epoch": 0.22827004219409283,
"grad_norm": 0.921875,
"learning_rate": 9.774242356473355e-07,
"loss": 0.26090332865715027,
"step": 1082,
"token_acc": 0.9227313566936208
},
{
"epoch": 0.22848101265822784,
"grad_norm": 0.6953125,
"learning_rate": 9.773739744443041e-07,
"loss": 0.25752192735671997,
"step": 1083,
"token_acc": 0.9295694325634445
},
{
"epoch": 0.22869198312236286,
"grad_norm": 0.77734375,
"learning_rate": 9.773236586495184e-07,
"loss": 0.26559850573539734,
"step": 1084,
"token_acc": 0.9298945568538045
},
{
"epoch": 0.2289029535864979,
"grad_norm": 0.92578125,
"learning_rate": 9.772732882687322e-07,
"loss": 0.27687737345695496,
"step": 1085,
"token_acc": 0.928654970760234
},
{
"epoch": 0.2291139240506329,
"grad_norm": 0.8125,
"learning_rate": 9.772228633077059e-07,
"loss": 0.23229250311851501,
"step": 1086,
"token_acc": 0.9331412103746398
},
{
"epoch": 0.22932489451476792,
"grad_norm": 0.76171875,
"learning_rate": 9.77172383772206e-07,
"loss": 0.2462151050567627,
"step": 1087,
"token_acc": 0.9313640312771503
},
{
"epoch": 0.22953586497890296,
"grad_norm": 0.80859375,
"learning_rate": 9.771218496680052e-07,
"loss": 0.2657579183578491,
"step": 1088,
"token_acc": 0.9255605381165919
},
{
"epoch": 0.22974683544303798,
"grad_norm": 0.703125,
"learning_rate": 9.770712610008826e-07,
"loss": 0.22832798957824707,
"step": 1089,
"token_acc": 0.9297372060857538
},
{
"epoch": 0.229957805907173,
"grad_norm": 0.71484375,
"learning_rate": 9.770206177766236e-07,
"loss": 0.23893365263938904,
"step": 1090,
"token_acc": 0.9337880079568059
},
{
"epoch": 0.230168776371308,
"grad_norm": 0.734375,
"learning_rate": 9.769699200010193e-07,
"loss": 0.25824564695358276,
"step": 1091,
"token_acc": 0.9248587570621469
},
{
"epoch": 0.23037974683544304,
"grad_norm": 0.58203125,
"learning_rate": 9.769191676798677e-07,
"loss": 0.24014432728290558,
"step": 1092,
"token_acc": 0.9332155477031802
},
{
"epoch": 0.23059071729957806,
"grad_norm": 0.75390625,
"learning_rate": 9.768683608189726e-07,
"loss": 0.22704805433750153,
"step": 1093,
"token_acc": 0.9340954942837929
},
{
"epoch": 0.23080168776371307,
"grad_norm": 0.796875,
"learning_rate": 9.768174994241443e-07,
"loss": 0.32337048649787903,
"step": 1094,
"token_acc": 0.9150032195750161
},
{
"epoch": 0.2310126582278481,
"grad_norm": 0.59765625,
"learning_rate": 9.76766583501199e-07,
"loss": 0.23895105719566345,
"step": 1095,
"token_acc": 0.9323377019201463
},
{
"epoch": 0.23122362869198312,
"grad_norm": 0.8125,
"learning_rate": 9.767156130559598e-07,
"loss": 0.3005239963531494,
"step": 1096,
"token_acc": 0.9185533666568656
},
{
"epoch": 0.23143459915611814,
"grad_norm": 0.765625,
"learning_rate": 9.766645880942553e-07,
"loss": 0.2561134099960327,
"step": 1097,
"token_acc": 0.9301909307875895
},
{
"epoch": 0.23164556962025318,
"grad_norm": 1.0703125,
"learning_rate": 9.766135086219208e-07,
"loss": 0.273573100566864,
"step": 1098,
"token_acc": 0.9235312402859808
},
{
"epoch": 0.2318565400843882,
"grad_norm": 0.73828125,
"learning_rate": 9.765623746447973e-07,
"loss": 0.23628413677215576,
"step": 1099,
"token_acc": 0.9250535331905781
},
{
"epoch": 0.2320675105485232,
"grad_norm": 1.0703125,
"learning_rate": 9.765111861687328e-07,
"loss": 0.3046630620956421,
"step": 1100,
"token_acc": 0.9216944801026957
},
{
"epoch": 0.23227848101265822,
"grad_norm": 0.79296875,
"learning_rate": 9.76459943199581e-07,
"loss": 0.24554237723350525,
"step": 1101,
"token_acc": 0.9288537549407114
},
{
"epoch": 0.23248945147679326,
"grad_norm": 0.796875,
"learning_rate": 9.76408645743202e-07,
"loss": 0.2889293432235718,
"step": 1102,
"token_acc": 0.9233661075766338
},
{
"epoch": 0.23270042194092827,
"grad_norm": 0.70703125,
"learning_rate": 9.763572938054621e-07,
"loss": 0.25873494148254395,
"step": 1103,
"token_acc": 0.9205917874396136
},
{
"epoch": 0.23291139240506328,
"grad_norm": 0.61328125,
"learning_rate": 9.763058873922336e-07,
"loss": 0.23966625332832336,
"step": 1104,
"token_acc": 0.9342347879532883
},
{
"epoch": 0.23312236286919832,
"grad_norm": 0.703125,
"learning_rate": 9.762544265093958e-07,
"loss": 0.25258344411849976,
"step": 1105,
"token_acc": 0.9244996967859309
},
{
"epoch": 0.23333333333333334,
"grad_norm": 0.80078125,
"learning_rate": 9.76202911162833e-07,
"loss": 0.2642197608947754,
"step": 1106,
"token_acc": 0.9265809217577706
},
{
"epoch": 0.23354430379746835,
"grad_norm": 0.77734375,
"learning_rate": 9.761513413584369e-07,
"loss": 0.27011817693710327,
"step": 1107,
"token_acc": 0.927093282394142
},
{
"epoch": 0.23375527426160336,
"grad_norm": 0.83203125,
"learning_rate": 9.760997171021047e-07,
"loss": 0.2859058380126953,
"step": 1108,
"token_acc": 0.9265121537591859
},
{
"epoch": 0.2339662447257384,
"grad_norm": 0.7265625,
"learning_rate": 9.760480383997403e-07,
"loss": 0.2449111044406891,
"step": 1109,
"token_acc": 0.9347949886104784
},
{
"epoch": 0.23417721518987342,
"grad_norm": 0.78515625,
"learning_rate": 9.759963052572535e-07,
"loss": 0.2845684289932251,
"step": 1110,
"token_acc": 0.9204577169969299
},
{
"epoch": 0.23438818565400843,
"grad_norm": 0.72265625,
"learning_rate": 9.759445176805603e-07,
"loss": 0.25201043486595154,
"step": 1111,
"token_acc": 0.9282962962962963
},
{
"epoch": 0.23459915611814347,
"grad_norm": 0.55859375,
"learning_rate": 9.758926756755832e-07,
"loss": 0.22519253194332123,
"step": 1112,
"token_acc": 0.9384904646790223
},
{
"epoch": 0.23481012658227848,
"grad_norm": 0.65234375,
"learning_rate": 9.758407792482508e-07,
"loss": 0.21577298641204834,
"step": 1113,
"token_acc": 0.9366028708133971
},
{
"epoch": 0.2350210970464135,
"grad_norm": 1.0234375,
"learning_rate": 9.757888284044978e-07,
"loss": 0.2565447688102722,
"step": 1114,
"token_acc": 0.9269433080070134
},
{
"epoch": 0.23523206751054854,
"grad_norm": 0.60546875,
"learning_rate": 9.75736823150265e-07,
"loss": 0.2519682049751282,
"step": 1115,
"token_acc": 0.9242471282210494
},
{
"epoch": 0.23544303797468355,
"grad_norm": 0.76171875,
"learning_rate": 9.756847634915e-07,
"loss": 0.2238023430109024,
"step": 1116,
"token_acc": 0.9373587342589603
},
{
"epoch": 0.23565400843881856,
"grad_norm": 0.68359375,
"learning_rate": 9.75632649434156e-07,
"loss": 0.23814505338668823,
"step": 1117,
"token_acc": 0.9302662037037037
},
{
"epoch": 0.23586497890295358,
"grad_norm": 0.65234375,
"learning_rate": 9.755804809841932e-07,
"loss": 0.2618996500968933,
"step": 1118,
"token_acc": 0.9262518968133535
},
{
"epoch": 0.23607594936708862,
"grad_norm": 0.77734375,
"learning_rate": 9.755282581475767e-07,
"loss": 0.2755908966064453,
"step": 1119,
"token_acc": 0.9285714285714286
},
{
"epoch": 0.23628691983122363,
"grad_norm": 0.58984375,
"learning_rate": 9.754759809302793e-07,
"loss": 0.22718246281147003,
"step": 1120,
"token_acc": 0.936778449697636
},
{
"epoch": 0.23649789029535864,
"grad_norm": 1.109375,
"learning_rate": 9.75423649338279e-07,
"loss": 0.25046756863594055,
"step": 1121,
"token_acc": 0.9300994275384152
},
{
"epoch": 0.23670886075949368,
"grad_norm": 3.6875,
"learning_rate": 9.753712633775603e-07,
"loss": 0.2738564610481262,
"step": 1122,
"token_acc": 0.9289681379713534
},
{
"epoch": 0.2369198312236287,
"grad_norm": 0.59765625,
"learning_rate": 9.753188230541144e-07,
"loss": 0.2159212827682495,
"step": 1123,
"token_acc": 0.9403917116094238
},
{
"epoch": 0.2371308016877637,
"grad_norm": 0.7890625,
"learning_rate": 9.752663283739378e-07,
"loss": 0.27135610580444336,
"step": 1124,
"token_acc": 0.9201945080091534
},
{
"epoch": 0.23734177215189872,
"grad_norm": 0.63671875,
"learning_rate": 9.752137793430338e-07,
"loss": 0.20492365956306458,
"step": 1125,
"token_acc": 0.9434542102028273
},
{
"epoch": 0.23755274261603376,
"grad_norm": 0.75,
"learning_rate": 9.751611759674123e-07,
"loss": 0.24978362023830414,
"step": 1126,
"token_acc": 0.9274985557481225
},
{
"epoch": 0.23776371308016878,
"grad_norm": 0.640625,
"learning_rate": 9.751085182530885e-07,
"loss": 0.2612837553024292,
"step": 1127,
"token_acc": 0.9279176201372997
},
{
"epoch": 0.2379746835443038,
"grad_norm": 0.6328125,
"learning_rate": 9.750558062060844e-07,
"loss": 0.22179251909255981,
"step": 1128,
"token_acc": 0.9389685688129387
},
{
"epoch": 0.23818565400843883,
"grad_norm": 0.890625,
"learning_rate": 9.750030398324279e-07,
"loss": 0.2751111388206482,
"step": 1129,
"token_acc": 0.9258064516129032
},
{
"epoch": 0.23839662447257384,
"grad_norm": 0.7265625,
"learning_rate": 9.749502191381533e-07,
"loss": 0.259870707988739,
"step": 1130,
"token_acc": 0.9296420923829918
},
{
"epoch": 0.23860759493670886,
"grad_norm": 0.69921875,
"learning_rate": 9.748973441293014e-07,
"loss": 0.24654024839401245,
"step": 1131,
"token_acc": 0.933527696793003
},
{
"epoch": 0.23881856540084387,
"grad_norm": 0.734375,
"learning_rate": 9.748444148119185e-07,
"loss": 0.2358575463294983,
"step": 1132,
"token_acc": 0.9305291723202171
},
{
"epoch": 0.2390295358649789,
"grad_norm": 1.0,
"learning_rate": 9.74791431192058e-07,
"loss": 0.2531391978263855,
"step": 1133,
"token_acc": 0.9241610738255034
},
{
"epoch": 0.23924050632911392,
"grad_norm": 0.890625,
"learning_rate": 9.747383932757787e-07,
"loss": 0.26321518421173096,
"step": 1134,
"token_acc": 0.9286318758815233
},
{
"epoch": 0.23945147679324894,
"grad_norm": 0.5625,
"learning_rate": 9.746853010691457e-07,
"loss": 0.2001633495092392,
"step": 1135,
"token_acc": 0.9387040280210157
},
{
"epoch": 0.23966244725738398,
"grad_norm": 1.0703125,
"learning_rate": 9.74632154578231e-07,
"loss": 0.2789958715438843,
"step": 1136,
"token_acc": 0.9239491150442478
},
{
"epoch": 0.239873417721519,
"grad_norm": 0.7734375,
"learning_rate": 9.745789538091123e-07,
"loss": 0.2799132168292999,
"step": 1137,
"token_acc": 0.919815668202765
},
{
"epoch": 0.240084388185654,
"grad_norm": 0.703125,
"learning_rate": 9.745256987678733e-07,
"loss": 0.3027660846710205,
"step": 1138,
"token_acc": 0.9214015151515151
},
{
"epoch": 0.24029535864978904,
"grad_norm": 1.453125,
"learning_rate": 9.74472389460604e-07,
"loss": 0.2762015461921692,
"step": 1139,
"token_acc": 0.9253100338218715
},
{
"epoch": 0.24050632911392406,
"grad_norm": 0.6640625,
"learning_rate": 9.744190258934015e-07,
"loss": 0.2636283040046692,
"step": 1140,
"token_acc": 0.9310656231186033
},
{
"epoch": 0.24071729957805907,
"grad_norm": 0.69921875,
"learning_rate": 9.743656080723676e-07,
"loss": 0.24154981970787048,
"step": 1141,
"token_acc": 0.9304322084073416
},
{
"epoch": 0.24092827004219408,
"grad_norm": 0.7890625,
"learning_rate": 9.743121360036117e-07,
"loss": 0.26985490322113037,
"step": 1142,
"token_acc": 0.9234859675036927
},
{
"epoch": 0.24113924050632912,
"grad_norm": 0.7109375,
"learning_rate": 9.742586096932484e-07,
"loss": 0.26405149698257446,
"step": 1143,
"token_acc": 0.925842060571752
},
{
"epoch": 0.24135021097046414,
"grad_norm": 0.75390625,
"learning_rate": 9.74205029147399e-07,
"loss": 0.2645023465156555,
"step": 1144,
"token_acc": 0.9224408326204734
},
{
"epoch": 0.24156118143459915,
"grad_norm": 0.59375,
"learning_rate": 9.74151394372191e-07,
"loss": 0.24868255853652954,
"step": 1145,
"token_acc": 0.9265844565529108
},
{
"epoch": 0.2417721518987342,
"grad_norm": 0.78125,
"learning_rate": 9.740977053737575e-07,
"loss": 0.2382085621356964,
"step": 1146,
"token_acc": 0.9304396215915415
},
{
"epoch": 0.2419831223628692,
"grad_norm": 0.70703125,
"learning_rate": 9.74043962158239e-07,
"loss": 0.27882516384124756,
"step": 1147,
"token_acc": 0.9233477789815818
},
{
"epoch": 0.24219409282700421,
"grad_norm": 0.640625,
"learning_rate": 9.73990164731781e-07,
"loss": 0.2696080207824707,
"step": 1148,
"token_acc": 0.9303629389200354
},
{
"epoch": 0.24240506329113923,
"grad_norm": 0.84375,
"learning_rate": 9.739363131005358e-07,
"loss": 0.2302226722240448,
"step": 1149,
"token_acc": 0.9319037960011591
},
{
"epoch": 0.24261603375527427,
"grad_norm": 0.78125,
"learning_rate": 9.738824072706619e-07,
"loss": 0.300628125667572,
"step": 1150,
"token_acc": 0.9245404708158659
},
{
"epoch": 0.24282700421940928,
"grad_norm": 0.74609375,
"learning_rate": 9.738284472483239e-07,
"loss": 0.28427547216415405,
"step": 1151,
"token_acc": 0.9175200664267922
},
{
"epoch": 0.2430379746835443,
"grad_norm": 0.69140625,
"learning_rate": 9.737744330396924e-07,
"loss": 0.25487983226776123,
"step": 1152,
"token_acc": 0.9297150610583447
},
{
"epoch": 0.24324894514767934,
"grad_norm": 0.76953125,
"learning_rate": 9.737203646509445e-07,
"loss": 0.23934927582740784,
"step": 1153,
"token_acc": 0.9319465081723626
},
{
"epoch": 0.24345991561181435,
"grad_norm": 0.72265625,
"learning_rate": 9.736662420882636e-07,
"loss": 0.24117323756217957,
"step": 1154,
"token_acc": 0.9246076233183856
},
{
"epoch": 0.24367088607594936,
"grad_norm": 0.64453125,
"learning_rate": 9.736120653578385e-07,
"loss": 0.21268248558044434,
"step": 1155,
"token_acc": 0.9357798165137615
},
{
"epoch": 0.2438818565400844,
"grad_norm": 0.65234375,
"learning_rate": 9.735578344658652e-07,
"loss": 0.20961320400238037,
"step": 1156,
"token_acc": 0.936529933481153
},
{
"epoch": 0.24409282700421941,
"grad_norm": 0.75,
"learning_rate": 9.735035494185454e-07,
"loss": 0.23409229516983032,
"step": 1157,
"token_acc": 0.9367359413202934
},
{
"epoch": 0.24430379746835443,
"grad_norm": 0.68359375,
"learning_rate": 9.73449210222087e-07,
"loss": 0.2215413749217987,
"step": 1158,
"token_acc": 0.9337132573485303
},
{
"epoch": 0.24451476793248944,
"grad_norm": 0.5078125,
"learning_rate": 9.733948168827042e-07,
"loss": 0.20670649409294128,
"step": 1159,
"token_acc": 0.9404427814156533
},
{
"epoch": 0.24472573839662448,
"grad_norm": 0.65625,
"learning_rate": 9.733403694066174e-07,
"loss": 0.22479060292243958,
"step": 1160,
"token_acc": 0.9324430479183032
},
{
"epoch": 0.2449367088607595,
"grad_norm": 0.77734375,
"learning_rate": 9.732858678000528e-07,
"loss": 0.26787155866622925,
"step": 1161,
"token_acc": 0.9296438033559022
},
{
"epoch": 0.2451476793248945,
"grad_norm": 0.65234375,
"learning_rate": 9.732313120692436e-07,
"loss": 0.23902547359466553,
"step": 1162,
"token_acc": 0.9335614485315085
},
{
"epoch": 0.24535864978902955,
"grad_norm": 0.6796875,
"learning_rate": 9.731767022204283e-07,
"loss": 0.25326600670814514,
"step": 1163,
"token_acc": 0.9322571346209282
},
{
"epoch": 0.24556962025316456,
"grad_norm": 0.7421875,
"learning_rate": 9.73122038259852e-07,
"loss": 0.25212883949279785,
"step": 1164,
"token_acc": 0.928149300155521
},
{
"epoch": 0.24578059071729957,
"grad_norm": 0.61328125,
"learning_rate": 9.730673201937667e-07,
"loss": 0.22139108180999756,
"step": 1165,
"token_acc": 0.9381711682395054
},
{
"epoch": 0.2459915611814346,
"grad_norm": 0.71875,
"learning_rate": 9.73012548028429e-07,
"loss": 0.2780754566192627,
"step": 1166,
"token_acc": 0.9247701309556979
},
{
"epoch": 0.24620253164556963,
"grad_norm": 0.72265625,
"learning_rate": 9.729577217701028e-07,
"loss": 0.25920483469963074,
"step": 1167,
"token_acc": 0.9288939051918735
},
{
"epoch": 0.24641350210970464,
"grad_norm": 0.91796875,
"learning_rate": 9.729028414250581e-07,
"loss": 0.2644927501678467,
"step": 1168,
"token_acc": 0.9281803542673108
},
{
"epoch": 0.24662447257383965,
"grad_norm": 0.6796875,
"learning_rate": 9.72847906999571e-07,
"loss": 0.2646119296550751,
"step": 1169,
"token_acc": 0.9231905465288035
},
{
"epoch": 0.2468354430379747,
"grad_norm": 0.73828125,
"learning_rate": 9.727929184999235e-07,
"loss": 0.28313902020454407,
"step": 1170,
"token_acc": 0.9207523897625656
},
{
"epoch": 0.2470464135021097,
"grad_norm": 0.5625,
"learning_rate": 9.72737875932404e-07,
"loss": 0.2552646994590759,
"step": 1171,
"token_acc": 0.9269677419354839
},
{
"epoch": 0.24725738396624472,
"grad_norm": 0.8046875,
"learning_rate": 9.726827793033072e-07,
"loss": 0.31298452615737915,
"step": 1172,
"token_acc": 0.9097605893186004
},
{
"epoch": 0.24746835443037973,
"grad_norm": 0.6796875,
"learning_rate": 9.726276286189338e-07,
"loss": 0.262067973613739,
"step": 1173,
"token_acc": 0.927277716794731
},
{
"epoch": 0.24767932489451477,
"grad_norm": 0.78125,
"learning_rate": 9.72572423885591e-07,
"loss": 0.2698643207550049,
"step": 1174,
"token_acc": 0.9250425894378195
},
{
"epoch": 0.2478902953586498,
"grad_norm": 0.5390625,
"learning_rate": 9.725171651095914e-07,
"loss": 0.2563115954399109,
"step": 1175,
"token_acc": 0.9273853081902618
},
{
"epoch": 0.2481012658227848,
"grad_norm": 0.76953125,
"learning_rate": 9.724618522972547e-07,
"loss": 0.2558833658695221,
"step": 1176,
"token_acc": 0.9279216235129462
},
{
"epoch": 0.24831223628691984,
"grad_norm": 0.92578125,
"learning_rate": 9.724064854549066e-07,
"loss": 0.2242472767829895,
"step": 1177,
"token_acc": 0.9351190476190476
},
{
"epoch": 0.24852320675105485,
"grad_norm": 0.66015625,
"learning_rate": 9.723510645888782e-07,
"loss": 0.24171394109725952,
"step": 1178,
"token_acc": 0.9324016899577511
},
{
"epoch": 0.24873417721518987,
"grad_norm": 0.98046875,
"learning_rate": 9.722955897055077e-07,
"loss": 0.2775258719921112,
"step": 1179,
"token_acc": 0.9166889900884008
},
{
"epoch": 0.2489451476793249,
"grad_norm": 0.6484375,
"learning_rate": 9.72240060811139e-07,
"loss": 0.2454729527235031,
"step": 1180,
"token_acc": 0.9307559145989613
},
{
"epoch": 0.24915611814345992,
"grad_norm": 0.91796875,
"learning_rate": 9.721844779121222e-07,
"loss": 0.3090250790119171,
"step": 1181,
"token_acc": 0.9210233592880979
},
{
"epoch": 0.24936708860759493,
"grad_norm": 0.77734375,
"learning_rate": 9.721288410148139e-07,
"loss": 0.2278854250907898,
"step": 1182,
"token_acc": 0.9349336702463676
},
{
"epoch": 0.24957805907172995,
"grad_norm": 0.82421875,
"learning_rate": 9.720731501255766e-07,
"loss": 0.28732359409332275,
"step": 1183,
"token_acc": 0.9273584905660377
},
{
"epoch": 0.249789029535865,
"grad_norm": 0.79296875,
"learning_rate": 9.72017405250779e-07,
"loss": 0.25907230377197266,
"step": 1184,
"token_acc": 0.9236153377967133
},
{
"epoch": 0.25,
"grad_norm": 0.78125,
"learning_rate": 9.71961606396796e-07,
"loss": 0.25944262742996216,
"step": 1185,
"token_acc": 0.9268510258697591
},
{
"epoch": 0.25021097046413504,
"grad_norm": 0.703125,
"learning_rate": 9.719057535700087e-07,
"loss": 0.22414150834083557,
"step": 1186,
"token_acc": 0.9399524375743162
},
{
"epoch": 0.25042194092827,
"grad_norm": 0.7109375,
"learning_rate": 9.71849846776804e-07,
"loss": 0.22336113452911377,
"step": 1187,
"token_acc": 0.9373601789709173
},
{
"epoch": 0.25063291139240507,
"grad_norm": 0.91015625,
"learning_rate": 9.717938860235761e-07,
"loss": 0.266832172870636,
"step": 1188,
"token_acc": 0.9330294530154277
},
{
"epoch": 0.2508438818565401,
"grad_norm": 0.68359375,
"learning_rate": 9.717378713167238e-07,
"loss": 0.2418878674507141,
"step": 1189,
"token_acc": 0.9303391384051329
},
{
"epoch": 0.2510548523206751,
"grad_norm": 1.359375,
"learning_rate": 9.716818026626531e-07,
"loss": 0.2523130774497986,
"step": 1190,
"token_acc": 0.9264534883720931
},
{
"epoch": 0.25126582278481013,
"grad_norm": 0.765625,
"learning_rate": 9.716256800677763e-07,
"loss": 0.24182310700416565,
"step": 1191,
"token_acc": 0.9301447451227187
},
{
"epoch": 0.2514767932489452,
"grad_norm": 0.6875,
"learning_rate": 9.715695035385109e-07,
"loss": 0.2770576477050781,
"step": 1192,
"token_acc": 0.9195145320983711
},
{
"epoch": 0.25168776371308016,
"grad_norm": 0.6953125,
"learning_rate": 9.715132730812816e-07,
"loss": 0.23406967520713806,
"step": 1193,
"token_acc": 0.9375539568345324
},
{
"epoch": 0.2518987341772152,
"grad_norm": 0.734375,
"learning_rate": 9.714569887025185e-07,
"loss": 0.24692702293395996,
"step": 1194,
"token_acc": 0.9298245614035088
},
{
"epoch": 0.2521097046413502,
"grad_norm": 0.7578125,
"learning_rate": 9.714006504086584e-07,
"loss": 0.30922287702560425,
"step": 1195,
"token_acc": 0.9170774137431139
},
{
"epoch": 0.2523206751054852,
"grad_norm": 1.078125,
"learning_rate": 9.71344258206144e-07,
"loss": 0.25276121497154236,
"step": 1196,
"token_acc": 0.9233000322268772
},
{
"epoch": 0.25253164556962027,
"grad_norm": 0.72265625,
"learning_rate": 9.712878121014243e-07,
"loss": 0.24556684494018555,
"step": 1197,
"token_acc": 0.9253149370125975
},
{
"epoch": 0.25274261603375525,
"grad_norm": 0.89453125,
"learning_rate": 9.712313121009545e-07,
"loss": 0.24084654450416565,
"step": 1198,
"token_acc": 0.9290652239254583
},
{
"epoch": 0.2529535864978903,
"grad_norm": 0.86328125,
"learning_rate": 9.711747582111956e-07,
"loss": 0.28878283500671387,
"step": 1199,
"token_acc": 0.9240226986128626
},
{
"epoch": 0.25316455696202533,
"grad_norm": 0.81640625,
"learning_rate": 9.71118150438615e-07,
"loss": 0.2748972177505493,
"step": 1200,
"token_acc": 0.9292035398230089
},
{
"epoch": 0.25316455696202533,
"eval_loss": 0.43360278010368347,
"eval_runtime": 245.4913,
"eval_samples_per_second": 137.296,
"eval_steps_per_second": 2.147,
"eval_token_acc": 0.8991687606569146,
"step": 1200
},
{
"epoch": 0.2533755274261603,
"grad_norm": 0.69921875,
"learning_rate": 9.710614887896864e-07,
"loss": 0.2842106521129608,
"step": 1201,
"token_acc": 0.9268008165645961
},
{
"epoch": 0.25358649789029536,
"grad_norm": 0.6640625,
"learning_rate": 9.710047732708896e-07,
"loss": 0.2714993357658386,
"step": 1202,
"token_acc": 0.9253355704697986
},
{
"epoch": 0.2537974683544304,
"grad_norm": 0.7265625,
"learning_rate": 9.709480038887104e-07,
"loss": 0.2943306863307953,
"step": 1203,
"token_acc": 0.9169639961076873
},
{
"epoch": 0.2540084388185654,
"grad_norm": 0.6796875,
"learning_rate": 9.708911806496409e-07,
"loss": 0.2546621859073639,
"step": 1204,
"token_acc": 0.925904145839459
},
{
"epoch": 0.2542194092827004,
"grad_norm": 0.8828125,
"learning_rate": 9.708343035601792e-07,
"loss": 0.28815045952796936,
"step": 1205,
"token_acc": 0.9163055254604551
},
{
"epoch": 0.25443037974683547,
"grad_norm": 0.78125,
"learning_rate": 9.707773726268297e-07,
"loss": 0.2761947214603424,
"step": 1206,
"token_acc": 0.9241419094317854
},
{
"epoch": 0.25464135021097045,
"grad_norm": 0.671875,
"learning_rate": 9.707203878561032e-07,
"loss": 0.2541855573654175,
"step": 1207,
"token_acc": 0.9287749287749287
},
{
"epoch": 0.2548523206751055,
"grad_norm": 0.73828125,
"learning_rate": 9.706633492545163e-07,
"loss": 0.24647629261016846,
"step": 1208,
"token_acc": 0.9260204081632653
},
{
"epoch": 0.25506329113924053,
"grad_norm": 0.77734375,
"learning_rate": 9.706062568285915e-07,
"loss": 0.26219797134399414,
"step": 1209,
"token_acc": 0.9297507283910651
},
{
"epoch": 0.2552742616033755,
"grad_norm": 0.8671875,
"learning_rate": 9.705491105848582e-07,
"loss": 0.33000296354293823,
"step": 1210,
"token_acc": 0.9137880129908473
},
{
"epoch": 0.25548523206751056,
"grad_norm": 0.94140625,
"learning_rate": 9.704919105298515e-07,
"loss": 0.2628134489059448,
"step": 1211,
"token_acc": 0.9242511520737328
},
{
"epoch": 0.25569620253164554,
"grad_norm": 0.74609375,
"learning_rate": 9.704346566701123e-07,
"loss": 0.2608003616333008,
"step": 1212,
"token_acc": 0.9264790350373349
},
{
"epoch": 0.2559071729957806,
"grad_norm": 0.65234375,
"learning_rate": 9.703773490121888e-07,
"loss": 0.2128331959247589,
"step": 1213,
"token_acc": 0.9373394966615306
},
{
"epoch": 0.2561181434599156,
"grad_norm": 0.7421875,
"learning_rate": 9.703199875626338e-07,
"loss": 0.2637559771537781,
"step": 1214,
"token_acc": 0.9277810133954572
},
{
"epoch": 0.2563291139240506,
"grad_norm": 0.65625,
"learning_rate": 9.702625723280076e-07,
"loss": 0.24556344747543335,
"step": 1215,
"token_acc": 0.9323801012809055
},
{
"epoch": 0.25654008438818565,
"grad_norm": 0.61328125,
"learning_rate": 9.70205103314876e-07,
"loss": 0.25457581877708435,
"step": 1216,
"token_acc": 0.9285921625544267
},
{
"epoch": 0.2567510548523207,
"grad_norm": 0.78515625,
"learning_rate": 9.701475805298111e-07,
"loss": 0.24435514211654663,
"step": 1217,
"token_acc": 0.9288
},
{
"epoch": 0.2569620253164557,
"grad_norm": 0.734375,
"learning_rate": 9.70090003979391e-07,
"loss": 0.27876341342926025,
"step": 1218,
"token_acc": 0.9308375634517766
},
{
"epoch": 0.2571729957805907,
"grad_norm": 0.890625,
"learning_rate": 9.700323736702003e-07,
"loss": 0.2775731682777405,
"step": 1219,
"token_acc": 0.9191780821917809
},
{
"epoch": 0.25738396624472576,
"grad_norm": 0.77734375,
"learning_rate": 9.699746896088293e-07,
"loss": 0.28725969791412354,
"step": 1220,
"token_acc": 0.9266795865633075
},
{
"epoch": 0.25759493670886074,
"grad_norm": 0.62109375,
"learning_rate": 9.699169518018747e-07,
"loss": 0.2132992148399353,
"step": 1221,
"token_acc": 0.9392470051340559
},
{
"epoch": 0.2578059071729958,
"grad_norm": 0.69140625,
"learning_rate": 9.698591602559392e-07,
"loss": 0.22622618079185486,
"step": 1222,
"token_acc": 0.9333721268548153
},
{
"epoch": 0.2580168776371308,
"grad_norm": 0.71484375,
"learning_rate": 9.698013149776318e-07,
"loss": 0.260597288608551,
"step": 1223,
"token_acc": 0.9280405405405405
},
{
"epoch": 0.2582278481012658,
"grad_norm": 0.57421875,
"learning_rate": 9.697434159735679e-07,
"loss": 0.24745848774909973,
"step": 1224,
"token_acc": 0.9323770491803278
},
{
"epoch": 0.25843881856540085,
"grad_norm": 0.66015625,
"learning_rate": 9.696854632503684e-07,
"loss": 0.2582542896270752,
"step": 1225,
"token_acc": 0.9207642596234897
},
{
"epoch": 0.2586497890295359,
"grad_norm": 0.765625,
"learning_rate": 9.696274568146607e-07,
"loss": 0.23025202751159668,
"step": 1226,
"token_acc": 0.9355661881977672
},
{
"epoch": 0.2588607594936709,
"grad_norm": 0.70703125,
"learning_rate": 9.695693966730786e-07,
"loss": 0.2466697096824646,
"step": 1227,
"token_acc": 0.9332688588007737
},
{
"epoch": 0.2590717299578059,
"grad_norm": 0.5859375,
"learning_rate": 9.695112828322614e-07,
"loss": 0.24845033884048462,
"step": 1228,
"token_acc": 0.9291573452647278
},
{
"epoch": 0.2592827004219409,
"grad_norm": 0.87890625,
"learning_rate": 9.694531152988553e-07,
"loss": 0.27860498428344727,
"step": 1229,
"token_acc": 0.921844342707652
},
{
"epoch": 0.25949367088607594,
"grad_norm": 0.62109375,
"learning_rate": 9.69394894079512e-07,
"loss": 0.2347266525030136,
"step": 1230,
"token_acc": 0.9367695534911598
},
{
"epoch": 0.259704641350211,
"grad_norm": 1.0234375,
"learning_rate": 9.693366191808895e-07,
"loss": 0.2848036289215088,
"step": 1231,
"token_acc": 0.9186012342051131
},
{
"epoch": 0.25991561181434597,
"grad_norm": 0.71875,
"learning_rate": 9.692782906096522e-07,
"loss": 0.271634578704834,
"step": 1232,
"token_acc": 0.9254937163375224
},
{
"epoch": 0.260126582278481,
"grad_norm": 0.703125,
"learning_rate": 9.692199083724704e-07,
"loss": 0.3552762567996979,
"step": 1233,
"token_acc": 0.9169062679700978
},
{
"epoch": 0.26033755274261605,
"grad_norm": 0.69140625,
"learning_rate": 9.691614724760208e-07,
"loss": 0.2829202115535736,
"step": 1234,
"token_acc": 0.92018779342723
},
{
"epoch": 0.26054852320675104,
"grad_norm": 0.91015625,
"learning_rate": 9.691029829269856e-07,
"loss": 0.24786871671676636,
"step": 1235,
"token_acc": 0.9295311214697011
},
{
"epoch": 0.2607594936708861,
"grad_norm": 0.69921875,
"learning_rate": 9.690444397320543e-07,
"loss": 0.23441259562969208,
"step": 1236,
"token_acc": 0.9349985807550383
},
{
"epoch": 0.2609704641350211,
"grad_norm": 0.67578125,
"learning_rate": 9.68985842897921e-07,
"loss": 0.21858146786689758,
"step": 1237,
"token_acc": 0.9393670511682934
},
{
"epoch": 0.2611814345991561,
"grad_norm": 0.7890625,
"learning_rate": 9.689271924312873e-07,
"loss": 0.28490114212036133,
"step": 1238,
"token_acc": 0.9234604105571848
},
{
"epoch": 0.26139240506329114,
"grad_norm": 0.7421875,
"learning_rate": 9.688684883388598e-07,
"loss": 0.26761138439178467,
"step": 1239,
"token_acc": 0.9277440706012134
},
{
"epoch": 0.2616033755274262,
"grad_norm": 0.70703125,
"learning_rate": 9.688097306273525e-07,
"loss": 0.2610659599304199,
"step": 1240,
"token_acc": 0.9297409943435546
},
{
"epoch": 0.26181434599156117,
"grad_norm": 0.59765625,
"learning_rate": 9.687509193034844e-07,
"loss": 0.22601675987243652,
"step": 1241,
"token_acc": 0.9416980118216013
},
{
"epoch": 0.2620253164556962,
"grad_norm": 0.78515625,
"learning_rate": 9.686920543739815e-07,
"loss": 0.248170405626297,
"step": 1242,
"token_acc": 0.9330609679446888
},
{
"epoch": 0.2622362869198312,
"grad_norm": 0.68359375,
"learning_rate": 9.686331358455747e-07,
"loss": 0.23889891803264618,
"step": 1243,
"token_acc": 0.9359560841720037
},
{
"epoch": 0.26244725738396624,
"grad_norm": 0.640625,
"learning_rate": 9.685741637250027e-07,
"loss": 0.2646980285644531,
"step": 1244,
"token_acc": 0.9301587301587302
},
{
"epoch": 0.2626582278481013,
"grad_norm": 0.70703125,
"learning_rate": 9.68515138019009e-07,
"loss": 0.23318523168563843,
"step": 1245,
"token_acc": 0.9326036866359447
},
{
"epoch": 0.26286919831223626,
"grad_norm": 0.7265625,
"learning_rate": 9.684560587343439e-07,
"loss": 0.25974488258361816,
"step": 1246,
"token_acc": 0.9271044258027191
},
{
"epoch": 0.2630801687763713,
"grad_norm": 0.78125,
"learning_rate": 9.683969258777634e-07,
"loss": 0.2580402195453644,
"step": 1247,
"token_acc": 0.9221480775341595
},
{
"epoch": 0.26329113924050634,
"grad_norm": 0.65234375,
"learning_rate": 9.6833773945603e-07,
"loss": 0.26659655570983887,
"step": 1248,
"token_acc": 0.9241719930273097
},
{
"epoch": 0.26350210970464133,
"grad_norm": 0.6875,
"learning_rate": 9.68278499475912e-07,
"loss": 0.24891090393066406,
"step": 1249,
"token_acc": 0.9277293695540748
},
{
"epoch": 0.26371308016877637,
"grad_norm": 0.80078125,
"learning_rate": 9.68219205944184e-07,
"loss": 0.23200133442878723,
"step": 1250,
"token_acc": 0.9329558998808105
},
{
"epoch": 0.2639240506329114,
"grad_norm": 0.8359375,
"learning_rate": 9.68159858867627e-07,
"loss": 0.2909882664680481,
"step": 1251,
"token_acc": 0.9168533034714446
},
{
"epoch": 0.2641350210970464,
"grad_norm": 0.703125,
"learning_rate": 9.681004582530279e-07,
"loss": 0.2549628019332886,
"step": 1252,
"token_acc": 0.924122926298613
},
{
"epoch": 0.26434599156118144,
"grad_norm": 0.8984375,
"learning_rate": 9.68041004107179e-07,
"loss": 0.24394650757312775,
"step": 1253,
"token_acc": 0.9308855291576674
},
{
"epoch": 0.2645569620253165,
"grad_norm": 0.609375,
"learning_rate": 9.6798149643688e-07,
"loss": 0.2381609082221985,
"step": 1254,
"token_acc": 0.9395761741122566
},
{
"epoch": 0.26476793248945146,
"grad_norm": 0.8515625,
"learning_rate": 9.67921935248936e-07,
"loss": 0.2466859370470047,
"step": 1255,
"token_acc": 0.9292594822396147
},
{
"epoch": 0.2649789029535865,
"grad_norm": 0.73828125,
"learning_rate": 9.67862320550158e-07,
"loss": 0.2523610591888428,
"step": 1256,
"token_acc": 0.9285103958986044
},
{
"epoch": 0.26518987341772154,
"grad_norm": 0.75,
"learning_rate": 9.67802652347364e-07,
"loss": 0.25371021032333374,
"step": 1257,
"token_acc": 0.9257921067259589
},
{
"epoch": 0.26540084388185653,
"grad_norm": 0.84765625,
"learning_rate": 9.67742930647377e-07,
"loss": 0.26243138313293457,
"step": 1258,
"token_acc": 0.9317498496692724
},
{
"epoch": 0.26561181434599157,
"grad_norm": 0.72265625,
"learning_rate": 9.67683155457027e-07,
"loss": 0.26387542486190796,
"step": 1259,
"token_acc": 0.9278320023661638
},
{
"epoch": 0.26582278481012656,
"grad_norm": 0.71484375,
"learning_rate": 9.6762332678315e-07,
"loss": 0.2729257047176361,
"step": 1260,
"token_acc": 0.9249352890422778
},
{
"epoch": 0.2660337552742616,
"grad_norm": 0.79296875,
"learning_rate": 9.675634446325873e-07,
"loss": 0.3026050329208374,
"step": 1261,
"token_acc": 0.9185979142526072
},
{
"epoch": 0.26624472573839664,
"grad_norm": 0.71484375,
"learning_rate": 9.675035090121875e-07,
"loss": 0.2565396726131439,
"step": 1262,
"token_acc": 0.9267410310521556
},
{
"epoch": 0.2664556962025316,
"grad_norm": 0.67578125,
"learning_rate": 9.674435199288045e-07,
"loss": 0.25251418352127075,
"step": 1263,
"token_acc": 0.9283837056504599
},
{
"epoch": 0.26666666666666666,
"grad_norm": 0.63671875,
"learning_rate": 9.673834773892984e-07,
"loss": 0.24896693229675293,
"step": 1264,
"token_acc": 0.930945558739255
},
{
"epoch": 0.2668776371308017,
"grad_norm": 1.203125,
"learning_rate": 9.673233814005359e-07,
"loss": 0.23103883862495422,
"step": 1265,
"token_acc": 0.9342560553633218
},
{
"epoch": 0.2670886075949367,
"grad_norm": 1.0703125,
"learning_rate": 9.672632319693894e-07,
"loss": 0.24049502611160278,
"step": 1266,
"token_acc": 0.9309718437783833
},
{
"epoch": 0.26729957805907173,
"grad_norm": 0.72265625,
"learning_rate": 9.672030291027374e-07,
"loss": 0.26001042127609253,
"step": 1267,
"token_acc": 0.9262520638414969
},
{
"epoch": 0.26751054852320677,
"grad_norm": 0.72265625,
"learning_rate": 9.671427728074644e-07,
"loss": 0.28741368651390076,
"step": 1268,
"token_acc": 0.9232505643340858
},
{
"epoch": 0.26772151898734176,
"grad_norm": 0.84765625,
"learning_rate": 9.670824630904617e-07,
"loss": 0.27266865968704224,
"step": 1269,
"token_acc": 0.9303104077906269
},
{
"epoch": 0.2679324894514768,
"grad_norm": 0.6953125,
"learning_rate": 9.67022099958626e-07,
"loss": 0.23278406262397766,
"step": 1270,
"token_acc": 0.9318857822724569
},
{
"epoch": 0.26814345991561184,
"grad_norm": 0.6640625,
"learning_rate": 9.669616834188604e-07,
"loss": 0.24085257947444916,
"step": 1271,
"token_acc": 0.9365031597953656
},
{
"epoch": 0.2683544303797468,
"grad_norm": 0.7265625,
"learning_rate": 9.66901213478074e-07,
"loss": 0.23926323652267456,
"step": 1272,
"token_acc": 0.9300341296928327
},
{
"epoch": 0.26856540084388186,
"grad_norm": 0.69921875,
"learning_rate": 9.668406901431816e-07,
"loss": 0.2380378544330597,
"step": 1273,
"token_acc": 0.9354371742906775
},
{
"epoch": 0.2687763713080169,
"grad_norm": 0.8046875,
"learning_rate": 9.667801134211054e-07,
"loss": 0.2831774353981018,
"step": 1274,
"token_acc": 0.9198927933293627
},
{
"epoch": 0.2689873417721519,
"grad_norm": 0.7109375,
"learning_rate": 9.667194833187722e-07,
"loss": 0.25443994998931885,
"step": 1275,
"token_acc": 0.927960927960928
},
{
"epoch": 0.26919831223628693,
"grad_norm": 0.68359375,
"learning_rate": 9.66658799843116e-07,
"loss": 0.2516897916793823,
"step": 1276,
"token_acc": 0.9287622439893143
},
{
"epoch": 0.2694092827004219,
"grad_norm": 0.890625,
"learning_rate": 9.665980630010762e-07,
"loss": 0.2877979874610901,
"step": 1277,
"token_acc": 0.9202143495087823
},
{
"epoch": 0.26962025316455696,
"grad_norm": 0.74609375,
"learning_rate": 9.665372727995985e-07,
"loss": 0.2647358775138855,
"step": 1278,
"token_acc": 0.9289904291447978
},
{
"epoch": 0.269831223628692,
"grad_norm": 0.66015625,
"learning_rate": 9.66476429245635e-07,
"loss": 0.22759586572647095,
"step": 1279,
"token_acc": 0.9376068376068376
},
{
"epoch": 0.270042194092827,
"grad_norm": 0.65234375,
"learning_rate": 9.664155323461436e-07,
"loss": 0.25757282972335815,
"step": 1280,
"token_acc": 0.927784222737819
},
{
"epoch": 0.270253164556962,
"grad_norm": 0.6796875,
"learning_rate": 9.663545821080884e-07,
"loss": 0.26837342977523804,
"step": 1281,
"token_acc": 0.9224065223502952
},
{
"epoch": 0.27046413502109706,
"grad_norm": 0.6484375,
"learning_rate": 9.662935785384395e-07,
"loss": 0.28320473432540894,
"step": 1282,
"token_acc": 0.9173576561636263
},
{
"epoch": 0.27067510548523205,
"grad_norm": 0.859375,
"learning_rate": 9.662325216441733e-07,
"loss": 0.23279771208763123,
"step": 1283,
"token_acc": 0.9394752534287418
},
{
"epoch": 0.2708860759493671,
"grad_norm": 0.671875,
"learning_rate": 9.66171411432272e-07,
"loss": 0.2523917555809021,
"step": 1284,
"token_acc": 0.9287851585876721
},
{
"epoch": 0.27109704641350213,
"grad_norm": 0.69921875,
"learning_rate": 9.661102479097241e-07,
"loss": 0.2521704435348511,
"step": 1285,
"token_acc": 0.9344652963955913
},
{
"epoch": 0.2713080168776371,
"grad_norm": 1.0,
"learning_rate": 9.660490310835243e-07,
"loss": 0.22997595369815826,
"step": 1286,
"token_acc": 0.9331612162937464
},
{
"epoch": 0.27151898734177216,
"grad_norm": 0.734375,
"learning_rate": 9.659877609606732e-07,
"loss": 0.2617974579334259,
"step": 1287,
"token_acc": 0.9331476323119777
},
{
"epoch": 0.2717299578059072,
"grad_norm": 0.796875,
"learning_rate": 9.659264375481777e-07,
"loss": 0.2871522605419159,
"step": 1288,
"token_acc": 0.9222536984576645
},
{
"epoch": 0.2719409282700422,
"grad_norm": 0.828125,
"learning_rate": 9.658650608530503e-07,
"loss": 0.2694163918495178,
"step": 1289,
"token_acc": 0.9271523178807947
},
{
"epoch": 0.2721518987341772,
"grad_norm": 1.015625,
"learning_rate": 9.658036308823101e-07,
"loss": 0.2750623822212219,
"step": 1290,
"token_acc": 0.9309335674568335
},
{
"epoch": 0.27236286919831226,
"grad_norm": 1.0703125,
"learning_rate": 9.657421476429823e-07,
"loss": 0.24293242394924164,
"step": 1291,
"token_acc": 0.9321070234113712
},
{
"epoch": 0.27257383966244725,
"grad_norm": 0.734375,
"learning_rate": 9.656806111420978e-07,
"loss": 0.26937633752822876,
"step": 1292,
"token_acc": 0.9326610279765778
},
{
"epoch": 0.2727848101265823,
"grad_norm": 0.75,
"learning_rate": 9.656190213866942e-07,
"loss": 0.258544385433197,
"step": 1293,
"token_acc": 0.9324324324324325
},
{
"epoch": 0.2729957805907173,
"grad_norm": 0.80078125,
"learning_rate": 9.655573783838142e-07,
"loss": 0.30913591384887695,
"step": 1294,
"token_acc": 0.9182915506035283
},
{
"epoch": 0.2732067510548523,
"grad_norm": 0.70703125,
"learning_rate": 9.654956821405076e-07,
"loss": 0.2871444821357727,
"step": 1295,
"token_acc": 0.9226289517470881
},
{
"epoch": 0.27341772151898736,
"grad_norm": 0.71484375,
"learning_rate": 9.6543393266383e-07,
"loss": 0.2559030055999756,
"step": 1296,
"token_acc": 0.9310548025928108
},
{
"epoch": 0.27362869198312234,
"grad_norm": 0.875,
"learning_rate": 9.653721299608425e-07,
"loss": 0.2956152558326721,
"step": 1297,
"token_acc": 0.9208739297313256
},
{
"epoch": 0.2738396624472574,
"grad_norm": 0.71484375,
"learning_rate": 9.653102740386134e-07,
"loss": 0.21772214770317078,
"step": 1298,
"token_acc": 0.9364857302742026
},
{
"epoch": 0.2740506329113924,
"grad_norm": 0.734375,
"learning_rate": 9.65248364904216e-07,
"loss": 0.30199459195137024,
"step": 1299,
"token_acc": 0.9205334815226451
},
{
"epoch": 0.2742616033755274,
"grad_norm": 0.87109375,
"learning_rate": 9.651864025647303e-07,
"loss": 0.24779856204986572,
"step": 1300,
"token_acc": 0.9276583381754794
},
{
"epoch": 0.27447257383966245,
"grad_norm": 0.68359375,
"learning_rate": 9.65124387027242e-07,
"loss": 0.2366098165512085,
"step": 1301,
"token_acc": 0.931989247311828
},
{
"epoch": 0.2746835443037975,
"grad_norm": 0.609375,
"learning_rate": 9.650623182988434e-07,
"loss": 0.24503357708454132,
"step": 1302,
"token_acc": 0.9304397815464214
},
{
"epoch": 0.2748945147679325,
"grad_norm": 0.765625,
"learning_rate": 9.650001963866324e-07,
"loss": 0.24037089943885803,
"step": 1303,
"token_acc": 0.9337579617834395
},
{
"epoch": 0.2751054852320675,
"grad_norm": 1.21875,
"learning_rate": 9.64938021297713e-07,
"loss": 0.24094949662685394,
"step": 1304,
"token_acc": 0.9320071258907363
},
{
"epoch": 0.27531645569620256,
"grad_norm": 0.8125,
"learning_rate": 9.64875793039196e-07,
"loss": 0.26656514406204224,
"step": 1305,
"token_acc": 0.9331143624962676
},
{
"epoch": 0.27552742616033754,
"grad_norm": 0.71484375,
"learning_rate": 9.64813511618197e-07,
"loss": 0.27888917922973633,
"step": 1306,
"token_acc": 0.9265919811320755
},
{
"epoch": 0.2757383966244726,
"grad_norm": 0.89453125,
"learning_rate": 9.64751177041839e-07,
"loss": 0.2879449725151062,
"step": 1307,
"token_acc": 0.9250924784217016
},
{
"epoch": 0.2759493670886076,
"grad_norm": 0.71875,
"learning_rate": 9.646887893172504e-07,
"loss": 0.2646368443965912,
"step": 1308,
"token_acc": 0.9223654708520179
},
{
"epoch": 0.2761603375527426,
"grad_norm": 0.82421875,
"learning_rate": 9.646263484515657e-07,
"loss": 0.23699407279491425,
"step": 1309,
"token_acc": 0.9358803986710963
},
{
"epoch": 0.27637130801687765,
"grad_norm": 0.66015625,
"learning_rate": 9.645638544519253e-07,
"loss": 0.20495551824569702,
"step": 1310,
"token_acc": 0.9403642773207991
},
{
"epoch": 0.27658227848101263,
"grad_norm": 0.73828125,
"learning_rate": 9.645013073254762e-07,
"loss": 0.24341881275177002,
"step": 1311,
"token_acc": 0.9322884012539185
},
{
"epoch": 0.2767932489451477,
"grad_norm": 0.640625,
"learning_rate": 9.64438707079371e-07,
"loss": 0.23052801191806793,
"step": 1312,
"token_acc": 0.9312657166806371
},
{
"epoch": 0.2770042194092827,
"grad_norm": 0.7109375,
"learning_rate": 9.643760537207688e-07,
"loss": 0.3025025725364685,
"step": 1313,
"token_acc": 0.9218884120171674
},
{
"epoch": 0.2772151898734177,
"grad_norm": 0.640625,
"learning_rate": 9.643133472568344e-07,
"loss": 0.20893415808677673,
"step": 1314,
"token_acc": 0.9359681372549019
},
{
"epoch": 0.27742616033755274,
"grad_norm": 0.8046875,
"learning_rate": 9.642505876947386e-07,
"loss": 0.2443920075893402,
"step": 1315,
"token_acc": 0.928676254769592
},
{
"epoch": 0.2776371308016878,
"grad_norm": 1.234375,
"learning_rate": 9.64187775041659e-07,
"loss": 0.25716376304626465,
"step": 1316,
"token_acc": 0.9291851851851852
},
{
"epoch": 0.27784810126582277,
"grad_norm": 0.68359375,
"learning_rate": 9.641249093047784e-07,
"loss": 0.23076121509075165,
"step": 1317,
"token_acc": 0.9285714285714286
},
{
"epoch": 0.2780590717299578,
"grad_norm": 0.9375,
"learning_rate": 9.640619904912859e-07,
"loss": 0.2742810845375061,
"step": 1318,
"token_acc": 0.9297956493078444
},
{
"epoch": 0.27827004219409285,
"grad_norm": 0.609375,
"learning_rate": 9.63999018608377e-07,
"loss": 0.22001489996910095,
"step": 1319,
"token_acc": 0.936816192560175
},
{
"epoch": 0.27848101265822783,
"grad_norm": 0.65625,
"learning_rate": 9.639359936632535e-07,
"loss": 0.2529153525829315,
"step": 1320,
"token_acc": 0.9257350712337071
},
{
"epoch": 0.2786919831223629,
"grad_norm": 0.65625,
"learning_rate": 9.63872915663122e-07,
"loss": 0.23905983567237854,
"step": 1321,
"token_acc": 0.9326950971859588
},
{
"epoch": 0.2789029535864979,
"grad_norm": 0.63671875,
"learning_rate": 9.638097846151965e-07,
"loss": 0.2406499683856964,
"step": 1322,
"token_acc": 0.930352798053528
},
{
"epoch": 0.2791139240506329,
"grad_norm": 0.98046875,
"learning_rate": 9.637466005266963e-07,
"loss": 0.26858270168304443,
"step": 1323,
"token_acc": 0.9200705260064649
},
{
"epoch": 0.27932489451476794,
"grad_norm": 0.859375,
"learning_rate": 9.636833634048475e-07,
"loss": 0.2660675644874573,
"step": 1324,
"token_acc": 0.9298245614035088
},
{
"epoch": 0.2795358649789029,
"grad_norm": 0.77734375,
"learning_rate": 9.636200732568814e-07,
"loss": 0.26166456937789917,
"step": 1325,
"token_acc": 0.9308510638297872
},
{
"epoch": 0.27974683544303797,
"grad_norm": 0.8671875,
"learning_rate": 9.63556730090036e-07,
"loss": 0.2833145260810852,
"step": 1326,
"token_acc": 0.9278801123946301
},
{
"epoch": 0.279957805907173,
"grad_norm": 0.81640625,
"learning_rate": 9.634933339115547e-07,
"loss": 0.28151270747184753,
"step": 1327,
"token_acc": 0.9252669039145908
},
{
"epoch": 0.280168776371308,
"grad_norm": 0.76953125,
"learning_rate": 9.63429884728688e-07,
"loss": 0.2141590267419815,
"step": 1328,
"token_acc": 0.9334285714285714
},
{
"epoch": 0.28037974683544303,
"grad_norm": 0.79296875,
"learning_rate": 9.633663825486914e-07,
"loss": 0.2234039306640625,
"step": 1329,
"token_acc": 0.9323050556983719
},
{
"epoch": 0.2805907172995781,
"grad_norm": 0.7734375,
"learning_rate": 9.633028273788272e-07,
"loss": 0.2672632336616516,
"step": 1330,
"token_acc": 0.9254259080681453
},
{
"epoch": 0.28080168776371306,
"grad_norm": 0.69140625,
"learning_rate": 9.63239219226363e-07,
"loss": 0.2287733405828476,
"step": 1331,
"token_acc": 0.9398969662278192
},
{
"epoch": 0.2810126582278481,
"grad_norm": 1.34375,
"learning_rate": 9.631755580985735e-07,
"loss": 0.26213157176971436,
"step": 1332,
"token_acc": 0.9299184505606524
},
{
"epoch": 0.28122362869198314,
"grad_norm": 0.79296875,
"learning_rate": 9.631118440027386e-07,
"loss": 0.28404700756073,
"step": 1333,
"token_acc": 0.9141087017741482
},
{
"epoch": 0.2814345991561181,
"grad_norm": 0.7109375,
"learning_rate": 9.630480769461447e-07,
"loss": 0.239895761013031,
"step": 1334,
"token_acc": 0.9308807134894092
},
{
"epoch": 0.28164556962025317,
"grad_norm": 0.78125,
"learning_rate": 9.629842569360838e-07,
"loss": 0.26395344734191895,
"step": 1335,
"token_acc": 0.9277073732718893
},
{
"epoch": 0.2818565400843882,
"grad_norm": 0.609375,
"learning_rate": 9.629203839798546e-07,
"loss": 0.21334236860275269,
"step": 1336,
"token_acc": 0.936906584992343
},
{
"epoch": 0.2820675105485232,
"grad_norm": 0.6796875,
"learning_rate": 9.628564580847613e-07,
"loss": 0.23438285291194916,
"step": 1337,
"token_acc": 0.9287510477787091
},
{
"epoch": 0.28227848101265823,
"grad_norm": 0.74609375,
"learning_rate": 9.627924792581143e-07,
"loss": 0.26787251234054565,
"step": 1338,
"token_acc": 0.9322139303482587
},
{
"epoch": 0.2824894514767933,
"grad_norm": 0.62890625,
"learning_rate": 9.627284475072303e-07,
"loss": 0.25793296098709106,
"step": 1339,
"token_acc": 0.927893175074184
},
{
"epoch": 0.28270042194092826,
"grad_norm": 0.71484375,
"learning_rate": 9.626643628394321e-07,
"loss": 0.2458716779947281,
"step": 1340,
"token_acc": 0.9302045728038508
},
{
"epoch": 0.2829113924050633,
"grad_norm": 0.73828125,
"learning_rate": 9.626002252620478e-07,
"loss": 0.2576259970664978,
"step": 1341,
"token_acc": 0.9324796274738067
},
{
"epoch": 0.2831223628691983,
"grad_norm": 0.7421875,
"learning_rate": 9.625360347824123e-07,
"loss": 0.27822190523147583,
"step": 1342,
"token_acc": 0.9268867924528302
},
{
"epoch": 0.2833333333333333,
"grad_norm": 0.71875,
"learning_rate": 9.624717914078666e-07,
"loss": 0.2430208921432495,
"step": 1343,
"token_acc": 0.9362327358213341
},
{
"epoch": 0.28354430379746837,
"grad_norm": 0.9453125,
"learning_rate": 9.62407495145757e-07,
"loss": 0.2376978099346161,
"step": 1344,
"token_acc": 0.9352154531946508
},
{
"epoch": 0.28375527426160335,
"grad_norm": 0.69921875,
"learning_rate": 9.623431460034365e-07,
"loss": 0.28660112619400024,
"step": 1345,
"token_acc": 0.9231664726426076
},
{
"epoch": 0.2839662447257384,
"grad_norm": 1.078125,
"learning_rate": 9.622787439882642e-07,
"loss": 0.22470048069953918,
"step": 1346,
"token_acc": 0.9390454284071306
},
{
"epoch": 0.28417721518987343,
"grad_norm": 0.78125,
"learning_rate": 9.622142891076049e-07,
"loss": 0.3022032380104065,
"step": 1347,
"token_acc": 0.9156626506024096
},
{
"epoch": 0.2843881856540084,
"grad_norm": 0.890625,
"learning_rate": 9.621497813688292e-07,
"loss": 0.24632784724235535,
"step": 1348,
"token_acc": 0.9298975672215108
},
{
"epoch": 0.28459915611814346,
"grad_norm": 0.6328125,
"learning_rate": 9.620852207793146e-07,
"loss": 0.26282811164855957,
"step": 1349,
"token_acc": 0.9305875576036866
},
{
"epoch": 0.2848101265822785,
"grad_norm": 0.7109375,
"learning_rate": 9.620206073464438e-07,
"loss": 0.2368626892566681,
"step": 1350,
"token_acc": 0.9320417287630403
},
{
"epoch": 0.2850210970464135,
"grad_norm": 0.7421875,
"learning_rate": 9.619559410776062e-07,
"loss": 0.27300721406936646,
"step": 1351,
"token_acc": 0.9214989059080962
},
{
"epoch": 0.2852320675105485,
"grad_norm": 0.73046875,
"learning_rate": 9.618912219801968e-07,
"loss": 0.3053458333015442,
"step": 1352,
"token_acc": 0.9201213346814965
},
{
"epoch": 0.28544303797468357,
"grad_norm": 0.796875,
"learning_rate": 9.618264500616164e-07,
"loss": 0.2645854353904724,
"step": 1353,
"token_acc": 0.9302251823660006
},
{
"epoch": 0.28565400843881855,
"grad_norm": 0.6640625,
"learning_rate": 9.61761625329273e-07,
"loss": 0.27379700541496277,
"step": 1354,
"token_acc": 0.9288623404833016
},
{
"epoch": 0.2858649789029536,
"grad_norm": 0.6640625,
"learning_rate": 9.61696747790579e-07,
"loss": 0.2626631557941437,
"step": 1355,
"token_acc": 0.9282009282009283
},
{
"epoch": 0.28607594936708863,
"grad_norm": 0.69140625,
"learning_rate": 9.616318174529544e-07,
"loss": 0.2584306001663208,
"step": 1356,
"token_acc": 0.9292520935604967
},
{
"epoch": 0.2862869198312236,
"grad_norm": 0.55078125,
"learning_rate": 9.615668343238243e-07,
"loss": 0.25304126739501953,
"step": 1357,
"token_acc": 0.9346613545816733
},
{
"epoch": 0.28649789029535866,
"grad_norm": 1.171875,
"learning_rate": 9.6150179841062e-07,
"loss": 0.3054851293563843,
"step": 1358,
"token_acc": 0.9169445286450439
},
{
"epoch": 0.28670886075949364,
"grad_norm": 0.86328125,
"learning_rate": 9.614367097207788e-07,
"loss": 0.33616381883621216,
"step": 1359,
"token_acc": 0.9140366696723775
},
{
"epoch": 0.2869198312236287,
"grad_norm": 0.703125,
"learning_rate": 9.613715682617442e-07,
"loss": 0.2768439054489136,
"step": 1360,
"token_acc": 0.9238784370477569
},
{
"epoch": 0.2871308016877637,
"grad_norm": 0.6875,
"learning_rate": 9.61306374040966e-07,
"loss": 0.2657453417778015,
"step": 1361,
"token_acc": 0.9254338394793926
},
{
"epoch": 0.2873417721518987,
"grad_norm": 0.765625,
"learning_rate": 9.612411270658994e-07,
"loss": 0.30385810136795044,
"step": 1362,
"token_acc": 0.9217443249701314
},
{
"epoch": 0.28755274261603375,
"grad_norm": 0.83203125,
"learning_rate": 9.611758273440058e-07,
"loss": 0.2552698254585266,
"step": 1363,
"token_acc": 0.9243302954984811
},
{
"epoch": 0.2877637130801688,
"grad_norm": 0.68359375,
"learning_rate": 9.611104748827533e-07,
"loss": 0.290875107049942,
"step": 1364,
"token_acc": 0.9198113207547169
},
{
"epoch": 0.2879746835443038,
"grad_norm": 0.71484375,
"learning_rate": 9.61045069689615e-07,
"loss": 0.24223566055297852,
"step": 1365,
"token_acc": 0.931023419955085
},
{
"epoch": 0.2881856540084388,
"grad_norm": 0.66015625,
"learning_rate": 9.609796117720708e-07,
"loss": 0.2444673329591751,
"step": 1366,
"token_acc": 0.933668936926266
},
{
"epoch": 0.28839662447257386,
"grad_norm": 0.875,
"learning_rate": 9.609141011376062e-07,
"loss": 0.28656303882598877,
"step": 1367,
"token_acc": 0.9205431956082057
},
{
"epoch": 0.28860759493670884,
"grad_norm": 0.71484375,
"learning_rate": 9.60848537793713e-07,
"loss": 0.22629567980766296,
"step": 1368,
"token_acc": 0.9353916523727844
},
{
"epoch": 0.2888185654008439,
"grad_norm": 0.7265625,
"learning_rate": 9.60782921747889e-07,
"loss": 0.21632830798625946,
"step": 1369,
"token_acc": 0.9382108822625269
},
{
"epoch": 0.2890295358649789,
"grad_norm": 0.95703125,
"learning_rate": 9.607172530076377e-07,
"loss": 0.27289730310440063,
"step": 1370,
"token_acc": 0.9276901987662782
},
{
"epoch": 0.2892405063291139,
"grad_norm": 0.671875,
"learning_rate": 9.606515315804691e-07,
"loss": 0.23308590054512024,
"step": 1371,
"token_acc": 0.933515731874145
},
{
"epoch": 0.28945147679324895,
"grad_norm": 0.7578125,
"learning_rate": 9.605857574738991e-07,
"loss": 0.23849669098854065,
"step": 1372,
"token_acc": 0.9353348729792148
},
{
"epoch": 0.289662447257384,
"grad_norm": 1.3828125,
"learning_rate": 9.605199306954492e-07,
"loss": 0.24058642983436584,
"step": 1373,
"token_acc": 0.9371130661453242
},
{
"epoch": 0.289873417721519,
"grad_norm": 0.67578125,
"learning_rate": 9.604540512526475e-07,
"loss": 0.27291715145111084,
"step": 1374,
"token_acc": 0.92439293598234
},
{
"epoch": 0.290084388185654,
"grad_norm": 0.75390625,
"learning_rate": 9.603881191530279e-07,
"loss": 0.26681673526763916,
"step": 1375,
"token_acc": 0.9251798561151079
},
{
"epoch": 0.290295358649789,
"grad_norm": 0.6796875,
"learning_rate": 9.6032213440413e-07,
"loss": 0.3197612166404724,
"step": 1376,
"token_acc": 0.9159369527145359
},
{
"epoch": 0.29050632911392404,
"grad_norm": 0.70703125,
"learning_rate": 9.602560970134998e-07,
"loss": 0.2690644860267639,
"step": 1377,
"token_acc": 0.9309221244705116
},
{
"epoch": 0.2907172995780591,
"grad_norm": 0.65234375,
"learning_rate": 9.601900069886896e-07,
"loss": 0.2277158945798874,
"step": 1378,
"token_acc": 0.9329285920786354
},
{
"epoch": 0.29092827004219407,
"grad_norm": 0.75390625,
"learning_rate": 9.601238643372568e-07,
"loss": 0.2508387565612793,
"step": 1379,
"token_acc": 0.9271716101694916
},
{
"epoch": 0.2911392405063291,
"grad_norm": 0.78125,
"learning_rate": 9.600576690667659e-07,
"loss": 0.30543452501296997,
"step": 1380,
"token_acc": 0.9198903441973805
},
{
"epoch": 0.29135021097046415,
"grad_norm": 0.6796875,
"learning_rate": 9.599914211847866e-07,
"loss": 0.2164490967988968,
"step": 1381,
"token_acc": 0.9413218035824583
},
{
"epoch": 0.29156118143459914,
"grad_norm": 0.6953125,
"learning_rate": 9.59925120698895e-07,
"loss": 0.2733488082885742,
"step": 1382,
"token_acc": 0.9247685185185185
},
{
"epoch": 0.2917721518987342,
"grad_norm": 0.59375,
"learning_rate": 9.59858767616673e-07,
"loss": 0.24653790891170502,
"step": 1383,
"token_acc": 0.9360535931790499
},
{
"epoch": 0.2919831223628692,
"grad_norm": 0.77734375,
"learning_rate": 9.597923619457085e-07,
"loss": 0.297510027885437,
"step": 1384,
"token_acc": 0.9154051647373108
},
{
"epoch": 0.2921940928270042,
"grad_norm": 0.70703125,
"learning_rate": 9.59725903693596e-07,
"loss": 0.2266286015510559,
"step": 1385,
"token_acc": 0.9409148665819568
},
{
"epoch": 0.29240506329113924,
"grad_norm": 0.65234375,
"learning_rate": 9.596593928679354e-07,
"loss": 0.2754872441291809,
"step": 1386,
"token_acc": 0.927317523868186
},
{
"epoch": 0.2926160337552743,
"grad_norm": 0.703125,
"learning_rate": 9.595928294763324e-07,
"loss": 0.2459161877632141,
"step": 1387,
"token_acc": 0.9298745724059293
},
{
"epoch": 0.29282700421940927,
"grad_norm": 0.74609375,
"learning_rate": 9.595262135263996e-07,
"loss": 0.23659512400627136,
"step": 1388,
"token_acc": 0.9361702127659575
},
{
"epoch": 0.2930379746835443,
"grad_norm": 0.8125,
"learning_rate": 9.594595450257549e-07,
"loss": 0.3071826100349426,
"step": 1389,
"token_acc": 0.9145141451414515
},
{
"epoch": 0.29324894514767935,
"grad_norm": 0.73046875,
"learning_rate": 9.593928239820225e-07,
"loss": 0.23544950783252716,
"step": 1390,
"token_acc": 0.9367909238249594
},
{
"epoch": 0.29345991561181434,
"grad_norm": 0.828125,
"learning_rate": 9.59326050402832e-07,
"loss": 0.25213587284088135,
"step": 1391,
"token_acc": 0.931865516215412
},
{
"epoch": 0.2936708860759494,
"grad_norm": 0.80078125,
"learning_rate": 9.5925922429582e-07,
"loss": 0.304402619600296,
"step": 1392,
"token_acc": 0.9191438763376932
},
{
"epoch": 0.29388185654008436,
"grad_norm": 0.7109375,
"learning_rate": 9.59192345668629e-07,
"loss": 0.24235710501670837,
"step": 1393,
"token_acc": 0.931580519868539
},
{
"epoch": 0.2940928270042194,
"grad_norm": 0.56640625,
"learning_rate": 9.591254145289066e-07,
"loss": 0.26642921566963196,
"step": 1394,
"token_acc": 0.9295583852614467
},
{
"epoch": 0.29430379746835444,
"grad_norm": 0.6875,
"learning_rate": 9.590584308843067e-07,
"loss": 0.25638318061828613,
"step": 1395,
"token_acc": 0.9266789895255699
},
{
"epoch": 0.29451476793248943,
"grad_norm": 0.72265625,
"learning_rate": 9.5899139474249e-07,
"loss": 0.27632462978363037,
"step": 1396,
"token_acc": 0.9211767971135165
},
{
"epoch": 0.29472573839662447,
"grad_norm": 0.70703125,
"learning_rate": 9.589243061111224e-07,
"loss": 0.2598225772380829,
"step": 1397,
"token_acc": 0.9285505978419364
},
{
"epoch": 0.2949367088607595,
"grad_norm": 0.77734375,
"learning_rate": 9.58857164997876e-07,
"loss": 0.2595940828323364,
"step": 1398,
"token_acc": 0.9247278382581648
},
{
"epoch": 0.2951476793248945,
"grad_norm": 0.73828125,
"learning_rate": 9.587899714104294e-07,
"loss": 0.25946277379989624,
"step": 1399,
"token_acc": 0.9253688989784336
},
{
"epoch": 0.29535864978902954,
"grad_norm": 0.625,
"learning_rate": 9.58722725356466e-07,
"loss": 0.20068538188934326,
"step": 1400,
"token_acc": 0.93887460725507
},
{
"epoch": 0.29535864978902954,
"eval_loss": 0.43371880054473877,
"eval_runtime": 246.0163,
"eval_samples_per_second": 137.003,
"eval_steps_per_second": 2.142,
"eval_token_acc": 0.8990397559005654,
"step": 1400
},
{
"epoch": 0.2955696202531646,
"grad_norm": 0.609375,
"learning_rate": 9.586554268436765e-07,
"loss": 0.1949131339788437,
"step": 1401,
"token_acc": 0.9440918706557873
},
{
"epoch": 0.29578059071729956,
"grad_norm": 0.671875,
"learning_rate": 9.585880758797569e-07,
"loss": 0.2542431950569153,
"step": 1402,
"token_acc": 0.9296116504854369
},
{
"epoch": 0.2959915611814346,
"grad_norm": 0.84375,
"learning_rate": 9.585206724724094e-07,
"loss": 0.2671182155609131,
"step": 1403,
"token_acc": 0.9345066358915176
},
{
"epoch": 0.29620253164556964,
"grad_norm": 1.46875,
"learning_rate": 9.584532166293422e-07,
"loss": 0.23155038058757782,
"step": 1404,
"token_acc": 0.9271665642286416
},
{
"epoch": 0.29641350210970463,
"grad_norm": 0.80859375,
"learning_rate": 9.583857083582691e-07,
"loss": 0.3051491379737854,
"step": 1405,
"token_acc": 0.9181929181929182
},
{
"epoch": 0.29662447257383967,
"grad_norm": 0.69921875,
"learning_rate": 9.583181476669108e-07,
"loss": 0.2135426104068756,
"step": 1406,
"token_acc": 0.9333539987600744
},
{
"epoch": 0.2968354430379747,
"grad_norm": 0.71875,
"learning_rate": 9.58250534562993e-07,
"loss": 0.24095328152179718,
"step": 1407,
"token_acc": 0.9283995186522263
},
{
"epoch": 0.2970464135021097,
"grad_norm": 1.125,
"learning_rate": 9.58182869054248e-07,
"loss": 0.2426932454109192,
"step": 1408,
"token_acc": 0.928467998841587
},
{
"epoch": 0.29725738396624474,
"grad_norm": 0.93359375,
"learning_rate": 9.581151511484137e-07,
"loss": 0.2751479148864746,
"step": 1409,
"token_acc": 0.9293640054127199
},
{
"epoch": 0.2974683544303797,
"grad_norm": 1.921875,
"learning_rate": 9.580473808532348e-07,
"loss": 0.2397601157426834,
"step": 1410,
"token_acc": 0.9367764915405165
},
{
"epoch": 0.29767932489451476,
"grad_norm": 0.90625,
"learning_rate": 9.579795581764606e-07,
"loss": 0.24342301487922668,
"step": 1411,
"token_acc": 0.9319277108433734
},
{
"epoch": 0.2978902953586498,
"grad_norm": 0.6640625,
"learning_rate": 9.579116831258478e-07,
"loss": 0.24738198518753052,
"step": 1412,
"token_acc": 0.9260881467287161
},
{
"epoch": 0.2981012658227848,
"grad_norm": 0.7890625,
"learning_rate": 9.578437557091586e-07,
"loss": 0.30160653591156006,
"step": 1413,
"token_acc": 0.9232409381663113
},
{
"epoch": 0.29831223628691983,
"grad_norm": 0.66015625,
"learning_rate": 9.577757759341603e-07,
"loss": 0.25977736711502075,
"step": 1414,
"token_acc": 0.9217270194986072
},
{
"epoch": 0.29852320675105487,
"grad_norm": 0.67578125,
"learning_rate": 9.577077438086276e-07,
"loss": 0.2611219882965088,
"step": 1415,
"token_acc": 0.9298686784017882
},
{
"epoch": 0.29873417721518986,
"grad_norm": 0.7890625,
"learning_rate": 9.576396593403405e-07,
"loss": 0.3144347369670868,
"step": 1416,
"token_acc": 0.9093191088798243
},
{
"epoch": 0.2989451476793249,
"grad_norm": 0.68359375,
"learning_rate": 9.57571522537085e-07,
"loss": 0.24973896145820618,
"step": 1417,
"token_acc": 0.9304597701149425
},
{
"epoch": 0.29915611814345994,
"grad_norm": 0.9296875,
"learning_rate": 9.575033334066527e-07,
"loss": 0.2714025378227234,
"step": 1418,
"token_acc": 0.9243498817966903
},
{
"epoch": 0.2993670886075949,
"grad_norm": 0.71875,
"learning_rate": 9.574350919568421e-07,
"loss": 0.24550315737724304,
"step": 1419,
"token_acc": 0.9313218390804597
},
{
"epoch": 0.29957805907172996,
"grad_norm": 0.71484375,
"learning_rate": 9.573667981954573e-07,
"loss": 0.24378708004951477,
"step": 1420,
"token_acc": 0.9386270167668459
},
{
"epoch": 0.299789029535865,
"grad_norm": 0.734375,
"learning_rate": 9.572984521303076e-07,
"loss": 0.25155583024024963,
"step": 1421,
"token_acc": 0.9310128566948886
},
{
"epoch": 0.3,
"grad_norm": 0.82421875,
"learning_rate": 9.572300537692094e-07,
"loss": 0.27014413475990295,
"step": 1422,
"token_acc": 0.9274395329441201
},
{
"epoch": 0.30021097046413503,
"grad_norm": 0.66015625,
"learning_rate": 9.57161603119985e-07,
"loss": 0.259027898311615,
"step": 1423,
"token_acc": 0.9269093895428415
},
{
"epoch": 0.30042194092827,
"grad_norm": 0.96484375,
"learning_rate": 9.570931001904616e-07,
"loss": 0.30663132667541504,
"step": 1424,
"token_acc": 0.921604938271605
},
{
"epoch": 0.30063291139240506,
"grad_norm": 0.7578125,
"learning_rate": 9.570245449884733e-07,
"loss": 0.2871406376361847,
"step": 1425,
"token_acc": 0.9261591299370349
},
{
"epoch": 0.3008438818565401,
"grad_norm": 0.6640625,
"learning_rate": 9.5695593752186e-07,
"loss": 0.26376545429229736,
"step": 1426,
"token_acc": 0.9314728682170542
},
{
"epoch": 0.3010548523206751,
"grad_norm": 0.734375,
"learning_rate": 9.568872777984681e-07,
"loss": 0.24171243607997894,
"step": 1427,
"token_acc": 0.9354838709677419
},
{
"epoch": 0.3012658227848101,
"grad_norm": 0.90234375,
"learning_rate": 9.568185658261485e-07,
"loss": 0.29318904876708984,
"step": 1428,
"token_acc": 0.9312581063553826
},
{
"epoch": 0.30147679324894516,
"grad_norm": 0.703125,
"learning_rate": 9.567498016127595e-07,
"loss": 0.21925503015518188,
"step": 1429,
"token_acc": 0.9372488408037094
},
{
"epoch": 0.30168776371308015,
"grad_norm": 0.796875,
"learning_rate": 9.566809851661648e-07,
"loss": 0.28585126996040344,
"step": 1430,
"token_acc": 0.9222972972972973
},
{
"epoch": 0.3018987341772152,
"grad_norm": 0.65625,
"learning_rate": 9.56612116494234e-07,
"loss": 0.22612613439559937,
"step": 1431,
"token_acc": 0.93677303906949
},
{
"epoch": 0.30210970464135023,
"grad_norm": 0.73828125,
"learning_rate": 9.56543195604843e-07,
"loss": 0.26892632246017456,
"step": 1432,
"token_acc": 0.9255504055619931
},
{
"epoch": 0.3023206751054852,
"grad_norm": 0.796875,
"learning_rate": 9.564742225058734e-07,
"loss": 0.27015364170074463,
"step": 1433,
"token_acc": 0.9248231132075472
},
{
"epoch": 0.30253164556962026,
"grad_norm": 0.7421875,
"learning_rate": 9.564051972052132e-07,
"loss": 0.24233023822307587,
"step": 1434,
"token_acc": 0.9338118022328549
},
{
"epoch": 0.3027426160337553,
"grad_norm": 0.76953125,
"learning_rate": 9.563361197107554e-07,
"loss": 0.2551451623439789,
"step": 1435,
"token_acc": 0.9307293256026854
},
{
"epoch": 0.3029535864978903,
"grad_norm": 0.71484375,
"learning_rate": 9.562669900304002e-07,
"loss": 0.2726331949234009,
"step": 1436,
"token_acc": 0.9276739197057922
},
{
"epoch": 0.3031645569620253,
"grad_norm": 0.703125,
"learning_rate": 9.561978081720524e-07,
"loss": 0.24130704998970032,
"step": 1437,
"token_acc": 0.9269317814419658
},
{
"epoch": 0.30337552742616036,
"grad_norm": 2.0,
"learning_rate": 9.561285741436245e-07,
"loss": 0.2952617108821869,
"step": 1438,
"token_acc": 0.9171339563862928
},
{
"epoch": 0.30358649789029535,
"grad_norm": 0.71484375,
"learning_rate": 9.560592879530333e-07,
"loss": 0.2580583691596985,
"step": 1439,
"token_acc": 0.9276932084309133
},
{
"epoch": 0.3037974683544304,
"grad_norm": 1.1640625,
"learning_rate": 9.559899496082024e-07,
"loss": 0.2909284234046936,
"step": 1440,
"token_acc": 0.9199145755472504
},
{
"epoch": 0.3040084388185654,
"grad_norm": 0.66015625,
"learning_rate": 9.559205591170614e-07,
"loss": 0.25040456652641296,
"step": 1441,
"token_acc": 0.9311377245508982
},
{
"epoch": 0.3042194092827004,
"grad_norm": 0.70703125,
"learning_rate": 9.558511164875455e-07,
"loss": 0.23942893743515015,
"step": 1442,
"token_acc": 0.93700356066831
},
{
"epoch": 0.30443037974683546,
"grad_norm": 0.69921875,
"learning_rate": 9.557816217275962e-07,
"loss": 0.24013203382492065,
"step": 1443,
"token_acc": 0.9334166927164739
},
{
"epoch": 0.30464135021097044,
"grad_norm": 0.7109375,
"learning_rate": 9.557120748451608e-07,
"loss": 0.2695329189300537,
"step": 1444,
"token_acc": 0.9281867145421903
},
{
"epoch": 0.3048523206751055,
"grad_norm": 0.73828125,
"learning_rate": 9.556424758481926e-07,
"loss": 0.28713494539260864,
"step": 1445,
"token_acc": 0.9243258915627718
},
{
"epoch": 0.3050632911392405,
"grad_norm": 0.65625,
"learning_rate": 9.555728247446505e-07,
"loss": 0.2532760500907898,
"step": 1446,
"token_acc": 0.9253039953676896
},
{
"epoch": 0.3052742616033755,
"grad_norm": 0.94140625,
"learning_rate": 9.555031215425e-07,
"loss": 0.25795722007751465,
"step": 1447,
"token_acc": 0.924613987284287
},
{
"epoch": 0.30548523206751055,
"grad_norm": 1.0390625,
"learning_rate": 9.554333662497122e-07,
"loss": 0.24081915616989136,
"step": 1448,
"token_acc": 0.925282098200671
},
{
"epoch": 0.3056962025316456,
"grad_norm": 0.6015625,
"learning_rate": 9.553635588742644e-07,
"loss": 0.2439633309841156,
"step": 1449,
"token_acc": 0.929632999696694
},
{
"epoch": 0.3059071729957806,
"grad_norm": 0.8515625,
"learning_rate": 9.552936994241394e-07,
"loss": 0.2797650694847107,
"step": 1450,
"token_acc": 0.9248277927523211
},
{
"epoch": 0.3061181434599156,
"grad_norm": 1.03125,
"learning_rate": 9.552237879073262e-07,
"loss": 0.23650625348091125,
"step": 1451,
"token_acc": 0.927787406123628
},
{
"epoch": 0.30632911392405066,
"grad_norm": 0.9921875,
"learning_rate": 9.5515382433182e-07,
"loss": 0.23584780097007751,
"step": 1452,
"token_acc": 0.9325115562403699
},
{
"epoch": 0.30654008438818564,
"grad_norm": 0.66796875,
"learning_rate": 9.550838087056215e-07,
"loss": 0.25513389706611633,
"step": 1453,
"token_acc": 0.9239160839160839
},
{
"epoch": 0.3067510548523207,
"grad_norm": 0.6953125,
"learning_rate": 9.550137410367379e-07,
"loss": 0.2748698592185974,
"step": 1454,
"token_acc": 0.9194828092859242
},
{
"epoch": 0.3069620253164557,
"grad_norm": 0.6640625,
"learning_rate": 9.549436213331814e-07,
"loss": 0.2806450426578522,
"step": 1455,
"token_acc": 0.9268160950580611
},
{
"epoch": 0.3071729957805907,
"grad_norm": 0.6171875,
"learning_rate": 9.548734496029715e-07,
"loss": 0.24432675540447235,
"step": 1456,
"token_acc": 0.9289112790372236
},
{
"epoch": 0.30738396624472575,
"grad_norm": 0.62109375,
"learning_rate": 9.548032258541325e-07,
"loss": 0.2699507772922516,
"step": 1457,
"token_acc": 0.9230322393261691
},
{
"epoch": 0.30759493670886073,
"grad_norm": 0.66015625,
"learning_rate": 9.547329500946951e-07,
"loss": 0.266490638256073,
"step": 1458,
"token_acc": 0.926647564469914
},
{
"epoch": 0.3078059071729958,
"grad_norm": 0.6875,
"learning_rate": 9.546626223326964e-07,
"loss": 0.2475104182958603,
"step": 1459,
"token_acc": 0.9266259032795998
},
{
"epoch": 0.3080168776371308,
"grad_norm": 0.87109375,
"learning_rate": 9.545922425761782e-07,
"loss": 0.27554529905319214,
"step": 1460,
"token_acc": 0.9233644859813084
},
{
"epoch": 0.3082278481012658,
"grad_norm": 0.7421875,
"learning_rate": 9.545218108331895e-07,
"loss": 0.20847618579864502,
"step": 1461,
"token_acc": 0.9386682242990654
},
{
"epoch": 0.30843881856540084,
"grad_norm": 0.72265625,
"learning_rate": 9.54451327111785e-07,
"loss": 0.2351740300655365,
"step": 1462,
"token_acc": 0.9326732673267327
},
{
"epoch": 0.3086497890295359,
"grad_norm": 0.65234375,
"learning_rate": 9.543807914200244e-07,
"loss": 0.26930472254753113,
"step": 1463,
"token_acc": 0.9294723294723295
},
{
"epoch": 0.30886075949367087,
"grad_norm": 3.5,
"learning_rate": 9.543102037659746e-07,
"loss": 0.26640209555625916,
"step": 1464,
"token_acc": 0.9225462527438069
},
{
"epoch": 0.3090717299578059,
"grad_norm": 0.69921875,
"learning_rate": 9.542395641577079e-07,
"loss": 0.2432868480682373,
"step": 1465,
"token_acc": 0.930205618302925
},
{
"epoch": 0.30928270042194095,
"grad_norm": 0.69140625,
"learning_rate": 9.541688726033022e-07,
"loss": 0.21304035186767578,
"step": 1466,
"token_acc": 0.9346386409444285
},
{
"epoch": 0.30949367088607593,
"grad_norm": 0.55859375,
"learning_rate": 9.540981291108419e-07,
"loss": 0.23767802119255066,
"step": 1467,
"token_acc": 0.9310522253666066
},
{
"epoch": 0.309704641350211,
"grad_norm": 0.6484375,
"learning_rate": 9.54027333688417e-07,
"loss": 0.24264025688171387,
"step": 1468,
"token_acc": 0.9302891110510673
},
{
"epoch": 0.309915611814346,
"grad_norm": 0.734375,
"learning_rate": 9.539564863441239e-07,
"loss": 0.24002739787101746,
"step": 1469,
"token_acc": 0.9333333333333333
},
{
"epoch": 0.310126582278481,
"grad_norm": 0.8125,
"learning_rate": 9.53885587086064e-07,
"loss": 0.2675400376319885,
"step": 1470,
"token_acc": 0.9192083062946139
},
{
"epoch": 0.31033755274261604,
"grad_norm": 0.62109375,
"learning_rate": 9.538146359223457e-07,
"loss": 0.23460690677165985,
"step": 1471,
"token_acc": 0.9331210191082803
},
{
"epoch": 0.3105485232067511,
"grad_norm": 0.72265625,
"learning_rate": 9.537436328610829e-07,
"loss": 0.26020288467407227,
"step": 1472,
"token_acc": 0.9312090530077427
},
{
"epoch": 0.31075949367088607,
"grad_norm": 0.65625,
"learning_rate": 9.536725779103952e-07,
"loss": 0.2224973738193512,
"step": 1473,
"token_acc": 0.9334952930458549
},
{
"epoch": 0.3109704641350211,
"grad_norm": 0.6484375,
"learning_rate": 9.536014710784082e-07,
"loss": 0.23821187019348145,
"step": 1474,
"token_acc": 0.9365303244005642
},
{
"epoch": 0.3111814345991561,
"grad_norm": 0.74609375,
"learning_rate": 9.535303123732537e-07,
"loss": 0.24408169090747833,
"step": 1475,
"token_acc": 0.9283845650752126
},
{
"epoch": 0.31139240506329113,
"grad_norm": 0.9921875,
"learning_rate": 9.534591018030693e-07,
"loss": 0.2836637794971466,
"step": 1476,
"token_acc": 0.9262971698113207
},
{
"epoch": 0.3116033755274262,
"grad_norm": 0.7578125,
"learning_rate": 9.533878393759988e-07,
"loss": 0.21933302283287048,
"step": 1477,
"token_acc": 0.9352976913730255
},
{
"epoch": 0.31181434599156116,
"grad_norm": 0.65234375,
"learning_rate": 9.533165251001912e-07,
"loss": 0.2876891493797302,
"step": 1478,
"token_acc": 0.9213543055185284
},
{
"epoch": 0.3120253164556962,
"grad_norm": 1.0390625,
"learning_rate": 9.532451589838022e-07,
"loss": 0.22867098450660706,
"step": 1479,
"token_acc": 0.9380315917375456
},
{
"epoch": 0.31223628691983124,
"grad_norm": 0.72265625,
"learning_rate": 9.53173741034993e-07,
"loss": 0.20845067501068115,
"step": 1480,
"token_acc": 0.9393183707398172
},
{
"epoch": 0.3124472573839662,
"grad_norm": 0.65234375,
"learning_rate": 9.53102271261931e-07,
"loss": 0.22937451303005219,
"step": 1481,
"token_acc": 0.9368231046931408
},
{
"epoch": 0.31265822784810127,
"grad_norm": 0.625,
"learning_rate": 9.530307496727891e-07,
"loss": 0.24746280908584595,
"step": 1482,
"token_acc": 0.9314791403286978
},
{
"epoch": 0.3128691983122363,
"grad_norm": 0.6796875,
"learning_rate": 9.529591762757468e-07,
"loss": 0.2423558235168457,
"step": 1483,
"token_acc": 0.9336126329358685
},
{
"epoch": 0.3130801687763713,
"grad_norm": 0.7890625,
"learning_rate": 9.528875510789885e-07,
"loss": 0.23221366107463837,
"step": 1484,
"token_acc": 0.9326707277973709
},
{
"epoch": 0.31329113924050633,
"grad_norm": 0.7109375,
"learning_rate": 9.528158740907058e-07,
"loss": 0.2552267014980316,
"step": 1485,
"token_acc": 0.9284696494727843
},
{
"epoch": 0.3135021097046414,
"grad_norm": 0.63671875,
"learning_rate": 9.527441453190951e-07,
"loss": 0.22586306929588318,
"step": 1486,
"token_acc": 0.9339469409853817
},
{
"epoch": 0.31371308016877636,
"grad_norm": 0.74609375,
"learning_rate": 9.526723647723596e-07,
"loss": 0.2447504699230194,
"step": 1487,
"token_acc": 0.927639751552795
},
{
"epoch": 0.3139240506329114,
"grad_norm": 0.66796875,
"learning_rate": 9.526005324587076e-07,
"loss": 0.27702945470809937,
"step": 1488,
"token_acc": 0.9251620506776664
},
{
"epoch": 0.31413502109704644,
"grad_norm": 0.94921875,
"learning_rate": 9.525286483863542e-07,
"loss": 0.29128116369247437,
"step": 1489,
"token_acc": 0.916892502258356
},
{
"epoch": 0.3143459915611814,
"grad_norm": 0.73046875,
"learning_rate": 9.524567125635195e-07,
"loss": 0.2566947042942047,
"step": 1490,
"token_acc": 0.9236556143772638
},
{
"epoch": 0.31455696202531647,
"grad_norm": 0.8671875,
"learning_rate": 9.523847249984303e-07,
"loss": 0.23654621839523315,
"step": 1491,
"token_acc": 0.9337433603578418
},
{
"epoch": 0.31476793248945145,
"grad_norm": 0.66015625,
"learning_rate": 9.523126856993187e-07,
"loss": 0.22157318890094757,
"step": 1492,
"token_acc": 0.9311002178649237
},
{
"epoch": 0.3149789029535865,
"grad_norm": 0.6875,
"learning_rate": 9.52240594674423e-07,
"loss": 0.24354051053524017,
"step": 1493,
"token_acc": 0.9276387377584331
},
{
"epoch": 0.31518987341772153,
"grad_norm": 0.75,
"learning_rate": 9.521684519319878e-07,
"loss": 0.2520773410797119,
"step": 1494,
"token_acc": 0.9277808522412839
},
{
"epoch": 0.3154008438818565,
"grad_norm": 0.69140625,
"learning_rate": 9.520962574802628e-07,
"loss": 0.2067497968673706,
"step": 1495,
"token_acc": 0.9425287356321839
},
{
"epoch": 0.31561181434599156,
"grad_norm": 0.66015625,
"learning_rate": 9.520240113275046e-07,
"loss": 0.24253101646900177,
"step": 1496,
"token_acc": 0.9343544857768052
},
{
"epoch": 0.3158227848101266,
"grad_norm": 0.71875,
"learning_rate": 9.519517134819746e-07,
"loss": 0.23171034455299377,
"step": 1497,
"token_acc": 0.934695244474213
},
{
"epoch": 0.3160337552742616,
"grad_norm": 0.65234375,
"learning_rate": 9.518793639519408e-07,
"loss": 0.20974554121494293,
"step": 1498,
"token_acc": 0.9291677888499865
},
{
"epoch": 0.3162447257383966,
"grad_norm": 0.82421875,
"learning_rate": 9.518069627456771e-07,
"loss": 0.26031211018562317,
"step": 1499,
"token_acc": 0.932449105490438
},
{
"epoch": 0.31645569620253167,
"grad_norm": 0.81640625,
"learning_rate": 9.517345098714631e-07,
"loss": 0.2840519845485687,
"step": 1500,
"token_acc": 0.9211150652431791
},
{
"epoch": 0.31666666666666665,
"grad_norm": 1.4609375,
"learning_rate": 9.516620053375845e-07,
"loss": 0.24605998396873474,
"step": 1501,
"token_acc": 0.9327284105131415
},
{
"epoch": 0.3168776371308017,
"grad_norm": 0.7578125,
"learning_rate": 9.515894491523328e-07,
"loss": 0.2669103443622589,
"step": 1502,
"token_acc": 0.9236850106415324
},
{
"epoch": 0.31708860759493673,
"grad_norm": 0.80078125,
"learning_rate": 9.515168413240054e-07,
"loss": 0.33311209082603455,
"step": 1503,
"token_acc": 0.9080691642651297
},
{
"epoch": 0.3172995780590717,
"grad_norm": 0.76953125,
"learning_rate": 9.514441818609055e-07,
"loss": 0.24630479514598846,
"step": 1504,
"token_acc": 0.9333128457283344
},
{
"epoch": 0.31751054852320676,
"grad_norm": 0.6875,
"learning_rate": 9.513714707713424e-07,
"loss": 0.2654748857021332,
"step": 1505,
"token_acc": 0.9272495213784301
},
{
"epoch": 0.31772151898734174,
"grad_norm": 0.671875,
"learning_rate": 9.512987080636312e-07,
"loss": 0.27504950761795044,
"step": 1506,
"token_acc": 0.9268217054263566
},
{
"epoch": 0.3179324894514768,
"grad_norm": 0.6484375,
"learning_rate": 9.512258937460931e-07,
"loss": 0.23799118399620056,
"step": 1507,
"token_acc": 0.9329538266919671
},
{
"epoch": 0.3181434599156118,
"grad_norm": 0.6640625,
"learning_rate": 9.511530278270548e-07,
"loss": 0.24022099375724792,
"step": 1508,
"token_acc": 0.9324768449357633
},
{
"epoch": 0.3183544303797468,
"grad_norm": 0.7734375,
"learning_rate": 9.510801103148494e-07,
"loss": 0.2613435983657837,
"step": 1509,
"token_acc": 0.9316982303632412
},
{
"epoch": 0.31856540084388185,
"grad_norm": 0.609375,
"learning_rate": 9.510071412178153e-07,
"loss": 0.24355687201023102,
"step": 1510,
"token_acc": 0.9279303780255643
},
{
"epoch": 0.3187763713080169,
"grad_norm": 0.76171875,
"learning_rate": 9.509341205442973e-07,
"loss": 0.2355407327413559,
"step": 1511,
"token_acc": 0.9361508057160232
},
{
"epoch": 0.3189873417721519,
"grad_norm": 0.65234375,
"learning_rate": 9.508610483026461e-07,
"loss": 0.2515805661678314,
"step": 1512,
"token_acc": 0.9299863387978142
},
{
"epoch": 0.3191983122362869,
"grad_norm": 0.8671875,
"learning_rate": 9.507879245012178e-07,
"loss": 0.2919672131538391,
"step": 1513,
"token_acc": 0.9186514073615837
},
{
"epoch": 0.31940928270042196,
"grad_norm": 0.62890625,
"learning_rate": 9.507147491483749e-07,
"loss": 0.249205082654953,
"step": 1514,
"token_acc": 0.9292244175759363
},
{
"epoch": 0.31962025316455694,
"grad_norm": 0.6796875,
"learning_rate": 9.506415222524857e-07,
"loss": 0.24349896609783173,
"step": 1515,
"token_acc": 0.9286709389802174
},
{
"epoch": 0.319831223628692,
"grad_norm": 0.65625,
"learning_rate": 9.505682438219242e-07,
"loss": 0.2532562017440796,
"step": 1516,
"token_acc": 0.9333507579717721
},
{
"epoch": 0.320042194092827,
"grad_norm": 0.703125,
"learning_rate": 9.504949138650705e-07,
"loss": 0.2774209976196289,
"step": 1517,
"token_acc": 0.9235361000568505
},
{
"epoch": 0.320253164556962,
"grad_norm": 0.703125,
"learning_rate": 9.504215323903105e-07,
"loss": 0.2720155119895935,
"step": 1518,
"token_acc": 0.9254856480139171
},
{
"epoch": 0.32046413502109705,
"grad_norm": 0.67578125,
"learning_rate": 9.503480994060357e-07,
"loss": 0.2988170087337494,
"step": 1519,
"token_acc": 0.9205357142857142
},
{
"epoch": 0.3206751054852321,
"grad_norm": 0.89453125,
"learning_rate": 9.502746149206442e-07,
"loss": 0.21745893359184265,
"step": 1520,
"token_acc": 0.933983286908078
},
{
"epoch": 0.3208860759493671,
"grad_norm": 0.67578125,
"learning_rate": 9.502010789425393e-07,
"loss": 0.24026605486869812,
"step": 1521,
"token_acc": 0.9371310507674144
},
{
"epoch": 0.3210970464135021,
"grad_norm": 0.84765625,
"learning_rate": 9.501274914801306e-07,
"loss": 0.3082854747772217,
"step": 1522,
"token_acc": 0.9195114312558722
},
{
"epoch": 0.3213080168776371,
"grad_norm": 0.74609375,
"learning_rate": 9.500538525418333e-07,
"loss": 0.26113343238830566,
"step": 1523,
"token_acc": 0.9287790697674418
},
{
"epoch": 0.32151898734177214,
"grad_norm": 0.625,
"learning_rate": 9.49980162136069e-07,
"loss": 0.23083993792533875,
"step": 1524,
"token_acc": 0.9333127508490274
},
{
"epoch": 0.3217299578059072,
"grad_norm": 1.0546875,
"learning_rate": 9.499064202712643e-07,
"loss": 0.26131874322891235,
"step": 1525,
"token_acc": 0.9319688671086769
},
{
"epoch": 0.32194092827004217,
"grad_norm": 0.6328125,
"learning_rate": 9.498326269558525e-07,
"loss": 0.23430953919887543,
"step": 1526,
"token_acc": 0.9324556382369776
},
{
"epoch": 0.3221518987341772,
"grad_norm": 0.76953125,
"learning_rate": 9.497587821982727e-07,
"loss": 0.2851739525794983,
"step": 1527,
"token_acc": 0.9238906846899795
},
{
"epoch": 0.32236286919831225,
"grad_norm": 0.69921875,
"learning_rate": 9.496848860069691e-07,
"loss": 0.22896860539913177,
"step": 1528,
"token_acc": 0.9341647331786543
},
{
"epoch": 0.32257383966244724,
"grad_norm": 0.86328125,
"learning_rate": 9.496109383903929e-07,
"loss": 0.2737294137477875,
"step": 1529,
"token_acc": 0.9315191387559809
},
{
"epoch": 0.3227848101265823,
"grad_norm": 0.62890625,
"learning_rate": 9.495369393570003e-07,
"loss": 0.2521006464958191,
"step": 1530,
"token_acc": 0.9293909973521624
},
{
"epoch": 0.3229957805907173,
"grad_norm": 0.671875,
"learning_rate": 9.494628889152539e-07,
"loss": 0.23891186714172363,
"step": 1531,
"token_acc": 0.9335803876852907
},
{
"epoch": 0.3232067510548523,
"grad_norm": 0.6796875,
"learning_rate": 9.493887870736218e-07,
"loss": 0.23902195692062378,
"step": 1532,
"token_acc": 0.9280039721946375
},
{
"epoch": 0.32341772151898734,
"grad_norm": 0.78125,
"learning_rate": 9.493146338405784e-07,
"loss": 0.29924818873405457,
"step": 1533,
"token_acc": 0.9268981089472199
},
{
"epoch": 0.3236286919831224,
"grad_norm": 0.78515625,
"learning_rate": 9.492404292246037e-07,
"loss": 0.230657160282135,
"step": 1534,
"token_acc": 0.9317676927959379
},
{
"epoch": 0.32383966244725737,
"grad_norm": 0.78515625,
"learning_rate": 9.491661732341836e-07,
"loss": 0.28479820489883423,
"step": 1535,
"token_acc": 0.924031007751938
},
{
"epoch": 0.3240506329113924,
"grad_norm": 0.7109375,
"learning_rate": 9.490918658778098e-07,
"loss": 0.25515246391296387,
"step": 1536,
"token_acc": 0.9273451870018393
},
{
"epoch": 0.32426160337552745,
"grad_norm": 0.73828125,
"learning_rate": 9.4901750716398e-07,
"loss": 0.2776564359664917,
"step": 1537,
"token_acc": 0.9246202350243623
},
{
"epoch": 0.32447257383966244,
"grad_norm": 0.6484375,
"learning_rate": 9.489430971011978e-07,
"loss": 0.23764348030090332,
"step": 1538,
"token_acc": 0.9334790755777639
},
{
"epoch": 0.3246835443037975,
"grad_norm": 1.203125,
"learning_rate": 9.488686356979727e-07,
"loss": 0.2838374972343445,
"step": 1539,
"token_acc": 0.9257334963325183
},
{
"epoch": 0.32489451476793246,
"grad_norm": 0.90625,
"learning_rate": 9.487941229628199e-07,
"loss": 0.3395375907421112,
"step": 1540,
"token_acc": 0.9116038882138517
},
{
"epoch": 0.3251054852320675,
"grad_norm": 0.796875,
"learning_rate": 9.487195589042606e-07,
"loss": 0.2461426556110382,
"step": 1541,
"token_acc": 0.9316965690903368
},
{
"epoch": 0.32531645569620254,
"grad_norm": 0.62109375,
"learning_rate": 9.486449435308218e-07,
"loss": 0.22606943547725677,
"step": 1542,
"token_acc": 0.9363494539781592
},
{
"epoch": 0.32552742616033753,
"grad_norm": 0.62890625,
"learning_rate": 9.485702768510364e-07,
"loss": 0.2177889347076416,
"step": 1543,
"token_acc": 0.9346268656716418
},
{
"epoch": 0.32573839662447257,
"grad_norm": 0.82421875,
"learning_rate": 9.484955588734431e-07,
"loss": 0.2693382501602173,
"step": 1544,
"token_acc": 0.9304897314375987
},
{
"epoch": 0.3259493670886076,
"grad_norm": 0.578125,
"learning_rate": 9.484207896065868e-07,
"loss": 0.24160149693489075,
"step": 1545,
"token_acc": 0.936786410470621
},
{
"epoch": 0.3261603375527426,
"grad_norm": 0.68359375,
"learning_rate": 9.483459690590176e-07,
"loss": 0.2740339934825897,
"step": 1546,
"token_acc": 0.9267431597528685
},
{
"epoch": 0.32637130801687764,
"grad_norm": 1.21875,
"learning_rate": 9.482710972392922e-07,
"loss": 0.22905594110488892,
"step": 1547,
"token_acc": 0.9407459044963402
},
{
"epoch": 0.3265822784810127,
"grad_norm": 0.75,
"learning_rate": 9.481961741559725e-07,
"loss": 0.27816396951675415,
"step": 1548,
"token_acc": 0.9170243204577968
},
{
"epoch": 0.32679324894514766,
"grad_norm": 0.765625,
"learning_rate": 9.48121199817627e-07,
"loss": 0.2953186631202698,
"step": 1549,
"token_acc": 0.9229046705054382
},
{
"epoch": 0.3270042194092827,
"grad_norm": 0.640625,
"learning_rate": 9.480461742328294e-07,
"loss": 0.21503537893295288,
"step": 1550,
"token_acc": 0.9387460271597804
},
{
"epoch": 0.32721518987341774,
"grad_norm": 0.703125,
"learning_rate": 9.479710974101594e-07,
"loss": 0.2728448808193207,
"step": 1551,
"token_acc": 0.9233208417411358
},
{
"epoch": 0.32742616033755273,
"grad_norm": 0.65234375,
"learning_rate": 9.47895969358203e-07,
"loss": 0.25554656982421875,
"step": 1552,
"token_acc": 0.9293216009083167
},
{
"epoch": 0.32763713080168777,
"grad_norm": 0.75,
"learning_rate": 9.478207900855515e-07,
"loss": 0.23449862003326416,
"step": 1553,
"token_acc": 0.939615736505032
},
{
"epoch": 0.3278481012658228,
"grad_norm": 0.75390625,
"learning_rate": 9.477455596008022e-07,
"loss": 0.28548648953437805,
"step": 1554,
"token_acc": 0.9239766081871345
},
{
"epoch": 0.3280590717299578,
"grad_norm": 0.7421875,
"learning_rate": 9.476702779125585e-07,
"loss": 0.27637791633605957,
"step": 1555,
"token_acc": 0.9250645994832042
},
{
"epoch": 0.32827004219409284,
"grad_norm": 0.69921875,
"learning_rate": 9.475949450294297e-07,
"loss": 0.2447734922170639,
"step": 1556,
"token_acc": 0.9353186039065001
},
{
"epoch": 0.3284810126582278,
"grad_norm": 0.66796875,
"learning_rate": 9.475195609600303e-07,
"loss": 0.27998554706573486,
"step": 1557,
"token_acc": 0.9227260531258384
},
{
"epoch": 0.32869198312236286,
"grad_norm": 0.6015625,
"learning_rate": 9.474441257129813e-07,
"loss": 0.2545619606971741,
"step": 1558,
"token_acc": 0.9234523503724634
},
{
"epoch": 0.3289029535864979,
"grad_norm": 0.76953125,
"learning_rate": 9.473686392969096e-07,
"loss": 0.25500959157943726,
"step": 1559,
"token_acc": 0.9239380022962113
},
{
"epoch": 0.3291139240506329,
"grad_norm": 0.62890625,
"learning_rate": 9.472931017204473e-07,
"loss": 0.22655850648880005,
"step": 1560,
"token_acc": 0.9381243063263041
},
{
"epoch": 0.32932489451476793,
"grad_norm": 0.87109375,
"learning_rate": 9.47217512992233e-07,
"loss": 0.30268189311027527,
"step": 1561,
"token_acc": 0.9165935030728709
},
{
"epoch": 0.32953586497890297,
"grad_norm": 0.70703125,
"learning_rate": 9.471418731209108e-07,
"loss": 0.2656475007534027,
"step": 1562,
"token_acc": 0.9258801729462631
},
{
"epoch": 0.32974683544303796,
"grad_norm": 0.734375,
"learning_rate": 9.47066182115131e-07,
"loss": 0.2406896948814392,
"step": 1563,
"token_acc": 0.9350610664283586
},
{
"epoch": 0.329957805907173,
"grad_norm": 0.68359375,
"learning_rate": 9.469904399835493e-07,
"loss": 0.2999385595321655,
"step": 1564,
"token_acc": 0.9215285880980163
},
{
"epoch": 0.33016877637130804,
"grad_norm": 0.76953125,
"learning_rate": 9.469146467348274e-07,
"loss": 0.2671598792076111,
"step": 1565,
"token_acc": 0.9232323232323232
},
{
"epoch": 0.330379746835443,
"grad_norm": 0.77734375,
"learning_rate": 9.46838802377633e-07,
"loss": 0.27922165393829346,
"step": 1566,
"token_acc": 0.9190307328605201
},
{
"epoch": 0.33059071729957806,
"grad_norm": 0.625,
"learning_rate": 9.467629069206397e-07,
"loss": 0.24752941727638245,
"step": 1567,
"token_acc": 0.9327052489905787
},
{
"epoch": 0.3308016877637131,
"grad_norm": 0.90234375,
"learning_rate": 9.466869603725265e-07,
"loss": 0.2795766294002533,
"step": 1568,
"token_acc": 0.9258549057206775
},
{
"epoch": 0.3310126582278481,
"grad_norm": 0.66015625,
"learning_rate": 9.466109627419788e-07,
"loss": 0.2570780813694,
"step": 1569,
"token_acc": 0.9308101714961561
},
{
"epoch": 0.33122362869198313,
"grad_norm": 0.765625,
"learning_rate": 9.465349140376871e-07,
"loss": 0.2272178828716278,
"step": 1570,
"token_acc": 0.9376088218224028
},
{
"epoch": 0.33143459915611817,
"grad_norm": 0.65625,
"learning_rate": 9.464588142683488e-07,
"loss": 0.2625764012336731,
"step": 1571,
"token_acc": 0.9222898230088495
},
{
"epoch": 0.33164556962025316,
"grad_norm": 1.046875,
"learning_rate": 9.463826634426661e-07,
"loss": 0.2913757562637329,
"step": 1572,
"token_acc": 0.9228368794326242
},
{
"epoch": 0.3318565400843882,
"grad_norm": 1.015625,
"learning_rate": 9.463064615693479e-07,
"loss": 0.2961352467536926,
"step": 1573,
"token_acc": 0.9242700729927007
},
{
"epoch": 0.3320675105485232,
"grad_norm": 0.9453125,
"learning_rate": 9.462302086571081e-07,
"loss": 0.2536572813987732,
"step": 1574,
"token_acc": 0.9272577545978589
},
{
"epoch": 0.3322784810126582,
"grad_norm": 0.91796875,
"learning_rate": 9.461539047146672e-07,
"loss": 0.2897554039955139,
"step": 1575,
"token_acc": 0.9191452497834248
},
{
"epoch": 0.33248945147679326,
"grad_norm": 0.78515625,
"learning_rate": 9.460775497507512e-07,
"loss": 0.2721807062625885,
"step": 1576,
"token_acc": 0.9249371859296482
},
{
"epoch": 0.33270042194092825,
"grad_norm": 0.7265625,
"learning_rate": 9.460011437740916e-07,
"loss": 0.2651718556880951,
"step": 1577,
"token_acc": 0.9248257047590179
},
{
"epoch": 0.3329113924050633,
"grad_norm": 0.6328125,
"learning_rate": 9.459246867934263e-07,
"loss": 0.2130395919084549,
"step": 1578,
"token_acc": 0.9348395546823838
},
{
"epoch": 0.33312236286919833,
"grad_norm": 0.7109375,
"learning_rate": 9.45848178817499e-07,
"loss": 0.25840145349502563,
"step": 1579,
"token_acc": 0.9321983273596177
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.5703125,
"learning_rate": 9.457716198550586e-07,
"loss": 0.24820664525032043,
"step": 1580,
"token_acc": 0.9322164948453608
},
{
"epoch": 0.33354430379746836,
"grad_norm": 0.6328125,
"learning_rate": 9.456950099148606e-07,
"loss": 0.23400051891803741,
"step": 1581,
"token_acc": 0.9332925336597307
},
{
"epoch": 0.3337552742616034,
"grad_norm": 0.80859375,
"learning_rate": 9.456183490056659e-07,
"loss": 0.27070870995521545,
"step": 1582,
"token_acc": 0.9294269990288119
},
{
"epoch": 0.3339662447257384,
"grad_norm": 0.66015625,
"learning_rate": 9.455416371362413e-07,
"loss": 0.26411256194114685,
"step": 1583,
"token_acc": 0.9285919128190422
},
{
"epoch": 0.3341772151898734,
"grad_norm": 1.1171875,
"learning_rate": 9.454648743153593e-07,
"loss": 0.3222864866256714,
"step": 1584,
"token_acc": 0.9186418962203715
},
{
"epoch": 0.33438818565400846,
"grad_norm": 0.64453125,
"learning_rate": 9.453880605517986e-07,
"loss": 0.25822392106056213,
"step": 1585,
"token_acc": 0.9284223083805547
},
{
"epoch": 0.33459915611814345,
"grad_norm": 0.9453125,
"learning_rate": 9.453111958543436e-07,
"loss": 0.26547372341156006,
"step": 1586,
"token_acc": 0.9233236151603499
},
{
"epoch": 0.3348101265822785,
"grad_norm": 0.82421875,
"learning_rate": 9.452342802317841e-07,
"loss": 0.2866516411304474,
"step": 1587,
"token_acc": 0.9244858611825193
},
{
"epoch": 0.33502109704641353,
"grad_norm": 0.6015625,
"learning_rate": 9.451573136929163e-07,
"loss": 0.25004857778549194,
"step": 1588,
"token_acc": 0.9289326590364653
},
{
"epoch": 0.3352320675105485,
"grad_norm": 0.7109375,
"learning_rate": 9.450802962465418e-07,
"loss": 0.2522854804992676,
"step": 1589,
"token_acc": 0.9273840769903762
},
{
"epoch": 0.33544303797468356,
"grad_norm": 0.78125,
"learning_rate": 9.450032279014686e-07,
"loss": 0.320743203163147,
"step": 1590,
"token_acc": 0.9178940770918207
},
{
"epoch": 0.33565400843881854,
"grad_norm": 0.7890625,
"learning_rate": 9.449261086665095e-07,
"loss": 0.2889062762260437,
"step": 1591,
"token_acc": 0.930829420970266
},
{
"epoch": 0.3358649789029536,
"grad_norm": 0.85546875,
"learning_rate": 9.448489385504842e-07,
"loss": 0.2749912738800049,
"step": 1592,
"token_acc": 0.9262124367747694
},
{
"epoch": 0.3360759493670886,
"grad_norm": 0.76171875,
"learning_rate": 9.447717175622175e-07,
"loss": 0.3473696708679199,
"step": 1593,
"token_acc": 0.9091406677613574
},
{
"epoch": 0.3362869198312236,
"grad_norm": 0.65625,
"learning_rate": 9.446944457105405e-07,
"loss": 0.26022714376449585,
"step": 1594,
"token_acc": 0.9322671683913453
},
{
"epoch": 0.33649789029535865,
"grad_norm": 0.75,
"learning_rate": 9.446171230042897e-07,
"loss": 0.22268863022327423,
"step": 1595,
"token_acc": 0.9359122401847575
},
{
"epoch": 0.3367088607594937,
"grad_norm": 0.8203125,
"learning_rate": 9.445397494523077e-07,
"loss": 0.30218058824539185,
"step": 1596,
"token_acc": 0.9221233312142403
},
{
"epoch": 0.3369198312236287,
"grad_norm": 0.7890625,
"learning_rate": 9.444623250634427e-07,
"loss": 0.2494884729385376,
"step": 1597,
"token_acc": 0.930500917912405
},
{
"epoch": 0.3371308016877637,
"grad_norm": 0.71875,
"learning_rate": 9.44384849846549e-07,
"loss": 0.2274206578731537,
"step": 1598,
"token_acc": 0.9355882352941176
},
{
"epoch": 0.33734177215189876,
"grad_norm": 0.59765625,
"learning_rate": 9.443073238104865e-07,
"loss": 0.26012539863586426,
"step": 1599,
"token_acc": 0.930809804529941
},
{
"epoch": 0.33755274261603374,
"grad_norm": 0.73046875,
"learning_rate": 9.44229746964121e-07,
"loss": 0.23634777963161469,
"step": 1600,
"token_acc": 0.9323394495412844
},
{
"epoch": 0.33755274261603374,
"eval_loss": 0.4337185323238373,
"eval_runtime": 245.7809,
"eval_samples_per_second": 137.134,
"eval_steps_per_second": 2.144,
"eval_token_acc": 0.8991530557300548,
"step": 1600
},
{
"epoch": 0.3377637130801688,
"grad_norm": 0.67578125,
"learning_rate": 9.441521193163238e-07,
"loss": 0.23944209516048431,
"step": 1601,
"token_acc": 0.927382319173364
},
{
"epoch": 0.3379746835443038,
"grad_norm": 0.80078125,
"learning_rate": 9.440744408759727e-07,
"loss": 0.24298495054244995,
"step": 1602,
"token_acc": 0.9336686576548255
},
{
"epoch": 0.3381856540084388,
"grad_norm": 0.76953125,
"learning_rate": 9.439967116519505e-07,
"loss": 0.2587997019290924,
"step": 1603,
"token_acc": 0.9240464344941957
},
{
"epoch": 0.33839662447257385,
"grad_norm": 0.71875,
"learning_rate": 9.439189316531464e-07,
"loss": 0.2167397290468216,
"step": 1604,
"token_acc": 0.9390922401171303
},
{
"epoch": 0.33860759493670883,
"grad_norm": 0.6640625,
"learning_rate": 9.438411008884553e-07,
"loss": 0.2602764368057251,
"step": 1605,
"token_acc": 0.9219173952366596
},
{
"epoch": 0.3388185654008439,
"grad_norm": 0.64453125,
"learning_rate": 9.437632193667775e-07,
"loss": 0.23749463260173798,
"step": 1606,
"token_acc": 0.934610705596107
},
{
"epoch": 0.3390295358649789,
"grad_norm": 0.71875,
"learning_rate": 9.436852870970196e-07,
"loss": 0.2818450331687927,
"step": 1607,
"token_acc": 0.9215006305170239
},
{
"epoch": 0.3392405063291139,
"grad_norm": 0.71484375,
"learning_rate": 9.436073040880939e-07,
"loss": 0.2736986577510834,
"step": 1608,
"token_acc": 0.9178852643419573
},
{
"epoch": 0.33945147679324894,
"grad_norm": 0.609375,
"learning_rate": 9.435292703489184e-07,
"loss": 0.24205069243907928,
"step": 1609,
"token_acc": 0.9281653746770026
},
{
"epoch": 0.339662447257384,
"grad_norm": 0.96875,
"learning_rate": 9.434511858884167e-07,
"loss": 0.23683682084083557,
"step": 1610,
"token_acc": 0.9266888821569221
},
{
"epoch": 0.33987341772151897,
"grad_norm": 0.70703125,
"learning_rate": 9.433730507155184e-07,
"loss": 0.24786365032196045,
"step": 1611,
"token_acc": 0.9293495175848117
},
{
"epoch": 0.340084388185654,
"grad_norm": 0.72265625,
"learning_rate": 9.432948648391593e-07,
"loss": 0.2611381411552429,
"step": 1612,
"token_acc": 0.9250978200111795
},
{
"epoch": 0.34029535864978905,
"grad_norm": 0.6953125,
"learning_rate": 9.432166282682803e-07,
"loss": 0.2261401265859604,
"step": 1613,
"token_acc": 0.9268707482993197
},
{
"epoch": 0.34050632911392403,
"grad_norm": 0.69140625,
"learning_rate": 9.431383410118286e-07,
"loss": 0.2952437996864319,
"step": 1614,
"token_acc": 0.9218795888399413
},
{
"epoch": 0.3407172995780591,
"grad_norm": 0.73828125,
"learning_rate": 9.430600030787568e-07,
"loss": 0.2706619203090668,
"step": 1615,
"token_acc": 0.9214936783299029
},
{
"epoch": 0.3409282700421941,
"grad_norm": 0.75390625,
"learning_rate": 9.429816144780236e-07,
"loss": 0.28954198956489563,
"step": 1616,
"token_acc": 0.9216836734693877
},
{
"epoch": 0.3411392405063291,
"grad_norm": 0.6171875,
"learning_rate": 9.429031752185936e-07,
"loss": 0.23225362598896027,
"step": 1617,
"token_acc": 0.9354280772556933
},
{
"epoch": 0.34135021097046414,
"grad_norm": 1.2109375,
"learning_rate": 9.428246853094366e-07,
"loss": 0.2903389632701874,
"step": 1618,
"token_acc": 0.9204334365325078
},
{
"epoch": 0.3415611814345992,
"grad_norm": 0.6796875,
"learning_rate": 9.427461447595288e-07,
"loss": 0.2608135938644409,
"step": 1619,
"token_acc": 0.9234987661091308
},
{
"epoch": 0.34177215189873417,
"grad_norm": 0.91796875,
"learning_rate": 9.426675535778522e-07,
"loss": 0.3057047724723816,
"step": 1620,
"token_acc": 0.9140136864028563
},
{
"epoch": 0.3419831223628692,
"grad_norm": 0.62109375,
"learning_rate": 9.425889117733939e-07,
"loss": 0.2640751600265503,
"step": 1621,
"token_acc": 0.9277286135693216
},
{
"epoch": 0.3421940928270042,
"grad_norm": 0.62109375,
"learning_rate": 9.425102193551477e-07,
"loss": 0.28919440507888794,
"step": 1622,
"token_acc": 0.9247822644497229
},
{
"epoch": 0.34240506329113923,
"grad_norm": 0.7734375,
"learning_rate": 9.424314763321124e-07,
"loss": 0.2618682384490967,
"step": 1623,
"token_acc": 0.9260844748858448
},
{
"epoch": 0.3426160337552743,
"grad_norm": 0.7109375,
"learning_rate": 9.423526827132931e-07,
"loss": 0.2545069754123688,
"step": 1624,
"token_acc": 0.9270595897101921
},
{
"epoch": 0.34282700421940926,
"grad_norm": 0.69140625,
"learning_rate": 9.422738385077005e-07,
"loss": 0.27819639444351196,
"step": 1625,
"token_acc": 0.9325095057034221
},
{
"epoch": 0.3430379746835443,
"grad_norm": 0.7421875,
"learning_rate": 9.421949437243511e-07,
"loss": 0.2696912884712219,
"step": 1626,
"token_acc": 0.9227120535714286
},
{
"epoch": 0.34324894514767934,
"grad_norm": 0.67578125,
"learning_rate": 9.421159983722671e-07,
"loss": 0.23221710324287415,
"step": 1627,
"token_acc": 0.9322323462414579
},
{
"epoch": 0.3434599156118143,
"grad_norm": 0.70703125,
"learning_rate": 9.420370024604767e-07,
"loss": 0.21929380297660828,
"step": 1628,
"token_acc": 0.9286370597243492
},
{
"epoch": 0.34367088607594937,
"grad_norm": 0.81640625,
"learning_rate": 9.419579559980136e-07,
"loss": 0.2439175844192505,
"step": 1629,
"token_acc": 0.9296130117779025
},
{
"epoch": 0.3438818565400844,
"grad_norm": 0.80859375,
"learning_rate": 9.418788589939177e-07,
"loss": 0.23102378845214844,
"step": 1630,
"token_acc": 0.9315107913669065
},
{
"epoch": 0.3440928270042194,
"grad_norm": 0.703125,
"learning_rate": 9.417997114572342e-07,
"loss": 0.2606005072593689,
"step": 1631,
"token_acc": 0.9257028112449799
},
{
"epoch": 0.34430379746835443,
"grad_norm": 0.6328125,
"learning_rate": 9.417205133970143e-07,
"loss": 0.24455326795578003,
"step": 1632,
"token_acc": 0.9317640835757736
},
{
"epoch": 0.3445147679324895,
"grad_norm": 0.73828125,
"learning_rate": 9.41641264822315e-07,
"loss": 0.2429400384426117,
"step": 1633,
"token_acc": 0.9239130434782609
},
{
"epoch": 0.34472573839662446,
"grad_norm": 1.34375,
"learning_rate": 9.415619657421991e-07,
"loss": 0.2420717179775238,
"step": 1634,
"token_acc": 0.9310043668122271
},
{
"epoch": 0.3449367088607595,
"grad_norm": 0.65234375,
"learning_rate": 9.41482616165735e-07,
"loss": 0.27381402254104614,
"step": 1635,
"token_acc": 0.9236192714453584
},
{
"epoch": 0.34514767932489454,
"grad_norm": 1.5859375,
"learning_rate": 9.41403216101997e-07,
"loss": 0.24875643849372864,
"step": 1636,
"token_acc": 0.931766704416761
},
{
"epoch": 0.3453586497890295,
"grad_norm": 1.015625,
"learning_rate": 9.413237655600654e-07,
"loss": 0.2728778123855591,
"step": 1637,
"token_acc": 0.9263157894736842
},
{
"epoch": 0.34556962025316457,
"grad_norm": 0.7109375,
"learning_rate": 9.412442645490257e-07,
"loss": 0.30332183837890625,
"step": 1638,
"token_acc": 0.9168779938010707
},
{
"epoch": 0.34578059071729955,
"grad_norm": 0.78125,
"learning_rate": 9.411647130779699e-07,
"loss": 0.26190561056137085,
"step": 1639,
"token_acc": 0.9347617524784139
},
{
"epoch": 0.3459915611814346,
"grad_norm": 0.90234375,
"learning_rate": 9.41085111155995e-07,
"loss": 0.2752026915550232,
"step": 1640,
"token_acc": 0.9264008921103987
},
{
"epoch": 0.34620253164556963,
"grad_norm": 0.78515625,
"learning_rate": 9.410054587922043e-07,
"loss": 0.26243409514427185,
"step": 1641,
"token_acc": 0.928486646884273
},
{
"epoch": 0.3464135021097046,
"grad_norm": 0.84765625,
"learning_rate": 9.409257559957069e-07,
"loss": 0.2628988027572632,
"step": 1642,
"token_acc": 0.9306260575296108
},
{
"epoch": 0.34662447257383966,
"grad_norm": 0.640625,
"learning_rate": 9.408460027756172e-07,
"loss": 0.2401159405708313,
"step": 1643,
"token_acc": 0.9343711843711844
},
{
"epoch": 0.3468354430379747,
"grad_norm": 0.80859375,
"learning_rate": 9.407661991410558e-07,
"loss": 0.2749597132205963,
"step": 1644,
"token_acc": 0.9255230125523013
},
{
"epoch": 0.3470464135021097,
"grad_norm": 0.88671875,
"learning_rate": 9.40686345101149e-07,
"loss": 0.2682425379753113,
"step": 1645,
"token_acc": 0.9265940902021773
},
{
"epoch": 0.3472573839662447,
"grad_norm": 0.62890625,
"learning_rate": 9.406064406650287e-07,
"loss": 0.2411586046218872,
"step": 1646,
"token_acc": 0.9323656578134651
},
{
"epoch": 0.34746835443037977,
"grad_norm": 0.66015625,
"learning_rate": 9.405264858418326e-07,
"loss": 0.21946536004543304,
"step": 1647,
"token_acc": 0.9317245438493231
},
{
"epoch": 0.34767932489451475,
"grad_norm": 0.9375,
"learning_rate": 9.404464806407042e-07,
"loss": 0.296546071767807,
"step": 1648,
"token_acc": 0.9222361024359775
},
{
"epoch": 0.3478902953586498,
"grad_norm": 0.59375,
"learning_rate": 9.40366425070793e-07,
"loss": 0.20803073048591614,
"step": 1649,
"token_acc": 0.9425218176346675
},
{
"epoch": 0.34810126582278483,
"grad_norm": 0.83984375,
"learning_rate": 9.402863191412537e-07,
"loss": 0.3020900785923004,
"step": 1650,
"token_acc": 0.9220455254131588
},
{
"epoch": 0.3483122362869198,
"grad_norm": 0.6484375,
"learning_rate": 9.402061628612472e-07,
"loss": 0.25674012303352356,
"step": 1651,
"token_acc": 0.9310624493106245
},
{
"epoch": 0.34852320675105486,
"grad_norm": 0.76171875,
"learning_rate": 9.401259562399403e-07,
"loss": 0.28478553891181946,
"step": 1652,
"token_acc": 0.9299287410926366
},
{
"epoch": 0.3487341772151899,
"grad_norm": 0.80859375,
"learning_rate": 9.40045699286505e-07,
"loss": 0.23817691206932068,
"step": 1653,
"token_acc": 0.9281288723667905
},
{
"epoch": 0.3489451476793249,
"grad_norm": 0.6171875,
"learning_rate": 9.399653920101195e-07,
"loss": 0.22525762021541595,
"step": 1654,
"token_acc": 0.9331259720062208
},
{
"epoch": 0.3491561181434599,
"grad_norm": 0.70703125,
"learning_rate": 9.398850344199675e-07,
"loss": 0.23875603079795837,
"step": 1655,
"token_acc": 0.930642750373692
},
{
"epoch": 0.3493670886075949,
"grad_norm": 0.921875,
"learning_rate": 9.398046265252388e-07,
"loss": 0.25053465366363525,
"step": 1656,
"token_acc": 0.9324946302546794
},
{
"epoch": 0.34957805907172995,
"grad_norm": 0.69921875,
"learning_rate": 9.397241683351285e-07,
"loss": 0.2564641833305359,
"step": 1657,
"token_acc": 0.9294522512188127
},
{
"epoch": 0.349789029535865,
"grad_norm": 0.78125,
"learning_rate": 9.396436598588378e-07,
"loss": 0.2557925283908844,
"step": 1658,
"token_acc": 0.9266431924882629
},
{
"epoch": 0.35,
"grad_norm": 0.73046875,
"learning_rate": 9.395631011055734e-07,
"loss": 0.26249969005584717,
"step": 1659,
"token_acc": 0.9319962394233783
},
{
"epoch": 0.350210970464135,
"grad_norm": 0.6953125,
"learning_rate": 9.394824920845481e-07,
"loss": 0.23513028025627136,
"step": 1660,
"token_acc": 0.9312829038514824
},
{
"epoch": 0.35042194092827006,
"grad_norm": 0.7734375,
"learning_rate": 9.394018328049799e-07,
"loss": 0.2548867464065552,
"step": 1661,
"token_acc": 0.9336269267065115
},
{
"epoch": 0.35063291139240504,
"grad_norm": 0.7734375,
"learning_rate": 9.393211232760932e-07,
"loss": 0.23487508296966553,
"step": 1662,
"token_acc": 0.9349162011173184
},
{
"epoch": 0.3508438818565401,
"grad_norm": 0.7421875,
"learning_rate": 9.392403635071176e-07,
"loss": 0.260843962430954,
"step": 1663,
"token_acc": 0.9274122159929182
},
{
"epoch": 0.3510548523206751,
"grad_norm": 0.796875,
"learning_rate": 9.391595535072887e-07,
"loss": 0.2631385922431946,
"step": 1664,
"token_acc": 0.9209509658246656
},
{
"epoch": 0.3512658227848101,
"grad_norm": 0.8359375,
"learning_rate": 9.390786932858479e-07,
"loss": 0.23830120265483856,
"step": 1665,
"token_acc": 0.9282700421940928
},
{
"epoch": 0.35147679324894515,
"grad_norm": 0.6953125,
"learning_rate": 9.389977828520421e-07,
"loss": 0.26736193895339966,
"step": 1666,
"token_acc": 0.9238440616500453
},
{
"epoch": 0.3516877637130802,
"grad_norm": 0.69921875,
"learning_rate": 9.389168222151243e-07,
"loss": 0.27793848514556885,
"step": 1667,
"token_acc": 0.9270031365839749
},
{
"epoch": 0.3518987341772152,
"grad_norm": 0.57421875,
"learning_rate": 9.388358113843529e-07,
"loss": 0.25020474195480347,
"step": 1668,
"token_acc": 0.9275283937263386
},
{
"epoch": 0.3521097046413502,
"grad_norm": 0.85546875,
"learning_rate": 9.387547503689921e-07,
"loss": 0.2771265506744385,
"step": 1669,
"token_acc": 0.9255952380952381
},
{
"epoch": 0.35232067510548526,
"grad_norm": 0.76953125,
"learning_rate": 9.386736391783121e-07,
"loss": 0.29730749130249023,
"step": 1670,
"token_acc": 0.9246809835045129
},
{
"epoch": 0.35253164556962024,
"grad_norm": 0.67578125,
"learning_rate": 9.385924778215885e-07,
"loss": 0.2390887439250946,
"step": 1671,
"token_acc": 0.93340922026181
},
{
"epoch": 0.3527426160337553,
"grad_norm": 0.671875,
"learning_rate": 9.385112663081028e-07,
"loss": 0.2280537337064743,
"step": 1672,
"token_acc": 0.9354485776805251
},
{
"epoch": 0.35295358649789027,
"grad_norm": 0.80078125,
"learning_rate": 9.384300046471424e-07,
"loss": 0.3002857565879822,
"step": 1673,
"token_acc": 0.9189271563547698
},
{
"epoch": 0.3531645569620253,
"grad_norm": 0.8828125,
"learning_rate": 9.38348692848e-07,
"loss": 0.2538878917694092,
"step": 1674,
"token_acc": 0.9297851875601154
},
{
"epoch": 0.35337552742616035,
"grad_norm": 0.703125,
"learning_rate": 9.382673309199745e-07,
"loss": 0.22456809878349304,
"step": 1675,
"token_acc": 0.9339071626191208
},
{
"epoch": 0.35358649789029534,
"grad_norm": 0.71875,
"learning_rate": 9.381859188723702e-07,
"loss": 0.25327855348587036,
"step": 1676,
"token_acc": 0.929019929019929
},
{
"epoch": 0.3537974683544304,
"grad_norm": 0.65234375,
"learning_rate": 9.381044567144973e-07,
"loss": 0.24936312437057495,
"step": 1677,
"token_acc": 0.9296285953644233
},
{
"epoch": 0.3540084388185654,
"grad_norm": 0.72265625,
"learning_rate": 9.380229444556717e-07,
"loss": 0.231843501329422,
"step": 1678,
"token_acc": 0.9309262166405023
},
{
"epoch": 0.3542194092827004,
"grad_norm": 0.81640625,
"learning_rate": 9.379413821052151e-07,
"loss": 0.2699410319328308,
"step": 1679,
"token_acc": 0.9276815557581283
},
{
"epoch": 0.35443037974683544,
"grad_norm": 0.75,
"learning_rate": 9.378597696724546e-07,
"loss": 0.24168401956558228,
"step": 1680,
"token_acc": 0.9271465741543798
},
{
"epoch": 0.3546413502109705,
"grad_norm": 0.85546875,
"learning_rate": 9.377781071667235e-07,
"loss": 0.27143609523773193,
"step": 1681,
"token_acc": 0.9236213506340313
},
{
"epoch": 0.35485232067510547,
"grad_norm": 0.63671875,
"learning_rate": 9.376963945973606e-07,
"loss": 0.2371048629283905,
"step": 1682,
"token_acc": 0.9340296866410115
},
{
"epoch": 0.3550632911392405,
"grad_norm": 0.7265625,
"learning_rate": 9.376146319737102e-07,
"loss": 0.2859129309654236,
"step": 1683,
"token_acc": 0.9200359389038635
},
{
"epoch": 0.35527426160337555,
"grad_norm": 0.8828125,
"learning_rate": 9.375328193051227e-07,
"loss": 0.28455692529678345,
"step": 1684,
"token_acc": 0.9202853598014888
},
{
"epoch": 0.35548523206751054,
"grad_norm": 0.87109375,
"learning_rate": 9.374509566009542e-07,
"loss": 0.2839798331260681,
"step": 1685,
"token_acc": 0.9208261617900172
},
{
"epoch": 0.3556962025316456,
"grad_norm": 0.91796875,
"learning_rate": 9.373690438705661e-07,
"loss": 0.2920804023742676,
"step": 1686,
"token_acc": 0.9253557943653791
},
{
"epoch": 0.35590717299578056,
"grad_norm": 0.74609375,
"learning_rate": 9.372870811233261e-07,
"loss": 0.29054540395736694,
"step": 1687,
"token_acc": 0.9222520107238605
},
{
"epoch": 0.3561181434599156,
"grad_norm": 0.69140625,
"learning_rate": 9.372050683686071e-07,
"loss": 0.2947998046875,
"step": 1688,
"token_acc": 0.918534718425369
},
{
"epoch": 0.35632911392405064,
"grad_norm": 0.765625,
"learning_rate": 9.371230056157882e-07,
"loss": 0.2592851519584656,
"step": 1689,
"token_acc": 0.9262319268220714
},
{
"epoch": 0.35654008438818563,
"grad_norm": 0.7265625,
"learning_rate": 9.370408928742537e-07,
"loss": 0.26862451434135437,
"step": 1690,
"token_acc": 0.9255605381165919
},
{
"epoch": 0.35675105485232067,
"grad_norm": 1.1875,
"learning_rate": 9.369587301533941e-07,
"loss": 0.27907925844192505,
"step": 1691,
"token_acc": 0.9208823529411765
},
{
"epoch": 0.3569620253164557,
"grad_norm": 1.1015625,
"learning_rate": 9.368765174626052e-07,
"loss": 0.27880504727363586,
"step": 1692,
"token_acc": 0.9164239953407105
},
{
"epoch": 0.3571729957805907,
"grad_norm": 0.6484375,
"learning_rate": 9.367942548112889e-07,
"loss": 0.22724300622940063,
"step": 1693,
"token_acc": 0.9299659126123334
},
{
"epoch": 0.35738396624472574,
"grad_norm": 0.63671875,
"learning_rate": 9.367119422088526e-07,
"loss": 0.2403573840856552,
"step": 1694,
"token_acc": 0.9322590271560728
},
{
"epoch": 0.3575949367088608,
"grad_norm": 0.59375,
"learning_rate": 9.366295796647093e-07,
"loss": 0.22870095074176788,
"step": 1695,
"token_acc": 0.9367552703941339
},
{
"epoch": 0.35780590717299576,
"grad_norm": 0.7578125,
"learning_rate": 9.365471671882781e-07,
"loss": 0.28150975704193115,
"step": 1696,
"token_acc": 0.9182144997004195
},
{
"epoch": 0.3580168776371308,
"grad_norm": 0.80859375,
"learning_rate": 9.364647047889833e-07,
"loss": 0.31994009017944336,
"step": 1697,
"token_acc": 0.9140805334701205
},
{
"epoch": 0.35822784810126584,
"grad_norm": 0.7734375,
"learning_rate": 9.363821924762554e-07,
"loss": 0.2513180077075958,
"step": 1698,
"token_acc": 0.9310533515731874
},
{
"epoch": 0.35843881856540083,
"grad_norm": 0.72265625,
"learning_rate": 9.362996302595303e-07,
"loss": 0.29430505633354187,
"step": 1699,
"token_acc": 0.9234194122885129
},
{
"epoch": 0.35864978902953587,
"grad_norm": 0.75,
"learning_rate": 9.362170181482496e-07,
"loss": 0.2608153820037842,
"step": 1700,
"token_acc": 0.9316065192083819
},
{
"epoch": 0.3588607594936709,
"grad_norm": 0.96875,
"learning_rate": 9.361343561518608e-07,
"loss": 0.27800315618515015,
"step": 1701,
"token_acc": 0.9215134459036898
},
{
"epoch": 0.3590717299578059,
"grad_norm": 0.77734375,
"learning_rate": 9.36051644279817e-07,
"loss": 0.26527389883995056,
"step": 1702,
"token_acc": 0.9273917108133375
},
{
"epoch": 0.35928270042194094,
"grad_norm": 0.7421875,
"learning_rate": 9.359688825415768e-07,
"loss": 0.24779286980628967,
"step": 1703,
"token_acc": 0.930368636629608
},
{
"epoch": 0.3594936708860759,
"grad_norm": 0.66796875,
"learning_rate": 9.35886070946605e-07,
"loss": 0.2481088787317276,
"step": 1704,
"token_acc": 0.936046511627907
},
{
"epoch": 0.35970464135021096,
"grad_norm": 0.6796875,
"learning_rate": 9.358032095043716e-07,
"loss": 0.23535574972629547,
"step": 1705,
"token_acc": 0.9337689337689338
},
{
"epoch": 0.359915611814346,
"grad_norm": 0.8359375,
"learning_rate": 9.357202982243526e-07,
"loss": 0.29152315855026245,
"step": 1706,
"token_acc": 0.9170403587443946
},
{
"epoch": 0.360126582278481,
"grad_norm": 0.578125,
"learning_rate": 9.356373371160298e-07,
"loss": 0.21768781542778015,
"step": 1707,
"token_acc": 0.939873417721519
},
{
"epoch": 0.36033755274261603,
"grad_norm": 0.70703125,
"learning_rate": 9.3555432618889e-07,
"loss": 0.2704032063484192,
"step": 1708,
"token_acc": 0.9234184239733629
},
{
"epoch": 0.36054852320675107,
"grad_norm": 0.71484375,
"learning_rate": 9.354712654524267e-07,
"loss": 0.24188432097434998,
"step": 1709,
"token_acc": 0.9323047858942065
},
{
"epoch": 0.36075949367088606,
"grad_norm": 0.6484375,
"learning_rate": 9.353881549161383e-07,
"loss": 0.26301664113998413,
"step": 1710,
"token_acc": 0.9259364358683314
},
{
"epoch": 0.3609704641350211,
"grad_norm": 0.6484375,
"learning_rate": 9.353049945895293e-07,
"loss": 0.24812474846839905,
"step": 1711,
"token_acc": 0.9318505845451953
},
{
"epoch": 0.36118143459915614,
"grad_norm": 0.70703125,
"learning_rate": 9.352217844821098e-07,
"loss": 0.2484617680311203,
"step": 1712,
"token_acc": 0.9366929133858267
},
{
"epoch": 0.3613924050632911,
"grad_norm": 0.73046875,
"learning_rate": 9.351385246033956e-07,
"loss": 0.23907536268234253,
"step": 1713,
"token_acc": 0.9324242424242424
},
{
"epoch": 0.36160337552742616,
"grad_norm": 0.6171875,
"learning_rate": 9.35055214962908e-07,
"loss": 0.23805205523967743,
"step": 1714,
"token_acc": 0.9346926713947991
},
{
"epoch": 0.3618143459915612,
"grad_norm": 0.578125,
"learning_rate": 9.349718555701744e-07,
"loss": 0.2235066294670105,
"step": 1715,
"token_acc": 0.9360208062418726
},
{
"epoch": 0.3620253164556962,
"grad_norm": 0.96875,
"learning_rate": 9.348884464347275e-07,
"loss": 0.28189563751220703,
"step": 1716,
"token_acc": 0.9183735860593091
},
{
"epoch": 0.36223628691983123,
"grad_norm": 0.91015625,
"learning_rate": 9.348049875661059e-07,
"loss": 0.28244683146476746,
"step": 1717,
"token_acc": 0.9196454103517301
},
{
"epoch": 0.36244725738396627,
"grad_norm": 0.6875,
"learning_rate": 9.347214789738538e-07,
"loss": 0.3043467700481415,
"step": 1718,
"token_acc": 0.9222814164838609
},
{
"epoch": 0.36265822784810126,
"grad_norm": 0.95703125,
"learning_rate": 9.346379206675211e-07,
"loss": 0.2614938020706177,
"step": 1719,
"token_acc": 0.9301948051948052
},
{
"epoch": 0.3628691983122363,
"grad_norm": 0.85546875,
"learning_rate": 9.345543126566635e-07,
"loss": 0.2756979167461395,
"step": 1720,
"token_acc": 0.9246906939214632
},
{
"epoch": 0.3630801687763713,
"grad_norm": 0.8671875,
"learning_rate": 9.344706549508421e-07,
"loss": 0.28513869643211365,
"step": 1721,
"token_acc": 0.9201053555750659
},
{
"epoch": 0.3632911392405063,
"grad_norm": 0.86328125,
"learning_rate": 9.343869475596241e-07,
"loss": 0.2887800931930542,
"step": 1722,
"token_acc": 0.9226091763405196
},
{
"epoch": 0.36350210970464136,
"grad_norm": 0.640625,
"learning_rate": 9.34303190492582e-07,
"loss": 0.23534713685512543,
"step": 1723,
"token_acc": 0.935820895522388
},
{
"epoch": 0.36371308016877635,
"grad_norm": 0.70703125,
"learning_rate": 9.342193837592941e-07,
"loss": 0.30718207359313965,
"step": 1724,
"token_acc": 0.9156766154737758
},
{
"epoch": 0.3639240506329114,
"grad_norm": 0.76171875,
"learning_rate": 9.341355273693446e-07,
"loss": 0.2970879077911377,
"step": 1725,
"token_acc": 0.9235145385587863
},
{
"epoch": 0.36413502109704643,
"grad_norm": 0.84765625,
"learning_rate": 9.340516213323228e-07,
"loss": 0.2779674530029297,
"step": 1726,
"token_acc": 0.926529357516139
},
{
"epoch": 0.3643459915611814,
"grad_norm": 0.67578125,
"learning_rate": 9.339676656578245e-07,
"loss": 0.301363468170166,
"step": 1727,
"token_acc": 0.9251336898395722
},
{
"epoch": 0.36455696202531646,
"grad_norm": 0.5859375,
"learning_rate": 9.338836603554505e-07,
"loss": 0.25844764709472656,
"step": 1728,
"token_acc": 0.9302995391705069
},
{
"epoch": 0.3647679324894515,
"grad_norm": 0.6875,
"learning_rate": 9.337996054348076e-07,
"loss": 0.2536547780036926,
"step": 1729,
"token_acc": 0.9244303432362273
},
{
"epoch": 0.3649789029535865,
"grad_norm": 0.6015625,
"learning_rate": 9.337155009055081e-07,
"loss": 0.2093231976032257,
"step": 1730,
"token_acc": 0.9381860196418256
},
{
"epoch": 0.3651898734177215,
"grad_norm": 0.734375,
"learning_rate": 9.336313467771701e-07,
"loss": 0.2366497814655304,
"step": 1731,
"token_acc": 0.9375562894025818
},
{
"epoch": 0.36540084388185656,
"grad_norm": 0.61328125,
"learning_rate": 9.335471430594175e-07,
"loss": 0.23666155338287354,
"step": 1732,
"token_acc": 0.9285078611687927
},
{
"epoch": 0.36561181434599155,
"grad_norm": 0.69140625,
"learning_rate": 9.334628897618797e-07,
"loss": 0.2761836349964142,
"step": 1733,
"token_acc": 0.9200749297533563
},
{
"epoch": 0.3658227848101266,
"grad_norm": 0.69921875,
"learning_rate": 9.333785868941915e-07,
"loss": 0.22819164395332336,
"step": 1734,
"token_acc": 0.9270286047869235
},
{
"epoch": 0.36603375527426163,
"grad_norm": 0.765625,
"learning_rate": 9.332942344659938e-07,
"loss": 0.25955134630203247,
"step": 1735,
"token_acc": 0.9318693693693694
},
{
"epoch": 0.3662447257383966,
"grad_norm": 0.74609375,
"learning_rate": 9.332098324869329e-07,
"loss": 0.24414655566215515,
"step": 1736,
"token_acc": 0.9310754604872252
},
{
"epoch": 0.36645569620253166,
"grad_norm": 0.6953125,
"learning_rate": 9.331253809666611e-07,
"loss": 0.27255892753601074,
"step": 1737,
"token_acc": 0.9247685185185185
},
{
"epoch": 0.36666666666666664,
"grad_norm": 0.671875,
"learning_rate": 9.330408799148362e-07,
"loss": 0.2682799994945526,
"step": 1738,
"token_acc": 0.9308393586922351
},
{
"epoch": 0.3668776371308017,
"grad_norm": 0.90625,
"learning_rate": 9.329563293411211e-07,
"loss": 0.2974281311035156,
"step": 1739,
"token_acc": 0.9198951354500436
},
{
"epoch": 0.3670886075949367,
"grad_norm": 0.73828125,
"learning_rate": 9.328717292551855e-07,
"loss": 0.22290384769439697,
"step": 1740,
"token_acc": 0.9367441860465117
},
{
"epoch": 0.3672995780590717,
"grad_norm": 0.65234375,
"learning_rate": 9.327870796667038e-07,
"loss": 0.22769173979759216,
"step": 1741,
"token_acc": 0.933920704845815
},
{
"epoch": 0.36751054852320675,
"grad_norm": 0.703125,
"learning_rate": 9.327023805853564e-07,
"loss": 0.2551524043083191,
"step": 1742,
"token_acc": 0.9279952195996415
},
{
"epoch": 0.3677215189873418,
"grad_norm": 0.67578125,
"learning_rate": 9.326176320208296e-07,
"loss": 0.21800342202186584,
"step": 1743,
"token_acc": 0.9341448189762797
},
{
"epoch": 0.3679324894514768,
"grad_norm": 0.75,
"learning_rate": 9.325328339828147e-07,
"loss": 0.29383134841918945,
"step": 1744,
"token_acc": 0.9246655031995347
},
{
"epoch": 0.3681434599156118,
"grad_norm": 0.66796875,
"learning_rate": 9.324479864810094e-07,
"loss": 0.24657410383224487,
"step": 1745,
"token_acc": 0.9288770053475935
},
{
"epoch": 0.36835443037974686,
"grad_norm": 0.77734375,
"learning_rate": 9.323630895251167e-07,
"loss": 0.24168427288532257,
"step": 1746,
"token_acc": 0.9318403115871471
},
{
"epoch": 0.36856540084388184,
"grad_norm": 0.78515625,
"learning_rate": 9.322781431248452e-07,
"loss": 0.2549583315849304,
"step": 1747,
"token_acc": 0.9271541950113379
},
{
"epoch": 0.3687763713080169,
"grad_norm": 0.74609375,
"learning_rate": 9.321931472899092e-07,
"loss": 0.2550113797187805,
"step": 1748,
"token_acc": 0.9294592914853946
},
{
"epoch": 0.3689873417721519,
"grad_norm": 0.73828125,
"learning_rate": 9.321081020300288e-07,
"loss": 0.2872239947319031,
"step": 1749,
"token_acc": 0.9191132414619533
},
{
"epoch": 0.3691983122362869,
"grad_norm": 0.6640625,
"learning_rate": 9.320230073549295e-07,
"loss": 0.22348162531852722,
"step": 1750,
"token_acc": 0.9379411764705883
},
{
"epoch": 0.36940928270042195,
"grad_norm": 0.640625,
"learning_rate": 9.319378632743429e-07,
"loss": 0.25937214493751526,
"step": 1751,
"token_acc": 0.9258928571428572
},
{
"epoch": 0.369620253164557,
"grad_norm": 0.921875,
"learning_rate": 9.318526697980056e-07,
"loss": 0.277938574552536,
"step": 1752,
"token_acc": 0.9225372698041509
},
{
"epoch": 0.369831223628692,
"grad_norm": 1.1171875,
"learning_rate": 9.317674269356604e-07,
"loss": 0.3054434061050415,
"step": 1753,
"token_acc": 0.915146249637996
},
{
"epoch": 0.370042194092827,
"grad_norm": 0.75390625,
"learning_rate": 9.316821346970554e-07,
"loss": 0.2515072822570801,
"step": 1754,
"token_acc": 0.9270833333333334
},
{
"epoch": 0.370253164556962,
"grad_norm": 0.8046875,
"learning_rate": 9.315967930919445e-07,
"loss": 0.2604433298110962,
"step": 1755,
"token_acc": 0.9382022471910112
},
{
"epoch": 0.37046413502109704,
"grad_norm": 0.83984375,
"learning_rate": 9.315114021300874e-07,
"loss": 0.30326345562934875,
"step": 1756,
"token_acc": 0.9137931034482759
},
{
"epoch": 0.3706751054852321,
"grad_norm": 0.6796875,
"learning_rate": 9.314259618212492e-07,
"loss": 0.2592795491218567,
"step": 1757,
"token_acc": 0.9306569343065694
},
{
"epoch": 0.37088607594936707,
"grad_norm": 0.6328125,
"learning_rate": 9.313404721752008e-07,
"loss": 0.24161499738693237,
"step": 1758,
"token_acc": 0.9281354051054383
},
{
"epoch": 0.3710970464135021,
"grad_norm": 0.62890625,
"learning_rate": 9.312549332017183e-07,
"loss": 0.2770785093307495,
"step": 1759,
"token_acc": 0.9250313676286073
},
{
"epoch": 0.37130801687763715,
"grad_norm": 0.703125,
"learning_rate": 9.311693449105844e-07,
"loss": 0.2557133734226227,
"step": 1760,
"token_acc": 0.926200451176281
},
{
"epoch": 0.37151898734177213,
"grad_norm": 0.76171875,
"learning_rate": 9.310837073115862e-07,
"loss": 0.3002878427505493,
"step": 1761,
"token_acc": 0.9207248018120046
},
{
"epoch": 0.3717299578059072,
"grad_norm": 0.68359375,
"learning_rate": 9.309980204145176e-07,
"loss": 0.24686521291732788,
"step": 1762,
"token_acc": 0.9330046403712297
},
{
"epoch": 0.3719409282700422,
"grad_norm": 0.76953125,
"learning_rate": 9.309122842291774e-07,
"loss": 0.292415589094162,
"step": 1763,
"token_acc": 0.9288702928870293
},
{
"epoch": 0.3721518987341772,
"grad_norm": 0.7109375,
"learning_rate": 9.308264987653703e-07,
"loss": 0.25645536184310913,
"step": 1764,
"token_acc": 0.9317073170731708
},
{
"epoch": 0.37236286919831224,
"grad_norm": 0.65234375,
"learning_rate": 9.307406640329065e-07,
"loss": 0.22903116047382355,
"step": 1765,
"token_acc": 0.9329577464788732
},
{
"epoch": 0.3725738396624473,
"grad_norm": 0.6875,
"learning_rate": 9.306547800416022e-07,
"loss": 0.2557244300842285,
"step": 1766,
"token_acc": 0.9314949201741655
},
{
"epoch": 0.37278481012658227,
"grad_norm": 0.8125,
"learning_rate": 9.305688468012787e-07,
"loss": 0.19353802502155304,
"step": 1767,
"token_acc": 0.9413680781758957
},
{
"epoch": 0.3729957805907173,
"grad_norm": 1.171875,
"learning_rate": 9.304828643217631e-07,
"loss": 0.31406620144844055,
"step": 1768,
"token_acc": 0.9192907367777438
},
{
"epoch": 0.37320675105485235,
"grad_norm": 0.57421875,
"learning_rate": 9.303968326128884e-07,
"loss": 0.21769124269485474,
"step": 1769,
"token_acc": 0.9381910972497873
},
{
"epoch": 0.37341772151898733,
"grad_norm": 0.79296875,
"learning_rate": 9.303107516844932e-07,
"loss": 0.2673788070678711,
"step": 1770,
"token_acc": 0.9337557603686636
},
{
"epoch": 0.3736286919831224,
"grad_norm": 0.68359375,
"learning_rate": 9.302246215464213e-07,
"loss": 0.24331021308898926,
"step": 1771,
"token_acc": 0.9333902647309992
},
{
"epoch": 0.37383966244725736,
"grad_norm": 0.8828125,
"learning_rate": 9.301384422085227e-07,
"loss": 0.27998149394989014,
"step": 1772,
"token_acc": 0.9219777079165475
},
{
"epoch": 0.3740506329113924,
"grad_norm": 0.73828125,
"learning_rate": 9.300522136806524e-07,
"loss": 0.2714657187461853,
"step": 1773,
"token_acc": 0.9230088495575222
},
{
"epoch": 0.37426160337552744,
"grad_norm": 0.62890625,
"learning_rate": 9.299659359726717e-07,
"loss": 0.23993107676506042,
"step": 1774,
"token_acc": 0.9355118565644881
},
{
"epoch": 0.3744725738396624,
"grad_norm": 0.69921875,
"learning_rate": 9.298796090944468e-07,
"loss": 0.27614468336105347,
"step": 1775,
"token_acc": 0.9254237288135593
},
{
"epoch": 0.37468354430379747,
"grad_norm": 0.67578125,
"learning_rate": 9.297932330558503e-07,
"loss": 0.26223164796829224,
"step": 1776,
"token_acc": 0.9227665706051873
},
{
"epoch": 0.3748945147679325,
"grad_norm": 0.61328125,
"learning_rate": 9.297068078667598e-07,
"loss": 0.21317782998085022,
"step": 1777,
"token_acc": 0.9325064897605999
},
{
"epoch": 0.3751054852320675,
"grad_norm": 0.7421875,
"learning_rate": 9.296203335370587e-07,
"loss": 0.30340367555618286,
"step": 1778,
"token_acc": 0.9200648123143397
},
{
"epoch": 0.37531645569620253,
"grad_norm": 0.734375,
"learning_rate": 9.295338100766364e-07,
"loss": 0.21391162276268005,
"step": 1779,
"token_acc": 0.9396666666666667
},
{
"epoch": 0.3755274261603376,
"grad_norm": 0.625,
"learning_rate": 9.294472374953872e-07,
"loss": 0.2524837851524353,
"step": 1780,
"token_acc": 0.9336639801611903
},
{
"epoch": 0.37573839662447256,
"grad_norm": 0.78515625,
"learning_rate": 9.293606158032117e-07,
"loss": 0.24630481004714966,
"step": 1781,
"token_acc": 0.931282722513089
},
{
"epoch": 0.3759493670886076,
"grad_norm": 0.7109375,
"learning_rate": 9.292739450100155e-07,
"loss": 0.2903074026107788,
"step": 1782,
"token_acc": 0.9166461765429064
},
{
"epoch": 0.37616033755274264,
"grad_norm": 0.65625,
"learning_rate": 9.291872251257107e-07,
"loss": 0.2113291174173355,
"step": 1783,
"token_acc": 0.9350509930220076
},
{
"epoch": 0.3763713080168776,
"grad_norm": 0.62109375,
"learning_rate": 9.291004561602138e-07,
"loss": 0.2671849727630615,
"step": 1784,
"token_acc": 0.9261083743842364
},
{
"epoch": 0.37658227848101267,
"grad_norm": 1.1875,
"learning_rate": 9.290136381234479e-07,
"loss": 0.34464550018310547,
"step": 1785,
"token_acc": 0.9090909090909091
},
{
"epoch": 0.37679324894514765,
"grad_norm": 0.6484375,
"learning_rate": 9.289267710253415e-07,
"loss": 0.24422526359558105,
"step": 1786,
"token_acc": 0.9316384180790961
},
{
"epoch": 0.3770042194092827,
"grad_norm": 0.6484375,
"learning_rate": 9.288398548758283e-07,
"loss": 0.2332957684993744,
"step": 1787,
"token_acc": 0.9384615384615385
},
{
"epoch": 0.37721518987341773,
"grad_norm": 0.76171875,
"learning_rate": 9.28752889684848e-07,
"loss": 0.32483792304992676,
"step": 1788,
"token_acc": 0.9187627464309993
},
{
"epoch": 0.3774261603375527,
"grad_norm": 0.671875,
"learning_rate": 9.286658754623458e-07,
"loss": 0.2529968023300171,
"step": 1789,
"token_acc": 0.9290078556881001
},
{
"epoch": 0.37763713080168776,
"grad_norm": 0.6953125,
"learning_rate": 9.285788122182728e-07,
"loss": 0.21313020586967468,
"step": 1790,
"token_acc": 0.9386806160999709
},
{
"epoch": 0.3778481012658228,
"grad_norm": 0.79296875,
"learning_rate": 9.284916999625849e-07,
"loss": 0.28534191846847534,
"step": 1791,
"token_acc": 0.918232044198895
},
{
"epoch": 0.3780590717299578,
"grad_norm": 0.7578125,
"learning_rate": 9.284045387052444e-07,
"loss": 0.2752482295036316,
"step": 1792,
"token_acc": 0.9267692307692308
},
{
"epoch": 0.3782700421940928,
"grad_norm": 0.703125,
"learning_rate": 9.283173284562189e-07,
"loss": 0.23181575536727905,
"step": 1793,
"token_acc": 0.9338775510204081
},
{
"epoch": 0.37848101265822787,
"grad_norm": 0.69921875,
"learning_rate": 9.282300692254818e-07,
"loss": 0.2837159037590027,
"step": 1794,
"token_acc": 0.9179869524697111
},
{
"epoch": 0.37869198312236285,
"grad_norm": 0.74609375,
"learning_rate": 9.281427610230117e-07,
"loss": 0.2748425602912903,
"step": 1795,
"token_acc": 0.9209206255532606
},
{
"epoch": 0.3789029535864979,
"grad_norm": 0.73828125,
"learning_rate": 9.280554038587931e-07,
"loss": 0.27525874972343445,
"step": 1796,
"token_acc": 0.9241778319123021
},
{
"epoch": 0.37911392405063293,
"grad_norm": 0.765625,
"learning_rate": 9.27967997742816e-07,
"loss": 0.24282464385032654,
"step": 1797,
"token_acc": 0.9311178247734139
},
{
"epoch": 0.3793248945147679,
"grad_norm": 0.80078125,
"learning_rate": 9.278805426850761e-07,
"loss": 0.2466842383146286,
"step": 1798,
"token_acc": 0.9305689488910318
},
{
"epoch": 0.37953586497890296,
"grad_norm": 0.79296875,
"learning_rate": 9.277930386955745e-07,
"loss": 0.28594300150871277,
"step": 1799,
"token_acc": 0.9248895434462445
},
{
"epoch": 0.379746835443038,
"grad_norm": 0.73828125,
"learning_rate": 9.277054857843183e-07,
"loss": 0.2887975573539734,
"step": 1800,
"token_acc": 0.9212067955477445
},
{
"epoch": 0.379746835443038,
"eval_loss": 0.43369510769844055,
"eval_runtime": 245.8672,
"eval_samples_per_second": 137.086,
"eval_steps_per_second": 2.143,
"eval_token_acc": 0.899108184510455,
"step": 1800
},
{
"epoch": 0.379957805907173,
"grad_norm": 0.7421875,
"learning_rate": 9.276178839613196e-07,
"loss": 0.2607673108577728,
"step": 1801,
"token_acc": 0.928168130489335
},
{
"epoch": 0.380168776371308,
"grad_norm": 0.6640625,
"learning_rate": 9.275302332365965e-07,
"loss": 0.2245202362537384,
"step": 1802,
"token_acc": 0.9333936106088004
},
{
"epoch": 0.380379746835443,
"grad_norm": 0.66015625,
"learning_rate": 9.274425336201728e-07,
"loss": 0.23835879564285278,
"step": 1803,
"token_acc": 0.9261068702290076
},
{
"epoch": 0.38059071729957805,
"grad_norm": 0.76171875,
"learning_rate": 9.273547851220775e-07,
"loss": 0.24824509024620056,
"step": 1804,
"token_acc": 0.9263188918361333
},
{
"epoch": 0.3808016877637131,
"grad_norm": 0.83984375,
"learning_rate": 9.272669877523454e-07,
"loss": 0.23923823237419128,
"step": 1805,
"token_acc": 0.9348637015781922
},
{
"epoch": 0.3810126582278481,
"grad_norm": 0.65234375,
"learning_rate": 9.271791415210168e-07,
"loss": 0.26697322726249695,
"step": 1806,
"token_acc": 0.9220437956204379
},
{
"epoch": 0.3812236286919831,
"grad_norm": 0.7109375,
"learning_rate": 9.270912464381377e-07,
"loss": 0.24041783809661865,
"step": 1807,
"token_acc": 0.93335325762104
},
{
"epoch": 0.38143459915611816,
"grad_norm": 0.73046875,
"learning_rate": 9.270033025137598e-07,
"loss": 0.27658283710479736,
"step": 1808,
"token_acc": 0.9247249565720903
},
{
"epoch": 0.38164556962025314,
"grad_norm": 0.703125,
"learning_rate": 9.269153097579401e-07,
"loss": 0.22341014444828033,
"step": 1809,
"token_acc": 0.9340346886551038
},
{
"epoch": 0.3818565400843882,
"grad_norm": 0.65625,
"learning_rate": 9.268272681807415e-07,
"loss": 0.2665144205093384,
"step": 1810,
"token_acc": 0.9275118947663028
},
{
"epoch": 0.3820675105485232,
"grad_norm": 0.96875,
"learning_rate": 9.26739177792232e-07,
"loss": 0.25565192103385925,
"step": 1811,
"token_acc": 0.927958307786634
},
{
"epoch": 0.3822784810126582,
"grad_norm": 0.79296875,
"learning_rate": 9.266510386024858e-07,
"loss": 0.27763834595680237,
"step": 1812,
"token_acc": 0.9273282442748092
},
{
"epoch": 0.38248945147679325,
"grad_norm": 0.96484375,
"learning_rate": 9.265628506215819e-07,
"loss": 0.2819390594959259,
"step": 1813,
"token_acc": 0.9262629432067775
},
{
"epoch": 0.3827004219409283,
"grad_norm": 0.7890625,
"learning_rate": 9.264746138596058e-07,
"loss": 0.29503488540649414,
"step": 1814,
"token_acc": 0.920820189274448
},
{
"epoch": 0.3829113924050633,
"grad_norm": 0.83984375,
"learning_rate": 9.26386328326648e-07,
"loss": 0.2595806121826172,
"step": 1815,
"token_acc": 0.9269662921348315
},
{
"epoch": 0.3831223628691983,
"grad_norm": 1.03125,
"learning_rate": 9.262979940328046e-07,
"loss": 0.24287866055965424,
"step": 1816,
"token_acc": 0.9247341913822048
},
{
"epoch": 0.38333333333333336,
"grad_norm": 0.75,
"learning_rate": 9.262096109881774e-07,
"loss": 0.2156772017478943,
"step": 1817,
"token_acc": 0.9401315789473684
},
{
"epoch": 0.38354430379746834,
"grad_norm": 0.8359375,
"learning_rate": 9.261211792028738e-07,
"loss": 0.2951451539993286,
"step": 1818,
"token_acc": 0.9252308608876973
},
{
"epoch": 0.3837552742616034,
"grad_norm": 0.72265625,
"learning_rate": 9.260326986870066e-07,
"loss": 0.27076175808906555,
"step": 1819,
"token_acc": 0.9258073901658423
},
{
"epoch": 0.38396624472573837,
"grad_norm": 0.6875,
"learning_rate": 9.259441694506944e-07,
"loss": 0.22185054421424866,
"step": 1820,
"token_acc": 0.9377751687701791
},
{
"epoch": 0.3841772151898734,
"grad_norm": 1.0390625,
"learning_rate": 9.258555915040614e-07,
"loss": 0.22383590042591095,
"step": 1821,
"token_acc": 0.9407185628742515
},
{
"epoch": 0.38438818565400845,
"grad_norm": 1.1171875,
"learning_rate": 9.257669648572371e-07,
"loss": 0.2371908277273178,
"step": 1822,
"token_acc": 0.938851142680667
},
{
"epoch": 0.38459915611814344,
"grad_norm": 0.671875,
"learning_rate": 9.256782895203567e-07,
"loss": 0.26829174160957336,
"step": 1823,
"token_acc": 0.9228368794326242
},
{
"epoch": 0.3848101265822785,
"grad_norm": 0.79296875,
"learning_rate": 9.255895655035608e-07,
"loss": 0.2719237506389618,
"step": 1824,
"token_acc": 0.9227082085080887
},
{
"epoch": 0.3850210970464135,
"grad_norm": 0.82421875,
"learning_rate": 9.255007928169961e-07,
"loss": 0.2967223823070526,
"step": 1825,
"token_acc": 0.9157397107897665
},
{
"epoch": 0.3852320675105485,
"grad_norm": 0.75390625,
"learning_rate": 9.254119714708142e-07,
"loss": 0.24009615182876587,
"step": 1826,
"token_acc": 0.9363372093023256
},
{
"epoch": 0.38544303797468354,
"grad_norm": 0.7109375,
"learning_rate": 9.253231014751729e-07,
"loss": 0.307847797870636,
"step": 1827,
"token_acc": 0.9175531914893617
},
{
"epoch": 0.3856540084388186,
"grad_norm": 1.1328125,
"learning_rate": 9.252341828402349e-07,
"loss": 0.23958738148212433,
"step": 1828,
"token_acc": 0.9360444907890163
},
{
"epoch": 0.38586497890295357,
"grad_norm": 1.078125,
"learning_rate": 9.25145215576169e-07,
"loss": 0.2339942902326584,
"step": 1829,
"token_acc": 0.9394484412470024
},
{
"epoch": 0.3860759493670886,
"grad_norm": 0.66796875,
"learning_rate": 9.250561996931492e-07,
"loss": 0.225580632686615,
"step": 1830,
"token_acc": 0.9307692307692308
},
{
"epoch": 0.38628691983122365,
"grad_norm": 0.78125,
"learning_rate": 9.249671352013553e-07,
"loss": 0.26309460401535034,
"step": 1831,
"token_acc": 0.9265518362040949
},
{
"epoch": 0.38649789029535864,
"grad_norm": 0.6640625,
"learning_rate": 9.248780221109728e-07,
"loss": 0.24800172448158264,
"step": 1832,
"token_acc": 0.9343544857768052
},
{
"epoch": 0.3867088607594937,
"grad_norm": 0.76171875,
"learning_rate": 9.247888604321923e-07,
"loss": 0.25037574768066406,
"step": 1833,
"token_acc": 0.9315494710640946
},
{
"epoch": 0.3869198312236287,
"grad_norm": 0.68359375,
"learning_rate": 9.2469965017521e-07,
"loss": 0.28050029277801514,
"step": 1834,
"token_acc": 0.9182209469153515
},
{
"epoch": 0.3871308016877637,
"grad_norm": 1.171875,
"learning_rate": 9.246103913502282e-07,
"loss": 0.26002001762390137,
"step": 1835,
"token_acc": 0.9284110050533408
},
{
"epoch": 0.38734177215189874,
"grad_norm": 0.7734375,
"learning_rate": 9.245210839674543e-07,
"loss": 0.2603076696395874,
"step": 1836,
"token_acc": 0.9262130347401665
},
{
"epoch": 0.38755274261603373,
"grad_norm": 0.55859375,
"learning_rate": 9.244317280371013e-07,
"loss": 0.21622005105018616,
"step": 1837,
"token_acc": 0.9399153737658674
},
{
"epoch": 0.38776371308016877,
"grad_norm": 0.72265625,
"learning_rate": 9.243423235693879e-07,
"loss": 0.2640265226364136,
"step": 1838,
"token_acc": 0.9272887842213718
},
{
"epoch": 0.3879746835443038,
"grad_norm": 0.7890625,
"learning_rate": 9.242528705745381e-07,
"loss": 0.2691488265991211,
"step": 1839,
"token_acc": 0.9298516687268232
},
{
"epoch": 0.3881856540084388,
"grad_norm": 0.7421875,
"learning_rate": 9.241633690627818e-07,
"loss": 0.24178001284599304,
"step": 1840,
"token_acc": 0.9338363319791062
},
{
"epoch": 0.38839662447257384,
"grad_norm": 0.66796875,
"learning_rate": 9.240738190443541e-07,
"loss": 0.25345578789711,
"step": 1841,
"token_acc": 0.9266702878870179
},
{
"epoch": 0.3886075949367089,
"grad_norm": 0.62109375,
"learning_rate": 9.239842205294959e-07,
"loss": 0.2360486388206482,
"step": 1842,
"token_acc": 0.9328785811732606
},
{
"epoch": 0.38881856540084386,
"grad_norm": 0.73828125,
"learning_rate": 9.238945735284534e-07,
"loss": 0.2449038028717041,
"step": 1843,
"token_acc": 0.9268953068592057
},
{
"epoch": 0.3890295358649789,
"grad_norm": 0.62890625,
"learning_rate": 9.238048780514787e-07,
"loss": 0.26960039138793945,
"step": 1844,
"token_acc": 0.9270607375271149
},
{
"epoch": 0.38924050632911394,
"grad_norm": 0.83203125,
"learning_rate": 9.237151341088292e-07,
"loss": 0.2575567662715912,
"step": 1845,
"token_acc": 0.9342750072066878
},
{
"epoch": 0.38945147679324893,
"grad_norm": 0.6953125,
"learning_rate": 9.236253417107676e-07,
"loss": 0.2754535675048828,
"step": 1846,
"token_acc": 0.9288154897494305
},
{
"epoch": 0.38966244725738397,
"grad_norm": 0.78515625,
"learning_rate": 9.23535500867563e-07,
"loss": 0.2439945489168167,
"step": 1847,
"token_acc": 0.9362714013950539
},
{
"epoch": 0.389873417721519,
"grad_norm": 0.6640625,
"learning_rate": 9.234456115894888e-07,
"loss": 0.2606812119483948,
"step": 1848,
"token_acc": 0.9236545682102628
},
{
"epoch": 0.390084388185654,
"grad_norm": 0.9140625,
"learning_rate": 9.233556738868249e-07,
"loss": 0.27625495195388794,
"step": 1849,
"token_acc": 0.9225721784776902
},
{
"epoch": 0.39029535864978904,
"grad_norm": 0.6640625,
"learning_rate": 9.232656877698566e-07,
"loss": 0.2793102264404297,
"step": 1850,
"token_acc": 0.9202629322663618
},
{
"epoch": 0.3905063291139241,
"grad_norm": 0.6875,
"learning_rate": 9.231756532488743e-07,
"loss": 0.2455964833498001,
"step": 1851,
"token_acc": 0.9258962011771
},
{
"epoch": 0.39071729957805906,
"grad_norm": 0.5078125,
"learning_rate": 9.230855703341743e-07,
"loss": 0.19969472289085388,
"step": 1852,
"token_acc": 0.9432416617905208
},
{
"epoch": 0.3909282700421941,
"grad_norm": 0.7265625,
"learning_rate": 9.229954390360584e-07,
"loss": 0.28671932220458984,
"step": 1853,
"token_acc": 0.9230058515552818
},
{
"epoch": 0.3911392405063291,
"grad_norm": 0.75,
"learning_rate": 9.229052593648339e-07,
"loss": 0.26401039958000183,
"step": 1854,
"token_acc": 0.9225410977988298
},
{
"epoch": 0.39135021097046413,
"grad_norm": 0.7734375,
"learning_rate": 9.228150313308134e-07,
"loss": 0.233763188123703,
"step": 1855,
"token_acc": 0.9356511131442513
},
{
"epoch": 0.39156118143459917,
"grad_norm": 0.62109375,
"learning_rate": 9.227247549443156e-07,
"loss": 0.19187316298484802,
"step": 1856,
"token_acc": 0.9460545193687231
},
{
"epoch": 0.39177215189873416,
"grad_norm": 0.82421875,
"learning_rate": 9.226344302156641e-07,
"loss": 0.24813725054264069,
"step": 1857,
"token_acc": 0.9283582089552239
},
{
"epoch": 0.3919831223628692,
"grad_norm": 0.7421875,
"learning_rate": 9.225440571551882e-07,
"loss": 0.28099197149276733,
"step": 1858,
"token_acc": 0.9286861548345647
},
{
"epoch": 0.39219409282700424,
"grad_norm": 0.65234375,
"learning_rate": 9.224536357732231e-07,
"loss": 0.23781321942806244,
"step": 1859,
"token_acc": 0.9274464239607539
},
{
"epoch": 0.3924050632911392,
"grad_norm": 0.68359375,
"learning_rate": 9.223631660801093e-07,
"loss": 0.3117380440235138,
"step": 1860,
"token_acc": 0.9198289684660609
},
{
"epoch": 0.39261603375527426,
"grad_norm": 0.81640625,
"learning_rate": 9.222726480861922e-07,
"loss": 0.2948303818702698,
"step": 1861,
"token_acc": 0.9180280882774434
},
{
"epoch": 0.3928270042194093,
"grad_norm": 0.58203125,
"learning_rate": 9.22182081801824e-07,
"loss": 0.2569146454334259,
"step": 1862,
"token_acc": 0.9329383248047401
},
{
"epoch": 0.3930379746835443,
"grad_norm": 0.94921875,
"learning_rate": 9.220914672373614e-07,
"loss": 0.32216933369636536,
"step": 1863,
"token_acc": 0.9064083457526081
},
{
"epoch": 0.39324894514767933,
"grad_norm": 0.9453125,
"learning_rate": 9.220008044031669e-07,
"loss": 0.27637138962745667,
"step": 1864,
"token_acc": 0.9210816777041942
},
{
"epoch": 0.39345991561181437,
"grad_norm": 0.81640625,
"learning_rate": 9.219100933096086e-07,
"loss": 0.26345184445381165,
"step": 1865,
"token_acc": 0.9283387622149837
},
{
"epoch": 0.39367088607594936,
"grad_norm": 0.8671875,
"learning_rate": 9.218193339670601e-07,
"loss": 0.3173444867134094,
"step": 1866,
"token_acc": 0.9129682997118156
},
{
"epoch": 0.3938818565400844,
"grad_norm": 0.625,
"learning_rate": 9.217285263859007e-07,
"loss": 0.22335606813430786,
"step": 1867,
"token_acc": 0.935064935064935
},
{
"epoch": 0.39409282700421944,
"grad_norm": 0.7265625,
"learning_rate": 9.216376705765147e-07,
"loss": 0.2642119526863098,
"step": 1868,
"token_acc": 0.9246898995865328
},
{
"epoch": 0.3943037974683544,
"grad_norm": 0.6484375,
"learning_rate": 9.215467665492923e-07,
"loss": 0.23183171451091766,
"step": 1869,
"token_acc": 0.9344312290332418
},
{
"epoch": 0.39451476793248946,
"grad_norm": 0.73046875,
"learning_rate": 9.214558143146292e-07,
"loss": 0.272377073764801,
"step": 1870,
"token_acc": 0.9305785123966942
},
{
"epoch": 0.39472573839662445,
"grad_norm": 0.8359375,
"learning_rate": 9.213648138829266e-07,
"loss": 0.26784923672676086,
"step": 1871,
"token_acc": 0.928849902534113
},
{
"epoch": 0.3949367088607595,
"grad_norm": 0.890625,
"learning_rate": 9.212737652645913e-07,
"loss": 0.23856118321418762,
"step": 1872,
"token_acc": 0.9337503554165482
},
{
"epoch": 0.39514767932489453,
"grad_norm": 0.6875,
"learning_rate": 9.211826684700351e-07,
"loss": 0.2687574625015259,
"step": 1873,
"token_acc": 0.9265722752716496
},
{
"epoch": 0.3953586497890295,
"grad_norm": 0.8046875,
"learning_rate": 9.210915235096759e-07,
"loss": 0.2604142427444458,
"step": 1874,
"token_acc": 0.9201069201069201
},
{
"epoch": 0.39556962025316456,
"grad_norm": 0.59765625,
"learning_rate": 9.210003303939371e-07,
"loss": 0.21657685935497284,
"step": 1875,
"token_acc": 0.9314020224104946
},
{
"epoch": 0.3957805907172996,
"grad_norm": 0.6875,
"learning_rate": 9.20909089133247e-07,
"loss": 0.24224431812763214,
"step": 1876,
"token_acc": 0.9318985849056604
},
{
"epoch": 0.3959915611814346,
"grad_norm": 0.765625,
"learning_rate": 9.208177997380399e-07,
"loss": 0.29792293906211853,
"step": 1877,
"token_acc": 0.9213075060532687
},
{
"epoch": 0.3962025316455696,
"grad_norm": 0.67578125,
"learning_rate": 9.20726462218756e-07,
"loss": 0.2578182816505432,
"step": 1878,
"token_acc": 0.924163783160323
},
{
"epoch": 0.39641350210970466,
"grad_norm": 0.6640625,
"learning_rate": 9.2063507658584e-07,
"loss": 0.23330868780612946,
"step": 1879,
"token_acc": 0.9328333876752527
},
{
"epoch": 0.39662447257383965,
"grad_norm": 0.890625,
"learning_rate": 9.205436428497426e-07,
"loss": 0.28994351625442505,
"step": 1880,
"token_acc": 0.9161849710982659
},
{
"epoch": 0.3968354430379747,
"grad_norm": 0.75390625,
"learning_rate": 9.204521610209202e-07,
"loss": 0.2730024456977844,
"step": 1881,
"token_acc": 0.9229598051157125
},
{
"epoch": 0.39704641350210973,
"grad_norm": 0.78515625,
"learning_rate": 9.203606311098347e-07,
"loss": 0.24187864363193512,
"step": 1882,
"token_acc": 0.9261853448275862
},
{
"epoch": 0.3972573839662447,
"grad_norm": 1.8984375,
"learning_rate": 9.202690531269531e-07,
"loss": 0.2717437148094177,
"step": 1883,
"token_acc": 0.9216603332358959
},
{
"epoch": 0.39746835443037976,
"grad_norm": 0.76171875,
"learning_rate": 9.201774270827481e-07,
"loss": 0.281715989112854,
"step": 1884,
"token_acc": 0.9242509892594686
},
{
"epoch": 0.39767932489451474,
"grad_norm": 0.7890625,
"learning_rate": 9.200857529876978e-07,
"loss": 0.2318015843629837,
"step": 1885,
"token_acc": 0.9304397815464214
},
{
"epoch": 0.3978902953586498,
"grad_norm": 0.65234375,
"learning_rate": 9.199940308522862e-07,
"loss": 0.27065324783325195,
"step": 1886,
"token_acc": 0.92619825708061
},
{
"epoch": 0.3981012658227848,
"grad_norm": 0.83984375,
"learning_rate": 9.199022606870024e-07,
"loss": 0.2438652217388153,
"step": 1887,
"token_acc": 0.9273247496423462
},
{
"epoch": 0.3983122362869198,
"grad_norm": 0.703125,
"learning_rate": 9.198104425023411e-07,
"loss": 0.24134406447410583,
"step": 1888,
"token_acc": 0.9285099052540913
},
{
"epoch": 0.39852320675105485,
"grad_norm": 0.9765625,
"learning_rate": 9.197185763088024e-07,
"loss": 0.24476462602615356,
"step": 1889,
"token_acc": 0.92578125
},
{
"epoch": 0.3987341772151899,
"grad_norm": 0.70703125,
"learning_rate": 9.19626662116892e-07,
"loss": 0.24153611063957214,
"step": 1890,
"token_acc": 0.9318809450751764
},
{
"epoch": 0.3989451476793249,
"grad_norm": 0.671875,
"learning_rate": 9.195346999371211e-07,
"loss": 0.271758496761322,
"step": 1891,
"token_acc": 0.9209603452926896
},
{
"epoch": 0.3991561181434599,
"grad_norm": 0.6328125,
"learning_rate": 9.194426897800064e-07,
"loss": 0.26978251338005066,
"step": 1892,
"token_acc": 0.9290673105946272
},
{
"epoch": 0.39936708860759496,
"grad_norm": 0.76171875,
"learning_rate": 9.1935063165607e-07,
"loss": 0.28870701789855957,
"step": 1893,
"token_acc": 0.9254198690577854
},
{
"epoch": 0.39957805907172994,
"grad_norm": 0.8125,
"learning_rate": 9.192585255758394e-07,
"loss": 0.2580242455005646,
"step": 1894,
"token_acc": 0.9339540296770439
},
{
"epoch": 0.399789029535865,
"grad_norm": 1.375,
"learning_rate": 9.191663715498478e-07,
"loss": 0.21934179961681366,
"step": 1895,
"token_acc": 0.9387427838357921
},
{
"epoch": 0.4,
"grad_norm": 0.80859375,
"learning_rate": 9.19074169588634e-07,
"loss": 0.27658939361572266,
"step": 1896,
"token_acc": 0.9278290993071594
},
{
"epoch": 0.400210970464135,
"grad_norm": 0.68359375,
"learning_rate": 9.189819197027418e-07,
"loss": 0.22759562730789185,
"step": 1897,
"token_acc": 0.9377700950734659
},
{
"epoch": 0.40042194092827005,
"grad_norm": 0.70703125,
"learning_rate": 9.188896219027209e-07,
"loss": 0.2513328790664673,
"step": 1898,
"token_acc": 0.9277489925158319
},
{
"epoch": 0.4006329113924051,
"grad_norm": 0.62890625,
"learning_rate": 9.187972761991263e-07,
"loss": 0.22657060623168945,
"step": 1899,
"token_acc": 0.935752688172043
},
{
"epoch": 0.4008438818565401,
"grad_norm": 1.0390625,
"learning_rate": 9.187048826025184e-07,
"loss": 0.2751932442188263,
"step": 1900,
"token_acc": 0.9230091096091684
},
{
"epoch": 0.4010548523206751,
"grad_norm": 0.7890625,
"learning_rate": 9.186124411234632e-07,
"loss": 0.27560490369796753,
"step": 1901,
"token_acc": 0.9197261978842564
},
{
"epoch": 0.4012658227848101,
"grad_norm": 0.7265625,
"learning_rate": 9.185199517725324e-07,
"loss": 0.24792620539665222,
"step": 1902,
"token_acc": 0.9330645161290323
},
{
"epoch": 0.40147679324894514,
"grad_norm": 0.87890625,
"learning_rate": 9.184274145603029e-07,
"loss": 0.3180472254753113,
"step": 1903,
"token_acc": 0.9188658057271196
},
{
"epoch": 0.4016877637130802,
"grad_norm": 0.71875,
"learning_rate": 9.183348294973568e-07,
"loss": 0.24376609921455383,
"step": 1904,
"token_acc": 0.9286992840095465
},
{
"epoch": 0.40189873417721517,
"grad_norm": 0.76171875,
"learning_rate": 9.182421965942821e-07,
"loss": 0.28026020526885986,
"step": 1905,
"token_acc": 0.9222288438617402
},
{
"epoch": 0.4021097046413502,
"grad_norm": 3.65625,
"learning_rate": 9.181495158616725e-07,
"loss": 0.25579574704170227,
"step": 1906,
"token_acc": 0.9286159600997507
},
{
"epoch": 0.40232067510548525,
"grad_norm": 0.6328125,
"learning_rate": 9.180567873101265e-07,
"loss": 0.23290948569774628,
"step": 1907,
"token_acc": 0.9333922782198645
},
{
"epoch": 0.40253164556962023,
"grad_norm": 0.5859375,
"learning_rate": 9.179640109502484e-07,
"loss": 0.2401217222213745,
"step": 1908,
"token_acc": 0.9341425619834711
},
{
"epoch": 0.4027426160337553,
"grad_norm": 0.8203125,
"learning_rate": 9.17871186792648e-07,
"loss": 0.26812300086021423,
"step": 1909,
"token_acc": 0.9327782917052112
},
{
"epoch": 0.4029535864978903,
"grad_norm": 0.7578125,
"learning_rate": 9.177783148479408e-07,
"loss": 0.2858043909072876,
"step": 1910,
"token_acc": 0.9216884198833657
},
{
"epoch": 0.4031645569620253,
"grad_norm": 0.8046875,
"learning_rate": 9.176853951267469e-07,
"loss": 0.23564667999744415,
"step": 1911,
"token_acc": 0.9349294045426643
},
{
"epoch": 0.40337552742616034,
"grad_norm": 0.75,
"learning_rate": 9.175924276396931e-07,
"loss": 0.24908998608589172,
"step": 1912,
"token_acc": 0.9320012890750886
},
{
"epoch": 0.4035864978902954,
"grad_norm": 0.796875,
"learning_rate": 9.174994123974105e-07,
"loss": 0.25956442952156067,
"step": 1913,
"token_acc": 0.9315411065958112
},
{
"epoch": 0.40379746835443037,
"grad_norm": 0.546875,
"learning_rate": 9.174063494105366e-07,
"loss": 0.1772785186767578,
"step": 1914,
"token_acc": 0.9457151570600792
},
{
"epoch": 0.4040084388185654,
"grad_norm": 1.078125,
"learning_rate": 9.173132386897136e-07,
"loss": 0.26592162251472473,
"step": 1915,
"token_acc": 0.9237465181058496
},
{
"epoch": 0.40421940928270045,
"grad_norm": 0.65234375,
"learning_rate": 9.172200802455898e-07,
"loss": 0.25685757398605347,
"step": 1916,
"token_acc": 0.9311940759024993
},
{
"epoch": 0.40443037974683543,
"grad_norm": 0.65234375,
"learning_rate": 9.171268740888182e-07,
"loss": 0.250326544046402,
"step": 1917,
"token_acc": 0.9311023622047244
},
{
"epoch": 0.4046413502109705,
"grad_norm": 1.1875,
"learning_rate": 9.170336202300583e-07,
"loss": 0.2700883150100708,
"step": 1918,
"token_acc": 0.9242227979274611
},
{
"epoch": 0.40485232067510546,
"grad_norm": 0.74609375,
"learning_rate": 9.169403186799741e-07,
"loss": 0.2698970437049866,
"step": 1919,
"token_acc": 0.9252514270182115
},
{
"epoch": 0.4050632911392405,
"grad_norm": 0.7421875,
"learning_rate": 9.168469694492355e-07,
"loss": 0.28905174136161804,
"step": 1920,
"token_acc": 0.9209056159952955
},
{
"epoch": 0.40527426160337554,
"grad_norm": 0.73046875,
"learning_rate": 9.167535725485178e-07,
"loss": 0.23809051513671875,
"step": 1921,
"token_acc": 0.9302013422818792
},
{
"epoch": 0.4054852320675105,
"grad_norm": 0.78515625,
"learning_rate": 9.166601279885017e-07,
"loss": 0.23999746143817902,
"step": 1922,
"token_acc": 0.935251798561151
},
{
"epoch": 0.40569620253164557,
"grad_norm": 0.6796875,
"learning_rate": 9.165666357798733e-07,
"loss": 0.23197510838508606,
"step": 1923,
"token_acc": 0.9327980969372583
},
{
"epoch": 0.4059071729957806,
"grad_norm": 0.86328125,
"learning_rate": 9.164730959333245e-07,
"loss": 0.29541683197021484,
"step": 1924,
"token_acc": 0.9220892909250072
},
{
"epoch": 0.4061181434599156,
"grad_norm": 0.60546875,
"learning_rate": 9.16379508459552e-07,
"loss": 0.27687525749206543,
"step": 1925,
"token_acc": 0.9200790737079921
},
{
"epoch": 0.40632911392405063,
"grad_norm": 1.2109375,
"learning_rate": 9.162858733692585e-07,
"loss": 0.3053325116634369,
"step": 1926,
"token_acc": 0.916566265060241
},
{
"epoch": 0.4065400843881857,
"grad_norm": 0.69140625,
"learning_rate": 9.16192190673152e-07,
"loss": 0.2050454467535019,
"step": 1927,
"token_acc": 0.9362162162162162
},
{
"epoch": 0.40675105485232066,
"grad_norm": 0.7734375,
"learning_rate": 9.160984603819459e-07,
"loss": 0.2896081805229187,
"step": 1928,
"token_acc": 0.9183372641509434
},
{
"epoch": 0.4069620253164557,
"grad_norm": 0.75390625,
"learning_rate": 9.160046825063591e-07,
"loss": 0.30498236417770386,
"step": 1929,
"token_acc": 0.925463948889565
},
{
"epoch": 0.40717299578059074,
"grad_norm": 0.7109375,
"learning_rate": 9.159108570571157e-07,
"loss": 0.26336434483528137,
"step": 1930,
"token_acc": 0.9244799088059276
},
{
"epoch": 0.4073839662447257,
"grad_norm": 0.78515625,
"learning_rate": 9.158169840449457e-07,
"loss": 0.2476760447025299,
"step": 1931,
"token_acc": 0.9316361556064073
},
{
"epoch": 0.40759493670886077,
"grad_norm": 0.8359375,
"learning_rate": 9.157230634805839e-07,
"loss": 0.23430106043815613,
"step": 1932,
"token_acc": 0.9355336212214682
},
{
"epoch": 0.4078059071729958,
"grad_norm": 0.7421875,
"learning_rate": 9.156290953747714e-07,
"loss": 0.2746797502040863,
"step": 1933,
"token_acc": 0.9278319123020706
},
{
"epoch": 0.4080168776371308,
"grad_norm": 0.76171875,
"learning_rate": 9.155350797382537e-07,
"loss": 0.2650185525417328,
"step": 1934,
"token_acc": 0.9247412982126059
},
{
"epoch": 0.40822784810126583,
"grad_norm": 0.7421875,
"learning_rate": 9.154410165817828e-07,
"loss": 0.30540603399276733,
"step": 1935,
"token_acc": 0.9162790697674419
},
{
"epoch": 0.4084388185654008,
"grad_norm": 0.78515625,
"learning_rate": 9.153469059161153e-07,
"loss": 0.27233344316482544,
"step": 1936,
"token_acc": 0.9290521592821088
},
{
"epoch": 0.40864978902953586,
"grad_norm": 2.515625,
"learning_rate": 9.152527477520137e-07,
"loss": 0.3095840811729431,
"step": 1937,
"token_acc": 0.9161658653846154
},
{
"epoch": 0.4088607594936709,
"grad_norm": 1.390625,
"learning_rate": 9.151585421002457e-07,
"loss": 0.31651750206947327,
"step": 1938,
"token_acc": 0.9165916591659166
},
{
"epoch": 0.4090717299578059,
"grad_norm": 0.75,
"learning_rate": 9.150642889715845e-07,
"loss": 0.26749229431152344,
"step": 1939,
"token_acc": 0.9320086929524992
},
{
"epoch": 0.4092827004219409,
"grad_norm": 0.6484375,
"learning_rate": 9.149699883768088e-07,
"loss": 0.21710315346717834,
"step": 1940,
"token_acc": 0.9348079161816065
},
{
"epoch": 0.40949367088607597,
"grad_norm": 0.73828125,
"learning_rate": 9.148756403267026e-07,
"loss": 0.34100958704948425,
"step": 1941,
"token_acc": 0.9185356200527705
},
{
"epoch": 0.40970464135021095,
"grad_norm": 0.7734375,
"learning_rate": 9.147812448320554e-07,
"loss": 0.25636026263237,
"step": 1942,
"token_acc": 0.9277210884353742
},
{
"epoch": 0.409915611814346,
"grad_norm": 0.75390625,
"learning_rate": 9.146868019036623e-07,
"loss": 0.31487444043159485,
"step": 1943,
"token_acc": 0.916615194564546
},
{
"epoch": 0.41012658227848103,
"grad_norm": 0.83984375,
"learning_rate": 9.145923115523236e-07,
"loss": 0.2474036067724228,
"step": 1944,
"token_acc": 0.932229377491567
},
{
"epoch": 0.410337552742616,
"grad_norm": 0.8046875,
"learning_rate": 9.144977737888448e-07,
"loss": 0.23942765593528748,
"step": 1945,
"token_acc": 0.935179358086847
},
{
"epoch": 0.41054852320675106,
"grad_norm": 0.71875,
"learning_rate": 9.144031886240373e-07,
"loss": 0.321605384349823,
"step": 1946,
"token_acc": 0.9218192627824019
},
{
"epoch": 0.4107594936708861,
"grad_norm": 0.63671875,
"learning_rate": 9.143085560687179e-07,
"loss": 0.24766230583190918,
"step": 1947,
"token_acc": 0.9277746793084216
},
{
"epoch": 0.4109704641350211,
"grad_norm": 0.7421875,
"learning_rate": 9.142138761337082e-07,
"loss": 0.26399165391921997,
"step": 1948,
"token_acc": 0.9282596835788325
},
{
"epoch": 0.4111814345991561,
"grad_norm": 0.69921875,
"learning_rate": 9.141191488298361e-07,
"loss": 0.2344694286584854,
"step": 1949,
"token_acc": 0.9286647504572773
},
{
"epoch": 0.41139240506329117,
"grad_norm": 0.69140625,
"learning_rate": 9.140243741679341e-07,
"loss": 0.20495407283306122,
"step": 1950,
"token_acc": 0.937727724412057
},
{
"epoch": 0.41160337552742615,
"grad_norm": 0.7265625,
"learning_rate": 9.139295521588406e-07,
"loss": 0.2814640998840332,
"step": 1951,
"token_acc": 0.9239944521497919
},
{
"epoch": 0.4118143459915612,
"grad_norm": 0.6875,
"learning_rate": 9.138346828133995e-07,
"loss": 0.257242351770401,
"step": 1952,
"token_acc": 0.9294289897510981
},
{
"epoch": 0.4120253164556962,
"grad_norm": 0.7734375,
"learning_rate": 9.137397661424596e-07,
"loss": 0.30631783604621887,
"step": 1953,
"token_acc": 0.9265954533004657
},
{
"epoch": 0.4122362869198312,
"grad_norm": 0.66796875,
"learning_rate": 9.136448021568757e-07,
"loss": 0.2396744191646576,
"step": 1954,
"token_acc": 0.9292063492063493
},
{
"epoch": 0.41244725738396626,
"grad_norm": 0.54296875,
"learning_rate": 9.135497908675076e-07,
"loss": 0.2213076651096344,
"step": 1955,
"token_acc": 0.9362022269034005
},
{
"epoch": 0.41265822784810124,
"grad_norm": 0.73828125,
"learning_rate": 9.134547322852206e-07,
"loss": 0.2861933708190918,
"step": 1956,
"token_acc": 0.9219481735872619
},
{
"epoch": 0.4128691983122363,
"grad_norm": 0.71875,
"learning_rate": 9.133596264208856e-07,
"loss": 0.2793295383453369,
"step": 1957,
"token_acc": 0.9232751216719153
},
{
"epoch": 0.4130801687763713,
"grad_norm": 0.78515625,
"learning_rate": 9.132644732853785e-07,
"loss": 0.23415514826774597,
"step": 1958,
"token_acc": 0.936848752762867
},
{
"epoch": 0.4132911392405063,
"grad_norm": 0.7109375,
"learning_rate": 9.131692728895811e-07,
"loss": 0.29440468549728394,
"step": 1959,
"token_acc": 0.9173300673606859
},
{
"epoch": 0.41350210970464135,
"grad_norm": 0.62890625,
"learning_rate": 9.130740252443803e-07,
"loss": 0.25713711977005005,
"step": 1960,
"token_acc": 0.9248366013071896
},
{
"epoch": 0.4137130801687764,
"grad_norm": 0.734375,
"learning_rate": 9.129787303606687e-07,
"loss": 0.28025707602500916,
"step": 1961,
"token_acc": 0.9208227669766131
},
{
"epoch": 0.4139240506329114,
"grad_norm": 0.83984375,
"learning_rate": 9.128833882493436e-07,
"loss": 0.25988930463790894,
"step": 1962,
"token_acc": 0.9253945480631277
},
{
"epoch": 0.4141350210970464,
"grad_norm": 1.015625,
"learning_rate": 9.127879989213086e-07,
"loss": 0.2681414484977722,
"step": 1963,
"token_acc": 0.924223602484472
},
{
"epoch": 0.41434599156118146,
"grad_norm": 0.609375,
"learning_rate": 9.126925623874719e-07,
"loss": 0.23518618941307068,
"step": 1964,
"token_acc": 0.9347695990424896
},
{
"epoch": 0.41455696202531644,
"grad_norm": 0.84375,
"learning_rate": 9.125970786587479e-07,
"loss": 0.2538262605667114,
"step": 1965,
"token_acc": 0.9263786242183059
},
{
"epoch": 0.4147679324894515,
"grad_norm": 0.80078125,
"learning_rate": 9.125015477460556e-07,
"loss": 0.2897471785545349,
"step": 1966,
"token_acc": 0.9163509180996794
},
{
"epoch": 0.41497890295358647,
"grad_norm": 0.6640625,
"learning_rate": 9.124059696603201e-07,
"loss": 0.2630343437194824,
"step": 1967,
"token_acc": 0.9244324970131422
},
{
"epoch": 0.4151898734177215,
"grad_norm": 0.7421875,
"learning_rate": 9.123103444124713e-07,
"loss": 0.25306540727615356,
"step": 1968,
"token_acc": 0.9272513933704899
},
{
"epoch": 0.41540084388185655,
"grad_norm": 0.62109375,
"learning_rate": 9.12214672013445e-07,
"loss": 0.22293509542942047,
"step": 1969,
"token_acc": 0.9381590196637218
},
{
"epoch": 0.41561181434599154,
"grad_norm": 0.5625,
"learning_rate": 9.121189524741817e-07,
"loss": 0.23128658533096313,
"step": 1970,
"token_acc": 0.9351984013702541
},
{
"epoch": 0.4158227848101266,
"grad_norm": 0.54296875,
"learning_rate": 9.120231858056282e-07,
"loss": 0.2618774175643921,
"step": 1971,
"token_acc": 0.9295605057194462
},
{
"epoch": 0.4160337552742616,
"grad_norm": 0.72265625,
"learning_rate": 9.119273720187361e-07,
"loss": 0.25696003437042236,
"step": 1972,
"token_acc": 0.9253450439146801
},
{
"epoch": 0.4162447257383966,
"grad_norm": 1.1328125,
"learning_rate": 9.118315111244624e-07,
"loss": 0.22687079012393951,
"step": 1973,
"token_acc": 0.9345570630486831
},
{
"epoch": 0.41645569620253164,
"grad_norm": 0.8671875,
"learning_rate": 9.117356031337698e-07,
"loss": 0.25750643014907837,
"step": 1974,
"token_acc": 0.9301022090339598
},
{
"epoch": 0.4166666666666667,
"grad_norm": 0.91796875,
"learning_rate": 9.11639648057626e-07,
"loss": 0.2913302481174469,
"step": 1975,
"token_acc": 0.9276166456494326
},
{
"epoch": 0.41687763713080167,
"grad_norm": 0.86328125,
"learning_rate": 9.115436459070044e-07,
"loss": 0.2763899564743042,
"step": 1976,
"token_acc": 0.9230769230769231
},
{
"epoch": 0.4170886075949367,
"grad_norm": 0.640625,
"learning_rate": 9.114475966928836e-07,
"loss": 0.2513253390789032,
"step": 1977,
"token_acc": 0.9275092936802974
},
{
"epoch": 0.41729957805907175,
"grad_norm": 0.72265625,
"learning_rate": 9.113515004262475e-07,
"loss": 0.24095700681209564,
"step": 1978,
"token_acc": 0.9372488408037094
},
{
"epoch": 0.41751054852320674,
"grad_norm": 0.765625,
"learning_rate": 9.112553571180858e-07,
"loss": 0.2281455397605896,
"step": 1979,
"token_acc": 0.9321780699133553
},
{
"epoch": 0.4177215189873418,
"grad_norm": 0.765625,
"learning_rate": 9.111591667793933e-07,
"loss": 0.23691600561141968,
"step": 1980,
"token_acc": 0.934855403348554
},
{
"epoch": 0.4179324894514768,
"grad_norm": 0.78515625,
"learning_rate": 9.1106292942117e-07,
"loss": 0.25448983907699585,
"step": 1981,
"token_acc": 0.9298789947254111
},
{
"epoch": 0.4181434599156118,
"grad_norm": 1.265625,
"learning_rate": 9.109666450544213e-07,
"loss": 0.2553502023220062,
"step": 1982,
"token_acc": 0.93343653250774
},
{
"epoch": 0.41835443037974684,
"grad_norm": 0.7734375,
"learning_rate": 9.108703136901587e-07,
"loss": 0.28221210837364197,
"step": 1983,
"token_acc": 0.9225239616613419
},
{
"epoch": 0.41856540084388183,
"grad_norm": 0.71875,
"learning_rate": 9.10773935339398e-07,
"loss": 0.27611416578292847,
"step": 1984,
"token_acc": 0.9298349056603774
},
{
"epoch": 0.41877637130801687,
"grad_norm": 0.78515625,
"learning_rate": 9.106775100131608e-07,
"loss": 0.2644606828689575,
"step": 1985,
"token_acc": 0.9235033259423503
},
{
"epoch": 0.4189873417721519,
"grad_norm": 0.64453125,
"learning_rate": 9.105810377224745e-07,
"loss": 0.2672709822654724,
"step": 1986,
"token_acc": 0.927710843373494
},
{
"epoch": 0.4191983122362869,
"grad_norm": 0.71484375,
"learning_rate": 9.104845184783716e-07,
"loss": 0.2812398076057434,
"step": 1987,
"token_acc": 0.9225935447015139
},
{
"epoch": 0.41940928270042194,
"grad_norm": 0.5546875,
"learning_rate": 9.103879522918896e-07,
"loss": 0.26493459939956665,
"step": 1988,
"token_acc": 0.9258202567760342
},
{
"epoch": 0.419620253164557,
"grad_norm": 0.67578125,
"learning_rate": 9.102913391740716e-07,
"loss": 0.23307015001773834,
"step": 1989,
"token_acc": 0.936281241417193
},
{
"epoch": 0.41983122362869196,
"grad_norm": 0.99609375,
"learning_rate": 9.101946791359665e-07,
"loss": 0.28101563453674316,
"step": 1990,
"token_acc": 0.923546511627907
},
{
"epoch": 0.420042194092827,
"grad_norm": 0.6953125,
"learning_rate": 9.100979721886279e-07,
"loss": 0.24740473926067352,
"step": 1991,
"token_acc": 0.9285905322278684
},
{
"epoch": 0.42025316455696204,
"grad_norm": 0.7578125,
"learning_rate": 9.100012183431152e-07,
"loss": 0.25005412101745605,
"step": 1992,
"token_acc": 0.9281230382925298
},
{
"epoch": 0.42046413502109703,
"grad_norm": 0.7578125,
"learning_rate": 9.099044176104929e-07,
"loss": 0.20773842930793762,
"step": 1993,
"token_acc": 0.9379124175164967
},
{
"epoch": 0.42067510548523207,
"grad_norm": 0.859375,
"learning_rate": 9.098075700018311e-07,
"loss": 0.28273704648017883,
"step": 1994,
"token_acc": 0.9198189460071128
},
{
"epoch": 0.4208860759493671,
"grad_norm": 1.0546875,
"learning_rate": 9.097106755282049e-07,
"loss": 0.28721871972084045,
"step": 1995,
"token_acc": 0.9265134347685335
},
{
"epoch": 0.4210970464135021,
"grad_norm": 1.8359375,
"learning_rate": 9.096137342006953e-07,
"loss": 0.27081963419914246,
"step": 1996,
"token_acc": 0.9250157529930687
},
{
"epoch": 0.42130801687763714,
"grad_norm": 0.640625,
"learning_rate": 9.095167460303883e-07,
"loss": 0.23872046172618866,
"step": 1997,
"token_acc": 0.9386761842959117
},
{
"epoch": 0.4215189873417722,
"grad_norm": 0.66796875,
"learning_rate": 9.094197110283752e-07,
"loss": 0.26904159784317017,
"step": 1998,
"token_acc": 0.9290484140233722
},
{
"epoch": 0.42172995780590716,
"grad_norm": 0.98828125,
"learning_rate": 9.093226292057529e-07,
"loss": 0.3138263523578644,
"step": 1999,
"token_acc": 0.9206625980819529
},
{
"epoch": 0.4219409282700422,
"grad_norm": 0.55078125,
"learning_rate": 9.092255005736236e-07,
"loss": 0.1999545842409134,
"step": 2000,
"token_acc": 0.9382038694773318
},
{
"epoch": 0.4219409282700422,
"eval_loss": 0.43380746245384216,
"eval_runtime": 245.6162,
"eval_samples_per_second": 137.226,
"eval_steps_per_second": 2.146,
"eval_token_acc": 0.8990778964372251,
"step": 2000
}
],
"logging_steps": 1,
"max_steps": 9480,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.0315273681255793e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}