diff --git "a/kd_mllm/s2_siglip2_qwen3_4b_10pct/checkpoint-1000/trainer_state.json" "b/kd_mllm/s2_siglip2_qwen3_4b_10pct/checkpoint-1000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/kd_mllm/s2_siglip2_qwen3_4b_10pct/checkpoint-1000/trainer_state.json" @@ -0,0 +1,8034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.508391476522723, + "eval_steps": 100.0, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0015085800490288515, + "grad_norm": 27.43332290649414, + "learning_rate": 2.0000000000000002e-07, + "loss": 2.315774917602539, + "step": 1, + "token_acc": 0.5353626166623855 + }, + { + "epoch": 0.003017160098057703, + "grad_norm": 37.682369232177734, + "learning_rate": 4.0000000000000003e-07, + "loss": 2.6036336421966553, + "step": 2, + "token_acc": 0.5029902023158163 + }, + { + "epoch": 0.004525740147086555, + "grad_norm": 33.49211120605469, + "learning_rate": 6.000000000000001e-07, + "loss": 2.544572114944458, + "step": 3, + "token_acc": 0.5050233677552962 + }, + { + "epoch": 0.006034320196115406, + "grad_norm": 31.669525146484375, + "learning_rate": 8.000000000000001e-07, + "loss": 2.3813283443450928, + "step": 4, + "token_acc": 0.5267805479383129 + }, + { + "epoch": 0.007542900245144258, + "grad_norm": 29.21599578857422, + "learning_rate": 1.0000000000000002e-06, + "loss": 2.332510232925415, + "step": 5, + "token_acc": 0.5399678575843738 + }, + { + "epoch": 0.00905148029417311, + "grad_norm": 28.25368881225586, + "learning_rate": 1.2000000000000002e-06, + "loss": 2.3171558380126953, + "step": 6, + "token_acc": 0.5383878952038481 + }, + { + "epoch": 0.010560060343201961, + "grad_norm": 29.415964126586914, + "learning_rate": 1.4000000000000001e-06, + "loss": 2.3267464637756348, + "step": 7, + "token_acc": 0.5472103004291845 + }, + { + "epoch": 0.012068640392230812, + "grad_norm": 22.19594955444336, + "learning_rate": 1.6000000000000001e-06, + "loss": 2.163546085357666, + "step": 8, + "token_acc": 0.553026196928636 + }, + { + "epoch": 0.013577220441259665, + "grad_norm": 21.95931053161621, + "learning_rate": 1.8000000000000001e-06, + "loss": 2.153867483139038, + "step": 9, + "token_acc": 0.5549912272601348 + }, + { + "epoch": 0.015085800490288516, + "grad_norm": 16.09490203857422, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.9219461679458618, + "step": 10, + "token_acc": 0.5878281866412733 + }, + { + "epoch": 0.016594380539317367, + "grad_norm": 12.050214767456055, + "learning_rate": 2.2e-06, + "loss": 1.9869475364685059, + "step": 11, + "token_acc": 0.5726549012085171 + }, + { + "epoch": 0.01810296058834622, + "grad_norm": 12.800933837890625, + "learning_rate": 2.4000000000000003e-06, + "loss": 1.9000219106674194, + "step": 12, + "token_acc": 0.5844527523527137 + }, + { + "epoch": 0.01961154063737507, + "grad_norm": 7.983212471008301, + "learning_rate": 2.6e-06, + "loss": 1.7120435237884521, + "step": 13, + "token_acc": 0.615970723468143 + }, + { + "epoch": 0.021120120686403922, + "grad_norm": 5.47987699508667, + "learning_rate": 2.8000000000000003e-06, + "loss": 1.5804016590118408, + "step": 14, + "token_acc": 0.6328618638466622 + }, + { + "epoch": 0.022628700735432775, + "grad_norm": 5.692638874053955, + "learning_rate": 3e-06, + "loss": 1.546237587928772, + "step": 15, + "token_acc": 0.6364698218738067 + }, + { + "epoch": 0.024137280784461625, + "grad_norm": 5.260006427764893, + "learning_rate": 3.2000000000000003e-06, + "loss": 1.5829031467437744, + "step": 16, + "token_acc": 0.6295521038985752 + }, + { + "epoch": 0.025645860833490478, + "grad_norm": 4.086191654205322, + "learning_rate": 3.4000000000000005e-06, + "loss": 1.589308500289917, + "step": 17, + "token_acc": 0.635892265591243 + }, + { + "epoch": 0.02715444088251933, + "grad_norm": 3.6220755577087402, + "learning_rate": 3.6000000000000003e-06, + "loss": 1.4261023998260498, + "step": 18, + "token_acc": 0.6583951015146632 + }, + { + "epoch": 0.02866302093154818, + "grad_norm": 3.7352187633514404, + "learning_rate": 3.8000000000000005e-06, + "loss": 1.4845764636993408, + "step": 19, + "token_acc": 0.6390838434061495 + }, + { + "epoch": 0.030171600980577033, + "grad_norm": 3.888475179672241, + "learning_rate": 4.000000000000001e-06, + "loss": 1.4704111814498901, + "step": 20, + "token_acc": 0.6519462661307384 + }, + { + "epoch": 0.03168018102960588, + "grad_norm": 4.424134731292725, + "learning_rate": 4.2000000000000004e-06, + "loss": 1.347973108291626, + "step": 21, + "token_acc": 0.6741436704725757 + }, + { + "epoch": 0.033188761078634735, + "grad_norm": 4.935394763946533, + "learning_rate": 4.4e-06, + "loss": 1.4122121334075928, + "step": 22, + "token_acc": 0.662193109901746 + }, + { + "epoch": 0.03469734112766359, + "grad_norm": 6.558776378631592, + "learning_rate": 4.600000000000001e-06, + "loss": 1.246969223022461, + "step": 23, + "token_acc": 0.6964433090980546 + }, + { + "epoch": 0.03620592117669244, + "grad_norm": 3.7029266357421875, + "learning_rate": 4.800000000000001e-06, + "loss": 1.3686296939849854, + "step": 24, + "token_acc": 0.6681733759985679 + }, + { + "epoch": 0.037714501225721286, + "grad_norm": 4.599174976348877, + "learning_rate": 5e-06, + "loss": 1.3214421272277832, + "step": 25, + "token_acc": 0.6742244370809091 + }, + { + "epoch": 0.03922308127475014, + "grad_norm": 6.8990960121154785, + "learning_rate": 5.2e-06, + "loss": 1.3585565090179443, + "step": 26, + "token_acc": 0.6672947216078207 + }, + { + "epoch": 0.04073166132377899, + "grad_norm": 2.533958911895752, + "learning_rate": 5.400000000000001e-06, + "loss": 1.311933994293213, + "step": 27, + "token_acc": 0.6766603949427632 + }, + { + "epoch": 0.042240241372807845, + "grad_norm": 3.1441352367401123, + "learning_rate": 5.600000000000001e-06, + "loss": 1.3389827013015747, + "step": 28, + "token_acc": 0.6688676527084134 + }, + { + "epoch": 0.0437488214218367, + "grad_norm": 7.692182540893555, + "learning_rate": 5.8e-06, + "loss": 1.22218656539917, + "step": 29, + "token_acc": 0.6902666293658137 + }, + { + "epoch": 0.04525740147086555, + "grad_norm": 3.20206880569458, + "learning_rate": 6e-06, + "loss": 1.352611780166626, + "step": 30, + "token_acc": 0.6697725463192098 + }, + { + "epoch": 0.046765981519894397, + "grad_norm": 3.803832769393921, + "learning_rate": 6.200000000000001e-06, + "loss": 1.2987114191055298, + "step": 31, + "token_acc": 0.6782692481406762 + }, + { + "epoch": 0.04827456156892325, + "grad_norm": 5.866830348968506, + "learning_rate": 6.4000000000000006e-06, + "loss": 1.2691588401794434, + "step": 32, + "token_acc": 0.6838129859638493 + }, + { + "epoch": 0.0497831416179521, + "grad_norm": 9.66088581085205, + "learning_rate": 6.600000000000001e-06, + "loss": 1.227351188659668, + "step": 33, + "token_acc": 0.6908303652489699 + }, + { + "epoch": 0.051291721666980955, + "grad_norm": 9.164165496826172, + "learning_rate": 6.800000000000001e-06, + "loss": 1.2421988248825073, + "step": 34, + "token_acc": 0.6896976330551375 + }, + { + "epoch": 0.05280030171600981, + "grad_norm": 4.829171180725098, + "learning_rate": 7e-06, + "loss": 1.1738221645355225, + "step": 35, + "token_acc": 0.6989229809283525 + }, + { + "epoch": 0.05430888176503866, + "grad_norm": 3.074422836303711, + "learning_rate": 7.2000000000000005e-06, + "loss": 1.2580199241638184, + "step": 36, + "token_acc": 0.6833896158716758 + }, + { + "epoch": 0.05581746181406751, + "grad_norm": 3.908547878265381, + "learning_rate": 7.4e-06, + "loss": 1.2889597415924072, + "step": 37, + "token_acc": 0.6835733156043879 + }, + { + "epoch": 0.05732604186309636, + "grad_norm": 4.031389236450195, + "learning_rate": 7.600000000000001e-06, + "loss": 1.1728020906448364, + "step": 38, + "token_acc": 0.7027823340015795 + }, + { + "epoch": 0.05883462191212521, + "grad_norm": 12.120161056518555, + "learning_rate": 7.800000000000002e-06, + "loss": 1.2163987159729004, + "step": 39, + "token_acc": 0.6933997186643962 + }, + { + "epoch": 0.060343201961154065, + "grad_norm": 4.134565830230713, + "learning_rate": 8.000000000000001e-06, + "loss": 1.171552062034607, + "step": 40, + "token_acc": 0.694423109346022 + }, + { + "epoch": 0.06185178201018292, + "grad_norm": 4.407393932342529, + "learning_rate": 8.2e-06, + "loss": 1.2011823654174805, + "step": 41, + "token_acc": 0.6905345396252142 + }, + { + "epoch": 0.06336036205921176, + "grad_norm": 5.2070841789245605, + "learning_rate": 8.400000000000001e-06, + "loss": 1.170388102531433, + "step": 42, + "token_acc": 0.7057333018007365 + }, + { + "epoch": 0.06486894210824062, + "grad_norm": 4.616318225860596, + "learning_rate": 8.6e-06, + "loss": 1.2227301597595215, + "step": 43, + "token_acc": 0.6859392932051129 + }, + { + "epoch": 0.06637752215726947, + "grad_norm": 4.058250904083252, + "learning_rate": 8.8e-06, + "loss": 1.2488218545913696, + "step": 44, + "token_acc": 0.6840151858567544 + }, + { + "epoch": 0.06788610220629832, + "grad_norm": 4.093045234680176, + "learning_rate": 9e-06, + "loss": 1.2562403678894043, + "step": 45, + "token_acc": 0.6873919225300695 + }, + { + "epoch": 0.06939468225532718, + "grad_norm": 4.09552001953125, + "learning_rate": 9.200000000000002e-06, + "loss": 1.1795514822006226, + "step": 46, + "token_acc": 0.6978651882698241 + }, + { + "epoch": 0.07090326230435602, + "grad_norm": 2.903743028640747, + "learning_rate": 9.4e-06, + "loss": 1.2144815921783447, + "step": 47, + "token_acc": 0.6874907601005301 + }, + { + "epoch": 0.07241184235338488, + "grad_norm": 4.317008972167969, + "learning_rate": 9.600000000000001e-06, + "loss": 1.070378303527832, + "step": 48, + "token_acc": 0.7211217263495457 + }, + { + "epoch": 0.07392042240241373, + "grad_norm": 3.1660358905792236, + "learning_rate": 9.800000000000001e-06, + "loss": 1.1601417064666748, + "step": 49, + "token_acc": 0.6969117203977101 + }, + { + "epoch": 0.07542900245144257, + "grad_norm": 3.6307532787323, + "learning_rate": 1e-05, + "loss": 1.1627264022827148, + "step": 50, + "token_acc": 0.6991897438067233 + }, + { + "epoch": 0.07693758250047143, + "grad_norm": 2.8982045650482178, + "learning_rate": 9.999972660400536e-06, + "loss": 1.1132137775421143, + "step": 51, + "token_acc": 0.7060062080791856 + }, + { + "epoch": 0.07844616254950028, + "grad_norm": 5.44349479675293, + "learning_rate": 9.999890641901124e-06, + "loss": 1.1400864124298096, + "step": 52, + "token_acc": 0.7071653370416453 + }, + { + "epoch": 0.07995474259852914, + "grad_norm": 3.9622344970703125, + "learning_rate": 9.999753945398704e-06, + "loss": 1.197326421737671, + "step": 53, + "token_acc": 0.6922334672261742 + }, + { + "epoch": 0.08146332264755798, + "grad_norm": 7.886420249938965, + "learning_rate": 9.99956257238817e-06, + "loss": 1.0651140213012695, + "step": 54, + "token_acc": 0.7219923450915485 + }, + { + "epoch": 0.08297190269658684, + "grad_norm": 4.582088470458984, + "learning_rate": 9.999316524962347e-06, + "loss": 1.1732186079025269, + "step": 55, + "token_acc": 0.6963434022257552 + }, + { + "epoch": 0.08448048274561569, + "grad_norm": 5.447359085083008, + "learning_rate": 9.999015805811965e-06, + "loss": 1.1138594150543213, + "step": 56, + "token_acc": 0.7083070876061377 + }, + { + "epoch": 0.08598906279464454, + "grad_norm": 3.797089099884033, + "learning_rate": 9.998660418225645e-06, + "loss": 1.1444756984710693, + "step": 57, + "token_acc": 0.7018640391190852 + }, + { + "epoch": 0.0874976428436734, + "grad_norm": 4.891401290893555, + "learning_rate": 9.998250366089848e-06, + "loss": 1.1523783206939697, + "step": 58, + "token_acc": 0.6940022467600077 + }, + { + "epoch": 0.08900622289270224, + "grad_norm": 29.149520874023438, + "learning_rate": 9.997785653888835e-06, + "loss": 1.1699167490005493, + "step": 59, + "token_acc": 0.6981227345680125 + }, + { + "epoch": 0.0905148029417311, + "grad_norm": 3.721289873123169, + "learning_rate": 9.99726628670463e-06, + "loss": 1.159071922302246, + "step": 60, + "token_acc": 0.7017582417582418 + }, + { + "epoch": 0.09202338299075995, + "grad_norm": 3.475222110748291, + "learning_rate": 9.996692270216946e-06, + "loss": 1.0862263441085815, + "step": 61, + "token_acc": 0.7150378621022382 + }, + { + "epoch": 0.09353196303978879, + "grad_norm": 3.345388174057007, + "learning_rate": 9.996063610703138e-06, + "loss": 1.148200511932373, + "step": 62, + "token_acc": 0.7047909604519774 + }, + { + "epoch": 0.09504054308881765, + "grad_norm": 3.6677258014678955, + "learning_rate": 9.995380315038119e-06, + "loss": 1.159632682800293, + "step": 63, + "token_acc": 0.6990338821608382 + }, + { + "epoch": 0.0965491231378465, + "grad_norm": 3.2358202934265137, + "learning_rate": 9.994642390694308e-06, + "loss": 1.1028666496276855, + "step": 64, + "token_acc": 0.7125423549343498 + }, + { + "epoch": 0.09805770318687536, + "grad_norm": 3.1392016410827637, + "learning_rate": 9.993849845741525e-06, + "loss": 1.0811834335327148, + "step": 65, + "token_acc": 0.7150693030283124 + }, + { + "epoch": 0.0995662832359042, + "grad_norm": 8.662134170532227, + "learning_rate": 9.993002688846913e-06, + "loss": 1.152016043663025, + "step": 66, + "token_acc": 0.7045721884109856 + }, + { + "epoch": 0.10107486328493305, + "grad_norm": 3.4476332664489746, + "learning_rate": 9.992100929274848e-06, + "loss": 1.117648959159851, + "step": 67, + "token_acc": 0.7095779289365837 + }, + { + "epoch": 0.10258344333396191, + "grad_norm": 5.268514633178711, + "learning_rate": 9.991144576886824e-06, + "loss": 1.0997554063796997, + "step": 68, + "token_acc": 0.7143801548825789 + }, + { + "epoch": 0.10409202338299076, + "grad_norm": 5.2205071449279785, + "learning_rate": 9.990133642141359e-06, + "loss": 1.1245999336242676, + "step": 69, + "token_acc": 0.6998076611960957 + }, + { + "epoch": 0.10560060343201962, + "grad_norm": 3.2978031635284424, + "learning_rate": 9.989068136093873e-06, + "loss": 1.1045854091644287, + "step": 70, + "token_acc": 0.7119964991977328 + }, + { + "epoch": 0.10710918348104846, + "grad_norm": 2.967536449432373, + "learning_rate": 9.987948070396572e-06, + "loss": 1.0529496669769287, + "step": 71, + "token_acc": 0.7203069500636313 + }, + { + "epoch": 0.10861776353007732, + "grad_norm": 3.241534471511841, + "learning_rate": 9.986773457298311e-06, + "loss": 1.1522048711776733, + "step": 72, + "token_acc": 0.6950877497325585 + }, + { + "epoch": 0.11012634357910617, + "grad_norm": 3.841336488723755, + "learning_rate": 9.985544309644474e-06, + "loss": 1.0820667743682861, + "step": 73, + "token_acc": 0.7155998809169396 + }, + { + "epoch": 0.11163492362813501, + "grad_norm": 9.532113075256348, + "learning_rate": 9.984260640876821e-06, + "loss": 1.0510246753692627, + "step": 74, + "token_acc": 0.7271442180633663 + }, + { + "epoch": 0.11314350367716387, + "grad_norm": 5.615748882293701, + "learning_rate": 9.98292246503335e-06, + "loss": 1.0529574155807495, + "step": 75, + "token_acc": 0.7282389937106918 + }, + { + "epoch": 0.11465208372619272, + "grad_norm": 2.5460317134857178, + "learning_rate": 9.981529796748135e-06, + "loss": 1.1364719867706299, + "step": 76, + "token_acc": 0.6988545551411828 + }, + { + "epoch": 0.11616066377522158, + "grad_norm": 3.310466766357422, + "learning_rate": 9.980082651251175e-06, + "loss": 0.9804608821868896, + "step": 77, + "token_acc": 0.739940063010681 + }, + { + "epoch": 0.11766924382425042, + "grad_norm": 3.7793593406677246, + "learning_rate": 9.97858104436822e-06, + "loss": 1.0354225635528564, + "step": 78, + "token_acc": 0.7247269597030183 + }, + { + "epoch": 0.11917782387327927, + "grad_norm": 3.8370258808135986, + "learning_rate": 9.977024992520604e-06, + "loss": 1.0909526348114014, + "step": 79, + "token_acc": 0.7111403668058834 + }, + { + "epoch": 0.12068640392230813, + "grad_norm": 3.318906784057617, + "learning_rate": 9.975414512725058e-06, + "loss": 1.032583475112915, + "step": 80, + "token_acc": 0.7263522119385243 + }, + { + "epoch": 0.12219498397133698, + "grad_norm": 2.7286150455474854, + "learning_rate": 9.973749622593534e-06, + "loss": 1.0561416149139404, + "step": 81, + "token_acc": 0.723143530867061 + }, + { + "epoch": 0.12370356402036584, + "grad_norm": 6.966344356536865, + "learning_rate": 9.972030340333e-06, + "loss": 1.0811532735824585, + "step": 82, + "token_acc": 0.7163992361217815 + }, + { + "epoch": 0.12521214406939468, + "grad_norm": 3.315882921218872, + "learning_rate": 9.970256684745258e-06, + "loss": 1.100940227508545, + "step": 83, + "token_acc": 0.7138771751248122 + }, + { + "epoch": 0.12672072411842353, + "grad_norm": 4.109171390533447, + "learning_rate": 9.968428675226714e-06, + "loss": 1.0458613634109497, + "step": 84, + "token_acc": 0.7217123989525219 + }, + { + "epoch": 0.12822930416745237, + "grad_norm": 3.328895330429077, + "learning_rate": 9.966546331768192e-06, + "loss": 1.046668529510498, + "step": 85, + "token_acc": 0.7220332414743369 + }, + { + "epoch": 0.12973788421648125, + "grad_norm": 2.8330154418945312, + "learning_rate": 9.964609674954696e-06, + "loss": 1.094944953918457, + "step": 86, + "token_acc": 0.7121080517675671 + }, + { + "epoch": 0.1312464642655101, + "grad_norm": 3.141967296600342, + "learning_rate": 9.962618725965196e-06, + "loss": 1.1229407787322998, + "step": 87, + "token_acc": 0.7109942638623327 + }, + { + "epoch": 0.13275504431453894, + "grad_norm": 4.489798545837402, + "learning_rate": 9.960573506572391e-06, + "loss": 1.05949068069458, + "step": 88, + "token_acc": 0.7156693185748876 + }, + { + "epoch": 0.13426362436356779, + "grad_norm": 5.760927677154541, + "learning_rate": 9.95847403914247e-06, + "loss": 1.1056040525436401, + "step": 89, + "token_acc": 0.7088521285426301 + }, + { + "epoch": 0.13577220441259663, + "grad_norm": 5.503835678100586, + "learning_rate": 9.956320346634877e-06, + "loss": 1.1028335094451904, + "step": 90, + "token_acc": 0.7130427841634738 + }, + { + "epoch": 0.1372807844616255, + "grad_norm": 3.3800086975097656, + "learning_rate": 9.954112452602045e-06, + "loss": 1.0070219039916992, + "step": 91, + "token_acc": 0.72648175623004 + }, + { + "epoch": 0.13878936451065435, + "grad_norm": 3.4539055824279785, + "learning_rate": 9.951850381189152e-06, + "loss": 1.0808827877044678, + "step": 92, + "token_acc": 0.7149563318777292 + }, + { + "epoch": 0.1402979445596832, + "grad_norm": 3.0789613723754883, + "learning_rate": 9.949534157133844e-06, + "loss": 1.0336943864822388, + "step": 93, + "token_acc": 0.7267212059931979 + }, + { + "epoch": 0.14180652460871204, + "grad_norm": 3.465055465698242, + "learning_rate": 9.94716380576598e-06, + "loss": 1.0511704683303833, + "step": 94, + "token_acc": 0.7253258163713574 + }, + { + "epoch": 0.1433151046577409, + "grad_norm": 3.7335546016693115, + "learning_rate": 9.944739353007344e-06, + "loss": 1.0147852897644043, + "step": 95, + "token_acc": 0.7307765557301338 + }, + { + "epoch": 0.14482368470676976, + "grad_norm": 3.6083109378814697, + "learning_rate": 9.942260825371359e-06, + "loss": 1.1048388481140137, + "step": 96, + "token_acc": 0.7040359652497924 + }, + { + "epoch": 0.1463322647557986, + "grad_norm": 2.22707200050354, + "learning_rate": 9.939728249962808e-06, + "loss": 1.0626146793365479, + "step": 97, + "token_acc": 0.7120508853360838 + }, + { + "epoch": 0.14784084480482745, + "grad_norm": 3.1977670192718506, + "learning_rate": 9.937141654477529e-06, + "loss": 1.091273546218872, + "step": 98, + "token_acc": 0.7101187926876 + }, + { + "epoch": 0.1493494248538563, + "grad_norm": 2.1985111236572266, + "learning_rate": 9.934501067202117e-06, + "loss": 1.1240156888961792, + "step": 99, + "token_acc": 0.7054784009821223 + }, + { + "epoch": 0.15085800490288515, + "grad_norm": 2.9239284992218018, + "learning_rate": 9.931806517013612e-06, + "loss": 1.0962307453155518, + "step": 100, + "token_acc": 0.7090999615271459 + }, + { + "epoch": 0.15236658495191402, + "grad_norm": 3.0761330127716064, + "learning_rate": 9.929058033379181e-06, + "loss": 1.0907471179962158, + "step": 101, + "token_acc": 0.7129876157209507 + }, + { + "epoch": 0.15387516500094287, + "grad_norm": 4.151998996734619, + "learning_rate": 9.926255646355804e-06, + "loss": 1.0916119813919067, + "step": 102, + "token_acc": 0.7107656926962486 + }, + { + "epoch": 0.1553837450499717, + "grad_norm": 3.0589332580566406, + "learning_rate": 9.923399386589933e-06, + "loss": 1.0659116506576538, + "step": 103, + "token_acc": 0.7162235368046979 + }, + { + "epoch": 0.15689232509900056, + "grad_norm": 3.133718252182007, + "learning_rate": 9.920489285317169e-06, + "loss": 1.050791621208191, + "step": 104, + "token_acc": 0.7249207617182467 + }, + { + "epoch": 0.15840090514802943, + "grad_norm": 4.4807820320129395, + "learning_rate": 9.917525374361913e-06, + "loss": 1.0721123218536377, + "step": 105, + "token_acc": 0.7111249057775019 + }, + { + "epoch": 0.15990948519705828, + "grad_norm": 3.6618025302886963, + "learning_rate": 9.91450768613702e-06, + "loss": 0.9978234171867371, + "step": 106, + "token_acc": 0.7294720895000583 + }, + { + "epoch": 0.16141806524608712, + "grad_norm": 2.550509452819824, + "learning_rate": 9.911436253643445e-06, + "loss": 1.06361985206604, + "step": 107, + "token_acc": 0.7161888031453248 + }, + { + "epoch": 0.16292664529511597, + "grad_norm": 1.998715877532959, + "learning_rate": 9.908311110469881e-06, + "loss": 1.0948556661605835, + "step": 108, + "token_acc": 0.7080483336974607 + }, + { + "epoch": 0.16443522534414481, + "grad_norm": 6.698785781860352, + "learning_rate": 9.905132290792395e-06, + "loss": 1.0508636236190796, + "step": 109, + "token_acc": 0.7170636152982489 + }, + { + "epoch": 0.1659438053931737, + "grad_norm": 2.3685758113861084, + "learning_rate": 9.901899829374048e-06, + "loss": 1.0660905838012695, + "step": 110, + "token_acc": 0.7214620134349625 + }, + { + "epoch": 0.16745238544220253, + "grad_norm": 2.5085978507995605, + "learning_rate": 9.89861376156452e-06, + "loss": 0.9668797254562378, + "step": 111, + "token_acc": 0.7385307598503007 + }, + { + "epoch": 0.16896096549123138, + "grad_norm": 2.6316843032836914, + "learning_rate": 9.895274123299724e-06, + "loss": 1.0601732730865479, + "step": 112, + "token_acc": 0.7167253156696336 + }, + { + "epoch": 0.17046954554026023, + "grad_norm": 2.2268619537353516, + "learning_rate": 9.891880951101407e-06, + "loss": 1.0351203680038452, + "step": 113, + "token_acc": 0.7175595170162744 + }, + { + "epoch": 0.17197812558928907, + "grad_norm": 3.421921730041504, + "learning_rate": 9.888434282076759e-06, + "loss": 1.0248239040374756, + "step": 114, + "token_acc": 0.7196467991169978 + }, + { + "epoch": 0.17348670563831795, + "grad_norm": 2.6647675037384033, + "learning_rate": 9.884934153917998e-06, + "loss": 1.0660920143127441, + "step": 115, + "token_acc": 0.7176527600334817 + }, + { + "epoch": 0.1749952856873468, + "grad_norm": 2.6446070671081543, + "learning_rate": 9.881380604901964e-06, + "loss": 1.0326142311096191, + "step": 116, + "token_acc": 0.7242975984931722 + }, + { + "epoch": 0.17650386573637564, + "grad_norm": 2.4096245765686035, + "learning_rate": 9.877773673889702e-06, + "loss": 1.0499000549316406, + "step": 117, + "token_acc": 0.7218222843021381 + }, + { + "epoch": 0.17801244578540448, + "grad_norm": 2.977527618408203, + "learning_rate": 9.874113400326031e-06, + "loss": 0.9945988655090332, + "step": 118, + "token_acc": 0.7338616627193781 + }, + { + "epoch": 0.17952102583443333, + "grad_norm": 2.2931976318359375, + "learning_rate": 9.870399824239116e-06, + "loss": 1.1020820140838623, + "step": 119, + "token_acc": 0.7096959383265331 + }, + { + "epoch": 0.1810296058834622, + "grad_norm": 2.3942818641662598, + "learning_rate": 9.86663298624003e-06, + "loss": 1.0635650157928467, + "step": 120, + "token_acc": 0.7195226810393641 + }, + { + "epoch": 0.18253818593249105, + "grad_norm": 2.7707111835479736, + "learning_rate": 9.86281292752231e-06, + "loss": 1.0952274799346924, + "step": 121, + "token_acc": 0.7121221603354403 + }, + { + "epoch": 0.1840467659815199, + "grad_norm": 3.980163097381592, + "learning_rate": 9.858939689861506e-06, + "loss": 1.056509017944336, + "step": 122, + "token_acc": 0.7131120133747313 + }, + { + "epoch": 0.18555534603054874, + "grad_norm": 2.4051356315612793, + "learning_rate": 9.855013315614725e-06, + "loss": 1.0607542991638184, + "step": 123, + "token_acc": 0.7142528591440339 + }, + { + "epoch": 0.18706392607957759, + "grad_norm": 2.7722997665405273, + "learning_rate": 9.851033847720167e-06, + "loss": 1.0476069450378418, + "step": 124, + "token_acc": 0.7189458543100791 + }, + { + "epoch": 0.18857250612860646, + "grad_norm": 2.0917866230010986, + "learning_rate": 9.847001329696653e-06, + "loss": 1.0099124908447266, + "step": 125, + "token_acc": 0.7354876666957204 + }, + { + "epoch": 0.1900810861776353, + "grad_norm": 3.4954452514648438, + "learning_rate": 9.842915805643156e-06, + "loss": 1.032479166984558, + "step": 126, + "token_acc": 0.7251019066403682 + }, + { + "epoch": 0.19158966622666415, + "grad_norm": 2.1751708984375, + "learning_rate": 9.838777320238312e-06, + "loss": 1.0196071863174438, + "step": 127, + "token_acc": 0.7277527067763052 + }, + { + "epoch": 0.193098246275693, + "grad_norm": 2.467465877532959, + "learning_rate": 9.834585918739936e-06, + "loss": 1.0539357662200928, + "step": 128, + "token_acc": 0.7154692045958874 + }, + { + "epoch": 0.19460682632472184, + "grad_norm": 2.2475500106811523, + "learning_rate": 9.830341646984521e-06, + "loss": 1.0601041316986084, + "step": 129, + "token_acc": 0.7144262435310722 + }, + { + "epoch": 0.19611540637375072, + "grad_norm": 2.642874240875244, + "learning_rate": 9.826044551386743e-06, + "loss": 1.0321165323257446, + "step": 130, + "token_acc": 0.7220993864352441 + }, + { + "epoch": 0.19762398642277956, + "grad_norm": 2.1353695392608643, + "learning_rate": 9.821694678938954e-06, + "loss": 1.0367600917816162, + "step": 131, + "token_acc": 0.7203869439326138 + }, + { + "epoch": 0.1991325664718084, + "grad_norm": 2.2269461154937744, + "learning_rate": 9.817292077210658e-06, + "loss": 1.120430588722229, + "step": 132, + "token_acc": 0.7052752542225884 + }, + { + "epoch": 0.20064114652083725, + "grad_norm": 2.3690760135650635, + "learning_rate": 9.812836794348005e-06, + "loss": 1.040844202041626, + "step": 133, + "token_acc": 0.7278693392724573 + }, + { + "epoch": 0.2021497265698661, + "grad_norm": 3.5620229244232178, + "learning_rate": 9.808328879073251e-06, + "loss": 1.0494694709777832, + "step": 134, + "token_acc": 0.7215598324827791 + }, + { + "epoch": 0.20365830661889497, + "grad_norm": 2.4997615814208984, + "learning_rate": 9.803768380684242e-06, + "loss": 1.0201911926269531, + "step": 135, + "token_acc": 0.7280017876651107 + }, + { + "epoch": 0.20516688666792382, + "grad_norm": 5.488840579986572, + "learning_rate": 9.79915534905385e-06, + "loss": 1.0547184944152832, + "step": 136, + "token_acc": 0.7134793545616296 + }, + { + "epoch": 0.20667546671695267, + "grad_norm": 2.2710964679718018, + "learning_rate": 9.794489834629457e-06, + "loss": 0.9756638407707214, + "step": 137, + "token_acc": 0.7398678771176137 + }, + { + "epoch": 0.2081840467659815, + "grad_norm": 2.5210776329040527, + "learning_rate": 9.789771888432375e-06, + "loss": 1.0420737266540527, + "step": 138, + "token_acc": 0.7250211909302818 + }, + { + "epoch": 0.20969262681501036, + "grad_norm": 3.072575807571411, + "learning_rate": 9.785001562057311e-06, + "loss": 1.0800588130950928, + "step": 139, + "token_acc": 0.7118485627483057 + }, + { + "epoch": 0.21120120686403923, + "grad_norm": 2.4379169940948486, + "learning_rate": 9.780178907671788e-06, + "loss": 1.0567656755447388, + "step": 140, + "token_acc": 0.7193392406820442 + }, + { + "epoch": 0.21270978691306808, + "grad_norm": 14.357322692871094, + "learning_rate": 9.775303978015585e-06, + "loss": 0.9833472967147827, + "step": 141, + "token_acc": 0.7347797470734804 + }, + { + "epoch": 0.21421836696209692, + "grad_norm": 2.8680336475372314, + "learning_rate": 9.77037682640015e-06, + "loss": 1.039903998374939, + "step": 142, + "token_acc": 0.7224656186303681 + }, + { + "epoch": 0.21572694701112577, + "grad_norm": 2.596453905105591, + "learning_rate": 9.765397506708023e-06, + "loss": 1.0465795993804932, + "step": 143, + "token_acc": 0.7168776480985678 + }, + { + "epoch": 0.21723552706015464, + "grad_norm": 3.8211591243743896, + "learning_rate": 9.760366073392246e-06, + "loss": 1.0656347274780273, + "step": 144, + "token_acc": 0.7168101865077745 + }, + { + "epoch": 0.2187441071091835, + "grad_norm": 2.5738673210144043, + "learning_rate": 9.755282581475769e-06, + "loss": 1.036583662033081, + "step": 145, + "token_acc": 0.7225522663498243 + }, + { + "epoch": 0.22025268715821233, + "grad_norm": 2.2527616024017334, + "learning_rate": 9.750147086550843e-06, + "loss": 1.0628644227981567, + "step": 146, + "token_acc": 0.7188534438368929 + }, + { + "epoch": 0.22176126720724118, + "grad_norm": 2.0387396812438965, + "learning_rate": 9.744959644778422e-06, + "loss": 1.0493512153625488, + "step": 147, + "token_acc": 0.7175111111111111 + }, + { + "epoch": 0.22326984725627003, + "grad_norm": 2.8886630535125732, + "learning_rate": 9.739720312887536e-06, + "loss": 0.9899839162826538, + "step": 148, + "token_acc": 0.7314798624468147 + }, + { + "epoch": 0.2247784273052989, + "grad_norm": 3.132871389389038, + "learning_rate": 9.734429148174676e-06, + "loss": 1.0543450117111206, + "step": 149, + "token_acc": 0.7205085165515533 + }, + { + "epoch": 0.22628700735432775, + "grad_norm": 2.611292839050293, + "learning_rate": 9.729086208503174e-06, + "loss": 1.0235342979431152, + "step": 150, + "token_acc": 0.7257730336489286 + }, + { + "epoch": 0.2277955874033566, + "grad_norm": 1.8280929327011108, + "learning_rate": 9.723691552302563e-06, + "loss": 1.0597591400146484, + "step": 151, + "token_acc": 0.7148926373948349 + }, + { + "epoch": 0.22930416745238544, + "grad_norm": 2.2191998958587646, + "learning_rate": 9.718245238567939e-06, + "loss": 0.9249858856201172, + "step": 152, + "token_acc": 0.7485740676596236 + }, + { + "epoch": 0.23081274750141428, + "grad_norm": 2.7234697341918945, + "learning_rate": 9.712747326859316e-06, + "loss": 0.9665364027023315, + "step": 153, + "token_acc": 0.7428920422059359 + }, + { + "epoch": 0.23232132755044316, + "grad_norm": 2.2393007278442383, + "learning_rate": 9.707197877300974e-06, + "loss": 1.0497493743896484, + "step": 154, + "token_acc": 0.7190812720848057 + }, + { + "epoch": 0.233829907599472, + "grad_norm": 2.3182427883148193, + "learning_rate": 9.701596950580807e-06, + "loss": 0.9858441352844238, + "step": 155, + "token_acc": 0.7347013113161729 + }, + { + "epoch": 0.23533848764850085, + "grad_norm": 1.859687089920044, + "learning_rate": 9.69594460794965e-06, + "loss": 0.9696170091629028, + "step": 156, + "token_acc": 0.7384590831105614 + }, + { + "epoch": 0.2368470676975297, + "grad_norm": 1.8918596506118774, + "learning_rate": 9.690240911220618e-06, + "loss": 1.022160530090332, + "step": 157, + "token_acc": 0.7262180420615406 + }, + { + "epoch": 0.23835564774655854, + "grad_norm": 2.4236037731170654, + "learning_rate": 9.684485922768422e-06, + "loss": 1.037246823310852, + "step": 158, + "token_acc": 0.7234590065828845 + }, + { + "epoch": 0.23986422779558741, + "grad_norm": 2.179180383682251, + "learning_rate": 9.678679705528699e-06, + "loss": 1.0059877634048462, + "step": 159, + "token_acc": 0.7272259031208297 + }, + { + "epoch": 0.24137280784461626, + "grad_norm": 2.7571890354156494, + "learning_rate": 9.672822322997305e-06, + "loss": 0.9877049326896667, + "step": 160, + "token_acc": 0.734720451401134 + }, + { + "epoch": 0.2428813878936451, + "grad_norm": 2.0530447959899902, + "learning_rate": 9.666913839229639e-06, + "loss": 0.946295976638794, + "step": 161, + "token_acc": 0.7426134463150853 + }, + { + "epoch": 0.24438996794267395, + "grad_norm": 2.7437386512756348, + "learning_rate": 9.660954318839934e-06, + "loss": 1.0138883590698242, + "step": 162, + "token_acc": 0.7268457319645619 + }, + { + "epoch": 0.2458985479917028, + "grad_norm": 2.0369114875793457, + "learning_rate": 9.654943827000548e-06, + "loss": 1.0147656202316284, + "step": 163, + "token_acc": 0.7341937969562705 + }, + { + "epoch": 0.24740712804073167, + "grad_norm": 2.4712202548980713, + "learning_rate": 9.648882429441258e-06, + "loss": 1.0196806192398071, + "step": 164, + "token_acc": 0.7294436004557538 + }, + { + "epoch": 0.24891570808976052, + "grad_norm": 2.812676191329956, + "learning_rate": 9.642770192448537e-06, + "loss": 0.9951367378234863, + "step": 165, + "token_acc": 0.7306218057921635 + }, + { + "epoch": 0.25042428813878936, + "grad_norm": 1.87490975856781, + "learning_rate": 9.636607182864828e-06, + "loss": 0.9218826293945312, + "step": 166, + "token_acc": 0.7455243604448212 + }, + { + "epoch": 0.25193286818781824, + "grad_norm": 1.9590466022491455, + "learning_rate": 9.630393468087818e-06, + "loss": 1.0775024890899658, + "step": 167, + "token_acc": 0.7133664461380683 + }, + { + "epoch": 0.25344144823684706, + "grad_norm": 2.1801793575286865, + "learning_rate": 9.624129116069695e-06, + "loss": 1.0152616500854492, + "step": 168, + "token_acc": 0.7221026282853567 + }, + { + "epoch": 0.25495002828587593, + "grad_norm": 2.6200428009033203, + "learning_rate": 9.61781419531641e-06, + "loss": 1.024682879447937, + "step": 169, + "token_acc": 0.724750197685285 + }, + { + "epoch": 0.25645860833490475, + "grad_norm": 3.0211429595947266, + "learning_rate": 9.611448774886925e-06, + "loss": 1.047837495803833, + "step": 170, + "token_acc": 0.7178876404494382 + }, + { + "epoch": 0.2579671883839336, + "grad_norm": 3.1089694499969482, + "learning_rate": 9.605032924392457e-06, + "loss": 1.0591251850128174, + "step": 171, + "token_acc": 0.7207269483777455 + }, + { + "epoch": 0.2594757684329625, + "grad_norm": 2.4372165203094482, + "learning_rate": 9.598566713995718e-06, + "loss": 1.03012216091156, + "step": 172, + "token_acc": 0.7231423865300146 + }, + { + "epoch": 0.2609843484819913, + "grad_norm": 3.1218185424804688, + "learning_rate": 9.592050214410152e-06, + "loss": 1.030163049697876, + "step": 173, + "token_acc": 0.7259322384051234 + }, + { + "epoch": 0.2624929285310202, + "grad_norm": 2.774333953857422, + "learning_rate": 9.585483496899151e-06, + "loss": 0.955190896987915, + "step": 174, + "token_acc": 0.7390422199015366 + }, + { + "epoch": 0.264001508580049, + "grad_norm": 2.15107798576355, + "learning_rate": 9.578866633275289e-06, + "loss": 0.9795132279396057, + "step": 175, + "token_acc": 0.7338707809206106 + }, + { + "epoch": 0.2655100886290779, + "grad_norm": 2.5008866786956787, + "learning_rate": 9.572199695899522e-06, + "loss": 1.026280164718628, + "step": 176, + "token_acc": 0.7292986476164802 + }, + { + "epoch": 0.26701866867810675, + "grad_norm": 2.587505340576172, + "learning_rate": 9.565482757680415e-06, + "loss": 1.0221121311187744, + "step": 177, + "token_acc": 0.7257088951778481 + }, + { + "epoch": 0.26852724872713557, + "grad_norm": 2.463494062423706, + "learning_rate": 9.558715892073324e-06, + "loss": 1.0481772422790527, + "step": 178, + "token_acc": 0.7173504120386958 + }, + { + "epoch": 0.27003582877616444, + "grad_norm": 3.276925563812256, + "learning_rate": 9.551899173079607e-06, + "loss": 0.991529643535614, + "step": 179, + "token_acc": 0.7398327689034075 + }, + { + "epoch": 0.27154440882519326, + "grad_norm": 3.188319444656372, + "learning_rate": 9.545032675245814e-06, + "loss": 1.000281572341919, + "step": 180, + "token_acc": 0.7296483188301324 + }, + { + "epoch": 0.27305298887422214, + "grad_norm": 2.9731523990631104, + "learning_rate": 9.538116473662862e-06, + "loss": 0.9784246683120728, + "step": 181, + "token_acc": 0.7385413231811505 + }, + { + "epoch": 0.274561568923251, + "grad_norm": 3.390334367752075, + "learning_rate": 9.531150643965224e-06, + "loss": 1.037996768951416, + "step": 182, + "token_acc": 0.726039016115352 + }, + { + "epoch": 0.2760701489722798, + "grad_norm": 3.0717201232910156, + "learning_rate": 9.524135262330098e-06, + "loss": 1.0037124156951904, + "step": 183, + "token_acc": 0.7285005431750785 + }, + { + "epoch": 0.2775787290213087, + "grad_norm": 2.530275821685791, + "learning_rate": 9.517070405476575e-06, + "loss": 0.9921556711196899, + "step": 184, + "token_acc": 0.7353305785123967 + }, + { + "epoch": 0.2790873090703375, + "grad_norm": 2.229982852935791, + "learning_rate": 9.509956150664796e-06, + "loss": 1.000861406326294, + "step": 185, + "token_acc": 0.7271430215165113 + }, + { + "epoch": 0.2805958891193664, + "grad_norm": 2.3920340538024902, + "learning_rate": 9.502792575695112e-06, + "loss": 1.0035457611083984, + "step": 186, + "token_acc": 0.7278244891140933 + }, + { + "epoch": 0.28210446916839527, + "grad_norm": 2.3219757080078125, + "learning_rate": 9.495579758907231e-06, + "loss": 0.9932644367218018, + "step": 187, + "token_acc": 0.7344323552038685 + }, + { + "epoch": 0.2836130492174241, + "grad_norm": 2.836785078048706, + "learning_rate": 9.48831777917936e-06, + "loss": 1.0274665355682373, + "step": 188, + "token_acc": 0.7226516410219365 + }, + { + "epoch": 0.28512162926645296, + "grad_norm": 2.0176432132720947, + "learning_rate": 9.481006715927352e-06, + "loss": 1.0273675918579102, + "step": 189, + "token_acc": 0.7212298881919825 + }, + { + "epoch": 0.2866302093154818, + "grad_norm": 2.2734129428863525, + "learning_rate": 9.473646649103819e-06, + "loss": 1.0322508811950684, + "step": 190, + "token_acc": 0.7201403209367371 + }, + { + "epoch": 0.28813878936451065, + "grad_norm": 2.3202552795410156, + "learning_rate": 9.466237659197271e-06, + "loss": 0.9573006629943848, + "step": 191, + "token_acc": 0.737797541198012 + }, + { + "epoch": 0.2896473694135395, + "grad_norm": 2.0430493354797363, + "learning_rate": 9.458779827231237e-06, + "loss": 1.0123865604400635, + "step": 192, + "token_acc": 0.7280118082564079 + }, + { + "epoch": 0.29115594946256834, + "grad_norm": 2.5600745677948, + "learning_rate": 9.451273234763372e-06, + "loss": 0.9469287991523743, + "step": 193, + "token_acc": 0.7428571428571429 + }, + { + "epoch": 0.2926645295115972, + "grad_norm": 2.138296127319336, + "learning_rate": 9.443717963884568e-06, + "loss": 1.0311319828033447, + "step": 194, + "token_acc": 0.7269601458713206 + }, + { + "epoch": 0.29417310956062603, + "grad_norm": 2.4624361991882324, + "learning_rate": 9.43611409721806e-06, + "loss": 1.047189474105835, + "step": 195, + "token_acc": 0.7176506867186944 + }, + { + "epoch": 0.2956816896096549, + "grad_norm": 4.700667858123779, + "learning_rate": 9.428461717918512e-06, + "loss": 0.9889592528343201, + "step": 196, + "token_acc": 0.7365254605222353 + }, + { + "epoch": 0.2971902696586838, + "grad_norm": 2.8413708209991455, + "learning_rate": 9.420760909671119e-06, + "loss": 0.9957966804504395, + "step": 197, + "token_acc": 0.7314576026408087 + }, + { + "epoch": 0.2986988497077126, + "grad_norm": 2.9809980392456055, + "learning_rate": 9.413011756690686e-06, + "loss": 1.024433970451355, + "step": 198, + "token_acc": 0.721934846081339 + }, + { + "epoch": 0.3002074297567415, + "grad_norm": 3.8305580615997314, + "learning_rate": 9.405214343720708e-06, + "loss": 0.9800687432289124, + "step": 199, + "token_acc": 0.7346131128174838 + }, + { + "epoch": 0.3017160098057703, + "grad_norm": 2.690847396850586, + "learning_rate": 9.397368756032445e-06, + "loss": 0.9597967863082886, + "step": 200, + "token_acc": 0.7411594695681741 + }, + { + "epoch": 0.30322458985479916, + "grad_norm": 3.5039279460906982, + "learning_rate": 9.389475079423988e-06, + "loss": 0.9554229974746704, + "step": 201, + "token_acc": 0.7444199243379571 + }, + { + "epoch": 0.30473316990382804, + "grad_norm": 3.411407947540283, + "learning_rate": 9.381533400219319e-06, + "loss": 0.9736269116401672, + "step": 202, + "token_acc": 0.7402278866155423 + }, + { + "epoch": 0.30624174995285686, + "grad_norm": 3.8000547885894775, + "learning_rate": 9.373543805267367e-06, + "loss": 0.9508758187294006, + "step": 203, + "token_acc": 0.7402914791523858 + }, + { + "epoch": 0.30775033000188573, + "grad_norm": 2.6628122329711914, + "learning_rate": 9.365506381941066e-06, + "loss": 1.0009136199951172, + "step": 204, + "token_acc": 0.7314362116102253 + }, + { + "epoch": 0.3092589100509146, + "grad_norm": 2.6256113052368164, + "learning_rate": 9.357421218136387e-06, + "loss": 0.952735960483551, + "step": 205, + "token_acc": 0.7410526875149454 + }, + { + "epoch": 0.3107674900999434, + "grad_norm": 5.0103583335876465, + "learning_rate": 9.349288402271387e-06, + "loss": 0.995936930179596, + "step": 206, + "token_acc": 0.7290078638804529 + }, + { + "epoch": 0.3122760701489723, + "grad_norm": 3.681445598602295, + "learning_rate": 9.341108023285239e-06, + "loss": 1.0061194896697998, + "step": 207, + "token_acc": 0.7284562006998399 + }, + { + "epoch": 0.3137846501980011, + "grad_norm": 3.204003095626831, + "learning_rate": 9.332880170637252e-06, + "loss": 0.948515772819519, + "step": 208, + "token_acc": 0.7452056186339505 + }, + { + "epoch": 0.31529323024703, + "grad_norm": 3.628021001815796, + "learning_rate": 9.324604934305911e-06, + "loss": 0.9973806142807007, + "step": 209, + "token_acc": 0.7279908024441653 + }, + { + "epoch": 0.31680181029605886, + "grad_norm": 3.6676204204559326, + "learning_rate": 9.31628240478787e-06, + "loss": 0.9946406483650208, + "step": 210, + "token_acc": 0.7349647449191207 + }, + { + "epoch": 0.3183103903450877, + "grad_norm": 2.8932945728302, + "learning_rate": 9.30791267309698e-06, + "loss": 1.0221295356750488, + "step": 211, + "token_acc": 0.7261453579807667 + }, + { + "epoch": 0.31981897039411655, + "grad_norm": 5.859443664550781, + "learning_rate": 9.299495830763285e-06, + "loss": 0.9403787851333618, + "step": 212, + "token_acc": 0.7405990558359108 + }, + { + "epoch": 0.32132755044314537, + "grad_norm": 5.388125896453857, + "learning_rate": 9.291031969832026e-06, + "loss": 1.0030494928359985, + "step": 213, + "token_acc": 0.726610358269909 + }, + { + "epoch": 0.32283613049217424, + "grad_norm": 2.6519083976745605, + "learning_rate": 9.28252118286263e-06, + "loss": 1.054288387298584, + "step": 214, + "token_acc": 0.714758255692155 + }, + { + "epoch": 0.3243447105412031, + "grad_norm": 3.33801007270813, + "learning_rate": 9.273963562927695e-06, + "loss": 1.044953465461731, + "step": 215, + "token_acc": 0.7215793963954357 + }, + { + "epoch": 0.32585329059023194, + "grad_norm": 2.166651487350464, + "learning_rate": 9.265359203611988e-06, + "loss": 1.0144579410552979, + "step": 216, + "token_acc": 0.7260201750377843 + }, + { + "epoch": 0.3273618706392608, + "grad_norm": 3.1577341556549072, + "learning_rate": 9.256708199011402e-06, + "loss": 0.9953950643539429, + "step": 217, + "token_acc": 0.7330878250195291 + }, + { + "epoch": 0.32887045068828963, + "grad_norm": 4.481351852416992, + "learning_rate": 9.248010643731936e-06, + "loss": 1.01018488407135, + "step": 218, + "token_acc": 0.7254581003046424 + }, + { + "epoch": 0.3303790307373185, + "grad_norm": 3.1495249271392822, + "learning_rate": 9.23926663288866e-06, + "loss": 0.9719182252883911, + "step": 219, + "token_acc": 0.7349775064267352 + }, + { + "epoch": 0.3318876107863474, + "grad_norm": 2.9385905265808105, + "learning_rate": 9.230476262104678e-06, + "loss": 0.9681339263916016, + "step": 220, + "token_acc": 0.7401827759597733 + }, + { + "epoch": 0.3333961908353762, + "grad_norm": 2.845468759536743, + "learning_rate": 9.221639627510076e-06, + "loss": 0.9389423727989197, + "step": 221, + "token_acc": 0.7456421052631579 + }, + { + "epoch": 0.33490477088440507, + "grad_norm": 2.5076491832733154, + "learning_rate": 9.212756825740874e-06, + "loss": 1.0208263397216797, + "step": 222, + "token_acc": 0.7267491467576792 + }, + { + "epoch": 0.3364133509334339, + "grad_norm": 3.2223143577575684, + "learning_rate": 9.203827953937969e-06, + "loss": 0.969973087310791, + "step": 223, + "token_acc": 0.7323485889440863 + }, + { + "epoch": 0.33792193098246276, + "grad_norm": 3.3483426570892334, + "learning_rate": 9.194853109746073e-06, + "loss": 1.0131691694259644, + "step": 224, + "token_acc": 0.72661853188929 + }, + { + "epoch": 0.33943051103149163, + "grad_norm": 3.0698840618133545, + "learning_rate": 9.185832391312644e-06, + "loss": 1.0323610305786133, + "step": 225, + "token_acc": 0.7202015599453215 + }, + { + "epoch": 0.34093909108052045, + "grad_norm": 3.4069857597351074, + "learning_rate": 9.176765897286812e-06, + "loss": 0.9797638058662415, + "step": 226, + "token_acc": 0.7394527962882941 + }, + { + "epoch": 0.3424476711295493, + "grad_norm": 2.7319929599761963, + "learning_rate": 9.167653726818305e-06, + "loss": 0.9821237325668335, + "step": 227, + "token_acc": 0.7339037330911524 + }, + { + "epoch": 0.34395625117857814, + "grad_norm": 3.201282024383545, + "learning_rate": 9.15849597955636e-06, + "loss": 0.9242328405380249, + "step": 228, + "token_acc": 0.7482755391866163 + }, + { + "epoch": 0.345464831227607, + "grad_norm": 2.790771484375, + "learning_rate": 9.149292755648631e-06, + "loss": 0.9432302713394165, + "step": 229, + "token_acc": 0.7443399608416292 + }, + { + "epoch": 0.3469734112766359, + "grad_norm": 4.136040687561035, + "learning_rate": 9.140044155740102e-06, + "loss": 1.0728843212127686, + "step": 230, + "token_acc": 0.7175115538421015 + }, + { + "epoch": 0.3484819913256647, + "grad_norm": 2.648467779159546, + "learning_rate": 9.130750280971978e-06, + "loss": 0.9763900637626648, + "step": 231, + "token_acc": 0.7370195872097195 + }, + { + "epoch": 0.3499905713746936, + "grad_norm": 4.50541877746582, + "learning_rate": 9.121411232980589e-06, + "loss": 1.004884958267212, + "step": 232, + "token_acc": 0.7320310977326374 + }, + { + "epoch": 0.3514991514237224, + "grad_norm": 3.075073719024658, + "learning_rate": 9.112027113896262e-06, + "loss": 0.9093083143234253, + "step": 233, + "token_acc": 0.747817920671749 + }, + { + "epoch": 0.3530077314727513, + "grad_norm": 3.8219592571258545, + "learning_rate": 9.102598026342223e-06, + "loss": 0.9917465448379517, + "step": 234, + "token_acc": 0.7359651756229361 + }, + { + "epoch": 0.35451631152178015, + "grad_norm": 2.3074402809143066, + "learning_rate": 9.093124073433464e-06, + "loss": 1.0049961805343628, + "step": 235, + "token_acc": 0.7230178852874666 + }, + { + "epoch": 0.35602489157080897, + "grad_norm": 2.5973546504974365, + "learning_rate": 9.083605358775612e-06, + "loss": 0.9772617220878601, + "step": 236, + "token_acc": 0.7335022833341697 + }, + { + "epoch": 0.35753347161983784, + "grad_norm": 2.660331964492798, + "learning_rate": 9.074041986463808e-06, + "loss": 1.0027625560760498, + "step": 237, + "token_acc": 0.7271168434197529 + }, + { + "epoch": 0.35904205166886666, + "grad_norm": 7.415755271911621, + "learning_rate": 9.064434061081562e-06, + "loss": 0.9581634402275085, + "step": 238, + "token_acc": 0.7421920053419128 + }, + { + "epoch": 0.36055063171789553, + "grad_norm": 2.6920483112335205, + "learning_rate": 9.0547816876996e-06, + "loss": 0.9932858943939209, + "step": 239, + "token_acc": 0.7322656686675253 + }, + { + "epoch": 0.3620592117669244, + "grad_norm": 2.4598066806793213, + "learning_rate": 9.045084971874738e-06, + "loss": 1.0127662420272827, + "step": 240, + "token_acc": 0.7261635563949009 + }, + { + "epoch": 0.3635677918159532, + "grad_norm": 3.3929948806762695, + "learning_rate": 9.035344019648701e-06, + "loss": 1.036333680152893, + "step": 241, + "token_acc": 0.7195195473728145 + }, + { + "epoch": 0.3650763718649821, + "grad_norm": 2.0975193977355957, + "learning_rate": 9.025558937546987e-06, + "loss": 0.9245340824127197, + "step": 242, + "token_acc": 0.7444659524311478 + }, + { + "epoch": 0.3665849519140109, + "grad_norm": 5.632428169250488, + "learning_rate": 9.015729832577681e-06, + "loss": 0.9687602519989014, + "step": 243, + "token_acc": 0.7384145379440914 + }, + { + "epoch": 0.3680935319630398, + "grad_norm": 3.5934066772460938, + "learning_rate": 9.005856812230304e-06, + "loss": 0.9666587114334106, + "step": 244, + "token_acc": 0.7311572987490624 + }, + { + "epoch": 0.36960211201206866, + "grad_norm": 2.738447427749634, + "learning_rate": 8.995939984474624e-06, + "loss": 0.9545416831970215, + "step": 245, + "token_acc": 0.7404001782740259 + }, + { + "epoch": 0.3711106920610975, + "grad_norm": 2.395606517791748, + "learning_rate": 8.98597945775948e-06, + "loss": 1.0676915645599365, + "step": 246, + "token_acc": 0.7160236041761234 + }, + { + "epoch": 0.37261927211012635, + "grad_norm": 2.7077395915985107, + "learning_rate": 8.975975341011595e-06, + "loss": 1.0289101600646973, + "step": 247, + "token_acc": 0.7184290562877419 + }, + { + "epoch": 0.37412785215915517, + "grad_norm": 2.6003077030181885, + "learning_rate": 8.96592774363439e-06, + "loss": 0.9919425845146179, + "step": 248, + "token_acc": 0.7328744590935546 + }, + { + "epoch": 0.37563643220818405, + "grad_norm": 4.835913181304932, + "learning_rate": 8.955836775506776e-06, + "loss": 0.9712538123130798, + "step": 249, + "token_acc": 0.7309789414870046 + }, + { + "epoch": 0.3771450122572129, + "grad_norm": 2.7414259910583496, + "learning_rate": 8.94570254698197e-06, + "loss": 0.9899940490722656, + "step": 250, + "token_acc": 0.7366128681007573 + }, + { + "epoch": 0.37865359230624174, + "grad_norm": 2.38342547416687, + "learning_rate": 8.935525168886263e-06, + "loss": 1.0229578018188477, + "step": 251, + "token_acc": 0.728451741209326 + }, + { + "epoch": 0.3801621723552706, + "grad_norm": 2.9305362701416016, + "learning_rate": 8.92530475251784e-06, + "loss": 0.9926273822784424, + "step": 252, + "token_acc": 0.7352914075097904 + }, + { + "epoch": 0.38167075240429943, + "grad_norm": 2.3433339595794678, + "learning_rate": 8.91504140964553e-06, + "loss": 0.892216682434082, + "step": 253, + "token_acc": 0.7589404880052018 + }, + { + "epoch": 0.3831793324533283, + "grad_norm": 2.823096752166748, + "learning_rate": 8.90473525250761e-06, + "loss": 1.0185294151306152, + "step": 254, + "token_acc": 0.7276133743274404 + }, + { + "epoch": 0.3846879125023572, + "grad_norm": 2.260789394378662, + "learning_rate": 8.894386393810563e-06, + "loss": 1.018679141998291, + "step": 255, + "token_acc": 0.7209881868066039 + }, + { + "epoch": 0.386196492551386, + "grad_norm": 2.515350818634033, + "learning_rate": 8.883994946727848e-06, + "loss": 0.9378585815429688, + "step": 256, + "token_acc": 0.7467327754213 + }, + { + "epoch": 0.38770507260041487, + "grad_norm": 2.0592963695526123, + "learning_rate": 8.873561024898668e-06, + "loss": 0.9734134078025818, + "step": 257, + "token_acc": 0.7326388109297469 + }, + { + "epoch": 0.3892136526494437, + "grad_norm": 2.2741405963897705, + "learning_rate": 8.863084742426719e-06, + "loss": 0.9801385402679443, + "step": 258, + "token_acc": 0.7324649567151779 + }, + { + "epoch": 0.39072223269847256, + "grad_norm": 2.290621280670166, + "learning_rate": 8.852566213878947e-06, + "loss": 0.9336638450622559, + "step": 259, + "token_acc": 0.7439496855345912 + }, + { + "epoch": 0.39223081274750143, + "grad_norm": 2.0970897674560547, + "learning_rate": 8.842005554284296e-06, + "loss": 0.9606438279151917, + "step": 260, + "token_acc": 0.7393114525434833 + }, + { + "epoch": 0.39373939279653025, + "grad_norm": 2.5087926387786865, + "learning_rate": 8.831402879132447e-06, + "loss": 1.0091387033462524, + "step": 261, + "token_acc": 0.7286905715497338 + }, + { + "epoch": 0.3952479728455591, + "grad_norm": 2.636766195297241, + "learning_rate": 8.820758304372557e-06, + "loss": 0.9792462587356567, + "step": 262, + "token_acc": 0.7359910560903117 + }, + { + "epoch": 0.39675655289458794, + "grad_norm": 2.7607264518737793, + "learning_rate": 8.810071946411989e-06, + "loss": 1.0698983669281006, + "step": 263, + "token_acc": 0.7178458546073426 + }, + { + "epoch": 0.3982651329436168, + "grad_norm": 4.028689384460449, + "learning_rate": 8.799343922115045e-06, + "loss": 1.0240118503570557, + "step": 264, + "token_acc": 0.7290772320901107 + }, + { + "epoch": 0.3997737129926457, + "grad_norm": 2.4673407077789307, + "learning_rate": 8.788574348801676e-06, + "loss": 0.9736855030059814, + "step": 265, + "token_acc": 0.7383697051011746 + }, + { + "epoch": 0.4012822930416745, + "grad_norm": 2.1820874214172363, + "learning_rate": 8.777763344246209e-06, + "loss": 0.9662646055221558, + "step": 266, + "token_acc": 0.7370327149812885 + }, + { + "epoch": 0.4027908730907034, + "grad_norm": 2.803478717803955, + "learning_rate": 8.766911026676063e-06, + "loss": 1.0034945011138916, + "step": 267, + "token_acc": 0.7319846310671999 + }, + { + "epoch": 0.4042994531397322, + "grad_norm": 2.1364336013793945, + "learning_rate": 8.756017514770444e-06, + "loss": 0.9319308996200562, + "step": 268, + "token_acc": 0.7427066109912284 + }, + { + "epoch": 0.4058080331887611, + "grad_norm": 3.1203558444976807, + "learning_rate": 8.745082927659048e-06, + "loss": 1.0237950086593628, + "step": 269, + "token_acc": 0.72509035741377 + }, + { + "epoch": 0.40731661323778995, + "grad_norm": 2.2100014686584473, + "learning_rate": 8.734107384920771e-06, + "loss": 1.0081058740615845, + "step": 270, + "token_acc": 0.7289230891593096 + }, + { + "epoch": 0.40882519328681877, + "grad_norm": 3.033102035522461, + "learning_rate": 8.72309100658239e-06, + "loss": 1.0360187292099, + "step": 271, + "token_acc": 0.7178113736483781 + }, + { + "epoch": 0.41033377333584764, + "grad_norm": 2.039494514465332, + "learning_rate": 8.71203391311725e-06, + "loss": 0.9908350706100464, + "step": 272, + "token_acc": 0.7327832561883679 + }, + { + "epoch": 0.41184235338487646, + "grad_norm": 3.4466865062713623, + "learning_rate": 8.700936225443958e-06, + "loss": 0.9967418909072876, + "step": 273, + "token_acc": 0.7297109021246952 + }, + { + "epoch": 0.41335093343390533, + "grad_norm": 2.207625389099121, + "learning_rate": 8.689798064925049e-06, + "loss": 0.9572864174842834, + "step": 274, + "token_acc": 0.7418617132989522 + }, + { + "epoch": 0.4148595134829342, + "grad_norm": 2.7611567974090576, + "learning_rate": 8.67861955336566e-06, + "loss": 0.9642038941383362, + "step": 275, + "token_acc": 0.7372177193250752 + }, + { + "epoch": 0.416368093531963, + "grad_norm": 2.314100742340088, + "learning_rate": 8.6674008130122e-06, + "loss": 0.9624761343002319, + "step": 276, + "token_acc": 0.7423471956499849 + }, + { + "epoch": 0.4178766735809919, + "grad_norm": 1.7179359197616577, + "learning_rate": 8.65614196655102e-06, + "loss": 0.9623450040817261, + "step": 277, + "token_acc": 0.7373785407453199 + }, + { + "epoch": 0.4193852536300207, + "grad_norm": 1.5303754806518555, + "learning_rate": 8.644843137107058e-06, + "loss": 0.9889683127403259, + "step": 278, + "token_acc": 0.7297831325301205 + }, + { + "epoch": 0.4208938336790496, + "grad_norm": 3.288228750228882, + "learning_rate": 8.633504448242504e-06, + "loss": 0.8794506788253784, + "step": 279, + "token_acc": 0.7640624274234753 + }, + { + "epoch": 0.42240241372807846, + "grad_norm": 2.824843645095825, + "learning_rate": 8.622126023955446e-06, + "loss": 0.9805803298950195, + "step": 280, + "token_acc": 0.7379739523159332 + }, + { + "epoch": 0.4239109937771073, + "grad_norm": 2.7605974674224854, + "learning_rate": 8.610707988678504e-06, + "loss": 0.9847573041915894, + "step": 281, + "token_acc": 0.7307890443891131 + }, + { + "epoch": 0.42541957382613615, + "grad_norm": 2.8286962509155273, + "learning_rate": 8.599250467277483e-06, + "loss": 0.9010330438613892, + "step": 282, + "token_acc": 0.7584281842818428 + }, + { + "epoch": 0.42692815387516503, + "grad_norm": 2.3051202297210693, + "learning_rate": 8.587753585050004e-06, + "loss": 0.9498060941696167, + "step": 283, + "token_acc": 0.7399619862200048 + }, + { + "epoch": 0.42843673392419385, + "grad_norm": 1.5281521081924438, + "learning_rate": 8.576217467724129e-06, + "loss": 0.8279926776885986, + "step": 284, + "token_acc": 0.7709903083450399 + }, + { + "epoch": 0.4299453139732227, + "grad_norm": 2.368086576461792, + "learning_rate": 8.564642241456986e-06, + "loss": 0.9744131565093994, + "step": 285, + "token_acc": 0.7365811138014527 + }, + { + "epoch": 0.43145389402225154, + "grad_norm": 2.38155460357666, + "learning_rate": 8.553028032833397e-06, + "loss": 0.9327294826507568, + "step": 286, + "token_acc": 0.7441484879692912 + }, + { + "epoch": 0.4329624740712804, + "grad_norm": 3.084064245223999, + "learning_rate": 8.541374968864486e-06, + "loss": 0.9276280403137207, + "step": 287, + "token_acc": 0.7435728931181326 + }, + { + "epoch": 0.4344710541203093, + "grad_norm": 3.183046817779541, + "learning_rate": 8.529683176986295e-06, + "loss": 0.9687259197235107, + "step": 288, + "token_acc": 0.7369287340220627 + }, + { + "epoch": 0.4359796341693381, + "grad_norm": 3.0860378742218018, + "learning_rate": 8.517952785058385e-06, + "loss": 0.9527791738510132, + "step": 289, + "token_acc": 0.7423072245195816 + }, + { + "epoch": 0.437488214218367, + "grad_norm": 2.124403953552246, + "learning_rate": 8.506183921362443e-06, + "loss": 0.9727620482444763, + "step": 290, + "token_acc": 0.734139341759785 + }, + { + "epoch": 0.4389967942673958, + "grad_norm": 3.0881552696228027, + "learning_rate": 8.494376714600878e-06, + "loss": 0.9623801708221436, + "step": 291, + "token_acc": 0.7405262384242246 + }, + { + "epoch": 0.44050537431642467, + "grad_norm": 2.0167603492736816, + "learning_rate": 8.482531293895412e-06, + "loss": 0.9587442278862, + "step": 292, + "token_acc": 0.7406382978723405 + }, + { + "epoch": 0.44201395436545354, + "grad_norm": 4.438661575317383, + "learning_rate": 8.470647788785665e-06, + "loss": 1.0010523796081543, + "step": 293, + "token_acc": 0.727760519627224 + }, + { + "epoch": 0.44352253441448236, + "grad_norm": 2.1790931224823, + "learning_rate": 8.458726329227748e-06, + "loss": 1.0261417627334595, + "step": 294, + "token_acc": 0.7254366401884381 + }, + { + "epoch": 0.44503111446351123, + "grad_norm": 2.5913753509521484, + "learning_rate": 8.446767045592829e-06, + "loss": 0.9819632768630981, + "step": 295, + "token_acc": 0.734521470670783 + }, + { + "epoch": 0.44653969451254005, + "grad_norm": 3.203066349029541, + "learning_rate": 8.434770068665723e-06, + "loss": 0.8786149024963379, + "step": 296, + "token_acc": 0.757187126070015 + }, + { + "epoch": 0.4480482745615689, + "grad_norm": 3.1226532459259033, + "learning_rate": 8.422735529643445e-06, + "loss": 0.9815603494644165, + "step": 297, + "token_acc": 0.7386603232956696 + }, + { + "epoch": 0.4495568546105978, + "grad_norm": 2.7377941608428955, + "learning_rate": 8.410663560133784e-06, + "loss": 0.9410181641578674, + "step": 298, + "token_acc": 0.7507978090021433 + }, + { + "epoch": 0.4510654346596266, + "grad_norm": 2.268411159515381, + "learning_rate": 8.398554292153866e-06, + "loss": 0.9518625140190125, + "step": 299, + "token_acc": 0.7410614742409707 + }, + { + "epoch": 0.4525740147086555, + "grad_norm": 2.549682855606079, + "learning_rate": 8.386407858128707e-06, + "loss": 0.9655609130859375, + "step": 300, + "token_acc": 0.7375617986345444 + }, + { + "epoch": 0.4540825947576843, + "grad_norm": 2.066896915435791, + "learning_rate": 8.37422439088976e-06, + "loss": 0.9887623190879822, + "step": 301, + "token_acc": 0.7282220212720149 + }, + { + "epoch": 0.4555911748067132, + "grad_norm": 2.3282089233398438, + "learning_rate": 8.362004023673473e-06, + "loss": 1.029325008392334, + "step": 302, + "token_acc": 0.7192592093138247 + }, + { + "epoch": 0.45709975485574206, + "grad_norm": 2.1860337257385254, + "learning_rate": 8.349746890119826e-06, + "loss": 0.9645460844039917, + "step": 303, + "token_acc": 0.7373355683723044 + }, + { + "epoch": 0.4586083349047709, + "grad_norm": 1.862546443939209, + "learning_rate": 8.337453124270864e-06, + "loss": 0.9768998622894287, + "step": 304, + "token_acc": 0.7343086163956384 + }, + { + "epoch": 0.46011691495379975, + "grad_norm": 2.1919784545898438, + "learning_rate": 8.325122860569241e-06, + "loss": 1.0176093578338623, + "step": 305, + "token_acc": 0.7260874452062493 + }, + { + "epoch": 0.46162549500282857, + "grad_norm": 2.3253211975097656, + "learning_rate": 8.31275623385675e-06, + "loss": 1.0008552074432373, + "step": 306, + "token_acc": 0.7298423627468744 + }, + { + "epoch": 0.46313407505185744, + "grad_norm": 2.2187750339508057, + "learning_rate": 8.300353379372834e-06, + "loss": 0.9586999416351318, + "step": 307, + "token_acc": 0.7410419920180461 + }, + { + "epoch": 0.4646426551008863, + "grad_norm": 2.745685577392578, + "learning_rate": 8.287914432753123e-06, + "loss": 0.9785603880882263, + "step": 308, + "token_acc": 0.7318993987636548 + }, + { + "epoch": 0.46615123514991513, + "grad_norm": 1.8214253187179565, + "learning_rate": 8.275439530027948e-06, + "loss": 1.029813528060913, + "step": 309, + "token_acc": 0.723904790342533 + }, + { + "epoch": 0.467659815198944, + "grad_norm": 1.8105900287628174, + "learning_rate": 8.262928807620843e-06, + "loss": 0.9917140007019043, + "step": 310, + "token_acc": 0.7305203367481489 + }, + { + "epoch": 0.4691683952479728, + "grad_norm": 1.7886117696762085, + "learning_rate": 8.250382402347066e-06, + "loss": 1.0178697109222412, + "step": 311, + "token_acc": 0.7176047319958534 + }, + { + "epoch": 0.4706769752970017, + "grad_norm": 2.6013174057006836, + "learning_rate": 8.237800451412095e-06, + "loss": 0.8847721815109253, + "step": 312, + "token_acc": 0.7515454425884519 + }, + { + "epoch": 0.47218555534603057, + "grad_norm": 3.7195141315460205, + "learning_rate": 8.225183092410128e-06, + "loss": 0.9107895493507385, + "step": 313, + "token_acc": 0.7552759144102238 + }, + { + "epoch": 0.4736941353950594, + "grad_norm": 2.3263838291168213, + "learning_rate": 8.212530463322584e-06, + "loss": 1.0151785612106323, + "step": 314, + "token_acc": 0.7158458244111349 + }, + { + "epoch": 0.47520271544408826, + "grad_norm": 2.4454832077026367, + "learning_rate": 8.199842702516584e-06, + "loss": 0.9493778944015503, + "step": 315, + "token_acc": 0.7430618744313012 + }, + { + "epoch": 0.4767112954931171, + "grad_norm": 1.86771821975708, + "learning_rate": 8.18711994874345e-06, + "loss": 1.0082162618637085, + "step": 316, + "token_acc": 0.7245781645636955 + }, + { + "epoch": 0.47821987554214596, + "grad_norm": 2.4932756423950195, + "learning_rate": 8.174362341137177e-06, + "loss": 1.0193214416503906, + "step": 317, + "token_acc": 0.7237573787321413 + }, + { + "epoch": 0.47972845559117483, + "grad_norm": 2.155308485031128, + "learning_rate": 8.161570019212921e-06, + "loss": 1.0259885787963867, + "step": 318, + "token_acc": 0.7231689158847581 + }, + { + "epoch": 0.48123703564020365, + "grad_norm": 2.636300563812256, + "learning_rate": 8.148743122865463e-06, + "loss": 0.9795339703559875, + "step": 319, + "token_acc": 0.7246668983895261 + }, + { + "epoch": 0.4827456156892325, + "grad_norm": 2.6430575847625732, + "learning_rate": 8.135881792367686e-06, + "loss": 0.895164430141449, + "step": 320, + "token_acc": 0.7505640379873025 + }, + { + "epoch": 0.48425419573826134, + "grad_norm": 2.0760655403137207, + "learning_rate": 8.12298616836904e-06, + "loss": 0.9317203760147095, + "step": 321, + "token_acc": 0.7466990335314669 + }, + { + "epoch": 0.4857627757872902, + "grad_norm": 2.3329524993896484, + "learning_rate": 8.110056391894005e-06, + "loss": 0.9701822996139526, + "step": 322, + "token_acc": 0.738693231210206 + }, + { + "epoch": 0.4872713558363191, + "grad_norm": 2.4405055046081543, + "learning_rate": 8.097092604340543e-06, + "loss": 0.9560205936431885, + "step": 323, + "token_acc": 0.7387687598736177 + }, + { + "epoch": 0.4887799358853479, + "grad_norm": 3.081310272216797, + "learning_rate": 8.084094947478556e-06, + "loss": 0.9925979971885681, + "step": 324, + "token_acc": 0.7339661677011075 + }, + { + "epoch": 0.4902885159343768, + "grad_norm": 2.617116928100586, + "learning_rate": 8.071063563448341e-06, + "loss": 0.863371729850769, + "step": 325, + "token_acc": 0.7616929966413734 + }, + { + "epoch": 0.4917970959834056, + "grad_norm": 2.267735481262207, + "learning_rate": 8.057998594759022e-06, + "loss": 0.9829203486442566, + "step": 326, + "token_acc": 0.732360893450669 + }, + { + "epoch": 0.49330567603243447, + "grad_norm": 2.4509315490722656, + "learning_rate": 8.044900184287007e-06, + "loss": 0.9475609064102173, + "step": 327, + "token_acc": 0.7407449830477413 + }, + { + "epoch": 0.49481425608146334, + "grad_norm": 2.1019883155822754, + "learning_rate": 8.031768475274412e-06, + "loss": 0.9428449273109436, + "step": 328, + "token_acc": 0.7364185554281683 + }, + { + "epoch": 0.49632283613049216, + "grad_norm": 2.3447651863098145, + "learning_rate": 8.018603611327505e-06, + "loss": 0.9480220675468445, + "step": 329, + "token_acc": 0.7407305163995238 + }, + { + "epoch": 0.49783141617952104, + "grad_norm": 3.0796406269073486, + "learning_rate": 8.005405736415127e-06, + "loss": 0.9253246784210205, + "step": 330, + "token_acc": 0.7462672072640754 + }, + { + "epoch": 0.49933999622854985, + "grad_norm": 2.8936774730682373, + "learning_rate": 7.992174994867124e-06, + "loss": 0.9145048260688782, + "step": 331, + "token_acc": 0.752209261223047 + }, + { + "epoch": 0.5008485762775787, + "grad_norm": 3.5692570209503174, + "learning_rate": 7.978911531372764e-06, + "loss": 0.9855884313583374, + "step": 332, + "token_acc": 0.7315728526835162 + }, + { + "epoch": 0.5023571563266076, + "grad_norm": 2.4607326984405518, + "learning_rate": 7.965615490979165e-06, + "loss": 0.9835842847824097, + "step": 333, + "token_acc": 0.7292859799181852 + }, + { + "epoch": 0.5038657363756365, + "grad_norm": 1.9839391708374023, + "learning_rate": 7.952287019089686e-06, + "loss": 0.9971153736114502, + "step": 334, + "token_acc": 0.7246776559182154 + }, + { + "epoch": 0.5053743164246652, + "grad_norm": 1.7717572450637817, + "learning_rate": 7.938926261462366e-06, + "loss": 0.9655886888504028, + "step": 335, + "token_acc": 0.7381942223643535 + }, + { + "epoch": 0.5068828964736941, + "grad_norm": 1.8665540218353271, + "learning_rate": 7.925533364208308e-06, + "loss": 0.8911811113357544, + "step": 336, + "token_acc": 0.7581819636712811 + }, + { + "epoch": 0.508391476522723, + "grad_norm": 2.563227415084839, + "learning_rate": 7.912108473790092e-06, + "loss": 0.9800481796264648, + "step": 337, + "token_acc": 0.7293876154635648 + }, + { + "epoch": 0.5099000565717519, + "grad_norm": 1.981688380241394, + "learning_rate": 7.898651737020166e-06, + "loss": 0.9202637076377869, + "step": 338, + "token_acc": 0.7509830611010284 + }, + { + "epoch": 0.5114086366207807, + "grad_norm": 2.020841121673584, + "learning_rate": 7.885163301059251e-06, + "loss": 0.9376657605171204, + "step": 339, + "token_acc": 0.7413153163602622 + }, + { + "epoch": 0.5129172166698095, + "grad_norm": 2.4810831546783447, + "learning_rate": 7.871643313414718e-06, + "loss": 1.02794349193573, + "step": 340, + "token_acc": 0.7219416145877922 + }, + { + "epoch": 0.5144257967188384, + "grad_norm": 2.2617504596710205, + "learning_rate": 7.858091921938989e-06, + "loss": 0.9320923089981079, + "step": 341, + "token_acc": 0.7445262522631262 + }, + { + "epoch": 0.5159343767678672, + "grad_norm": 3.3374249935150146, + "learning_rate": 7.844509274827907e-06, + "loss": 0.935370922088623, + "step": 342, + "token_acc": 0.7429917893407181 + }, + { + "epoch": 0.5174429568168961, + "grad_norm": 2.1883275508880615, + "learning_rate": 7.830895520619129e-06, + "loss": 0.9605501890182495, + "step": 343, + "token_acc": 0.7344315431920609 + }, + { + "epoch": 0.518951536865925, + "grad_norm": 1.9800795316696167, + "learning_rate": 7.817250808190483e-06, + "loss": 1.0150222778320312, + "step": 344, + "token_acc": 0.7218973209126003 + }, + { + "epoch": 0.5204601169149538, + "grad_norm": 1.9465405941009521, + "learning_rate": 7.803575286758365e-06, + "loss": 0.9980044364929199, + "step": 345, + "token_acc": 0.7322597425115956 + }, + { + "epoch": 0.5219686969639826, + "grad_norm": 1.9981352090835571, + "learning_rate": 7.789869105876083e-06, + "loss": 1.0256046056747437, + "step": 346, + "token_acc": 0.7290106738888004 + }, + { + "epoch": 0.5234772770130115, + "grad_norm": 1.9621007442474365, + "learning_rate": 7.776132415432234e-06, + "loss": 0.9643725752830505, + "step": 347, + "token_acc": 0.7369925018288223 + }, + { + "epoch": 0.5249858570620404, + "grad_norm": 2.209317207336426, + "learning_rate": 7.762365365649068e-06, + "loss": 0.9680784940719604, + "step": 348, + "token_acc": 0.7380459970597762 + }, + { + "epoch": 0.5264944371110692, + "grad_norm": 2.1109721660614014, + "learning_rate": 7.748568107080831e-06, + "loss": 0.9282352924346924, + "step": 349, + "token_acc": 0.7414129013317676 + }, + { + "epoch": 0.528003017160098, + "grad_norm": 2.1684558391571045, + "learning_rate": 7.734740790612137e-06, + "loss": 0.9311020970344543, + "step": 350, + "token_acc": 0.7453566411737163 + }, + { + "epoch": 0.5295115972091269, + "grad_norm": 2.010829210281372, + "learning_rate": 7.720883567456299e-06, + "loss": 0.9590600728988647, + "step": 351, + "token_acc": 0.7433301548012741 + }, + { + "epoch": 0.5310201772581558, + "grad_norm": 2.085218906402588, + "learning_rate": 7.70699658915369e-06, + "loss": 0.9398245811462402, + "step": 352, + "token_acc": 0.7430573627272389 + }, + { + "epoch": 0.5325287573071846, + "grad_norm": 1.5894458293914795, + "learning_rate": 7.693080007570084e-06, + "loss": 1.0644735097885132, + "step": 353, + "token_acc": 0.7161714080550804 + }, + { + "epoch": 0.5340373373562135, + "grad_norm": 1.8693451881408691, + "learning_rate": 7.679133974894984e-06, + "loss": 0.9500839710235596, + "step": 354, + "token_acc": 0.7419489928388961 + }, + { + "epoch": 0.5355459174052423, + "grad_norm": 2.5866587162017822, + "learning_rate": 7.66515864363997e-06, + "loss": 0.9353429079055786, + "step": 355, + "token_acc": 0.7461213464468763 + }, + { + "epoch": 0.5370544974542711, + "grad_norm": 2.532707929611206, + "learning_rate": 7.651154166637025e-06, + "loss": 0.887712836265564, + "step": 356, + "token_acc": 0.7582956823554052 + }, + { + "epoch": 0.5385630775033, + "grad_norm": 2.0619170665740967, + "learning_rate": 7.637120697036866e-06, + "loss": 0.9981312155723572, + "step": 357, + "token_acc": 0.731128759276136 + }, + { + "epoch": 0.5400716575523289, + "grad_norm": 1.5410892963409424, + "learning_rate": 7.62305838830727e-06, + "loss": 0.9933099746704102, + "step": 358, + "token_acc": 0.7289849403481322 + }, + { + "epoch": 0.5415802376013578, + "grad_norm": 1.658732295036316, + "learning_rate": 7.608967394231387e-06, + "loss": 0.9434093236923218, + "step": 359, + "token_acc": 0.7418413943146771 + }, + { + "epoch": 0.5430888176503865, + "grad_norm": 1.6756763458251953, + "learning_rate": 7.594847868906076e-06, + "loss": 0.9273457527160645, + "step": 360, + "token_acc": 0.7412127214702322 + }, + { + "epoch": 0.5445973976994154, + "grad_norm": 2.145250082015991, + "learning_rate": 7.580699966740201e-06, + "loss": 0.9404428601264954, + "step": 361, + "token_acc": 0.7464569005295423 + }, + { + "epoch": 0.5461059777484443, + "grad_norm": 1.704442024230957, + "learning_rate": 7.566523842452958e-06, + "loss": 0.8951021432876587, + "step": 362, + "token_acc": 0.7574827023244967 + }, + { + "epoch": 0.5476145577974731, + "grad_norm": 1.6698086261749268, + "learning_rate": 7.552319651072164e-06, + "loss": 0.922804594039917, + "step": 363, + "token_acc": 0.7422231796639379 + }, + { + "epoch": 0.549123137846502, + "grad_norm": 1.8719260692596436, + "learning_rate": 7.5380875479325855e-06, + "loss": 0.9571307897567749, + "step": 364, + "token_acc": 0.734748408040954 + }, + { + "epoch": 0.5506317178955308, + "grad_norm": 1.652816653251648, + "learning_rate": 7.52382768867422e-06, + "loss": 0.9770803451538086, + "step": 365, + "token_acc": 0.735107209802039 + }, + { + "epoch": 0.5521402979445597, + "grad_norm": 2.688976764678955, + "learning_rate": 7.509540229240601e-06, + "loss": 0.8821839690208435, + "step": 366, + "token_acc": 0.7596353949887622 + }, + { + "epoch": 0.5536488779935885, + "grad_norm": 2.0760059356689453, + "learning_rate": 7.4952253258771036e-06, + "loss": 0.9600338339805603, + "step": 367, + "token_acc": 0.7365659340659341 + }, + { + "epoch": 0.5551574580426174, + "grad_norm": 1.8999563455581665, + "learning_rate": 7.480883135129211e-06, + "loss": 1.021796703338623, + "step": 368, + "token_acc": 0.7261275609932673 + }, + { + "epoch": 0.5566660380916463, + "grad_norm": 2.993677854537964, + "learning_rate": 7.4665138138408255e-06, + "loss": 0.9661844372749329, + "step": 369, + "token_acc": 0.7392967841921736 + }, + { + "epoch": 0.558174618140675, + "grad_norm": 2.258496046066284, + "learning_rate": 7.452117519152542e-06, + "loss": 0.9570837020874023, + "step": 370, + "token_acc": 0.736274857056115 + }, + { + "epoch": 0.5596831981897039, + "grad_norm": 2.13960862159729, + "learning_rate": 7.437694408499932e-06, + "loss": 0.9515930414199829, + "step": 371, + "token_acc": 0.7369614512471655 + }, + { + "epoch": 0.5611917782387328, + "grad_norm": 1.8820668458938599, + "learning_rate": 7.4232446396118265e-06, + "loss": 0.9325123429298401, + "step": 372, + "token_acc": 0.7468182791350477 + }, + { + "epoch": 0.5627003582877617, + "grad_norm": 2.1882309913635254, + "learning_rate": 7.408768370508577e-06, + "loss": 1.0136504173278809, + "step": 373, + "token_acc": 0.7238842696920248 + }, + { + "epoch": 0.5642089383367905, + "grad_norm": 2.532456874847412, + "learning_rate": 7.394265759500348e-06, + "loss": 0.9706389904022217, + "step": 374, + "token_acc": 0.7339384859434211 + }, + { + "epoch": 0.5657175183858193, + "grad_norm": 2.2703211307525635, + "learning_rate": 7.379736965185369e-06, + "loss": 0.942437469959259, + "step": 375, + "token_acc": 0.7420059940960407 + }, + { + "epoch": 0.5672260984348482, + "grad_norm": 2.049642562866211, + "learning_rate": 7.365182146448205e-06, + "loss": 0.9510753154754639, + "step": 376, + "token_acc": 0.7403331840995311 + }, + { + "epoch": 0.568734678483877, + "grad_norm": 4.985785484313965, + "learning_rate": 7.350601462458025e-06, + "loss": 0.9692330360412598, + "step": 377, + "token_acc": 0.7308537803930276 + }, + { + "epoch": 0.5702432585329059, + "grad_norm": 1.8394250869750977, + "learning_rate": 7.335995072666848e-06, + "loss": 0.9120851755142212, + "step": 378, + "token_acc": 0.7455035971223022 + }, + { + "epoch": 0.5717518385819348, + "grad_norm": 2.4557979106903076, + "learning_rate": 7.3213631368078196e-06, + "loss": 0.9210205674171448, + "step": 379, + "token_acc": 0.7491070698257764 + }, + { + "epoch": 0.5732604186309636, + "grad_norm": 3.0784006118774414, + "learning_rate": 7.30670581489344e-06, + "loss": 0.9026201367378235, + "step": 380, + "token_acc": 0.752652236427933 + }, + { + "epoch": 0.5747689986799924, + "grad_norm": 2.210352659225464, + "learning_rate": 7.292023267213836e-06, + "loss": 0.9767452478408813, + "step": 381, + "token_acc": 0.731851653501138 + }, + { + "epoch": 0.5762775787290213, + "grad_norm": 1.9603726863861084, + "learning_rate": 7.2773156543349965e-06, + "loss": 0.9445821046829224, + "step": 382, + "token_acc": 0.7441293003478933 + }, + { + "epoch": 0.5777861587780502, + "grad_norm": 2.0910959243774414, + "learning_rate": 7.262583137097019e-06, + "loss": 0.9755586385726929, + "step": 383, + "token_acc": 0.735424681858538 + }, + { + "epoch": 0.579294738827079, + "grad_norm": 2.012289047241211, + "learning_rate": 7.247825876612353e-06, + "loss": 0.9555764198303223, + "step": 384, + "token_acc": 0.7414517367541442 + }, + { + "epoch": 0.5808033188761078, + "grad_norm": 2.108126640319824, + "learning_rate": 7.233044034264034e-06, + "loss": 0.9527639746665955, + "step": 385, + "token_acc": 0.740637830216066 + }, + { + "epoch": 0.5823118989251367, + "grad_norm": 1.8331726789474487, + "learning_rate": 7.218237771703921e-06, + "loss": 0.9224135875701904, + "step": 386, + "token_acc": 0.7494916632777552 + }, + { + "epoch": 0.5838204789741656, + "grad_norm": 2.3606057167053223, + "learning_rate": 7.203407250850929e-06, + "loss": 0.9600188732147217, + "step": 387, + "token_acc": 0.740285320352289 + }, + { + "epoch": 0.5853290590231944, + "grad_norm": 2.54732346534729, + "learning_rate": 7.18855263388926e-06, + "loss": 0.9573134183883667, + "step": 388, + "token_acc": 0.7416639263400198 + }, + { + "epoch": 0.5868376390722233, + "grad_norm": 1.9928252696990967, + "learning_rate": 7.173674083266624e-06, + "loss": 0.9796591997146606, + "step": 389, + "token_acc": 0.7336936301866103 + }, + { + "epoch": 0.5883462191212521, + "grad_norm": 2.5923011302948, + "learning_rate": 7.158771761692464e-06, + "loss": 0.9223604798316956, + "step": 390, + "token_acc": 0.7507637684749401 + }, + { + "epoch": 0.5898547991702809, + "grad_norm": 2.480717182159424, + "learning_rate": 7.143845832136188e-06, + "loss": 0.8255516290664673, + "step": 391, + "token_acc": 0.7721612290025992 + }, + { + "epoch": 0.5913633792193098, + "grad_norm": 2.365816831588745, + "learning_rate": 7.128896457825364e-06, + "loss": 0.9118064641952515, + "step": 392, + "token_acc": 0.750064339872904 + }, + { + "epoch": 0.5928719592683387, + "grad_norm": 3.1344282627105713, + "learning_rate": 7.113923802243957e-06, + "loss": 1.0114411115646362, + "step": 393, + "token_acc": 0.7271490640354082 + }, + { + "epoch": 0.5943805393173676, + "grad_norm": 2.9526679515838623, + "learning_rate": 7.098928029130529e-06, + "loss": 0.9530540704727173, + "step": 394, + "token_acc": 0.7385640798716743 + }, + { + "epoch": 0.5958891193663963, + "grad_norm": 22.099040985107422, + "learning_rate": 7.083909302476453e-06, + "loss": 0.987083375453949, + "step": 395, + "token_acc": 0.7309508816120907 + }, + { + "epoch": 0.5973976994154252, + "grad_norm": 2.393683671951294, + "learning_rate": 7.068867786524116e-06, + "loss": 0.9549717903137207, + "step": 396, + "token_acc": 0.7391963420392137 + }, + { + "epoch": 0.5989062794644541, + "grad_norm": 2.490553140640259, + "learning_rate": 7.053803645765128e-06, + "loss": 0.9989736676216125, + "step": 397, + "token_acc": 0.7295440101684144 + }, + { + "epoch": 0.600414859513483, + "grad_norm": 3.546765089035034, + "learning_rate": 7.038717044938519e-06, + "loss": 0.9882051348686218, + "step": 398, + "token_acc": 0.7351974743895368 + }, + { + "epoch": 0.6019234395625118, + "grad_norm": 2.6756155490875244, + "learning_rate": 7.023608149028936e-06, + "loss": 0.9170738458633423, + "step": 399, + "token_acc": 0.7507165089833276 + }, + { + "epoch": 0.6034320196115406, + "grad_norm": 2.210718870162964, + "learning_rate": 7.008477123264849e-06, + "loss": 0.91643226146698, + "step": 400, + "token_acc": 0.7500736562986657 + }, + { + "epoch": 0.6049405996605695, + "grad_norm": 2.841707468032837, + "learning_rate": 6.993324133116726e-06, + "loss": 1.0150797367095947, + "step": 401, + "token_acc": 0.7221038641205558 + }, + { + "epoch": 0.6064491797095983, + "grad_norm": 2.1029279232025146, + "learning_rate": 6.978149344295242e-06, + "loss": 0.9783422946929932, + "step": 402, + "token_acc": 0.7347534348705933 + }, + { + "epoch": 0.6079577597586272, + "grad_norm": 13.042510032653809, + "learning_rate": 6.9629529227494575e-06, + "loss": 0.9919490814208984, + "step": 403, + "token_acc": 0.7255470473075112 + }, + { + "epoch": 0.6094663398076561, + "grad_norm": 2.5895752906799316, + "learning_rate": 6.9477350346650016e-06, + "loss": 0.9375113248825073, + "step": 404, + "token_acc": 0.7456939702749185 + }, + { + "epoch": 0.610974919856685, + "grad_norm": 1.8473247289657593, + "learning_rate": 6.932495846462262e-06, + "loss": 0.9944723844528198, + "step": 405, + "token_acc": 0.7213679175012001 + }, + { + "epoch": 0.6124834999057137, + "grad_norm": 2.5757694244384766, + "learning_rate": 6.9172355247945586e-06, + "loss": 0.9606615304946899, + "step": 406, + "token_acc": 0.7381708709643933 + }, + { + "epoch": 0.6139920799547426, + "grad_norm": 2.6024703979492188, + "learning_rate": 6.901954236546324e-06, + "loss": 0.9863815307617188, + "step": 407, + "token_acc": 0.7321535260306146 + }, + { + "epoch": 0.6155006600037715, + "grad_norm": 2.2890546321868896, + "learning_rate": 6.88665214883128e-06, + "loss": 0.960534930229187, + "step": 408, + "token_acc": 0.7413778682354253 + }, + { + "epoch": 0.6170092400528003, + "grad_norm": 2.486588954925537, + "learning_rate": 6.871329428990602e-06, + "loss": 0.9164827466011047, + "step": 409, + "token_acc": 0.7509202815325264 + }, + { + "epoch": 0.6185178201018292, + "grad_norm": 2.4943809509277344, + "learning_rate": 6.855986244591104e-06, + "loss": 0.9710013270378113, + "step": 410, + "token_acc": 0.7370900717291761 + }, + { + "epoch": 0.620026400150858, + "grad_norm": 2.3884289264678955, + "learning_rate": 6.840622763423391e-06, + "loss": 0.9332308769226074, + "step": 411, + "token_acc": 0.742402092790644 + }, + { + "epoch": 0.6215349801998868, + "grad_norm": 4.601289749145508, + "learning_rate": 6.825239153500029e-06, + "loss": 0.9185839295387268, + "step": 412, + "token_acc": 0.7430449583284423 + }, + { + "epoch": 0.6230435602489157, + "grad_norm": 8.099652290344238, + "learning_rate": 6.809835583053716e-06, + "loss": 0.9104984998703003, + "step": 413, + "token_acc": 0.7514000141773588 + }, + { + "epoch": 0.6245521402979446, + "grad_norm": 2.2322280406951904, + "learning_rate": 6.794412220535426e-06, + "loss": 0.9525901079177856, + "step": 414, + "token_acc": 0.7380575293211198 + }, + { + "epoch": 0.6260607203469735, + "grad_norm": 4.025731086730957, + "learning_rate": 6.778969234612583e-06, + "loss": 0.9312620759010315, + "step": 415, + "token_acc": 0.747639274054832 + }, + { + "epoch": 0.6275693003960022, + "grad_norm": 6.0961432456970215, + "learning_rate": 6.763506794167207e-06, + "loss": 0.9398995637893677, + "step": 416, + "token_acc": 0.7460887892134346 + }, + { + "epoch": 0.6290778804450311, + "grad_norm": 2.2645070552825928, + "learning_rate": 6.748025068294067e-06, + "loss": 0.9649618864059448, + "step": 417, + "token_acc": 0.7369103679463644 + }, + { + "epoch": 0.63058646049406, + "grad_norm": 2.406071901321411, + "learning_rate": 6.732524226298841e-06, + "loss": 0.9486474990844727, + "step": 418, + "token_acc": 0.7430658356057654 + }, + { + "epoch": 0.6320950405430888, + "grad_norm": 2.9746577739715576, + "learning_rate": 6.717004437696249e-06, + "loss": 0.9725360870361328, + "step": 419, + "token_acc": 0.7324518590824386 + }, + { + "epoch": 0.6336036205921177, + "grad_norm": 2.082751512527466, + "learning_rate": 6.701465872208216e-06, + "loss": 0.9647396802902222, + "step": 420, + "token_acc": 0.7345937486782558 + }, + { + "epoch": 0.6351122006411465, + "grad_norm": 8.607621192932129, + "learning_rate": 6.685908699762003e-06, + "loss": 0.9550772905349731, + "step": 421, + "token_acc": 0.7387355508374617 + }, + { + "epoch": 0.6366207806901754, + "grad_norm": 4.4206743240356445, + "learning_rate": 6.670333090488357e-06, + "loss": 0.9342201948165894, + "step": 422, + "token_acc": 0.7407306185660928 + }, + { + "epoch": 0.6381293607392042, + "grad_norm": 3.386361837387085, + "learning_rate": 6.654739214719642e-06, + "loss": 0.9455040693283081, + "step": 423, + "token_acc": 0.7487530316203724 + }, + { + "epoch": 0.6396379407882331, + "grad_norm": 2.503026008605957, + "learning_rate": 6.6391272429879886e-06, + "loss": 0.9735032320022583, + "step": 424, + "token_acc": 0.7362507111701119 + }, + { + "epoch": 0.641146520837262, + "grad_norm": 2.5624146461486816, + "learning_rate": 6.6234973460234184e-06, + "loss": 1.0114408731460571, + "step": 425, + "token_acc": 0.7251338712413075 + }, + { + "epoch": 0.6426551008862907, + "grad_norm": 3.4173195362091064, + "learning_rate": 6.607849694751978e-06, + "loss": 1.000654697418213, + "step": 426, + "token_acc": 0.7266354044548652 + }, + { + "epoch": 0.6441636809353196, + "grad_norm": 2.5287883281707764, + "learning_rate": 6.592184460293878e-06, + "loss": 0.9304991960525513, + "step": 427, + "token_acc": 0.7403036204509725 + }, + { + "epoch": 0.6456722609843485, + "grad_norm": 3.0434274673461914, + "learning_rate": 6.576501813961609e-06, + "loss": 0.9665944576263428, + "step": 428, + "token_acc": 0.7368104683959865 + }, + { + "epoch": 0.6471808410333774, + "grad_norm": 2.108100175857544, + "learning_rate": 6.560801927258081e-06, + "loss": 0.930558443069458, + "step": 429, + "token_acc": 0.7427380781310944 + }, + { + "epoch": 0.6486894210824062, + "grad_norm": 2.991330146789551, + "learning_rate": 6.545084971874738e-06, + "loss": 0.9191858172416687, + "step": 430, + "token_acc": 0.7514126965362335 + }, + { + "epoch": 0.650198001131435, + "grad_norm": 13.502842903137207, + "learning_rate": 6.529351119689687e-06, + "loss": 0.9495930075645447, + "step": 431, + "token_acc": 0.7411706883235425 + }, + { + "epoch": 0.6517065811804639, + "grad_norm": 2.422739267349243, + "learning_rate": 6.513600542765816e-06, + "loss": 0.9332212805747986, + "step": 432, + "token_acc": 0.7466078431372549 + }, + { + "epoch": 0.6532151612294927, + "grad_norm": 2.5644075870513916, + "learning_rate": 6.49783341334891e-06, + "loss": 0.9498615264892578, + "step": 433, + "token_acc": 0.7404254237718875 + }, + { + "epoch": 0.6547237412785216, + "grad_norm": 2.672065496444702, + "learning_rate": 6.4820499038657695e-06, + "loss": 1.0073184967041016, + "step": 434, + "token_acc": 0.7288787706317587 + }, + { + "epoch": 0.6562323213275505, + "grad_norm": 2.3766725063323975, + "learning_rate": 6.466250186922325e-06, + "loss": 0.9814496040344238, + "step": 435, + "token_acc": 0.7313121911220442 + }, + { + "epoch": 0.6577409013765793, + "grad_norm": 2.48927903175354, + "learning_rate": 6.450434435301751e-06, + "loss": 0.9808628559112549, + "step": 436, + "token_acc": 0.7337870038426035 + }, + { + "epoch": 0.6592494814256081, + "grad_norm": 4.715419769287109, + "learning_rate": 6.434602821962571e-06, + "loss": 0.9126060009002686, + "step": 437, + "token_acc": 0.7450674441045435 + }, + { + "epoch": 0.660758061474637, + "grad_norm": 2.3945095539093018, + "learning_rate": 6.418755520036775e-06, + "loss": 0.9260703325271606, + "step": 438, + "token_acc": 0.7451657605798956 + }, + { + "epoch": 0.6622666415236659, + "grad_norm": 2.267333745956421, + "learning_rate": 6.402892702827916e-06, + "loss": 0.9356890916824341, + "step": 439, + "token_acc": 0.7428018542933696 + }, + { + "epoch": 0.6637752215726948, + "grad_norm": 3.4336681365966797, + "learning_rate": 6.387014543809224e-06, + "loss": 0.9199202060699463, + "step": 440, + "token_acc": 0.747440768616442 + }, + { + "epoch": 0.6652838016217235, + "grad_norm": 2.1354541778564453, + "learning_rate": 6.371121216621698e-06, + "loss": 0.9322351217269897, + "step": 441, + "token_acc": 0.7466146558930065 + }, + { + "epoch": 0.6667923816707524, + "grad_norm": 3.732377767562866, + "learning_rate": 6.355212895072223e-06, + "loss": 0.8929821252822876, + "step": 442, + "token_acc": 0.7554185452767668 + }, + { + "epoch": 0.6683009617197813, + "grad_norm": 3.357151985168457, + "learning_rate": 6.339289753131649e-06, + "loss": 0.9481819868087769, + "step": 443, + "token_acc": 0.747400751239961 + }, + { + "epoch": 0.6698095417688101, + "grad_norm": 2.8206398487091064, + "learning_rate": 6.323351964932909e-06, + "loss": 0.9305940270423889, + "step": 444, + "token_acc": 0.7488562148569832 + }, + { + "epoch": 0.671318121817839, + "grad_norm": 2.5736637115478516, + "learning_rate": 6.3073997047691e-06, + "loss": 0.9425253868103027, + "step": 445, + "token_acc": 0.7454691714380353 + }, + { + "epoch": 0.6728267018668678, + "grad_norm": 3.726701021194458, + "learning_rate": 6.291433147091583e-06, + "loss": 0.9405432939529419, + "step": 446, + "token_acc": 0.7478510028653295 + }, + { + "epoch": 0.6743352819158966, + "grad_norm": 2.72888445854187, + "learning_rate": 6.275452466508076e-06, + "loss": 0.9785466194152832, + "step": 447, + "token_acc": 0.7341814290809654 + }, + { + "epoch": 0.6758438619649255, + "grad_norm": 2.08109712600708, + "learning_rate": 6.259457837780741e-06, + "loss": 0.9148257374763489, + "step": 448, + "token_acc": 0.7510153846153846 + }, + { + "epoch": 0.6773524420139544, + "grad_norm": 2.856480598449707, + "learning_rate": 6.243449435824276e-06, + "loss": 0.9594903588294983, + "step": 449, + "token_acc": 0.742551359330791 + }, + { + "epoch": 0.6788610220629833, + "grad_norm": 2.4389445781707764, + "learning_rate": 6.227427435703997e-06, + "loss": 0.935378909111023, + "step": 450, + "token_acc": 0.7417902383382362 + }, + { + "epoch": 0.680369602112012, + "grad_norm": 2.9558165073394775, + "learning_rate": 6.211392012633932e-06, + "loss": 0.9837616682052612, + "step": 451, + "token_acc": 0.7328532896638089 + }, + { + "epoch": 0.6818781821610409, + "grad_norm": 2.462099552154541, + "learning_rate": 6.1953433419748995e-06, + "loss": 1.0033810138702393, + "step": 452, + "token_acc": 0.7345579489890314 + }, + { + "epoch": 0.6833867622100698, + "grad_norm": 3.553602933883667, + "learning_rate": 6.179281599232592e-06, + "loss": 0.9184862375259399, + "step": 453, + "token_acc": 0.7447203958831866 + }, + { + "epoch": 0.6848953422590986, + "grad_norm": 2.977471351623535, + "learning_rate": 6.163206960055652e-06, + "loss": 0.848037838935852, + "step": 454, + "token_acc": 0.7656200611105257 + }, + { + "epoch": 0.6864039223081275, + "grad_norm": 6.395641803741455, + "learning_rate": 6.147119600233758e-06, + "loss": 0.9354156255722046, + "step": 455, + "token_acc": 0.7433707108138023 + }, + { + "epoch": 0.6879125023571563, + "grad_norm": 3.2083609104156494, + "learning_rate": 6.131019695695702e-06, + "loss": 0.8993312120437622, + "step": 456, + "token_acc": 0.7576809970592354 + }, + { + "epoch": 0.6894210824061852, + "grad_norm": 2.2400403022766113, + "learning_rate": 6.114907422507459e-06, + "loss": 0.8566614389419556, + "step": 457, + "token_acc": 0.7598617909588252 + }, + { + "epoch": 0.690929662455214, + "grad_norm": 3.1033976078033447, + "learning_rate": 6.098782956870266e-06, + "loss": 0.9051586389541626, + "step": 458, + "token_acc": 0.7500543056633049 + }, + { + "epoch": 0.6924382425042429, + "grad_norm": 4.04071569442749, + "learning_rate": 6.0826464751187e-06, + "loss": 0.8197078704833984, + "step": 459, + "token_acc": 0.7723147271425602 + }, + { + "epoch": 0.6939468225532718, + "grad_norm": 6.173750877380371, + "learning_rate": 6.066498153718735e-06, + "loss": 0.9737037420272827, + "step": 460, + "token_acc": 0.7328420785160716 + }, + { + "epoch": 0.6954554026023005, + "grad_norm": 3.0429303646087646, + "learning_rate": 6.0503381692658305e-06, + "loss": 0.9919114112854004, + "step": 461, + "token_acc": 0.7261085626911316 + }, + { + "epoch": 0.6969639826513294, + "grad_norm": 3.3123037815093994, + "learning_rate": 6.034166698482984e-06, + "loss": 0.9440866112709045, + "step": 462, + "token_acc": 0.7406060281886016 + }, + { + "epoch": 0.6984725627003583, + "grad_norm": 2.6384501457214355, + "learning_rate": 6.0179839182188125e-06, + "loss": 0.9087064862251282, + "step": 463, + "token_acc": 0.7501567762164673 + }, + { + "epoch": 0.6999811427493872, + "grad_norm": 1.9863035678863525, + "learning_rate": 6.001790005445607e-06, + "loss": 0.99301677942276, + "step": 464, + "token_acc": 0.7315791101079125 + }, + { + "epoch": 0.701489722798416, + "grad_norm": 2.8223328590393066, + "learning_rate": 5.985585137257401e-06, + "loss": 0.9712878465652466, + "step": 465, + "token_acc": 0.7355955106790286 + }, + { + "epoch": 0.7029983028474448, + "grad_norm": 2.4844985008239746, + "learning_rate": 5.969369490868042e-06, + "loss": 0.8950424194335938, + "step": 466, + "token_acc": 0.7565651902343644 + }, + { + "epoch": 0.7045068828964737, + "grad_norm": 2.0196645259857178, + "learning_rate": 5.953143243609235e-06, + "loss": 0.9834406971931458, + "step": 467, + "token_acc": 0.7265924960265728 + }, + { + "epoch": 0.7060154629455025, + "grad_norm": 1.7638938426971436, + "learning_rate": 5.936906572928625e-06, + "loss": 0.9263018369674683, + "step": 468, + "token_acc": 0.7485688308189655 + }, + { + "epoch": 0.7075240429945314, + "grad_norm": 2.60968017578125, + "learning_rate": 5.920659656387836e-06, + "loss": 0.9821443557739258, + "step": 469, + "token_acc": 0.7345941501233407 + }, + { + "epoch": 0.7090326230435603, + "grad_norm": 2.4257900714874268, + "learning_rate": 5.904402671660551e-06, + "loss": 0.9720746278762817, + "step": 470, + "token_acc": 0.738523298197222 + }, + { + "epoch": 0.7105412030925891, + "grad_norm": 2.9759914875030518, + "learning_rate": 5.8881357965305444e-06, + "loss": 0.9163949489593506, + "step": 471, + "token_acc": 0.7487561112793666 + }, + { + "epoch": 0.7120497831416179, + "grad_norm": 2.537590742111206, + "learning_rate": 5.871859208889759e-06, + "loss": 0.9428315162658691, + "step": 472, + "token_acc": 0.7395884977689638 + }, + { + "epoch": 0.7135583631906468, + "grad_norm": 2.1287147998809814, + "learning_rate": 5.855573086736351e-06, + "loss": 0.9250155687332153, + "step": 473, + "token_acc": 0.748034823768067 + }, + { + "epoch": 0.7150669432396757, + "grad_norm": 2.5445432662963867, + "learning_rate": 5.839277608172739e-06, + "loss": 0.9140595197677612, + "step": 474, + "token_acc": 0.7448510393715874 + }, + { + "epoch": 0.7165755232887046, + "grad_norm": 2.147160530090332, + "learning_rate": 5.82297295140367e-06, + "loss": 0.9481013417243958, + "step": 475, + "token_acc": 0.7387447764724434 + }, + { + "epoch": 0.7180841033377333, + "grad_norm": 4.830345630645752, + "learning_rate": 5.806659294734256e-06, + "loss": 0.9496990442276001, + "step": 476, + "token_acc": 0.7404399254638256 + }, + { + "epoch": 0.7195926833867622, + "grad_norm": 2.418107271194458, + "learning_rate": 5.790336816568033e-06, + "loss": 0.8514952659606934, + "step": 477, + "token_acc": 0.7640080822126143 + }, + { + "epoch": 0.7211012634357911, + "grad_norm": 2.710644483566284, + "learning_rate": 5.774005695405008e-06, + "loss": 0.9454039931297302, + "step": 478, + "token_acc": 0.7365105202869717 + }, + { + "epoch": 0.7226098434848199, + "grad_norm": 1.9348199367523193, + "learning_rate": 5.7576661098397024e-06, + "loss": 0.9789907932281494, + "step": 479, + "token_acc": 0.7312225153913808 + }, + { + "epoch": 0.7241184235338488, + "grad_norm": 3.190880060195923, + "learning_rate": 5.74131823855921e-06, + "loss": 0.9176206588745117, + "step": 480, + "token_acc": 0.7456865072119774 + }, + { + "epoch": 0.7256270035828776, + "grad_norm": 3.2925896644592285, + "learning_rate": 5.72496226034123e-06, + "loss": 0.9067544341087341, + "step": 481, + "token_acc": 0.7507098738327332 + }, + { + "epoch": 0.7271355836319064, + "grad_norm": 3.2283425331115723, + "learning_rate": 5.708598354052122e-06, + "loss": 0.8657084703445435, + "step": 482, + "token_acc": 0.762007380869003 + }, + { + "epoch": 0.7286441636809353, + "grad_norm": 2.634796142578125, + "learning_rate": 5.692226698644938e-06, + "loss": 0.9294568300247192, + "step": 483, + "token_acc": 0.7458718845455753 + }, + { + "epoch": 0.7301527437299642, + "grad_norm": 3.3449671268463135, + "learning_rate": 5.675847473157485e-06, + "loss": 0.9523289203643799, + "step": 484, + "token_acc": 0.7367317440605851 + }, + { + "epoch": 0.7316613237789931, + "grad_norm": 2.8761274814605713, + "learning_rate": 5.659460856710346e-06, + "loss": 0.9882867336273193, + "step": 485, + "token_acc": 0.7340517807859114 + }, + { + "epoch": 0.7331699038280218, + "grad_norm": 2.7857325077056885, + "learning_rate": 5.643067028504931e-06, + "loss": 0.9058640599250793, + "step": 486, + "token_acc": 0.752078043565415 + }, + { + "epoch": 0.7346784838770507, + "grad_norm": 2.728865146636963, + "learning_rate": 5.626666167821522e-06, + "loss": 0.904780387878418, + "step": 487, + "token_acc": 0.7473715075608428 + }, + { + "epoch": 0.7361870639260796, + "grad_norm": 2.4043877124786377, + "learning_rate": 5.610258454017301e-06, + "loss": 0.9189773797988892, + "step": 488, + "token_acc": 0.7478888543930456 + }, + { + "epoch": 0.7376956439751085, + "grad_norm": 2.8819572925567627, + "learning_rate": 5.593844066524401e-06, + "loss": 0.9159302711486816, + "step": 489, + "token_acc": 0.7451533008796652 + }, + { + "epoch": 0.7392042240241373, + "grad_norm": 5.548727035522461, + "learning_rate": 5.577423184847932e-06, + "loss": 0.9318259358406067, + "step": 490, + "token_acc": 0.7488866806521659 + }, + { + "epoch": 0.7407128040731661, + "grad_norm": 4.18589973449707, + "learning_rate": 5.560995988564023e-06, + "loss": 0.8979176878929138, + "step": 491, + "token_acc": 0.7521535371568846 + }, + { + "epoch": 0.742221384122195, + "grad_norm": 3.82426381111145, + "learning_rate": 5.544562657317863e-06, + "loss": 0.9432802796363831, + "step": 492, + "token_acc": 0.7440698043974459 + }, + { + "epoch": 0.7437299641712238, + "grad_norm": 3.0609257221221924, + "learning_rate": 5.52812337082173e-06, + "loss": 0.9442110061645508, + "step": 493, + "token_acc": 0.738659125721692 + }, + { + "epoch": 0.7452385442202527, + "grad_norm": 3.2839205265045166, + "learning_rate": 5.5116783088530255e-06, + "loss": 0.995833158493042, + "step": 494, + "token_acc": 0.7287064700984474 + }, + { + "epoch": 0.7467471242692816, + "grad_norm": 3.367539644241333, + "learning_rate": 5.495227651252315e-06, + "loss": 0.9678905010223389, + "step": 495, + "token_acc": 0.731124962300191 + }, + { + "epoch": 0.7482557043183103, + "grad_norm": 2.915174722671509, + "learning_rate": 5.478771577921351e-06, + "loss": 0.9572954177856445, + "step": 496, + "token_acc": 0.7403673094582186 + }, + { + "epoch": 0.7497642843673392, + "grad_norm": 2.5182864665985107, + "learning_rate": 5.4623102688211186e-06, + "loss": 0.9415892362594604, + "step": 497, + "token_acc": 0.7425202791926052 + }, + { + "epoch": 0.7512728644163681, + "grad_norm": 2.926884651184082, + "learning_rate": 5.445843903969854e-06, + "loss": 0.9800302386283875, + "step": 498, + "token_acc": 0.7308346251937791 + }, + { + "epoch": 0.752781444465397, + "grad_norm": 2.2927305698394775, + "learning_rate": 5.429372663441086e-06, + "loss": 0.916739284992218, + "step": 499, + "token_acc": 0.7485802905557963 + }, + { + "epoch": 0.7542900245144258, + "grad_norm": 3.021710157394409, + "learning_rate": 5.412896727361663e-06, + "loss": 0.9158183336257935, + "step": 500, + "token_acc": 0.752576355946066 + }, + { + "epoch": 0.7557986045634546, + "grad_norm": 5.742190837860107, + "learning_rate": 5.396416275909779e-06, + "loss": 0.9648655652999878, + "step": 501, + "token_acc": 0.7444240555418928 + }, + { + "epoch": 0.7573071846124835, + "grad_norm": 2.6224379539489746, + "learning_rate": 5.379931489313016e-06, + "loss": 0.923028826713562, + "step": 502, + "token_acc": 0.7440035910146569 + }, + { + "epoch": 0.7588157646615123, + "grad_norm": 2.4750194549560547, + "learning_rate": 5.363442547846356e-06, + "loss": 0.9381512403488159, + "step": 503, + "token_acc": 0.7453745580133213 + }, + { + "epoch": 0.7603243447105412, + "grad_norm": 4.87894868850708, + "learning_rate": 5.346949631830221e-06, + "loss": 0.9352282285690308, + "step": 504, + "token_acc": 0.7408409545971102 + }, + { + "epoch": 0.7618329247595701, + "grad_norm": 3.720313549041748, + "learning_rate": 5.3304529216284974e-06, + "loss": 0.9013872742652893, + "step": 505, + "token_acc": 0.748305606900801 + }, + { + "epoch": 0.7633415048085989, + "grad_norm": 2.9087014198303223, + "learning_rate": 5.3139525976465675e-06, + "loss": 0.8870570659637451, + "step": 506, + "token_acc": 0.7529262833449528 + }, + { + "epoch": 0.7648500848576277, + "grad_norm": 2.4480483531951904, + "learning_rate": 5.2974488403293285e-06, + "loss": 0.9241657853126526, + "step": 507, + "token_acc": 0.7455856105633437 + }, + { + "epoch": 0.7663586649066566, + "grad_norm": 7.191082000732422, + "learning_rate": 5.280941830159228e-06, + "loss": 0.9194809198379517, + "step": 508, + "token_acc": 0.7480483782297966 + }, + { + "epoch": 0.7678672449556855, + "grad_norm": 2.828296661376953, + "learning_rate": 5.264431747654284e-06, + "loss": 0.9166814088821411, + "step": 509, + "token_acc": 0.7462037350558696 + }, + { + "epoch": 0.7693758250047144, + "grad_norm": 5.163946151733398, + "learning_rate": 5.247918773366112e-06, + "loss": 0.9463444948196411, + "step": 510, + "token_acc": 0.7417996670909625 + }, + { + "epoch": 0.7708844050537431, + "grad_norm": 3.1770670413970947, + "learning_rate": 5.231403087877955e-06, + "loss": 0.9137945175170898, + "step": 511, + "token_acc": 0.747213416504755 + }, + { + "epoch": 0.772392985102772, + "grad_norm": 2.666616678237915, + "learning_rate": 5.214884871802703e-06, + "loss": 0.9619874954223633, + "step": 512, + "token_acc": 0.7433701866169693 + }, + { + "epoch": 0.7739015651518009, + "grad_norm": 2.8672335147857666, + "learning_rate": 5.198364305780922e-06, + "loss": 1.0091532468795776, + "step": 513, + "token_acc": 0.7252970315467988 + }, + { + "epoch": 0.7754101452008297, + "grad_norm": 2.5311155319213867, + "learning_rate": 5.1818415704788725e-06, + "loss": 0.9674872756004333, + "step": 514, + "token_acc": 0.736843259844247 + }, + { + "epoch": 0.7769187252498586, + "grad_norm": 3.1638095378875732, + "learning_rate": 5.165316846586541e-06, + "loss": 0.9234471321105957, + "step": 515, + "token_acc": 0.7490415878099674 + }, + { + "epoch": 0.7784273052988874, + "grad_norm": 2.4637391567230225, + "learning_rate": 5.148790314815662e-06, + "loss": 0.9526304602622986, + "step": 516, + "token_acc": 0.7386506935687264 + }, + { + "epoch": 0.7799358853479162, + "grad_norm": 3.120725154876709, + "learning_rate": 5.132262155897739e-06, + "loss": 0.9100754261016846, + "step": 517, + "token_acc": 0.7472485247701386 + }, + { + "epoch": 0.7814444653969451, + "grad_norm": 2.3154561519622803, + "learning_rate": 5.11573255058207e-06, + "loss": 0.9742940068244934, + "step": 518, + "token_acc": 0.7329311960779602 + }, + { + "epoch": 0.782953045445974, + "grad_norm": 7.394751071929932, + "learning_rate": 5.099201679633769e-06, + "loss": 0.9479016065597534, + "step": 519, + "token_acc": 0.7392012178362264 + }, + { + "epoch": 0.7844616254950029, + "grad_norm": 2.4849321842193604, + "learning_rate": 5.082669723831793e-06, + "loss": 0.9881341457366943, + "step": 520, + "token_acc": 0.7342891343381033 + }, + { + "epoch": 0.7859702055440316, + "grad_norm": 3.6967718601226807, + "learning_rate": 5.066136863966963e-06, + "loss": 0.9450361132621765, + "step": 521, + "token_acc": 0.7414925668000197 + }, + { + "epoch": 0.7874787855930605, + "grad_norm": 9.815211296081543, + "learning_rate": 5.049603280839982e-06, + "loss": 0.8865867853164673, + "step": 522, + "token_acc": 0.7511806090979183 + }, + { + "epoch": 0.7889873656420894, + "grad_norm": 2.6683976650238037, + "learning_rate": 5.033069155259471e-06, + "loss": 0.9101175665855408, + "step": 523, + "token_acc": 0.7469992060155994 + }, + { + "epoch": 0.7904959456911183, + "grad_norm": 4.313793182373047, + "learning_rate": 5.016534668039976e-06, + "loss": 0.9642683267593384, + "step": 524, + "token_acc": 0.7368821589556462 + }, + { + "epoch": 0.7920045257401471, + "grad_norm": 2.419790744781494, + "learning_rate": 5e-06, + "loss": 0.9017623662948608, + "step": 525, + "token_acc": 0.7516489894992349 + }, + { + "epoch": 0.7935131057891759, + "grad_norm": 2.5014219284057617, + "learning_rate": 4.983465331960025e-06, + "loss": 0.9500449895858765, + "step": 526, + "token_acc": 0.7358819175338529 + }, + { + "epoch": 0.7950216858382048, + "grad_norm": 2.4624011516571045, + "learning_rate": 4.96693084474053e-06, + "loss": 0.9039821624755859, + "step": 527, + "token_acc": 0.7518650226121912 + }, + { + "epoch": 0.7965302658872336, + "grad_norm": 3.3341097831726074, + "learning_rate": 4.950396719160019e-06, + "loss": 0.9257516264915466, + "step": 528, + "token_acc": 0.7487506317030714 + }, + { + "epoch": 0.7980388459362625, + "grad_norm": 3.8896796703338623, + "learning_rate": 4.93386313603304e-06, + "loss": 0.9286510944366455, + "step": 529, + "token_acc": 0.7470575022461815 + }, + { + "epoch": 0.7995474259852914, + "grad_norm": 5.138925552368164, + "learning_rate": 4.917330276168208e-06, + "loss": 0.8819937109947205, + "step": 530, + "token_acc": 0.7539801514260233 + }, + { + "epoch": 0.8010560060343201, + "grad_norm": 28.535049438476562, + "learning_rate": 4.900798320366233e-06, + "loss": 0.9048066139221191, + "step": 531, + "token_acc": 0.7564860834990059 + }, + { + "epoch": 0.802564586083349, + "grad_norm": 3.8600754737854004, + "learning_rate": 4.884267449417932e-06, + "loss": 0.8396285772323608, + "step": 532, + "token_acc": 0.7693885691492393 + }, + { + "epoch": 0.8040731661323779, + "grad_norm": 2.427272081375122, + "learning_rate": 4.867737844102261e-06, + "loss": 0.9511575698852539, + "step": 533, + "token_acc": 0.7369301648884579 + }, + { + "epoch": 0.8055817461814068, + "grad_norm": 2.1535112857818604, + "learning_rate": 4.851209685184339e-06, + "loss": 0.9726126194000244, + "step": 534, + "token_acc": 0.7312168072477351 + }, + { + "epoch": 0.8070903262304356, + "grad_norm": 5.083701133728027, + "learning_rate": 4.8346831534134595e-06, + "loss": 0.9120529294013977, + "step": 535, + "token_acc": 0.7446565802113353 + }, + { + "epoch": 0.8085989062794644, + "grad_norm": 1.7981810569763184, + "learning_rate": 4.818158429521129e-06, + "loss": 0.9441041946411133, + "step": 536, + "token_acc": 0.7424260691341864 + }, + { + "epoch": 0.8101074863284933, + "grad_norm": 3.250389337539673, + "learning_rate": 4.801635694219079e-06, + "loss": 0.9443406462669373, + "step": 537, + "token_acc": 0.7443767453668444 + }, + { + "epoch": 0.8116160663775221, + "grad_norm": 3.4479801654815674, + "learning_rate": 4.785115128197298e-06, + "loss": 0.9709488153457642, + "step": 538, + "token_acc": 0.7291580491106682 + }, + { + "epoch": 0.813124646426551, + "grad_norm": 3.247022867202759, + "learning_rate": 4.768596912122046e-06, + "loss": 0.9263079166412354, + "step": 539, + "token_acc": 0.7449027390843421 + }, + { + "epoch": 0.8146332264755799, + "grad_norm": 2.5564754009246826, + "learning_rate": 4.752081226633888e-06, + "loss": 0.9359496235847473, + "step": 540, + "token_acc": 0.7404436688257207 + }, + { + "epoch": 0.8161418065246087, + "grad_norm": 2.4068868160247803, + "learning_rate": 4.735568252345718e-06, + "loss": 0.9023622870445251, + "step": 541, + "token_acc": 0.7436988090056582 + }, + { + "epoch": 0.8176503865736375, + "grad_norm": 10.005001068115234, + "learning_rate": 4.719058169840773e-06, + "loss": 0.9301308393478394, + "step": 542, + "token_acc": 0.7453245832948209 + }, + { + "epoch": 0.8191589666226664, + "grad_norm": 3.5919089317321777, + "learning_rate": 4.702551159670672e-06, + "loss": 0.9539059400558472, + "step": 543, + "token_acc": 0.7336260170361003 + }, + { + "epoch": 0.8206675466716953, + "grad_norm": 2.8090004920959473, + "learning_rate": 4.686047402353433e-06, + "loss": 0.9294788837432861, + "step": 544, + "token_acc": 0.7444573434889713 + }, + { + "epoch": 0.8221761267207242, + "grad_norm": 3.550071954727173, + "learning_rate": 4.669547078371503e-06, + "loss": 0.9498259425163269, + "step": 545, + "token_acc": 0.7412593121747117 + }, + { + "epoch": 0.8236847067697529, + "grad_norm": 2.6558685302734375, + "learning_rate": 4.65305036816978e-06, + "loss": 0.9410960674285889, + "step": 546, + "token_acc": 0.7408250966910185 + }, + { + "epoch": 0.8251932868187818, + "grad_norm": 2.3485679626464844, + "learning_rate": 4.636557452153645e-06, + "loss": 1.0080523490905762, + "step": 547, + "token_acc": 0.7303840258676801 + }, + { + "epoch": 0.8267018668678107, + "grad_norm": 2.6467690467834473, + "learning_rate": 4.620068510686985e-06, + "loss": 0.8990232944488525, + "step": 548, + "token_acc": 0.7474186854671366 + }, + { + "epoch": 0.8282104469168395, + "grad_norm": 2.5140280723571777, + "learning_rate": 4.60358372409022e-06, + "loss": 0.9193800687789917, + "step": 549, + "token_acc": 0.7484605610399765 + }, + { + "epoch": 0.8297190269658684, + "grad_norm": 3.3551504611968994, + "learning_rate": 4.587103272638339e-06, + "loss": 0.8944040536880493, + "step": 550, + "token_acc": 0.7516968898794448 + }, + { + "epoch": 0.8312276070148972, + "grad_norm": 3.610629081726074, + "learning_rate": 4.570627336558915e-06, + "loss": 0.9261206984519958, + "step": 551, + "token_acc": 0.7471343061695552 + }, + { + "epoch": 0.832736187063926, + "grad_norm": 3.8441669940948486, + "learning_rate": 4.554156096030149e-06, + "loss": 0.8826600313186646, + "step": 552, + "token_acc": 0.7529767676375907 + }, + { + "epoch": 0.8342447671129549, + "grad_norm": 3.420154333114624, + "learning_rate": 4.537689731178883e-06, + "loss": 0.9584860801696777, + "step": 553, + "token_acc": 0.7421723125284361 + }, + { + "epoch": 0.8357533471619838, + "grad_norm": 3.2264482975006104, + "learning_rate": 4.5212284220786495e-06, + "loss": 0.9560744762420654, + "step": 554, + "token_acc": 0.7387742240428549 + }, + { + "epoch": 0.8372619272110127, + "grad_norm": 3.0898256301879883, + "learning_rate": 4.504772348747687e-06, + "loss": 0.9828572273254395, + "step": 555, + "token_acc": 0.7259248995427463 + }, + { + "epoch": 0.8387705072600414, + "grad_norm": 2.919325828552246, + "learning_rate": 4.488321691146975e-06, + "loss": 0.9172624349594116, + "step": 556, + "token_acc": 0.750397146121903 + }, + { + "epoch": 0.8402790873090703, + "grad_norm": 6.047379493713379, + "learning_rate": 4.471876629178273e-06, + "loss": 0.9269239902496338, + "step": 557, + "token_acc": 0.7454435494955604 + }, + { + "epoch": 0.8417876673580992, + "grad_norm": 2.804499864578247, + "learning_rate": 4.4554373426821375e-06, + "loss": 0.891727089881897, + "step": 558, + "token_acc": 0.7530506406345332 + }, + { + "epoch": 0.843296247407128, + "grad_norm": 2.245264768600464, + "learning_rate": 4.439004011435979e-06, + "loss": 0.9605697989463806, + "step": 559, + "token_acc": 0.7388660179640718 + }, + { + "epoch": 0.8448048274561569, + "grad_norm": 5.121172904968262, + "learning_rate": 4.42257681515207e-06, + "loss": 1.012239933013916, + "step": 560, + "token_acc": 0.720986731452065 + }, + { + "epoch": 0.8463134075051857, + "grad_norm": 2.9138190746307373, + "learning_rate": 4.406155933475599e-06, + "loss": 0.9261215925216675, + "step": 561, + "token_acc": 0.7475863768949509 + }, + { + "epoch": 0.8478219875542146, + "grad_norm": 3.1187124252319336, + "learning_rate": 4.3897415459827e-06, + "loss": 0.9570176601409912, + "step": 562, + "token_acc": 0.7368590777168655 + }, + { + "epoch": 0.8493305676032434, + "grad_norm": 2.542679786682129, + "learning_rate": 4.373333832178478e-06, + "loss": 0.9682487845420837, + "step": 563, + "token_acc": 0.734405569530198 + }, + { + "epoch": 0.8508391476522723, + "grad_norm": 3.4543561935424805, + "learning_rate": 4.356932971495071e-06, + "loss": 0.9621660709381104, + "step": 564, + "token_acc": 0.7336976074833603 + }, + { + "epoch": 0.8523477277013012, + "grad_norm": 3.8378469944000244, + "learning_rate": 4.340539143289655e-06, + "loss": 0.8806393146514893, + "step": 565, + "token_acc": 0.754316201080293 + }, + { + "epoch": 0.8538563077503301, + "grad_norm": 2.691270351409912, + "learning_rate": 4.324152526842517e-06, + "loss": 0.9608463644981384, + "step": 566, + "token_acc": 0.7389408564081541 + }, + { + "epoch": 0.8553648877993588, + "grad_norm": 4.662106037139893, + "learning_rate": 4.307773301355063e-06, + "loss": 0.9616008996963501, + "step": 567, + "token_acc": 0.7359077361669042 + }, + { + "epoch": 0.8568734678483877, + "grad_norm": 3.195162534713745, + "learning_rate": 4.291401645947879e-06, + "loss": 0.9519809484481812, + "step": 568, + "token_acc": 0.7406968709900306 + }, + { + "epoch": 0.8583820478974166, + "grad_norm": 4.480891227722168, + "learning_rate": 4.275037739658771e-06, + "loss": 0.9322656393051147, + "step": 569, + "token_acc": 0.7490488775302762 + }, + { + "epoch": 0.8598906279464454, + "grad_norm": 3.6828203201293945, + "learning_rate": 4.25868176144079e-06, + "loss": 0.916583776473999, + "step": 570, + "token_acc": 0.7494941965711852 + }, + { + "epoch": 0.8613992079954743, + "grad_norm": 4.880855560302734, + "learning_rate": 4.242333890160299e-06, + "loss": 0.9398585557937622, + "step": 571, + "token_acc": 0.7439643015007683 + }, + { + "epoch": 0.8629077880445031, + "grad_norm": 5.057602882385254, + "learning_rate": 4.225994304594994e-06, + "loss": 0.9398707151412964, + "step": 572, + "token_acc": 0.7396820132252648 + }, + { + "epoch": 0.864416368093532, + "grad_norm": 2.811352491378784, + "learning_rate": 4.209663183431969e-06, + "loss": 1.0100401639938354, + "step": 573, + "token_acc": 0.7239211316925648 + }, + { + "epoch": 0.8659249481425608, + "grad_norm": 12.501093864440918, + "learning_rate": 4.193340705265746e-06, + "loss": 0.9245884418487549, + "step": 574, + "token_acc": 0.7454041362773504 + }, + { + "epoch": 0.8674335281915897, + "grad_norm": 3.6205663681030273, + "learning_rate": 4.17702704859633e-06, + "loss": 0.8711004853248596, + "step": 575, + "token_acc": 0.7622990877497828 + }, + { + "epoch": 0.8689421082406186, + "grad_norm": 2.737107038497925, + "learning_rate": 4.160722391827262e-06, + "loss": 0.9507041573524475, + "step": 576, + "token_acc": 0.7431333674901428 + }, + { + "epoch": 0.8704506882896473, + "grad_norm": 2.785400152206421, + "learning_rate": 4.14442691326365e-06, + "loss": 0.9100902080535889, + "step": 577, + "token_acc": 0.7452752670501233 + }, + { + "epoch": 0.8719592683386762, + "grad_norm": 3.90169095993042, + "learning_rate": 4.128140791110243e-06, + "loss": 0.9461625218391418, + "step": 578, + "token_acc": 0.740879852458279 + }, + { + "epoch": 0.8734678483877051, + "grad_norm": 2.0831542015075684, + "learning_rate": 4.111864203469457e-06, + "loss": 0.9668182134628296, + "step": 579, + "token_acc": 0.7317909019744281 + }, + { + "epoch": 0.874976428436734, + "grad_norm": 2.6266393661499023, + "learning_rate": 4.0955973283394525e-06, + "loss": 0.9448071718215942, + "step": 580, + "token_acc": 0.7431223306530506 + }, + { + "epoch": 0.8764850084857628, + "grad_norm": 2.2467446327209473, + "learning_rate": 4.079340343612165e-06, + "loss": 0.8725464344024658, + "step": 581, + "token_acc": 0.7570967601273274 + }, + { + "epoch": 0.8779935885347916, + "grad_norm": 6.732999324798584, + "learning_rate": 4.063093427071376e-06, + "loss": 0.8902009129524231, + "step": 582, + "token_acc": 0.7565396349719279 + }, + { + "epoch": 0.8795021685838205, + "grad_norm": 2.9533133506774902, + "learning_rate": 4.046856756390767e-06, + "loss": 0.8953625559806824, + "step": 583, + "token_acc": 0.7529245922047489 + }, + { + "epoch": 0.8810107486328493, + "grad_norm": 2.3368334770202637, + "learning_rate": 4.03063050913196e-06, + "loss": 0.9594104290008545, + "step": 584, + "token_acc": 0.7335300124815614 + }, + { + "epoch": 0.8825193286818782, + "grad_norm": 3.8714253902435303, + "learning_rate": 4.0144148627426e-06, + "loss": 0.9315723180770874, + "step": 585, + "token_acc": 0.7458793542905693 + }, + { + "epoch": 0.8840279087309071, + "grad_norm": 3.479743719100952, + "learning_rate": 3.998209994554395e-06, + "loss": 0.9150199294090271, + "step": 586, + "token_acc": 0.7499616466376886 + }, + { + "epoch": 0.8855364887799358, + "grad_norm": 5.043886184692383, + "learning_rate": 3.982016081781189e-06, + "loss": 0.9383110404014587, + "step": 587, + "token_acc": 0.7410350748099093 + }, + { + "epoch": 0.8870450688289647, + "grad_norm": 3.2696523666381836, + "learning_rate": 3.965833301517017e-06, + "loss": 0.953903317451477, + "step": 588, + "token_acc": 0.7390840573201027 + }, + { + "epoch": 0.8885536488779936, + "grad_norm": 3.001802921295166, + "learning_rate": 3.949661830734172e-06, + "loss": 0.8968666791915894, + "step": 589, + "token_acc": 0.7531392405063291 + }, + { + "epoch": 0.8900622289270225, + "grad_norm": 3.440985918045044, + "learning_rate": 3.9335018462812664e-06, + "loss": 0.8885819911956787, + "step": 590, + "token_acc": 0.7531771674744211 + }, + { + "epoch": 0.8915708089760513, + "grad_norm": 2.6679365634918213, + "learning_rate": 3.9173535248813026e-06, + "loss": 0.9260917901992798, + "step": 591, + "token_acc": 0.7415974440894568 + }, + { + "epoch": 0.8930793890250801, + "grad_norm": 3.8178305625915527, + "learning_rate": 3.901217043129735e-06, + "loss": 0.9608850479125977, + "step": 592, + "token_acc": 0.7346061819424776 + }, + { + "epoch": 0.894587969074109, + "grad_norm": 2.9948854446411133, + "learning_rate": 3.885092577492543e-06, + "loss": 0.9068381786346436, + "step": 593, + "token_acc": 0.7486899257391061 + }, + { + "epoch": 0.8960965491231379, + "grad_norm": 3.9679934978485107, + "learning_rate": 3.8689803043043e-06, + "loss": 0.9466627240180969, + "step": 594, + "token_acc": 0.7346259615001395 + }, + { + "epoch": 0.8976051291721667, + "grad_norm": 2.963857412338257, + "learning_rate": 3.852880399766243e-06, + "loss": 0.9198077321052551, + "step": 595, + "token_acc": 0.7431665508454945 + }, + { + "epoch": 0.8991137092211956, + "grad_norm": 3.443026542663574, + "learning_rate": 3.8367930399443495e-06, + "loss": 0.910021185874939, + "step": 596, + "token_acc": 0.7510424619721618 + }, + { + "epoch": 0.9006222892702244, + "grad_norm": 3.5184829235076904, + "learning_rate": 3.820718400767409e-06, + "loss": 0.9800831079483032, + "step": 597, + "token_acc": 0.726972148020847 + }, + { + "epoch": 0.9021308693192532, + "grad_norm": 4.325000286102295, + "learning_rate": 3.8046566580251e-06, + "loss": 0.8991461992263794, + "step": 598, + "token_acc": 0.7544071549765623 + }, + { + "epoch": 0.9036394493682821, + "grad_norm": 2.661820411682129, + "learning_rate": 3.7886079873660693e-06, + "loss": 0.9109302163124084, + "step": 599, + "token_acc": 0.7479737728804299 + }, + { + "epoch": 0.905148029417311, + "grad_norm": 3.3297622203826904, + "learning_rate": 3.7725725642960047e-06, + "loss": 0.9115327596664429, + "step": 600, + "token_acc": 0.7492248136663906 + }, + { + "epoch": 0.9066566094663399, + "grad_norm": 2.4643445014953613, + "learning_rate": 3.756550564175727e-06, + "loss": 0.9309358596801758, + "step": 601, + "token_acc": 0.7433721289525183 + }, + { + "epoch": 0.9081651895153686, + "grad_norm": 3.9152750968933105, + "learning_rate": 3.7405421622192607e-06, + "loss": 0.9141098260879517, + "step": 602, + "token_acc": 0.751371936796291 + }, + { + "epoch": 0.9096737695643975, + "grad_norm": 3.1337151527404785, + "learning_rate": 3.7245475334919246e-06, + "loss": 0.9194568395614624, + "step": 603, + "token_acc": 0.7484828270666906 + }, + { + "epoch": 0.9111823496134264, + "grad_norm": 3.381565809249878, + "learning_rate": 3.7085668529084183e-06, + "loss": 0.863186240196228, + "step": 604, + "token_acc": 0.7604264392324094 + }, + { + "epoch": 0.9126909296624552, + "grad_norm": 2.79109263420105, + "learning_rate": 3.6926002952309015e-06, + "loss": 0.9094499349594116, + "step": 605, + "token_acc": 0.7501693727368298 + }, + { + "epoch": 0.9141995097114841, + "grad_norm": 6.591865062713623, + "learning_rate": 3.676648035067093e-06, + "loss": 0.9026281237602234, + "step": 606, + "token_acc": 0.7499577155099509 + }, + { + "epoch": 0.9157080897605129, + "grad_norm": 3.273190975189209, + "learning_rate": 3.6607102468683524e-06, + "loss": 0.8833757042884827, + "step": 607, + "token_acc": 0.7579680794638122 + }, + { + "epoch": 0.9172166698095418, + "grad_norm": 3.022238254547119, + "learning_rate": 3.64478710492778e-06, + "loss": 0.9162262082099915, + "step": 608, + "token_acc": 0.7505774175273456 + }, + { + "epoch": 0.9187252498585706, + "grad_norm": 3.0211308002471924, + "learning_rate": 3.628878783378302e-06, + "loss": 0.9533579349517822, + "step": 609, + "token_acc": 0.7356408197103552 + }, + { + "epoch": 0.9202338299075995, + "grad_norm": 5.029251575469971, + "learning_rate": 3.6129854561907786e-06, + "loss": 0.8781214952468872, + "step": 610, + "token_acc": 0.7561491760624458 + }, + { + "epoch": 0.9217424099566284, + "grad_norm": 4.955198764801025, + "learning_rate": 3.5971072971720844e-06, + "loss": 0.9115267992019653, + "step": 611, + "token_acc": 0.7456792034256823 + }, + { + "epoch": 0.9232509900056571, + "grad_norm": 3.5902044773101807, + "learning_rate": 3.581244479963225e-06, + "loss": 0.9406421184539795, + "step": 612, + "token_acc": 0.7402546191978369 + }, + { + "epoch": 0.924759570054686, + "grad_norm": 3.6855735778808594, + "learning_rate": 3.56539717803743e-06, + "loss": 0.8683364987373352, + "step": 613, + "token_acc": 0.7603832560420029 + }, + { + "epoch": 0.9262681501037149, + "grad_norm": 3.774980306625366, + "learning_rate": 3.5495655646982506e-06, + "loss": 0.8895199298858643, + "step": 614, + "token_acc": 0.7553493305051041 + }, + { + "epoch": 0.9277767301527438, + "grad_norm": 2.4307074546813965, + "learning_rate": 3.533749813077677e-06, + "loss": 0.8801548480987549, + "step": 615, + "token_acc": 0.7533507170795306 + }, + { + "epoch": 0.9292853102017726, + "grad_norm": 4.021278381347656, + "learning_rate": 3.517950096134232e-06, + "loss": 0.8849371671676636, + "step": 616, + "token_acc": 0.7501228691895807 + }, + { + "epoch": 0.9307938902508014, + "grad_norm": 3.663806200027466, + "learning_rate": 3.5021665866510924e-06, + "loss": 0.9281108379364014, + "step": 617, + "token_acc": 0.7432823937899462 + }, + { + "epoch": 0.9323024702998303, + "grad_norm": 3.2338950634002686, + "learning_rate": 3.4863994572341845e-06, + "loss": 0.8774363398551941, + "step": 618, + "token_acc": 0.7592568397397371 + }, + { + "epoch": 0.9338110503488591, + "grad_norm": 4.390686511993408, + "learning_rate": 3.470648880310313e-06, + "loss": 0.9034897089004517, + "step": 619, + "token_acc": 0.749003984063745 + }, + { + "epoch": 0.935319630397888, + "grad_norm": 3.028557062149048, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.9484639763832092, + "step": 620, + "token_acc": 0.7429885894003587 + }, + { + "epoch": 0.9368282104469169, + "grad_norm": 2.4263687133789062, + "learning_rate": 3.4391980727419206e-06, + "loss": 0.903827965259552, + "step": 621, + "token_acc": 0.7547725586310795 + }, + { + "epoch": 0.9383367904959456, + "grad_norm": 2.7173869609832764, + "learning_rate": 3.423498186038393e-06, + "loss": 0.8904889822006226, + "step": 622, + "token_acc": 0.7561929003227126 + }, + { + "epoch": 0.9398453705449745, + "grad_norm": 7.21522331237793, + "learning_rate": 3.4078155397061243e-06, + "loss": 0.8954098224639893, + "step": 623, + "token_acc": 0.753067326528508 + }, + { + "epoch": 0.9413539505940034, + "grad_norm": 3.604496479034424, + "learning_rate": 3.3921503052480243e-06, + "loss": 0.9189679622650146, + "step": 624, + "token_acc": 0.7485604606525912 + }, + { + "epoch": 0.9428625306430323, + "grad_norm": 2.7277839183807373, + "learning_rate": 3.3765026539765832e-06, + "loss": 0.9480217099189758, + "step": 625, + "token_acc": 0.7430719757873829 + }, + { + "epoch": 0.9443711106920611, + "grad_norm": 3.484670400619507, + "learning_rate": 3.3608727570120114e-06, + "loss": 0.8429567813873291, + "step": 626, + "token_acc": 0.7623493577009668 + }, + { + "epoch": 0.9458796907410899, + "grad_norm": 2.702427387237549, + "learning_rate": 3.3452607852803585e-06, + "loss": 0.9287722110748291, + "step": 627, + "token_acc": 0.743661655822302 + }, + { + "epoch": 0.9473882707901188, + "grad_norm": 2.3480427265167236, + "learning_rate": 3.3296669095116454e-06, + "loss": 0.8979330062866211, + "step": 628, + "token_acc": 0.7491542822480928 + }, + { + "epoch": 0.9488968508391477, + "grad_norm": 5.111566543579102, + "learning_rate": 3.3140913002379993e-06, + "loss": 0.8936165571212769, + "step": 629, + "token_acc": 0.753824981785801 + }, + { + "epoch": 0.9504054308881765, + "grad_norm": 3.4013071060180664, + "learning_rate": 3.298534127791785e-06, + "loss": 0.9079458713531494, + "step": 630, + "token_acc": 0.7492551114764204 + }, + { + "epoch": 0.9519140109372054, + "grad_norm": 2.9633424282073975, + "learning_rate": 3.2829955623037536e-06, + "loss": 0.9079474210739136, + "step": 631, + "token_acc": 0.7446587365746622 + }, + { + "epoch": 0.9534225909862342, + "grad_norm": 5.1142096519470215, + "learning_rate": 3.267475773701161e-06, + "loss": 0.9174619913101196, + "step": 632, + "token_acc": 0.7461978848002797 + }, + { + "epoch": 0.954931171035263, + "grad_norm": 3.7804722785949707, + "learning_rate": 3.251974931705933e-06, + "loss": 0.9086030721664429, + "step": 633, + "token_acc": 0.7528448446467445 + }, + { + "epoch": 0.9564397510842919, + "grad_norm": 3.9754841327667236, + "learning_rate": 3.236493205832795e-06, + "loss": 0.9098681211471558, + "step": 634, + "token_acc": 0.7530808695192257 + }, + { + "epoch": 0.9579483311333208, + "grad_norm": 3.0806455612182617, + "learning_rate": 3.2210307653874175e-06, + "loss": 0.9011087417602539, + "step": 635, + "token_acc": 0.7531327792076283 + }, + { + "epoch": 0.9594569111823497, + "grad_norm": 4.351457595825195, + "learning_rate": 3.205587779464576e-06, + "loss": 0.898061990737915, + "step": 636, + "token_acc": 0.7522088353413655 + }, + { + "epoch": 0.9609654912313784, + "grad_norm": 3.589484930038452, + "learning_rate": 3.1901644169462854e-06, + "loss": 0.9120070338249207, + "step": 637, + "token_acc": 0.7481205805731299 + }, + { + "epoch": 0.9624740712804073, + "grad_norm": 2.1719985008239746, + "learning_rate": 3.1747608464999723e-06, + "loss": 0.8444939851760864, + "step": 638, + "token_acc": 0.7606790112391776 + }, + { + "epoch": 0.9639826513294362, + "grad_norm": 2.886596441268921, + "learning_rate": 3.1593772365766107e-06, + "loss": 0.8635545969009399, + "step": 639, + "token_acc": 0.7592871859136382 + }, + { + "epoch": 0.965491231378465, + "grad_norm": 4.322239398956299, + "learning_rate": 3.1440137554088957e-06, + "loss": 0.922818124294281, + "step": 640, + "token_acc": 0.7418484500574053 + }, + { + "epoch": 0.9669998114274939, + "grad_norm": 3.271947145462036, + "learning_rate": 3.128670571009399e-06, + "loss": 0.9313502907752991, + "step": 641, + "token_acc": 0.7419956054304324 + }, + { + "epoch": 0.9685083914765227, + "grad_norm": 4.718305587768555, + "learning_rate": 3.1133478511687217e-06, + "loss": 0.9430618286132812, + "step": 642, + "token_acc": 0.7446289745588265 + }, + { + "epoch": 0.9700169715255516, + "grad_norm": 3.351572275161743, + "learning_rate": 3.0980457634536775e-06, + "loss": 0.8716884851455688, + "step": 643, + "token_acc": 0.7572232339430839 + }, + { + "epoch": 0.9715255515745804, + "grad_norm": 4.42294454574585, + "learning_rate": 3.082764475205442e-06, + "loss": 0.9097411632537842, + "step": 644, + "token_acc": 0.7566200762388818 + }, + { + "epoch": 0.9730341316236093, + "grad_norm": 3.2341248989105225, + "learning_rate": 3.06750415353774e-06, + "loss": 0.9301663041114807, + "step": 645, + "token_acc": 0.7404664573335703 + }, + { + "epoch": 0.9745427116726382, + "grad_norm": 6.189097881317139, + "learning_rate": 3.052264965335e-06, + "loss": 0.9450299143791199, + "step": 646, + "token_acc": 0.7416312275783422 + }, + { + "epoch": 0.9760512917216669, + "grad_norm": 4.670543193817139, + "learning_rate": 3.0370470772505433e-06, + "loss": 0.914315938949585, + "step": 647, + "token_acc": 0.7483432036097011 + }, + { + "epoch": 0.9775598717706958, + "grad_norm": 3.727722406387329, + "learning_rate": 3.02185065570476e-06, + "loss": 0.9281266331672668, + "step": 648, + "token_acc": 0.7453313253012048 + }, + { + "epoch": 0.9790684518197247, + "grad_norm": 2.604159355163574, + "learning_rate": 3.0066758668832752e-06, + "loss": 0.9509884119033813, + "step": 649, + "token_acc": 0.733581292125596 + }, + { + "epoch": 0.9805770318687536, + "grad_norm": 3.8625636100769043, + "learning_rate": 2.991522876735154e-06, + "loss": 0.9405251145362854, + "step": 650, + "token_acc": 0.7416272446986278 + }, + { + "epoch": 0.9820856119177824, + "grad_norm": 4.173559665679932, + "learning_rate": 2.9763918509710647e-06, + "loss": 0.89885014295578, + "step": 651, + "token_acc": 0.7530163477551576 + }, + { + "epoch": 0.9835941919668112, + "grad_norm": 3.9440720081329346, + "learning_rate": 2.9612829550614836e-06, + "loss": 0.8781136274337769, + "step": 652, + "token_acc": 0.7621304445948995 + }, + { + "epoch": 0.9851027720158401, + "grad_norm": 2.9241225719451904, + "learning_rate": 2.9461963542348737e-06, + "loss": 0.9027390480041504, + "step": 653, + "token_acc": 0.7482748774568124 + }, + { + "epoch": 0.9866113520648689, + "grad_norm": 3.344180107116699, + "learning_rate": 2.931132213475884e-06, + "loss": 0.8964638113975525, + "step": 654, + "token_acc": 0.7567957946580792 + }, + { + "epoch": 0.9881199321138978, + "grad_norm": 4.037812232971191, + "learning_rate": 2.9160906975235493e-06, + "loss": 0.9360722303390503, + "step": 655, + "token_acc": 0.7432091979295498 + }, + { + "epoch": 0.9896285121629267, + "grad_norm": 2.4966983795166016, + "learning_rate": 2.9010719708694724e-06, + "loss": 0.9333371520042419, + "step": 656, + "token_acc": 0.7433760724228707 + }, + { + "epoch": 0.9911370922119555, + "grad_norm": 3.449889898300171, + "learning_rate": 2.8860761977560435e-06, + "loss": 0.9346955418586731, + "step": 657, + "token_acc": 0.7471638238794129 + }, + { + "epoch": 0.9926456722609843, + "grad_norm": 2.6565051078796387, + "learning_rate": 2.871103542174637e-06, + "loss": 0.8991419672966003, + "step": 658, + "token_acc": 0.7460418963616318 + }, + { + "epoch": 0.9941542523100132, + "grad_norm": 3.964833974838257, + "learning_rate": 2.8561541678638145e-06, + "loss": 0.9402192831039429, + "step": 659, + "token_acc": 0.7388520971302428 + }, + { + "epoch": 0.9956628323590421, + "grad_norm": 5.085797309875488, + "learning_rate": 2.8412282383075362e-06, + "loss": 0.8859063982963562, + "step": 660, + "token_acc": 0.7513399237732253 + }, + { + "epoch": 0.997171412408071, + "grad_norm": 2.855872392654419, + "learning_rate": 2.826325916733378e-06, + "loss": 0.993620753288269, + "step": 661, + "token_acc": 0.7364008685852506 + }, + { + "epoch": 0.9986799924570997, + "grad_norm": 3.8379390239715576, + "learning_rate": 2.811447366110741e-06, + "loss": 0.796841561794281, + "step": 662, + "token_acc": 0.7757523041048229 + }, + { + "epoch": 1.0, + "grad_norm": 3.184330463409424, + "learning_rate": 2.796592749149071e-06, + "loss": 0.8582974672317505, + "step": 663, + "token_acc": 0.7592161573691718 + }, + { + "epoch": 1.0015085800490289, + "grad_norm": 3.187472343444824, + "learning_rate": 2.7817622282960816e-06, + "loss": 0.7970677614212036, + "step": 664, + "token_acc": 0.7762216825787566 + }, + { + "epoch": 1.0030171600980577, + "grad_norm": 2.451931953430176, + "learning_rate": 2.766955965735968e-06, + "loss": 0.8524621725082397, + "step": 665, + "token_acc": 0.7568122724908997 + }, + { + "epoch": 1.0045257401470866, + "grad_norm": 7.900096416473389, + "learning_rate": 2.7521741233876496e-06, + "loss": 0.8358883857727051, + "step": 666, + "token_acc": 0.7615922173368168 + }, + { + "epoch": 1.0060343201961155, + "grad_norm": 6.724613189697266, + "learning_rate": 2.7374168629029814e-06, + "loss": 0.8463543653488159, + "step": 667, + "token_acc": 0.7601190205787489 + }, + { + "epoch": 1.0075429002451444, + "grad_norm": 3.555211305618286, + "learning_rate": 2.722684345665004e-06, + "loss": 0.833192765712738, + "step": 668, + "token_acc": 0.764784144607056 + }, + { + "epoch": 1.009051480294173, + "grad_norm": 3.272108554840088, + "learning_rate": 2.707976732786166e-06, + "loss": 0.8384916186332703, + "step": 669, + "token_acc": 0.762353316835635 + }, + { + "epoch": 1.010560060343202, + "grad_norm": 2.4124550819396973, + "learning_rate": 2.693294185106562e-06, + "loss": 0.8108358383178711, + "step": 670, + "token_acc": 0.7709801351031481 + }, + { + "epoch": 1.0120686403922308, + "grad_norm": 2.7804059982299805, + "learning_rate": 2.678636863192184e-06, + "loss": 0.8144900798797607, + "step": 671, + "token_acc": 0.7674235334679604 + }, + { + "epoch": 1.0135772204412596, + "grad_norm": 3.606860637664795, + "learning_rate": 2.6640049273331516e-06, + "loss": 0.8778388500213623, + "step": 672, + "token_acc": 0.7499776626161544 + }, + { + "epoch": 1.0150858004902885, + "grad_norm": 4.016855716705322, + "learning_rate": 2.649398537541978e-06, + "loss": 0.8499450087547302, + "step": 673, + "token_acc": 0.7637103744388992 + }, + { + "epoch": 1.0165943805393174, + "grad_norm": 5.047566890716553, + "learning_rate": 2.6348178535517967e-06, + "loss": 0.8457518219947815, + "step": 674, + "token_acc": 0.7608401257122416 + }, + { + "epoch": 1.0181029605883463, + "grad_norm": 2.771507978439331, + "learning_rate": 2.6202630348146323e-06, + "loss": 0.8829108476638794, + "step": 675, + "token_acc": 0.7549261679980634 + }, + { + "epoch": 1.0196115406373751, + "grad_norm": 3.4020674228668213, + "learning_rate": 2.605734240499652e-06, + "loss": 0.8947675228118896, + "step": 676, + "token_acc": 0.7530002181976871 + }, + { + "epoch": 1.021120120686404, + "grad_norm": 3.1996207237243652, + "learning_rate": 2.5912316294914232e-06, + "loss": 0.8434700965881348, + "step": 677, + "token_acc": 0.7647234922708469 + }, + { + "epoch": 1.0226287007354329, + "grad_norm": 5.3903069496154785, + "learning_rate": 2.576755360388177e-06, + "loss": 0.820655345916748, + "step": 678, + "token_acc": 0.7658212161269002 + }, + { + "epoch": 1.0241372807844615, + "grad_norm": 3.5426740646362305, + "learning_rate": 2.562305591500069e-06, + "loss": 0.8218508362770081, + "step": 679, + "token_acc": 0.7697834547450354 + }, + { + "epoch": 1.0256458608334904, + "grad_norm": 2.9941842555999756, + "learning_rate": 2.5478824808474613e-06, + "loss": 0.7744419574737549, + "step": 680, + "token_acc": 0.7788550470419391 + }, + { + "epoch": 1.0271544408825193, + "grad_norm": 6.040406227111816, + "learning_rate": 2.5334861861591753e-06, + "loss": 0.8469799757003784, + "step": 681, + "token_acc": 0.7621353969370637 + }, + { + "epoch": 1.0286630209315482, + "grad_norm": 2.431262254714966, + "learning_rate": 2.5191168648707888e-06, + "loss": 0.8396000862121582, + "step": 682, + "token_acc": 0.7625957677081256 + }, + { + "epoch": 1.030171600980577, + "grad_norm": 2.729520320892334, + "learning_rate": 2.5047746741228977e-06, + "loss": 0.8249145746231079, + "step": 683, + "token_acc": 0.7648589827009805 + }, + { + "epoch": 1.031680181029606, + "grad_norm": 6.145883560180664, + "learning_rate": 2.490459770759398e-06, + "loss": 0.8704252243041992, + "step": 684, + "token_acc": 0.7611961318186086 + }, + { + "epoch": 1.0331887610786348, + "grad_norm": 5.440282344818115, + "learning_rate": 2.476172311325783e-06, + "loss": 0.8612276315689087, + "step": 685, + "token_acc": 0.7515157465818995 + }, + { + "epoch": 1.0346973411276637, + "grad_norm": 3.261218547821045, + "learning_rate": 2.461912452067415e-06, + "loss": 0.8618755340576172, + "step": 686, + "token_acc": 0.7591119418294545 + }, + { + "epoch": 1.0362059211766925, + "grad_norm": 11.558123588562012, + "learning_rate": 2.447680348927837e-06, + "loss": 0.8431553840637207, + "step": 687, + "token_acc": 0.7648332215378987 + }, + { + "epoch": 1.0377145012257212, + "grad_norm": 5.565245628356934, + "learning_rate": 2.433476157547044e-06, + "loss": 0.8032984733581543, + "step": 688, + "token_acc": 0.7730759286391451 + }, + { + "epoch": 1.03922308127475, + "grad_norm": 5.575282096862793, + "learning_rate": 2.4193000332597984e-06, + "loss": 0.8764697909355164, + "step": 689, + "token_acc": 0.7558628667578083 + }, + { + "epoch": 1.040731661323779, + "grad_norm": 37.750762939453125, + "learning_rate": 2.4051521310939258e-06, + "loss": 0.8644957542419434, + "step": 690, + "token_acc": 0.7623723487824038 + }, + { + "epoch": 1.0422402413728078, + "grad_norm": 3.2812461853027344, + "learning_rate": 2.391032605768613e-06, + "loss": 0.8651729822158813, + "step": 691, + "token_acc": 0.7593947036569988 + }, + { + "epoch": 1.0437488214218367, + "grad_norm": 3.353991985321045, + "learning_rate": 2.3769416116927335e-06, + "loss": 0.8115116953849792, + "step": 692, + "token_acc": 0.770557954892466 + }, + { + "epoch": 1.0452574014708655, + "grad_norm": 3.2831902503967285, + "learning_rate": 2.3628793029631353e-06, + "loss": 0.8754620552062988, + "step": 693, + "token_acc": 0.7559664155500604 + }, + { + "epoch": 1.0467659815198944, + "grad_norm": 2.6714625358581543, + "learning_rate": 2.3488458333629777e-06, + "loss": 0.8192331194877625, + "step": 694, + "token_acc": 0.7702919106881405 + }, + { + "epoch": 1.0482745615689233, + "grad_norm": 3.6974844932556152, + "learning_rate": 2.3348413563600324e-06, + "loss": 0.868034303188324, + "step": 695, + "token_acc": 0.7639542533272607 + }, + { + "epoch": 1.0497831416179522, + "grad_norm": 3.0387983322143555, + "learning_rate": 2.320866025105016e-06, + "loss": 0.8625673651695251, + "step": 696, + "token_acc": 0.7572219181414583 + }, + { + "epoch": 1.051291721666981, + "grad_norm": 2.827920913696289, + "learning_rate": 2.3069199924299175e-06, + "loss": 0.871715247631073, + "step": 697, + "token_acc": 0.752157829839704 + }, + { + "epoch": 1.05280030171601, + "grad_norm": 3.024477243423462, + "learning_rate": 2.29300341084631e-06, + "loss": 0.8218001127243042, + "step": 698, + "token_acc": 0.7675170331371942 + }, + { + "epoch": 1.0543088817650386, + "grad_norm": 2.48238205909729, + "learning_rate": 2.2791164325437047e-06, + "loss": 0.8754801750183105, + "step": 699, + "token_acc": 0.7543548635733004 + }, + { + "epoch": 1.0558174618140674, + "grad_norm": 4.160318374633789, + "learning_rate": 2.265259209387867e-06, + "loss": 0.8546444773674011, + "step": 700, + "token_acc": 0.7577334265227408 + }, + { + "epoch": 1.0573260418630963, + "grad_norm": 2.8128459453582764, + "learning_rate": 2.2514318929191707e-06, + "loss": 0.8897029757499695, + "step": 701, + "token_acc": 0.7498883729237363 + }, + { + "epoch": 1.0588346219121252, + "grad_norm": 3.436384916305542, + "learning_rate": 2.2376346343509343e-06, + "loss": 0.8458385467529297, + "step": 702, + "token_acc": 0.7630578294164246 + }, + { + "epoch": 1.060343201961154, + "grad_norm": 2.326026439666748, + "learning_rate": 2.2238675845677663e-06, + "loss": 0.8284354209899902, + "step": 703, + "token_acc": 0.7629417686642294 + }, + { + "epoch": 1.061851782010183, + "grad_norm": 3.5647480487823486, + "learning_rate": 2.2101308941239204e-06, + "loss": 0.8216782212257385, + "step": 704, + "token_acc": 0.7627888084486177 + }, + { + "epoch": 1.0633603620592118, + "grad_norm": 3.763617992401123, + "learning_rate": 2.1964247132416373e-06, + "loss": 0.7774605751037598, + "step": 705, + "token_acc": 0.7752575414607105 + }, + { + "epoch": 1.0648689421082407, + "grad_norm": 3.25126314163208, + "learning_rate": 2.182749191809518e-06, + "loss": 0.8533617854118347, + "step": 706, + "token_acc": 0.7645308431706506 + }, + { + "epoch": 1.0663775221572696, + "grad_norm": 2.5045928955078125, + "learning_rate": 2.1691044793808734e-06, + "loss": 0.8644918203353882, + "step": 707, + "token_acc": 0.7583524690208734 + }, + { + "epoch": 1.0678861022062982, + "grad_norm": 2.3704309463500977, + "learning_rate": 2.1554907251720947e-06, + "loss": 0.7577507495880127, + "step": 708, + "token_acc": 0.7850736019913308 + }, + { + "epoch": 1.069394682255327, + "grad_norm": 3.377476453781128, + "learning_rate": 2.1419080780610123e-06, + "loss": 0.8402142524719238, + "step": 709, + "token_acc": 0.7595628415300546 + }, + { + "epoch": 1.070903262304356, + "grad_norm": 3.135538339614868, + "learning_rate": 2.1283566865852824e-06, + "loss": 0.8431668281555176, + "step": 710, + "token_acc": 0.7657228764720622 + }, + { + "epoch": 1.0724118423533848, + "grad_norm": 3.9141173362731934, + "learning_rate": 2.11483669894075e-06, + "loss": 0.8567923903465271, + "step": 711, + "token_acc": 0.7595979568297907 + }, + { + "epoch": 1.0739204224024137, + "grad_norm": 4.373052597045898, + "learning_rate": 2.1013482629798334e-06, + "loss": 0.8514196872711182, + "step": 712, + "token_acc": 0.7629519386210258 + }, + { + "epoch": 1.0754290024514426, + "grad_norm": 4.593347072601318, + "learning_rate": 2.08789152620991e-06, + "loss": 0.8190202116966248, + "step": 713, + "token_acc": 0.7663320821559614 + }, + { + "epoch": 1.0769375825004714, + "grad_norm": 3.2002806663513184, + "learning_rate": 2.0744666357916925e-06, + "loss": 0.8645066618919373, + "step": 714, + "token_acc": 0.7626430499475763 + }, + { + "epoch": 1.0784461625495003, + "grad_norm": 4.525274753570557, + "learning_rate": 2.061073738537635e-06, + "loss": 0.8638495206832886, + "step": 715, + "token_acc": 0.7594907137016703 + }, + { + "epoch": 1.0799547425985292, + "grad_norm": 3.7141494750976562, + "learning_rate": 2.0477129809103147e-06, + "loss": 0.8441424369812012, + "step": 716, + "token_acc": 0.7623204507578764 + }, + { + "epoch": 1.081463322647558, + "grad_norm": 2.2049951553344727, + "learning_rate": 2.034384509020837e-06, + "loss": 0.8460217714309692, + "step": 717, + "token_acc": 0.7569604259735572 + }, + { + "epoch": 1.082971902696587, + "grad_norm": 5.033911228179932, + "learning_rate": 2.021088468627237e-06, + "loss": 0.860292911529541, + "step": 718, + "token_acc": 0.762498373454782 + }, + { + "epoch": 1.0844804827456156, + "grad_norm": 2.830131769180298, + "learning_rate": 2.0078250051328783e-06, + "loss": 0.85243821144104, + "step": 719, + "token_acc": 0.7572205519710841 + }, + { + "epoch": 1.0859890627946445, + "grad_norm": 7.026813507080078, + "learning_rate": 1.9945942635848745e-06, + "loss": 0.8810179233551025, + "step": 720, + "token_acc": 0.7540685280507461 + }, + { + "epoch": 1.0874976428436733, + "grad_norm": 2.464785575866699, + "learning_rate": 1.981396388672496e-06, + "loss": 0.8897414207458496, + "step": 721, + "token_acc": 0.7537993920972644 + }, + { + "epoch": 1.0890062228927022, + "grad_norm": 2.566727638244629, + "learning_rate": 1.9682315247255897e-06, + "loss": 0.7634376287460327, + "step": 722, + "token_acc": 0.7825126838704968 + }, + { + "epoch": 1.090514802941731, + "grad_norm": 2.9697439670562744, + "learning_rate": 1.9550998157129946e-06, + "loss": 0.8626710176467896, + "step": 723, + "token_acc": 0.7604733578028098 + }, + { + "epoch": 1.09202338299076, + "grad_norm": 4.085690021514893, + "learning_rate": 1.9420014052409793e-06, + "loss": 0.8562154769897461, + "step": 724, + "token_acc": 0.7562053046075857 + }, + { + "epoch": 1.0935319630397888, + "grad_norm": 6.677030086517334, + "learning_rate": 1.928936436551661e-06, + "loss": 0.8358262777328491, + "step": 725, + "token_acc": 0.7659065315315315 + }, + { + "epoch": 1.0950405430888177, + "grad_norm": 3.143918752670288, + "learning_rate": 1.915905052521445e-06, + "loss": 0.8849726915359497, + "step": 726, + "token_acc": 0.7540137851335177 + }, + { + "epoch": 1.0965491231378466, + "grad_norm": 2.5295815467834473, + "learning_rate": 1.9029073956594607e-06, + "loss": 0.853080153465271, + "step": 727, + "token_acc": 0.7596467441093481 + }, + { + "epoch": 1.0980577031868755, + "grad_norm": 3.3259615898132324, + "learning_rate": 1.8899436081059974e-06, + "loss": 0.8066115975379944, + "step": 728, + "token_acc": 0.7691869645122046 + }, + { + "epoch": 1.099566283235904, + "grad_norm": 4.004860877990723, + "learning_rate": 1.877013831630961e-06, + "loss": 0.8355309963226318, + "step": 729, + "token_acc": 0.7663862125118741 + }, + { + "epoch": 1.101074863284933, + "grad_norm": 5.869095802307129, + "learning_rate": 1.864118207632315e-06, + "loss": 0.8594299554824829, + "step": 730, + "token_acc": 0.7633474385522644 + }, + { + "epoch": 1.1025834433339619, + "grad_norm": 2.7965147495269775, + "learning_rate": 1.851256877134538e-06, + "loss": 0.8016583323478699, + "step": 731, + "token_acc": 0.770342006787921 + }, + { + "epoch": 1.1040920233829907, + "grad_norm": 4.566360950469971, + "learning_rate": 1.838429980787081e-06, + "loss": 0.8022979497909546, + "step": 732, + "token_acc": 0.7788270499644337 + }, + { + "epoch": 1.1056006034320196, + "grad_norm": 4.957377910614014, + "learning_rate": 1.825637658862824e-06, + "loss": 0.8630087375640869, + "step": 733, + "token_acc": 0.753685370702041 + }, + { + "epoch": 1.1071091834810485, + "grad_norm": 6.9369587898254395, + "learning_rate": 1.8128800512565514e-06, + "loss": 0.8395781517028809, + "step": 734, + "token_acc": 0.7680890538033395 + }, + { + "epoch": 1.1086177635300773, + "grad_norm": 2.991605520248413, + "learning_rate": 1.8001572974834169e-06, + "loss": 0.8441255688667297, + "step": 735, + "token_acc": 0.7642646919206304 + }, + { + "epoch": 1.1101263435791062, + "grad_norm": 3.7541069984436035, + "learning_rate": 1.7874695366774191e-06, + "loss": 0.8397233486175537, + "step": 736, + "token_acc": 0.7682788762802454 + }, + { + "epoch": 1.111634923628135, + "grad_norm": 3.308645009994507, + "learning_rate": 1.774816907589873e-06, + "loss": 0.8228752017021179, + "step": 737, + "token_acc": 0.7684187916168033 + }, + { + "epoch": 1.113143503677164, + "grad_norm": 2.857769012451172, + "learning_rate": 1.7621995485879062e-06, + "loss": 0.8564232587814331, + "step": 738, + "token_acc": 0.7605115141120388 + }, + { + "epoch": 1.1146520837261926, + "grad_norm": 4.118873119354248, + "learning_rate": 1.749617597652934e-06, + "loss": 0.894856333732605, + "step": 739, + "token_acc": 0.7495407225964482 + }, + { + "epoch": 1.1161606637752215, + "grad_norm": 4.511308193206787, + "learning_rate": 1.7370711923791567e-06, + "loss": 0.8855926394462585, + "step": 740, + "token_acc": 0.7487325863758361 + }, + { + "epoch": 1.1176692438242504, + "grad_norm": 3.6292173862457275, + "learning_rate": 1.7245604699720536e-06, + "loss": 0.8854216933250427, + "step": 741, + "token_acc": 0.7518254301300882 + }, + { + "epoch": 1.1191778238732792, + "grad_norm": 3.734825372695923, + "learning_rate": 1.7120855672468779e-06, + "loss": 0.8717069029808044, + "step": 742, + "token_acc": 0.7563011113440973 + }, + { + "epoch": 1.1206864039223081, + "grad_norm": 2.893608808517456, + "learning_rate": 1.6996466206271679e-06, + "loss": 0.8433588147163391, + "step": 743, + "token_acc": 0.7628889054689248 + }, + { + "epoch": 1.122194983971337, + "grad_norm": 2.988121747970581, + "learning_rate": 1.6872437661432518e-06, + "loss": 0.8239574432373047, + "step": 744, + "token_acc": 0.7671623207145081 + }, + { + "epoch": 1.1237035640203659, + "grad_norm": 3.3282277584075928, + "learning_rate": 1.6748771394307584e-06, + "loss": 0.8222818374633789, + "step": 745, + "token_acc": 0.7659170248640433 + }, + { + "epoch": 1.1252121440693947, + "grad_norm": 4.647844314575195, + "learning_rate": 1.6625468757291379e-06, + "loss": 0.8683135509490967, + "step": 746, + "token_acc": 0.7619553087831034 + }, + { + "epoch": 1.1267207241184236, + "grad_norm": 13.227705955505371, + "learning_rate": 1.6502531098801756e-06, + "loss": 0.8556308746337891, + "step": 747, + "token_acc": 0.7582034968187966 + }, + { + "epoch": 1.1282293041674523, + "grad_norm": 4.089370250701904, + "learning_rate": 1.6379959763265268e-06, + "loss": 0.8338348865509033, + "step": 748, + "token_acc": 0.7636845792054439 + }, + { + "epoch": 1.1297378842164814, + "grad_norm": 6.115414142608643, + "learning_rate": 1.62577560911024e-06, + "loss": 0.8292636871337891, + "step": 749, + "token_acc": 0.7663865965244708 + }, + { + "epoch": 1.13124646426551, + "grad_norm": 4.723997592926025, + "learning_rate": 1.6135921418712959e-06, + "loss": 0.8633147478103638, + "step": 750, + "token_acc": 0.756055589439749 + }, + { + "epoch": 1.1327550443145389, + "grad_norm": 2.888791799545288, + "learning_rate": 1.6014457078461354e-06, + "loss": 0.7982436418533325, + "step": 751, + "token_acc": 0.7717343156225958 + }, + { + "epoch": 1.1342636243635678, + "grad_norm": 4.401319980621338, + "learning_rate": 1.5893364398662175e-06, + "loss": 0.8644959926605225, + "step": 752, + "token_acc": 0.7612230715774844 + }, + { + "epoch": 1.1357722044125966, + "grad_norm": 2.6145591735839844, + "learning_rate": 1.5772644703565564e-06, + "loss": 0.8384586572647095, + "step": 753, + "token_acc": 0.764098412447674 + }, + { + "epoch": 1.1372807844616255, + "grad_norm": 3.062788724899292, + "learning_rate": 1.5652299313342772e-06, + "loss": 0.8414710164070129, + "step": 754, + "token_acc": 0.76557809515146 + }, + { + "epoch": 1.1387893645106544, + "grad_norm": 2.900927782058716, + "learning_rate": 1.5532329544071712e-06, + "loss": 0.8486537933349609, + "step": 755, + "token_acc": 0.7619649007708709 + }, + { + "epoch": 1.1402979445596833, + "grad_norm": 4.382763862609863, + "learning_rate": 1.5412736707722537e-06, + "loss": 0.8625706434249878, + "step": 756, + "token_acc": 0.7596236099230111 + }, + { + "epoch": 1.1418065246087121, + "grad_norm": 3.585167169570923, + "learning_rate": 1.5293522112143371e-06, + "loss": 0.8746465444564819, + "step": 757, + "token_acc": 0.7556544754571703 + }, + { + "epoch": 1.143315104657741, + "grad_norm": 2.879413604736328, + "learning_rate": 1.517468706104589e-06, + "loss": 0.8591663837432861, + "step": 758, + "token_acc": 0.7588272785213512 + }, + { + "epoch": 1.1448236847067697, + "grad_norm": 4.238146781921387, + "learning_rate": 1.505623285399121e-06, + "loss": 0.8582541346549988, + "step": 759, + "token_acc": 0.7639502943100065 + }, + { + "epoch": 1.1463322647557985, + "grad_norm": 3.5048112869262695, + "learning_rate": 1.4938160786375571e-06, + "loss": 0.8248607516288757, + "step": 760, + "token_acc": 0.7667978428800466 + }, + { + "epoch": 1.1478408448048274, + "grad_norm": 3.1696360111236572, + "learning_rate": 1.4820472149416153e-06, + "loss": 0.8439779281616211, + "step": 761, + "token_acc": 0.7629146530286679 + }, + { + "epoch": 1.1493494248538563, + "grad_norm": 3.901721715927124, + "learning_rate": 1.4703168230137072e-06, + "loss": 0.8692623376846313, + "step": 762, + "token_acc": 0.7581974176010873 + }, + { + "epoch": 1.1508580049028851, + "grad_norm": 3.409456491470337, + "learning_rate": 1.4586250311355132e-06, + "loss": 0.8195787072181702, + "step": 763, + "token_acc": 0.7710513796384396 + }, + { + "epoch": 1.152366584951914, + "grad_norm": 2.8862102031707764, + "learning_rate": 1.4469719671666043e-06, + "loss": 0.8796359896659851, + "step": 764, + "token_acc": 0.7540420689113885 + }, + { + "epoch": 1.153875165000943, + "grad_norm": 5.084037780761719, + "learning_rate": 1.4353577585430152e-06, + "loss": 0.8486667275428772, + "step": 765, + "token_acc": 0.7582036420062982 + }, + { + "epoch": 1.1553837450499718, + "grad_norm": 3.567854166030884, + "learning_rate": 1.4237825322758735e-06, + "loss": 0.8368003368377686, + "step": 766, + "token_acc": 0.7696384629945107 + }, + { + "epoch": 1.1568923250990006, + "grad_norm": 3.267861843109131, + "learning_rate": 1.412246414949997e-06, + "loss": 0.8285214900970459, + "step": 767, + "token_acc": 0.7642321887931999 + }, + { + "epoch": 1.1584009051480295, + "grad_norm": 2.8887999057769775, + "learning_rate": 1.4007495327225162e-06, + "loss": 0.8385510444641113, + "step": 768, + "token_acc": 0.7664767000077537 + }, + { + "epoch": 1.1599094851970584, + "grad_norm": 3.019667863845825, + "learning_rate": 1.389292011321498e-06, + "loss": 0.8696415424346924, + "step": 769, + "token_acc": 0.755802391288531 + }, + { + "epoch": 1.161418065246087, + "grad_norm": 3.079453468322754, + "learning_rate": 1.3778739760445552e-06, + "loss": 0.8744579553604126, + "step": 770, + "token_acc": 0.7533591643441415 + }, + { + "epoch": 1.162926645295116, + "grad_norm": 4.008443355560303, + "learning_rate": 1.3664955517574967e-06, + "loss": 0.7590682506561279, + "step": 771, + "token_acc": 0.7804309010832043 + }, + { + "epoch": 1.1644352253441448, + "grad_norm": 3.7360076904296875, + "learning_rate": 1.3551568628929434e-06, + "loss": 0.8400685787200928, + "step": 772, + "token_acc": 0.7590038028484081 + }, + { + "epoch": 1.1659438053931737, + "grad_norm": 3.469125986099243, + "learning_rate": 1.343858033448982e-06, + "loss": 0.9019758701324463, + "step": 773, + "token_acc": 0.7517547510056873 + }, + { + "epoch": 1.1674523854422025, + "grad_norm": 2.4454150199890137, + "learning_rate": 1.3325991869878013e-06, + "loss": 0.8335192203521729, + "step": 774, + "token_acc": 0.7679858133740376 + }, + { + "epoch": 1.1689609654912314, + "grad_norm": 4.562094211578369, + "learning_rate": 1.321380446634342e-06, + "loss": 0.8234320878982544, + "step": 775, + "token_acc": 0.7657508578103327 + }, + { + "epoch": 1.1704695455402603, + "grad_norm": 3.483257532119751, + "learning_rate": 1.3102019350749528e-06, + "loss": 0.8180626630783081, + "step": 776, + "token_acc": 0.7681058798003906 + }, + { + "epoch": 1.1719781255892892, + "grad_norm": 2.529137134552002, + "learning_rate": 1.2990637745560418e-06, + "loss": 0.8504586219787598, + "step": 777, + "token_acc": 0.7623680738786279 + }, + { + "epoch": 1.173486705638318, + "grad_norm": 3.3163459300994873, + "learning_rate": 1.2879660868827508e-06, + "loss": 0.897262454032898, + "step": 778, + "token_acc": 0.7475658317276814 + }, + { + "epoch": 1.1749952856873467, + "grad_norm": 2.71750807762146, + "learning_rate": 1.2769089934176126e-06, + "loss": 0.8242877721786499, + "step": 779, + "token_acc": 0.767124183006536 + }, + { + "epoch": 1.1765038657363756, + "grad_norm": 4.410928726196289, + "learning_rate": 1.2658926150792321e-06, + "loss": 0.8718596696853638, + "step": 780, + "token_acc": 0.7596812063891002 + }, + { + "epoch": 1.1780124457854044, + "grad_norm": 4.910094261169434, + "learning_rate": 1.2549170723409548e-06, + "loss": 0.8207314014434814, + "step": 781, + "token_acc": 0.7684252671779236 + }, + { + "epoch": 1.1795210258344333, + "grad_norm": 3.5763494968414307, + "learning_rate": 1.243982485229559e-06, + "loss": 0.8600786924362183, + "step": 782, + "token_acc": 0.7603266090297791 + }, + { + "epoch": 1.1810296058834622, + "grad_norm": 2.4195189476013184, + "learning_rate": 1.233088973323937e-06, + "loss": 0.8472797274589539, + "step": 783, + "token_acc": 0.7602361985404713 + }, + { + "epoch": 1.182538185932491, + "grad_norm": 3.069864511489868, + "learning_rate": 1.2222366557537911e-06, + "loss": 0.8422991633415222, + "step": 784, + "token_acc": 0.7592582392362067 + }, + { + "epoch": 1.18404676598152, + "grad_norm": 5.273087501525879, + "learning_rate": 1.2114256511983274e-06, + "loss": 0.8604516983032227, + "step": 785, + "token_acc": 0.7597487814023247 + }, + { + "epoch": 1.1855553460305488, + "grad_norm": 4.4869537353515625, + "learning_rate": 1.200656077884958e-06, + "loss": 0.8384960889816284, + "step": 786, + "token_acc": 0.7569379741780913 + }, + { + "epoch": 1.1870639260795777, + "grad_norm": 3.703582525253296, + "learning_rate": 1.189928053588012e-06, + "loss": 0.7874414920806885, + "step": 787, + "token_acc": 0.7767498545827216 + }, + { + "epoch": 1.1885725061286065, + "grad_norm": 3.1036217212677, + "learning_rate": 1.1792416956274443e-06, + "loss": 0.8466930389404297, + "step": 788, + "token_acc": 0.7613376396178172 + }, + { + "epoch": 1.1900810861776354, + "grad_norm": 4.446225643157959, + "learning_rate": 1.1685971208675539e-06, + "loss": 0.8279223442077637, + "step": 789, + "token_acc": 0.7668148654810761 + }, + { + "epoch": 1.191589666226664, + "grad_norm": 6.295124053955078, + "learning_rate": 1.157994445715706e-06, + "loss": 0.870482325553894, + "step": 790, + "token_acc": 0.7551759539154737 + }, + { + "epoch": 1.193098246275693, + "grad_norm": 3.1516034603118896, + "learning_rate": 1.1474337861210543e-06, + "loss": 0.8273830413818359, + "step": 791, + "token_acc": 0.7713691868017667 + }, + { + "epoch": 1.1946068263247218, + "grad_norm": 2.9097278118133545, + "learning_rate": 1.1369152575732823e-06, + "loss": 0.8556360006332397, + "step": 792, + "token_acc": 0.7645148569970541 + }, + { + "epoch": 1.1961154063737507, + "grad_norm": 8.206477165222168, + "learning_rate": 1.1264389751013326e-06, + "loss": 0.8083620071411133, + "step": 793, + "token_acc": 0.7674322878076891 + }, + { + "epoch": 1.1976239864227796, + "grad_norm": 5.194050312042236, + "learning_rate": 1.1160050532721527e-06, + "loss": 0.8563533425331116, + "step": 794, + "token_acc": 0.7565877525945104 + }, + { + "epoch": 1.1991325664718084, + "grad_norm": 3.949861764907837, + "learning_rate": 1.1056136061894386e-06, + "loss": 0.8768431544303894, + "step": 795, + "token_acc": 0.7599239750203638 + }, + { + "epoch": 1.2006411465208373, + "grad_norm": 3.744086980819702, + "learning_rate": 1.095264747492391e-06, + "loss": 0.7995933890342712, + "step": 796, + "token_acc": 0.7738916397149317 + }, + { + "epoch": 1.2021497265698662, + "grad_norm": 4.417436122894287, + "learning_rate": 1.0849585903544707e-06, + "loss": 0.8150638937950134, + "step": 797, + "token_acc": 0.7627850319492232 + }, + { + "epoch": 1.203658306618895, + "grad_norm": 2.5313515663146973, + "learning_rate": 1.0746952474821615e-06, + "loss": 0.8520450592041016, + "step": 798, + "token_acc": 0.7582100974632543 + }, + { + "epoch": 1.2051668866679237, + "grad_norm": 3.427147388458252, + "learning_rate": 1.0644748311137377e-06, + "loss": 0.858730673789978, + "step": 799, + "token_acc": 0.7597467296628418 + }, + { + "epoch": 1.2066754667169526, + "grad_norm": 2.962146043777466, + "learning_rate": 1.0542974530180327e-06, + "loss": 0.8352794647216797, + "step": 800, + "token_acc": 0.7665958273348416 + }, + { + "epoch": 1.2081840467659815, + "grad_norm": 7.250308513641357, + "learning_rate": 1.0441632244932238e-06, + "loss": 0.874833345413208, + "step": 801, + "token_acc": 0.7518474606551786 + }, + { + "epoch": 1.2096926268150103, + "grad_norm": 7.001688480377197, + "learning_rate": 1.0340722563656109e-06, + "loss": 0.8818879127502441, + "step": 802, + "token_acc": 0.7557013173917857 + }, + { + "epoch": 1.2112012068640392, + "grad_norm": 3.1277167797088623, + "learning_rate": 1.0240246589884046e-06, + "loss": 0.8554705381393433, + "step": 803, + "token_acc": 0.7593875726419311 + }, + { + "epoch": 1.212709786913068, + "grad_norm": 4.8815388679504395, + "learning_rate": 1.0140205422405213e-06, + "loss": 0.8261976838111877, + "step": 804, + "token_acc": 0.7640841559477515 + }, + { + "epoch": 1.214218366962097, + "grad_norm": 4.54482889175415, + "learning_rate": 1.0040600155253766e-06, + "loss": 0.872818648815155, + "step": 805, + "token_acc": 0.7557447469206087 + }, + { + "epoch": 1.2157269470111258, + "grad_norm": 4.650764465332031, + "learning_rate": 9.941431877696955e-07, + "loss": 0.7799949645996094, + "step": 806, + "token_acc": 0.7843485818124358 + }, + { + "epoch": 1.2172355270601547, + "grad_norm": 2.976388931274414, + "learning_rate": 9.842701674223187e-07, + "loss": 0.8012617826461792, + "step": 807, + "token_acc": 0.771749455215958 + }, + { + "epoch": 1.2187441071091836, + "grad_norm": 2.73356032371521, + "learning_rate": 9.744410624530148e-07, + "loss": 0.905472993850708, + "step": 808, + "token_acc": 0.7487559413563029 + }, + { + "epoch": 1.2202526871582124, + "grad_norm": 4.403116226196289, + "learning_rate": 9.646559803512995e-07, + "loss": 0.8043153285980225, + "step": 809, + "token_acc": 0.7700889676092755 + }, + { + "epoch": 1.221761267207241, + "grad_norm": 10.302741050720215, + "learning_rate": 9.549150281252633e-07, + "loss": 0.7473083734512329, + "step": 810, + "token_acc": 0.7887751635097257 + }, + { + "epoch": 1.22326984725627, + "grad_norm": 4.096353054046631, + "learning_rate": 9.452183123003999e-07, + "loss": 0.8180208206176758, + "step": 811, + "token_acc": 0.7714132186996239 + }, + { + "epoch": 1.2247784273052988, + "grad_norm": 4.816980361938477, + "learning_rate": 9.355659389184396e-07, + "loss": 0.8434216380119324, + "step": 812, + "token_acc": 0.7582288052664353 + }, + { + "epoch": 1.2262870073543277, + "grad_norm": 3.019615650177002, + "learning_rate": 9.259580135361929e-07, + "loss": 0.8763598203659058, + "step": 813, + "token_acc": 0.749660208712479 + }, + { + "epoch": 1.2277955874033566, + "grad_norm": 3.6115078926086426, + "learning_rate": 9.163946412243896e-07, + "loss": 0.8023242950439453, + "step": 814, + "token_acc": 0.7728122129667754 + }, + { + "epoch": 1.2293041674523855, + "grad_norm": 5.804961681365967, + "learning_rate": 9.068759265665384e-07, + "loss": 0.8712891340255737, + "step": 815, + "token_acc": 0.7567506694361721 + }, + { + "epoch": 1.2308127475014143, + "grad_norm": 4.241905689239502, + "learning_rate": 8.974019736577777e-07, + "loss": 0.7705470323562622, + "step": 816, + "token_acc": 0.7807038192196787 + }, + { + "epoch": 1.2323213275504432, + "grad_norm": 5.071633815765381, + "learning_rate": 8.879728861037385e-07, + "loss": 0.8718618154525757, + "step": 817, + "token_acc": 0.7573382131111056 + }, + { + "epoch": 1.233829907599472, + "grad_norm": 2.425518274307251, + "learning_rate": 8.785887670194137e-07, + "loss": 0.8495047092437744, + "step": 818, + "token_acc": 0.761827406649161 + }, + { + "epoch": 1.2353384876485007, + "grad_norm": 3.667283058166504, + "learning_rate": 8.692497190280225e-07, + "loss": 0.8571314215660095, + "step": 819, + "token_acc": 0.7580032036722691 + }, + { + "epoch": 1.2368470676975296, + "grad_norm": 3.761533737182617, + "learning_rate": 8.599558442598998e-07, + "loss": 0.8726866841316223, + "step": 820, + "token_acc": 0.7479659046881054 + }, + { + "epoch": 1.2383556477465585, + "grad_norm": 3.687664031982422, + "learning_rate": 8.507072443513703e-07, + "loss": 0.8146553039550781, + "step": 821, + "token_acc": 0.7674731511607995 + }, + { + "epoch": 1.2398642277955874, + "grad_norm": 3.2007253170013428, + "learning_rate": 8.415040204436426e-07, + "loss": 0.814734697341919, + "step": 822, + "token_acc": 0.77012391853846 + }, + { + "epoch": 1.2413728078446162, + "grad_norm": 2.594027042388916, + "learning_rate": 8.323462731816962e-07, + "loss": 0.8661234378814697, + "step": 823, + "token_acc": 0.7542809455794321 + }, + { + "epoch": 1.242881387893645, + "grad_norm": 3.325680732727051, + "learning_rate": 8.232341027131885e-07, + "loss": 0.8681249618530273, + "step": 824, + "token_acc": 0.756111659722055 + }, + { + "epoch": 1.244389967942674, + "grad_norm": 8.760191917419434, + "learning_rate": 8.141676086873574e-07, + "loss": 0.8644903898239136, + "step": 825, + "token_acc": 0.76072918452483 + }, + { + "epoch": 1.2458985479917029, + "grad_norm": 2.878784418106079, + "learning_rate": 8.051468902539272e-07, + "loss": 0.8529533743858337, + "step": 826, + "token_acc": 0.7617086860507913 + }, + { + "epoch": 1.2474071280407317, + "grad_norm": 9.790353775024414, + "learning_rate": 7.961720460620321e-07, + "loss": 0.8460848331451416, + "step": 827, + "token_acc": 0.7607271025120924 + }, + { + "epoch": 1.2489157080897606, + "grad_norm": 2.7448785305023193, + "learning_rate": 7.872431742591268e-07, + "loss": 0.8798612952232361, + "step": 828, + "token_acc": 0.7529055168950427 + }, + { + "epoch": 1.2504242881387895, + "grad_norm": 4.78510046005249, + "learning_rate": 7.783603724899258e-07, + "loss": 0.8721102476119995, + "step": 829, + "token_acc": 0.7572503643772867 + }, + { + "epoch": 1.2519328681878181, + "grad_norm": 3.698948383331299, + "learning_rate": 7.695237378953224e-07, + "loss": 0.8097854852676392, + "step": 830, + "token_acc": 0.7746331642727167 + }, + { + "epoch": 1.253441448236847, + "grad_norm": 3.0980825424194336, + "learning_rate": 7.607333671113409e-07, + "loss": 0.8027788400650024, + "step": 831, + "token_acc": 0.7732448595068403 + }, + { + "epoch": 1.2549500282858759, + "grad_norm": 3.929448127746582, + "learning_rate": 7.519893562680663e-07, + "loss": 0.8290728330612183, + "step": 832, + "token_acc": 0.7658323494687131 + }, + { + "epoch": 1.2564586083349047, + "grad_norm": 3.649696111679077, + "learning_rate": 7.432918009885997e-07, + "loss": 0.8624221086502075, + "step": 833, + "token_acc": 0.7556334940815432 + }, + { + "epoch": 1.2579671883839336, + "grad_norm": 3.5921993255615234, + "learning_rate": 7.346407963880137e-07, + "loss": 0.8591498136520386, + "step": 834, + "token_acc": 0.7588964414234306 + }, + { + "epoch": 1.2594757684329625, + "grad_norm": 4.081330299377441, + "learning_rate": 7.260364370723044e-07, + "loss": 0.8541571497917175, + "step": 835, + "token_acc": 0.7597739048516251 + }, + { + "epoch": 1.2609843484819914, + "grad_norm": 3.7765064239501953, + "learning_rate": 7.174788171373731e-07, + "loss": 0.8358518481254578, + "step": 836, + "token_acc": 0.7637771751620168 + }, + { + "epoch": 1.2624929285310202, + "grad_norm": 3.018167018890381, + "learning_rate": 7.089680301679752e-07, + "loss": 0.8271275758743286, + "step": 837, + "token_acc": 0.770699422595054 + }, + { + "epoch": 1.2640015085800491, + "grad_norm": 4.056427955627441, + "learning_rate": 7.005041692367154e-07, + "loss": 0.7969532012939453, + "step": 838, + "token_acc": 0.7798653726582142 + }, + { + "epoch": 1.2655100886290778, + "grad_norm": 3.9504849910736084, + "learning_rate": 6.92087326903022e-07, + "loss": 0.8235559463500977, + "step": 839, + "token_acc": 0.7659294530417894 + }, + { + "epoch": 1.2670186686781069, + "grad_norm": 2.4411401748657227, + "learning_rate": 6.837175952121305e-07, + "loss": 0.8697791695594788, + "step": 840, + "token_acc": 0.7545074024006906 + }, + { + "epoch": 1.2685272487271355, + "grad_norm": 2.85186505317688, + "learning_rate": 6.753950656940905e-07, + "loss": 0.8693240880966187, + "step": 841, + "token_acc": 0.7602805765800511 + }, + { + "epoch": 1.2700358287761644, + "grad_norm": 6.329864025115967, + "learning_rate": 6.671198293627479e-07, + "loss": 0.8074023723602295, + "step": 842, + "token_acc": 0.7734566639411283 + }, + { + "epoch": 1.2715444088251933, + "grad_norm": 6.148924350738525, + "learning_rate": 6.58891976714764e-07, + "loss": 0.8580352067947388, + "step": 843, + "token_acc": 0.7640509131156613 + }, + { + "epoch": 1.2730529888742221, + "grad_norm": 4.2148895263671875, + "learning_rate": 6.507115977286144e-07, + "loss": 0.8754616975784302, + "step": 844, + "token_acc": 0.7552476334202223 + }, + { + "epoch": 1.274561568923251, + "grad_norm": 2.693108320236206, + "learning_rate": 6.425787818636131e-07, + "loss": 0.8185067176818848, + "step": 845, + "token_acc": 0.7702717130198046 + }, + { + "epoch": 1.2760701489722799, + "grad_norm": 5.9335246086120605, + "learning_rate": 6.34493618058935e-07, + "loss": 0.8918846845626831, + "step": 846, + "token_acc": 0.745826737660095 + }, + { + "epoch": 1.2775787290213088, + "grad_norm": 3.1100077629089355, + "learning_rate": 6.264561947326331e-07, + "loss": 0.8468343019485474, + "step": 847, + "token_acc": 0.764616156935434 + }, + { + "epoch": 1.2790873090703374, + "grad_norm": 5.955336093902588, + "learning_rate": 6.184665997806832e-07, + "loss": 0.7954850196838379, + "step": 848, + "token_acc": 0.7730308781662895 + }, + { + "epoch": 1.2805958891193665, + "grad_norm": 2.4171230792999268, + "learning_rate": 6.105249205760128e-07, + "loss": 0.8406091928482056, + "step": 849, + "token_acc": 0.7652835633334109 + }, + { + "epoch": 1.2821044691683952, + "grad_norm": 5.052217960357666, + "learning_rate": 6.026312439675553e-07, + "loss": 0.8464303612709045, + "step": 850, + "token_acc": 0.7590613539721466 + }, + { + "epoch": 1.283613049217424, + "grad_norm": 4.832734107971191, + "learning_rate": 5.947856562792926e-07, + "loss": 0.8748173117637634, + "step": 851, + "token_acc": 0.7504129857071034 + }, + { + "epoch": 1.285121629266453, + "grad_norm": 3.7959144115448, + "learning_rate": 5.869882433093154e-07, + "loss": 0.8418779373168945, + "step": 852, + "token_acc": 0.7641078398741263 + }, + { + "epoch": 1.2866302093154818, + "grad_norm": 4.00180721282959, + "learning_rate": 5.79239090328883e-07, + "loss": 0.8274734020233154, + "step": 853, + "token_acc": 0.7651564783550875 + }, + { + "epoch": 1.2881387893645107, + "grad_norm": 3.8572921752929688, + "learning_rate": 5.715382820814885e-07, + "loss": 0.771088719367981, + "step": 854, + "token_acc": 0.7782687566863575 + }, + { + "epoch": 1.2896473694135395, + "grad_norm": 5.561359882354736, + "learning_rate": 5.63885902781941e-07, + "loss": 0.8117709755897522, + "step": 855, + "token_acc": 0.7662643917325566 + }, + { + "epoch": 1.2911559494625684, + "grad_norm": 2.937364339828491, + "learning_rate": 5.562820361154315e-07, + "loss": 0.8516515493392944, + "step": 856, + "token_acc": 0.7598392521731534 + }, + { + "epoch": 1.2926645295115973, + "grad_norm": 17.711872100830078, + "learning_rate": 5.487267652366291e-07, + "loss": 0.8287796974182129, + "step": 857, + "token_acc": 0.7678439323780036 + }, + { + "epoch": 1.2941731095606261, + "grad_norm": 2.5553131103515625, + "learning_rate": 5.412201727687644e-07, + "loss": 0.807916522026062, + "step": 858, + "token_acc": 0.7689954318762835 + }, + { + "epoch": 1.2956816896096548, + "grad_norm": 3.8752667903900146, + "learning_rate": 5.337623408027293e-07, + "loss": 0.8370561599731445, + "step": 859, + "token_acc": 0.7621591417076442 + }, + { + "epoch": 1.297190269658684, + "grad_norm": 2.635545492172241, + "learning_rate": 5.263533508961827e-07, + "loss": 0.8436602354049683, + "step": 860, + "token_acc": 0.760495642129581 + }, + { + "epoch": 1.2986988497077125, + "grad_norm": 2.683295726776123, + "learning_rate": 5.189932840726486e-07, + "loss": 0.8545613288879395, + "step": 861, + "token_acc": 0.7611358574610245 + }, + { + "epoch": 1.3002074297567414, + "grad_norm": 3.9508755207061768, + "learning_rate": 5.116822208206396e-07, + "loss": 0.8280319571495056, + "step": 862, + "token_acc": 0.7689847082076355 + }, + { + "epoch": 1.3017160098057703, + "grad_norm": 2.514185667037964, + "learning_rate": 5.044202410927707e-07, + "loss": 0.8647287487983704, + "step": 863, + "token_acc": 0.7533407970272266 + }, + { + "epoch": 1.3032245898547992, + "grad_norm": 2.995802879333496, + "learning_rate": 4.972074243048896e-07, + "loss": 0.8936266899108887, + "step": 864, + "token_acc": 0.7542240995998222 + }, + { + "epoch": 1.304733169903828, + "grad_norm": 2.855926036834717, + "learning_rate": 4.900438493352056e-07, + "loss": 0.8403551578521729, + "step": 865, + "token_acc": 0.756351188824128 + }, + { + "epoch": 1.306241749952857, + "grad_norm": 3.368074893951416, + "learning_rate": 4.829295945234258e-07, + "loss": 0.8307728171348572, + "step": 866, + "token_acc": 0.7635684309315957 + }, + { + "epoch": 1.3077503300018858, + "grad_norm": 2.812657117843628, + "learning_rate": 4.758647376699033e-07, + "loss": 0.8736594915390015, + "step": 867, + "token_acc": 0.7541351398909026 + }, + { + "epoch": 1.3092589100509147, + "grad_norm": 4.58750057220459, + "learning_rate": 4.6884935603477733e-07, + "loss": 0.8768450617790222, + "step": 868, + "token_acc": 0.7514803898138559 + }, + { + "epoch": 1.3107674900999435, + "grad_norm": 3.9196205139160156, + "learning_rate": 4.6188352633713964e-07, + "loss": 0.7879647016525269, + "step": 869, + "token_acc": 0.7731294927609598 + }, + { + "epoch": 1.3122760701489722, + "grad_norm": 3.082662582397461, + "learning_rate": 4.549673247541875e-07, + "loss": 0.7827639579772949, + "step": 870, + "token_acc": 0.7759933951209119 + }, + { + "epoch": 1.313784650198001, + "grad_norm": 4.24887228012085, + "learning_rate": 4.48100826920394e-07, + "loss": 0.8440124988555908, + "step": 871, + "token_acc": 0.7589064561679668 + }, + { + "epoch": 1.31529323024703, + "grad_norm": 10.108877182006836, + "learning_rate": 4.412841079266778e-07, + "loss": 0.8269444108009338, + "step": 872, + "token_acc": 0.7706411723781952 + }, + { + "epoch": 1.3168018102960588, + "grad_norm": 2.904810667037964, + "learning_rate": 4.345172423195865e-07, + "loss": 0.8416088223457336, + "step": 873, + "token_acc": 0.7620063440241889 + }, + { + "epoch": 1.3183103903450877, + "grad_norm": 2.08392071723938, + "learning_rate": 4.27800304100478e-07, + "loss": 0.8817130327224731, + "step": 874, + "token_acc": 0.7527659911964151 + }, + { + "epoch": 1.3198189703941166, + "grad_norm": 3.8090550899505615, + "learning_rate": 4.211333667247125e-07, + "loss": 0.7908709645271301, + "step": 875, + "token_acc": 0.7730251701757164 + }, + { + "epoch": 1.3213275504431454, + "grad_norm": 3.8741047382354736, + "learning_rate": 4.1451650310085076e-07, + "loss": 0.8834118247032166, + "step": 876, + "token_acc": 0.7499632587076863 + }, + { + "epoch": 1.3228361304921743, + "grad_norm": 2.829456329345703, + "learning_rate": 4.079497855898501e-07, + "loss": 0.8954581618309021, + "step": 877, + "token_acc": 0.7525333549946185 + }, + { + "epoch": 1.3243447105412032, + "grad_norm": 2.7152304649353027, + "learning_rate": 4.01433286004283e-07, + "loss": 0.8786543607711792, + "step": 878, + "token_acc": 0.7550847257901063 + }, + { + "epoch": 1.3258532905902318, + "grad_norm": 2.7631213665008545, + "learning_rate": 3.949670756075447e-07, + "loss": 0.8465378880500793, + "step": 879, + "token_acc": 0.7597265746696722 + }, + { + "epoch": 1.327361870639261, + "grad_norm": 2.524502754211426, + "learning_rate": 3.885512251130763e-07, + "loss": 0.8189669847488403, + "step": 880, + "token_acc": 0.7640774492178621 + }, + { + "epoch": 1.3288704506882896, + "grad_norm": 3.4909603595733643, + "learning_rate": 3.8218580468359136e-07, + "loss": 0.8377684354782104, + "step": 881, + "token_acc": 0.7637885045603167 + }, + { + "epoch": 1.3303790307373184, + "grad_norm": 3.5993642807006836, + "learning_rate": 3.7587088393030604e-07, + "loss": 0.8345588445663452, + "step": 882, + "token_acc": 0.7684170249161584 + }, + { + "epoch": 1.3318876107863473, + "grad_norm": 5.203431129455566, + "learning_rate": 3.6960653191218333e-07, + "loss": 0.8536242246627808, + "step": 883, + "token_acc": 0.7607238208246812 + }, + { + "epoch": 1.3333961908353762, + "grad_norm": 3.583754539489746, + "learning_rate": 3.6339281713517304e-07, + "loss": 0.8044613599777222, + "step": 884, + "token_acc": 0.7735486755924981 + }, + { + "epoch": 1.334904770884405, + "grad_norm": 4.450930595397949, + "learning_rate": 3.572298075514652e-07, + "loss": 0.777694821357727, + "step": 885, + "token_acc": 0.7796333384686489 + }, + { + "epoch": 1.336413350933434, + "grad_norm": 4.322178840637207, + "learning_rate": 3.511175705587433e-07, + "loss": 0.7915257811546326, + "step": 886, + "token_acc": 0.7769786337646705 + }, + { + "epoch": 1.3379219309824628, + "grad_norm": 3.398308753967285, + "learning_rate": 3.450561729994534e-07, + "loss": 0.8078945875167847, + "step": 887, + "token_acc": 0.7683210894104999 + }, + { + "epoch": 1.3394305110314917, + "grad_norm": 5.275265216827393, + "learning_rate": 3.390456811600673e-07, + "loss": 0.8685917854309082, + "step": 888, + "token_acc": 0.7592981063612655 + }, + { + "epoch": 1.3409390910805206, + "grad_norm": 3.8297252655029297, + "learning_rate": 3.3308616077036113e-07, + "loss": 0.8269513845443726, + "step": 889, + "token_acc": 0.7657146715279184 + }, + { + "epoch": 1.3424476711295492, + "grad_norm": 3.2783734798431396, + "learning_rate": 3.271776770026963e-07, + "loss": 0.8315811157226562, + "step": 890, + "token_acc": 0.7701571692953623 + }, + { + "epoch": 1.343956251178578, + "grad_norm": 3.1422719955444336, + "learning_rate": 3.213202944713023e-07, + "loss": 0.8674135208129883, + "step": 891, + "token_acc": 0.7578109292693715 + }, + { + "epoch": 1.345464831227607, + "grad_norm": 3.3541738986968994, + "learning_rate": 3.1551407723157734e-07, + "loss": 0.851649284362793, + "step": 892, + "token_acc": 0.7599915811136316 + }, + { + "epoch": 1.3469734112766358, + "grad_norm": 2.806809425354004, + "learning_rate": 3.0975908877938277e-07, + "loss": 0.8486818075180054, + "step": 893, + "token_acc": 0.7669441141498217 + }, + { + "epoch": 1.3484819913256647, + "grad_norm": 2.431014060974121, + "learning_rate": 3.040553920503503e-07, + "loss": 0.7670931220054626, + "step": 894, + "token_acc": 0.7825619668166857 + }, + { + "epoch": 1.3499905713746936, + "grad_norm": 2.5104012489318848, + "learning_rate": 2.984030494191942e-07, + "loss": 0.8774336576461792, + "step": 895, + "token_acc": 0.7549957522254644 + }, + { + "epoch": 1.3514991514237225, + "grad_norm": 10.018961906433105, + "learning_rate": 2.928021226990263e-07, + "loss": 0.8030972480773926, + "step": 896, + "token_acc": 0.7725771935188016 + }, + { + "epoch": 1.3530077314727513, + "grad_norm": 3.1026413440704346, + "learning_rate": 2.8725267314068496e-07, + "loss": 0.8295409679412842, + "step": 897, + "token_acc": 0.7697932036910609 + }, + { + "epoch": 1.3545163115217802, + "grad_norm": 2.5442681312561035, + "learning_rate": 2.817547614320615e-07, + "loss": 0.8227270841598511, + "step": 898, + "token_acc": 0.7606663514349107 + }, + { + "epoch": 1.3560248915708089, + "grad_norm": 4.6583733558654785, + "learning_rate": 2.763084476974376e-07, + "loss": 0.8517241477966309, + "step": 899, + "token_acc": 0.7605916005378187 + }, + { + "epoch": 1.357533471619838, + "grad_norm": 4.3161091804504395, + "learning_rate": 2.7091379149682683e-07, + "loss": 0.8690881729125977, + "step": 900, + "token_acc": 0.7529733997927257 + }, + { + "epoch": 1.3590420516688666, + "grad_norm": 4.815860748291016, + "learning_rate": 2.655708518253258e-07, + "loss": 0.8184032440185547, + "step": 901, + "token_acc": 0.7689133425034388 + }, + { + "epoch": 1.3605506317178955, + "grad_norm": 2.5258986949920654, + "learning_rate": 2.602796871124663e-07, + "loss": 0.8093873262405396, + "step": 902, + "token_acc": 0.7734018144694466 + }, + { + "epoch": 1.3620592117669243, + "grad_norm": 3.851839780807495, + "learning_rate": 2.5504035522157853e-07, + "loss": 0.8221151828765869, + "step": 903, + "token_acc": 0.766597640562249 + }, + { + "epoch": 1.3635677918159532, + "grad_norm": 4.92925500869751, + "learning_rate": 2.4985291344915675e-07, + "loss": 0.8636224269866943, + "step": 904, + "token_acc": 0.7594956293030092 + }, + { + "epoch": 1.365076371864982, + "grad_norm": 3.1369941234588623, + "learning_rate": 2.447174185242324e-07, + "loss": 0.8100435733795166, + "step": 905, + "token_acc": 0.7732822599952118 + }, + { + "epoch": 1.366584951914011, + "grad_norm": 3.5626163482666016, + "learning_rate": 2.3963392660775576e-07, + "loss": 0.7843802571296692, + "step": 906, + "token_acc": 0.7725212623097583 + }, + { + "epoch": 1.3680935319630398, + "grad_norm": 2.8909037113189697, + "learning_rate": 2.3460249329197825e-07, + "loss": 0.7857382893562317, + "step": 907, + "token_acc": 0.7768698389426705 + }, + { + "epoch": 1.3696021120120687, + "grad_norm": 4.476231098175049, + "learning_rate": 2.296231735998511e-07, + "loss": 0.7972747087478638, + "step": 908, + "token_acc": 0.7760357361828851 + }, + { + "epoch": 1.3711106920610976, + "grad_norm": 15.696163177490234, + "learning_rate": 2.2469602198441575e-07, + "loss": 0.8219046592712402, + "step": 909, + "token_acc": 0.7695260026866244 + }, + { + "epoch": 1.3726192721101262, + "grad_norm": 2.2422847747802734, + "learning_rate": 2.198210923282118e-07, + "loss": 0.8584589958190918, + "step": 910, + "token_acc": 0.7560702338460553 + }, + { + "epoch": 1.3741278521591551, + "grad_norm": 11.26689338684082, + "learning_rate": 2.149984379426906e-07, + "loss": 0.8486065864562988, + "step": 911, + "token_acc": 0.7583096134422918 + }, + { + "epoch": 1.375636432208184, + "grad_norm": 2.494154930114746, + "learning_rate": 2.102281115676258e-07, + "loss": 0.8555142879486084, + "step": 912, + "token_acc": 0.7623042954636692 + }, + { + "epoch": 1.3771450122572129, + "grad_norm": 3.414808750152588, + "learning_rate": 2.0551016537054492e-07, + "loss": 0.9207150936126709, + "step": 913, + "token_acc": 0.7424716786817713 + }, + { + "epoch": 1.3786535923062417, + "grad_norm": 3.5631885528564453, + "learning_rate": 2.008446509461498e-07, + "loss": 0.858719527721405, + "step": 914, + "token_acc": 0.7597096427553236 + }, + { + "epoch": 1.3801621723552706, + "grad_norm": 2.443528175354004, + "learning_rate": 1.962316193157593e-07, + "loss": 0.8557185530662537, + "step": 915, + "token_acc": 0.7586619591885584 + }, + { + "epoch": 1.3816707524042995, + "grad_norm": 3.3436384201049805, + "learning_rate": 1.91671120926748e-07, + "loss": 0.9479650259017944, + "step": 916, + "token_acc": 0.7379957138916591 + }, + { + "epoch": 1.3831793324533284, + "grad_norm": 2.9120945930480957, + "learning_rate": 1.871632056519962e-07, + "loss": 0.8470411896705627, + "step": 917, + "token_acc": 0.7576958499623231 + }, + { + "epoch": 1.3846879125023572, + "grad_norm": 2.4719932079315186, + "learning_rate": 1.8270792278934302e-07, + "loss": 0.8297110795974731, + "step": 918, + "token_acc": 0.7656576480042862 + }, + { + "epoch": 1.3861964925513859, + "grad_norm": 3.310103178024292, + "learning_rate": 1.7830532106104747e-07, + "loss": 0.8285752534866333, + "step": 919, + "token_acc": 0.7657645647247033 + }, + { + "epoch": 1.387705072600415, + "grad_norm": 3.489499807357788, + "learning_rate": 1.7395544861325718e-07, + "loss": 0.7715452909469604, + "step": 920, + "token_acc": 0.7812409881764876 + }, + { + "epoch": 1.3892136526494436, + "grad_norm": 3.3134560585021973, + "learning_rate": 1.696583530154794e-07, + "loss": 0.8156480193138123, + "step": 921, + "token_acc": 0.7750934884549178 + }, + { + "epoch": 1.3907222326984725, + "grad_norm": 4.780351161956787, + "learning_rate": 1.6541408126006464e-07, + "loss": 0.8397789001464844, + "step": 922, + "token_acc": 0.7659860542196929 + }, + { + "epoch": 1.3922308127475014, + "grad_norm": 2.9415769577026367, + "learning_rate": 1.6122267976168783e-07, + "loss": 0.8513314723968506, + "step": 923, + "token_acc": 0.7588572045798229 + }, + { + "epoch": 1.3937393927965303, + "grad_norm": 3.065779685974121, + "learning_rate": 1.5708419435684463e-07, + "loss": 0.8995948433876038, + "step": 924, + "token_acc": 0.7465725912083689 + }, + { + "epoch": 1.3952479728455591, + "grad_norm": 1.9850691556930542, + "learning_rate": 1.5299867030334815e-07, + "loss": 0.7551571130752563, + "step": 925, + "token_acc": 0.7806480925568339 + }, + { + "epoch": 1.396756552894588, + "grad_norm": 2.341355562210083, + "learning_rate": 1.4896615227983468e-07, + "loss": 0.8150415420532227, + "step": 926, + "token_acc": 0.7664070518625853 + }, + { + "epoch": 1.3982651329436169, + "grad_norm": 8.531405448913574, + "learning_rate": 1.4498668438527597e-07, + "loss": 0.8344970941543579, + "step": 927, + "token_acc": 0.7640511964385086 + }, + { + "epoch": 1.3997737129926457, + "grad_norm": 3.261359930038452, + "learning_rate": 1.4106031013849498e-07, + "loss": 0.8661531209945679, + "step": 928, + "token_acc": 0.7587449901581805 + }, + { + "epoch": 1.4012822930416746, + "grad_norm": 8.634267807006836, + "learning_rate": 1.3718707247769137e-07, + "loss": 0.7901025414466858, + "step": 929, + "token_acc": 0.7815322427840586 + }, + { + "epoch": 1.4027908730907033, + "grad_norm": 8.976533889770508, + "learning_rate": 1.333670137599713e-07, + "loss": 0.8702472448348999, + "step": 930, + "token_acc": 0.7581097316780137 + }, + { + "epoch": 1.4042994531397321, + "grad_norm": 4.943149566650391, + "learning_rate": 1.2960017576088445e-07, + "loss": 0.8625393509864807, + "step": 931, + "token_acc": 0.7572367882574415 + }, + { + "epoch": 1.405808033188761, + "grad_norm": 2.5458521842956543, + "learning_rate": 1.2588659967396998e-07, + "loss": 0.8305867910385132, + "step": 932, + "token_acc": 0.7635211267605634 + }, + { + "epoch": 1.40731661323779, + "grad_norm": 2.928887367248535, + "learning_rate": 1.222263261102985e-07, + "loss": 0.842563271522522, + "step": 933, + "token_acc": 0.7656107185054729 + }, + { + "epoch": 1.4088251932868188, + "grad_norm": 2.913442850112915, + "learning_rate": 1.1861939509803688e-07, + "loss": 0.8048484921455383, + "step": 934, + "token_acc": 0.7720449328647748 + }, + { + "epoch": 1.4103337733358476, + "grad_norm": 3.2029519081115723, + "learning_rate": 1.1506584608200366e-07, + "loss": 0.8218167424201965, + "step": 935, + "token_acc": 0.76690415639378 + }, + { + "epoch": 1.4118423533848765, + "grad_norm": 2.4202632904052734, + "learning_rate": 1.1156571792324212e-07, + "loss": 0.843273937702179, + "step": 936, + "token_acc": 0.7680688785865362 + }, + { + "epoch": 1.4133509334339054, + "grad_norm": 2.484635829925537, + "learning_rate": 1.0811904889859337e-07, + "loss": 0.8005064725875854, + "step": 937, + "token_acc": 0.7738100012030317 + }, + { + "epoch": 1.4148595134829343, + "grad_norm": 14.440755844116211, + "learning_rate": 1.0472587670027678e-07, + "loss": 0.8771236538887024, + "step": 938, + "token_acc": 0.751779743285514 + }, + { + "epoch": 1.416368093531963, + "grad_norm": 2.2059826850891113, + "learning_rate": 1.0138623843548078e-07, + "loss": 0.8186941742897034, + "step": 939, + "token_acc": 0.7674638263665595 + }, + { + "epoch": 1.417876673580992, + "grad_norm": 4.5751752853393555, + "learning_rate": 9.810017062595322e-08, + "loss": 0.8284302949905396, + "step": 940, + "token_acc": 0.7646645690527025 + }, + { + "epoch": 1.4193852536300207, + "grad_norm": 2.81125545501709, + "learning_rate": 9.486770920760668e-08, + "loss": 0.8492136597633362, + "step": 941, + "token_acc": 0.7658085126374142 + }, + { + "epoch": 1.4208938336790495, + "grad_norm": 14.796782493591309, + "learning_rate": 9.16888895301199e-08, + "loss": 0.8663207292556763, + "step": 942, + "token_acc": 0.7557719160657242 + }, + { + "epoch": 1.4224024137280784, + "grad_norm": 3.0281903743743896, + "learning_rate": 8.856374635655696e-08, + "loss": 0.7829236388206482, + "step": 943, + "token_acc": 0.7745779026509535 + }, + { + "epoch": 1.4239109937771073, + "grad_norm": 9.952474594116211, + "learning_rate": 8.549231386298151e-08, + "loss": 0.8628077507019043, + "step": 944, + "token_acc": 0.756022709835814 + }, + { + "epoch": 1.4254195738261362, + "grad_norm": 3.9230077266693115, + "learning_rate": 8.247462563808816e-08, + "loss": 0.8170371055603027, + "step": 945, + "token_acc": 0.7700100050025013 + }, + { + "epoch": 1.426928153875165, + "grad_norm": 3.16102933883667, + "learning_rate": 7.951071468283166e-08, + "loss": 0.8196001648902893, + "step": 946, + "token_acc": 0.7661214005677978 + }, + { + "epoch": 1.428436733924194, + "grad_norm": 3.1428802013397217, + "learning_rate": 7.660061341006719e-08, + "loss": 0.8285311460494995, + "step": 947, + "token_acc": 0.7673301451897807 + }, + { + "epoch": 1.4299453139732228, + "grad_norm": 5.604135036468506, + "learning_rate": 7.374435364419675e-08, + "loss": 0.7732332348823547, + "step": 948, + "token_acc": 0.7785789658811115 + }, + { + "epoch": 1.4314538940222516, + "grad_norm": 3.621756076812744, + "learning_rate": 7.094196662081832e-08, + "loss": 0.815258264541626, + "step": 949, + "token_acc": 0.7698474605650892 + }, + { + "epoch": 1.4329624740712803, + "grad_norm": 3.9680275917053223, + "learning_rate": 6.819348298638839e-08, + "loss": 0.8352375030517578, + "step": 950, + "token_acc": 0.7614864864864865 + }, + { + "epoch": 1.4344710541203094, + "grad_norm": 4.738731861114502, + "learning_rate": 6.549893279788278e-08, + "loss": 0.85414719581604, + "step": 951, + "token_acc": 0.76351890832547 + }, + { + "epoch": 1.435979634169338, + "grad_norm": 5.8890380859375, + "learning_rate": 6.285834552247127e-08, + "loss": 0.8063274025917053, + "step": 952, + "token_acc": 0.7660111401614187 + }, + { + "epoch": 1.437488214218367, + "grad_norm": 4.071278095245361, + "learning_rate": 6.027175003719354e-08, + "loss": 0.8338813185691833, + "step": 953, + "token_acc": 0.764085623678647 + }, + { + "epoch": 1.4389967942673958, + "grad_norm": 3.590146541595459, + "learning_rate": 5.773917462864265e-08, + "loss": 0.8335298895835876, + "step": 954, + "token_acc": 0.7642306279838413 + }, + { + "epoch": 1.4405053743164247, + "grad_norm": 10.704903602600098, + "learning_rate": 5.526064699265754e-08, + "loss": 0.8400973677635193, + "step": 955, + "token_acc": 0.766653357290211 + }, + { + "epoch": 1.4420139543654535, + "grad_norm": 4.129668712615967, + "learning_rate": 5.2836194234019976e-08, + "loss": 0.880071759223938, + "step": 956, + "token_acc": 0.7548260675552452 + }, + { + "epoch": 1.4435225344144824, + "grad_norm": 4.481297016143799, + "learning_rate": 5.0465842866156965e-08, + "loss": 0.8310449123382568, + "step": 957, + "token_acc": 0.7720488874207981 + }, + { + "epoch": 1.4450311144635113, + "grad_norm": 5.460820198059082, + "learning_rate": 4.8149618810850454e-08, + "loss": 0.8569669723510742, + "step": 958, + "token_acc": 0.7667159426881965 + }, + { + "epoch": 1.44653969451254, + "grad_norm": 8.950236320495605, + "learning_rate": 4.588754739795587e-08, + "loss": 0.8569900989532471, + "step": 959, + "token_acc": 0.757593263270158 + }, + { + "epoch": 1.448048274561569, + "grad_norm": 3.055006504058838, + "learning_rate": 4.367965336512403e-08, + "loss": 0.885501503944397, + "step": 960, + "token_acc": 0.7566585039509227 + }, + { + "epoch": 1.4495568546105977, + "grad_norm": 2.483043670654297, + "learning_rate": 4.1525960857530244e-08, + "loss": 0.8487317562103271, + "step": 961, + "token_acc": 0.7577910027516478 + }, + { + "epoch": 1.4510654346596266, + "grad_norm": 3.392090082168579, + "learning_rate": 3.9426493427611177e-08, + "loss": 0.8799946904182434, + "step": 962, + "token_acc": 0.7589928935993553 + }, + { + "epoch": 1.4525740147086554, + "grad_norm": 3.358323812484741, + "learning_rate": 3.738127403480507e-08, + "loss": 0.8986912965774536, + "step": 963, + "token_acc": 0.7477453651053926 + }, + { + "epoch": 1.4540825947576843, + "grad_norm": 4.571354389190674, + "learning_rate": 3.5390325045304704e-08, + "loss": 0.7936071157455444, + "step": 964, + "token_acc": 0.7799789781555616 + }, + { + "epoch": 1.4555911748067132, + "grad_norm": 2.9029417037963867, + "learning_rate": 3.345366823180929e-08, + "loss": 0.7989702820777893, + "step": 965, + "token_acc": 0.7716986097101004 + }, + { + "epoch": 1.457099754855742, + "grad_norm": 4.157406330108643, + "learning_rate": 3.1571324773286284e-08, + "loss": 0.8275780081748962, + "step": 966, + "token_acc": 0.7638347002149852 + }, + { + "epoch": 1.458608334904771, + "grad_norm": 4.188841819763184, + "learning_rate": 2.9743315254743834e-08, + "loss": 0.8145856857299805, + "step": 967, + "token_acc": 0.7700161700161701 + }, + { + "epoch": 1.4601169149537998, + "grad_norm": 8.615144729614258, + "learning_rate": 2.7969659666999273e-08, + "loss": 0.8375093936920166, + "step": 968, + "token_acc": 0.7654456532588083 + }, + { + "epoch": 1.4616254950028287, + "grad_norm": 3.2567293643951416, + "learning_rate": 2.625037740646763e-08, + "loss": 0.866114616394043, + "step": 969, + "token_acc": 0.7610168026054126 + }, + { + "epoch": 1.4631340750518573, + "grad_norm": 7.408751010894775, + "learning_rate": 2.4585487274942922e-08, + "loss": 0.8234032392501831, + "step": 970, + "token_acc": 0.7666519108750185 + }, + { + "epoch": 1.4646426551008864, + "grad_norm": 2.693723201751709, + "learning_rate": 2.2975007479397736e-08, + "loss": 0.8300590515136719, + "step": 971, + "token_acc": 0.7684509416801615 + }, + { + "epoch": 1.466151235149915, + "grad_norm": 3.6981606483459473, + "learning_rate": 2.1418955631781203e-08, + "loss": 0.8962347507476807, + "step": 972, + "token_acc": 0.7447123065550707 + }, + { + "epoch": 1.467659815198944, + "grad_norm": 4.05355167388916, + "learning_rate": 1.9917348748826337e-08, + "loss": 0.8538215160369873, + "step": 973, + "token_acc": 0.7578759114045949 + }, + { + "epoch": 1.4691683952479728, + "grad_norm": 4.54546594619751, + "learning_rate": 1.847020325186577e-08, + "loss": 0.8844298720359802, + "step": 974, + "token_acc": 0.752757244408757 + }, + { + "epoch": 1.4706769752970017, + "grad_norm": 3.000606060028076, + "learning_rate": 1.7077534966650767e-08, + "loss": 0.818281888961792, + "step": 975, + "token_acc": 0.7663689741945033 + }, + { + "epoch": 1.4721855553460306, + "grad_norm": 3.2969634532928467, + "learning_rate": 1.5739359123178587e-08, + "loss": 0.8062618374824524, + "step": 976, + "token_acc": 0.7695586975149957 + }, + { + "epoch": 1.4736941353950594, + "grad_norm": 7.097475051879883, + "learning_rate": 1.4455690355525964e-08, + "loss": 0.8039494752883911, + "step": 977, + "token_acc": 0.7709600083388346 + }, + { + "epoch": 1.4752027154440883, + "grad_norm": 4.352001667022705, + "learning_rate": 1.3226542701689215e-08, + "loss": 0.8472201228141785, + "step": 978, + "token_acc": 0.7611474739210472 + }, + { + "epoch": 1.476711295493117, + "grad_norm": 3.6690895557403564, + "learning_rate": 1.2051929603428824e-08, + "loss": 0.8614902496337891, + "step": 979, + "token_acc": 0.7538421632838337 + }, + { + "epoch": 1.478219875542146, + "grad_norm": 2.307662010192871, + "learning_rate": 1.0931863906127327e-08, + "loss": 0.8532673120498657, + "step": 980, + "token_acc": 0.7596543610226801 + }, + { + "epoch": 1.4797284555911747, + "grad_norm": 4.878684997558594, + "learning_rate": 9.866357858642206e-09, + "loss": 0.8487564325332642, + "step": 981, + "token_acc": 0.7645466128689743 + }, + { + "epoch": 1.4812370356402036, + "grad_norm": 5.448169231414795, + "learning_rate": 8.855423113177664e-09, + "loss": 0.8541351556777954, + "step": 982, + "token_acc": 0.7630805909148963 + }, + { + "epoch": 1.4827456156892325, + "grad_norm": 3.5828020572662354, + "learning_rate": 7.899070725153612e-09, + "loss": 0.8462401032447815, + "step": 983, + "token_acc": 0.7620738265545868 + }, + { + "epoch": 1.4842541957382613, + "grad_norm": 3.930011034011841, + "learning_rate": 6.997311153086883e-09, + "loss": 0.7914619445800781, + "step": 984, + "token_acc": 0.7749241116600203 + }, + { + "epoch": 1.4857627757872902, + "grad_norm": 2.846635580062866, + "learning_rate": 6.150154258476315e-09, + "loss": 0.8032582998275757, + "step": 985, + "token_acc": 0.7732157619178837 + }, + { + "epoch": 1.487271355836319, + "grad_norm": 3.6364030838012695, + "learning_rate": 5.357609305692291e-09, + "loss": 0.8917088508605957, + "step": 986, + "token_acc": 0.7493186813186813 + }, + { + "epoch": 1.488779935885348, + "grad_norm": 2.6039035320281982, + "learning_rate": 4.619684961881255e-09, + "loss": 0.8190131187438965, + "step": 987, + "token_acc": 0.7705410821643287 + }, + { + "epoch": 1.4902885159343768, + "grad_norm": 4.62582540512085, + "learning_rate": 3.936389296864129e-09, + "loss": 0.8465819358825684, + "step": 988, + "token_acc": 0.7601476014760148 + }, + { + "epoch": 1.4917970959834057, + "grad_norm": 13.716168403625488, + "learning_rate": 3.307729783054159e-09, + "loss": 0.8190443515777588, + "step": 989, + "token_acc": 0.7658335603377826 + }, + { + "epoch": 1.4933056760324344, + "grad_norm": 2.91422700881958, + "learning_rate": 2.7337132953697555e-09, + "loss": 0.8826557993888855, + "step": 990, + "token_acc": 0.7508792342068382 + }, + { + "epoch": 1.4948142560814635, + "grad_norm": 2.7613391876220703, + "learning_rate": 2.214346111164556e-09, + "loss": 0.847295343875885, + "step": 991, + "token_acc": 0.7569343623443111 + }, + { + "epoch": 1.496322836130492, + "grad_norm": 4.2528977394104, + "learning_rate": 1.749633910153592e-09, + "loss": 0.7847687005996704, + "step": 992, + "token_acc": 0.7772279949419979 + }, + { + "epoch": 1.497831416179521, + "grad_norm": 3.5503010749816895, + "learning_rate": 1.3395817743561135e-09, + "loss": 0.8109261989593506, + "step": 993, + "token_acc": 0.7656372165754496 + }, + { + "epoch": 1.4993399962285499, + "grad_norm": 2.5933306217193604, + "learning_rate": 9.841941880361917e-10, + "loss": 0.8115319013595581, + "step": 994, + "token_acc": 0.7727254115142085 + }, + { + "epoch": 1.5008485762775787, + "grad_norm": 3.881438970565796, + "learning_rate": 6.834750376549793e-10, + "loss": 0.8471834659576416, + "step": 995, + "token_acc": 0.7675790414478829 + }, + { + "epoch": 1.5023571563266076, + "grad_norm": 2.716616630554199, + "learning_rate": 4.374276118301879e-10, + "loss": 0.7983828783035278, + "step": 996, + "token_acc": 0.7698837464356219 + }, + { + "epoch": 1.5038657363756365, + "grad_norm": 3.5377018451690674, + "learning_rate": 2.4605460129556446e-10, + "loss": 0.8160542249679565, + "step": 997, + "token_acc": 0.7730439375267222 + }, + { + "epoch": 1.5053743164246653, + "grad_norm": 2.696662664413452, + "learning_rate": 1.0935809887702154e-10, + "loss": 0.8060298562049866, + "step": 998, + "token_acc": 0.773387288740776 + }, + { + "epoch": 1.506882896473694, + "grad_norm": 2.9544644355773926, + "learning_rate": 2.733959946432663e-11, + "loss": 0.8253732919692993, + "step": 999, + "token_acc": 0.767141668679387 + }, + { + "epoch": 1.508391476522723, + "grad_norm": 7.276932239532471, + "learning_rate": 0.0, + "loss": 0.8830540180206299, + "step": 1000, + "token_acc": 0.7554379776601999 + } + ], + "logging_steps": 1, + "max_steps": 1000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.4963572226947285e+19, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}