diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8365 @@ +{ + "best_global_step": 1038, + "best_metric": 0.5740059, + "best_model_checkpoint": "/mnt/gpfs/shenyujiong/output/qwen3-vl-8b-int-sft-merged-nv5592-third3000-full-3epoch/v0-20251226-140741/checkpoint-1038", + "epoch": 3.0, + "eval_steps": 500, + "global_step": 1038, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.002890173410404624, + "grad_norm": 6.073309605336921, + "learning_rate": 1.923076923076923e-08, + "loss": 0.8852723240852356, + "step": 1, + "token_acc": 0.7513407750453963 + }, + { + "epoch": 0.005780346820809248, + "grad_norm": 5.632770918536085, + "learning_rate": 3.846153846153846e-08, + "loss": 0.8229959607124329, + "step": 2, + "token_acc": 0.7648557050426209 + }, + { + "epoch": 0.008670520231213872, + "grad_norm": 5.550843708913173, + "learning_rate": 5.7692307692307695e-08, + "loss": 0.8395601511001587, + "step": 3, + "token_acc": 0.7611515500814708 + }, + { + "epoch": 0.011560693641618497, + "grad_norm": 5.463688271600264, + "learning_rate": 7.692307692307692e-08, + "loss": 0.8262450695037842, + "step": 4, + "token_acc": 0.7617775757231346 + }, + { + "epoch": 0.014450867052023121, + "grad_norm": 5.208733348546384, + "learning_rate": 9.615384615384616e-08, + "loss": 0.7870609760284424, + "step": 5, + "token_acc": 0.7738227378472486 + }, + { + "epoch": 0.017341040462427744, + "grad_norm": 6.094089600000965, + "learning_rate": 1.1538461538461539e-07, + "loss": 0.890167236328125, + "step": 6, + "token_acc": 0.7463134620800402 + }, + { + "epoch": 0.02023121387283237, + "grad_norm": 5.511558073866942, + "learning_rate": 1.346153846153846e-07, + "loss": 0.8200665712356567, + "step": 7, + "token_acc": 0.7655801718674399 + }, + { + "epoch": 0.023121387283236993, + "grad_norm": 5.840135867020467, + "learning_rate": 1.5384615384615385e-07, + "loss": 0.8561823964118958, + "step": 8, + "token_acc": 0.7551989061787877 + }, + { + "epoch": 0.02601156069364162, + "grad_norm": 4.93074237263625, + "learning_rate": 1.7307692307692305e-07, + "loss": 0.7908620834350586, + "step": 9, + "token_acc": 0.7736331966727492 + }, + { + "epoch": 0.028901734104046242, + "grad_norm": 5.513250434452228, + "learning_rate": 1.9230769230769231e-07, + "loss": 0.8536443710327148, + "step": 10, + "token_acc": 0.7537275655775426 + }, + { + "epoch": 0.031791907514450865, + "grad_norm": 5.6890026898261254, + "learning_rate": 2.1153846153846152e-07, + "loss": 0.8860396146774292, + "step": 11, + "token_acc": 0.7444433233394834 + }, + { + "epoch": 0.03468208092485549, + "grad_norm": 5.204460891865508, + "learning_rate": 2.3076923076923078e-07, + "loss": 0.8523805141448975, + "step": 12, + "token_acc": 0.7516541745600307 + }, + { + "epoch": 0.03757225433526012, + "grad_norm": 5.727537830602335, + "learning_rate": 2.5e-07, + "loss": 0.8715107440948486, + "step": 13, + "token_acc": 0.7483992966857977 + }, + { + "epoch": 0.04046242774566474, + "grad_norm": 5.573759954820184, + "learning_rate": 2.692307692307692e-07, + "loss": 0.8587294220924377, + "step": 14, + "token_acc": 0.752293881658215 + }, + { + "epoch": 0.04335260115606936, + "grad_norm": 5.626217493866761, + "learning_rate": 2.884615384615384e-07, + "loss": 0.8353704810142517, + "step": 15, + "token_acc": 0.7603716874100415 + }, + { + "epoch": 0.046242774566473986, + "grad_norm": 5.780174641621012, + "learning_rate": 3.076923076923077e-07, + "loss": 0.8726707100868225, + "step": 16, + "token_acc": 0.750940308255944 + }, + { + "epoch": 0.049132947976878616, + "grad_norm": 4.3328681597964875, + "learning_rate": 3.269230769230769e-07, + "loss": 0.718013346195221, + "step": 17, + "token_acc": 0.7931623195891079 + }, + { + "epoch": 0.05202312138728324, + "grad_norm": 5.47302287757926, + "learning_rate": 3.461538461538461e-07, + "loss": 0.8578764200210571, + "step": 18, + "token_acc": 0.7521628365412952 + }, + { + "epoch": 0.05491329479768786, + "grad_norm": 5.003969625540578, + "learning_rate": 3.6538461538461534e-07, + "loss": 0.8133180737495422, + "step": 19, + "token_acc": 0.7619723575896223 + }, + { + "epoch": 0.057803468208092484, + "grad_norm": 5.6946171227062115, + "learning_rate": 3.8461538461538463e-07, + "loss": 0.8691498041152954, + "step": 20, + "token_acc": 0.7492446732183174 + }, + { + "epoch": 0.06069364161849711, + "grad_norm": 5.520197357593707, + "learning_rate": 4.0384615384615386e-07, + "loss": 0.907565712928772, + "step": 21, + "token_acc": 0.739601049536876 + }, + { + "epoch": 0.06358381502890173, + "grad_norm": 4.583439446754697, + "learning_rate": 4.2307692307692304e-07, + "loss": 0.8114128708839417, + "step": 22, + "token_acc": 0.7639052404881551 + }, + { + "epoch": 0.06647398843930635, + "grad_norm": 4.920313367321747, + "learning_rate": 4.423076923076923e-07, + "loss": 0.8422179222106934, + "step": 23, + "token_acc": 0.7567072154640894 + }, + { + "epoch": 0.06936416184971098, + "grad_norm": 5.263032949222765, + "learning_rate": 4.6153846153846156e-07, + "loss": 0.8715439438819885, + "step": 24, + "token_acc": 0.7464963254144591 + }, + { + "epoch": 0.07225433526011561, + "grad_norm": 4.870068302475069, + "learning_rate": 4.807692307692307e-07, + "loss": 0.8316457271575928, + "step": 25, + "token_acc": 0.7582423573346417 + }, + { + "epoch": 0.07514450867052024, + "grad_norm": 4.199216776916685, + "learning_rate": 5e-07, + "loss": 0.7344825267791748, + "step": 26, + "token_acc": 0.7839733369517283 + }, + { + "epoch": 0.07803468208092486, + "grad_norm": 4.588333481721223, + "learning_rate": 5.192307692307692e-07, + "loss": 0.8012775182723999, + "step": 27, + "token_acc": 0.767028959599571 + }, + { + "epoch": 0.08092485549132948, + "grad_norm": 4.194674553902997, + "learning_rate": 5.384615384615384e-07, + "loss": 0.712963879108429, + "step": 28, + "token_acc": 0.7923740483107238 + }, + { + "epoch": 0.0838150289017341, + "grad_norm": 4.053747357354017, + "learning_rate": 5.576923076923077e-07, + "loss": 0.7496437430381775, + "step": 29, + "token_acc": 0.7814042116577906 + }, + { + "epoch": 0.08670520231213873, + "grad_norm": 3.718069447981091, + "learning_rate": 5.769230769230768e-07, + "loss": 0.7818017601966858, + "step": 30, + "token_acc": 0.7700440596977877 + }, + { + "epoch": 0.08959537572254335, + "grad_norm": 3.420080175405301, + "learning_rate": 5.961538461538461e-07, + "loss": 0.7861907482147217, + "step": 31, + "token_acc": 0.7627405151738911 + }, + { + "epoch": 0.09248554913294797, + "grad_norm": 2.639800184791621, + "learning_rate": 6.153846153846154e-07, + "loss": 0.6684123277664185, + "step": 32, + "token_acc": 0.7977043354655295 + }, + { + "epoch": 0.0953757225433526, + "grad_norm": 2.9502697501210413, + "learning_rate": 6.346153846153845e-07, + "loss": 0.7446445226669312, + "step": 33, + "token_acc": 0.771793289625916 + }, + { + "epoch": 0.09826589595375723, + "grad_norm": 2.8110101894954345, + "learning_rate": 6.538461538461538e-07, + "loss": 0.7382901906967163, + "step": 34, + "token_acc": 0.7770551133606955 + }, + { + "epoch": 0.10115606936416185, + "grad_norm": 2.9797000830123226, + "learning_rate": 6.730769230769231e-07, + "loss": 0.7384837865829468, + "step": 35, + "token_acc": 0.7742859974561853 + }, + { + "epoch": 0.10404624277456648, + "grad_norm": 2.7709890477908177, + "learning_rate": 6.923076923076922e-07, + "loss": 0.7289628982543945, + "step": 36, + "token_acc": 0.7765123239561783 + }, + { + "epoch": 0.1069364161849711, + "grad_norm": 2.59015685758215, + "learning_rate": 7.115384615384616e-07, + "loss": 0.7290064096450806, + "step": 37, + "token_acc": 0.7784733624454149 + }, + { + "epoch": 0.10982658959537572, + "grad_norm": 2.8646835764259233, + "learning_rate": 7.307692307692307e-07, + "loss": 0.7594764828681946, + "step": 38, + "token_acc": 0.7671359481427088 + }, + { + "epoch": 0.11271676300578035, + "grad_norm": 2.349168631790223, + "learning_rate": 7.5e-07, + "loss": 0.72218257188797, + "step": 39, + "token_acc": 0.7804759091596026 + }, + { + "epoch": 0.11560693641618497, + "grad_norm": 2.511985129172397, + "learning_rate": 7.692307692307693e-07, + "loss": 0.7277328968048096, + "step": 40, + "token_acc": 0.7780124249072232 + }, + { + "epoch": 0.11849710982658959, + "grad_norm": 2.5792884120122235, + "learning_rate": 7.884615384615384e-07, + "loss": 0.7460165619850159, + "step": 41, + "token_acc": 0.7733394615523893 + }, + { + "epoch": 0.12138728323699421, + "grad_norm": 1.5451971118538999, + "learning_rate": 8.076923076923077e-07, + "loss": 0.7386133670806885, + "step": 42, + "token_acc": 0.772020024353944 + }, + { + "epoch": 0.12427745664739884, + "grad_norm": 1.3982437840218045, + "learning_rate": 8.269230769230768e-07, + "loss": 0.7192668914794922, + "step": 43, + "token_acc": 0.7744656594039339 + }, + { + "epoch": 0.12716763005780346, + "grad_norm": 1.4772019806138394, + "learning_rate": 8.461538461538461e-07, + "loss": 0.6977580189704895, + "step": 44, + "token_acc": 0.7803848372212253 + }, + { + "epoch": 0.13005780346820808, + "grad_norm": 1.426662829341362, + "learning_rate": 8.653846153846154e-07, + "loss": 0.6999402642250061, + "step": 45, + "token_acc": 0.778780228821366 + }, + { + "epoch": 0.1329479768786127, + "grad_norm": 1.4168889938692493, + "learning_rate": 8.846153846153846e-07, + "loss": 0.7392410635948181, + "step": 46, + "token_acc": 0.7691270558007607 + }, + { + "epoch": 0.13583815028901733, + "grad_norm": 1.4711907038839338, + "learning_rate": 9.038461538461538e-07, + "loss": 0.7351399660110474, + "step": 47, + "token_acc": 0.7670848343481196 + }, + { + "epoch": 0.13872832369942195, + "grad_norm": 1.2965845227191142, + "learning_rate": 9.230769230769231e-07, + "loss": 0.7003874778747559, + "step": 48, + "token_acc": 0.7787950748052811 + }, + { + "epoch": 0.1416184971098266, + "grad_norm": 1.292104981035939, + "learning_rate": 9.423076923076923e-07, + "loss": 0.7326341867446899, + "step": 49, + "token_acc": 0.7685059219819624 + }, + { + "epoch": 0.14450867052023122, + "grad_norm": 1.2291132980421766, + "learning_rate": 9.615384615384615e-07, + "loss": 0.6871765851974487, + "step": 50, + "token_acc": 0.7841781074662453 + }, + { + "epoch": 0.14739884393063585, + "grad_norm": 1.123170268506369, + "learning_rate": 9.807692307692306e-07, + "loss": 0.6960352659225464, + "step": 51, + "token_acc": 0.7801758979708864 + }, + { + "epoch": 0.15028901734104047, + "grad_norm": 1.00691295990528, + "learning_rate": 1e-06, + "loss": 0.6956222653388977, + "step": 52, + "token_acc": 0.7829201628190622 + }, + { + "epoch": 0.1531791907514451, + "grad_norm": 0.9370942178938112, + "learning_rate": 9.999974620354198e-07, + "loss": 0.6958713531494141, + "step": 53, + "token_acc": 0.7809317408675194 + }, + { + "epoch": 0.15606936416184972, + "grad_norm": 1.1057401423493767, + "learning_rate": 9.999898481674446e-07, + "loss": 0.7062472105026245, + "step": 54, + "token_acc": 0.7756643140884724 + }, + { + "epoch": 0.15895953757225434, + "grad_norm": 0.8619542832761329, + "learning_rate": 9.999771584733693e-07, + "loss": 0.6577130556106567, + "step": 55, + "token_acc": 0.7922278867707445 + }, + { + "epoch": 0.16184971098265896, + "grad_norm": 0.9166807116221914, + "learning_rate": 9.999593930820181e-07, + "loss": 0.6945655941963196, + "step": 56, + "token_acc": 0.77725851438142 + }, + { + "epoch": 0.16473988439306358, + "grad_norm": 0.939862155697591, + "learning_rate": 9.999365521737421e-07, + "loss": 0.6921431422233582, + "step": 57, + "token_acc": 0.7773106126184057 + }, + { + "epoch": 0.1676300578034682, + "grad_norm": 0.9756834016584089, + "learning_rate": 9.999086359804195e-07, + "loss": 0.7256878018379211, + "step": 58, + "token_acc": 0.7686141412007078 + }, + { + "epoch": 0.17052023121387283, + "grad_norm": 0.8557348808489443, + "learning_rate": 9.99875644785451e-07, + "loss": 0.6813135147094727, + "step": 59, + "token_acc": 0.7843321803650282 + }, + { + "epoch": 0.17341040462427745, + "grad_norm": 0.8266352802865822, + "learning_rate": 9.998375789237592e-07, + "loss": 0.6513127088546753, + "step": 60, + "token_acc": 0.7914724403689247 + }, + { + "epoch": 0.17630057803468208, + "grad_norm": 0.8497866635296994, + "learning_rate": 9.99794438781783e-07, + "loss": 0.6605720520019531, + "step": 61, + "token_acc": 0.78915683493063 + }, + { + "epoch": 0.1791907514450867, + "grad_norm": 0.8351298607584619, + "learning_rate": 9.99746224797475e-07, + "loss": 0.6266233325004578, + "step": 62, + "token_acc": 0.7963586246917163 + }, + { + "epoch": 0.18208092485549132, + "grad_norm": 0.9019491097127296, + "learning_rate": 9.996929374602968e-07, + "loss": 0.6673212647438049, + "step": 63, + "token_acc": 0.7844323603274962 + }, + { + "epoch": 0.18497109826589594, + "grad_norm": 0.8813921264261143, + "learning_rate": 9.996345773112138e-07, + "loss": 0.7036587595939636, + "step": 64, + "token_acc": 0.7740703997187025 + }, + { + "epoch": 0.18786127167630057, + "grad_norm": 0.8869002415681166, + "learning_rate": 9.995711449426901e-07, + "loss": 0.6981368064880371, + "step": 65, + "token_acc": 0.7753412151954072 + }, + { + "epoch": 0.1907514450867052, + "grad_norm": 0.7752119383387671, + "learning_rate": 9.99502640998682e-07, + "loss": 0.6600744724273682, + "step": 66, + "token_acc": 0.788013646851561 + }, + { + "epoch": 0.1936416184971098, + "grad_norm": 0.8616071421748983, + "learning_rate": 9.99429066174632e-07, + "loss": 0.6547806262969971, + "step": 67, + "token_acc": 0.7894853017554794 + }, + { + "epoch": 0.19653179190751446, + "grad_norm": 0.8018562843868764, + "learning_rate": 9.993504212174613e-07, + "loss": 0.6278072595596313, + "step": 68, + "token_acc": 0.7972202882855006 + }, + { + "epoch": 0.1994219653179191, + "grad_norm": 0.7473736558335493, + "learning_rate": 9.992667069255618e-07, + "loss": 0.6237850785255432, + "step": 69, + "token_acc": 0.7982735792533637 + }, + { + "epoch": 0.2023121387283237, + "grad_norm": 0.6999587458869299, + "learning_rate": 9.991779241487899e-07, + "loss": 0.6401976346969604, + "step": 70, + "token_acc": 0.7928364264997928 + }, + { + "epoch": 0.20520231213872833, + "grad_norm": 0.6924984079683673, + "learning_rate": 9.990840737884554e-07, + "loss": 0.6805769205093384, + "step": 71, + "token_acc": 0.7801177818172763 + }, + { + "epoch": 0.20809248554913296, + "grad_norm": 0.7111004746445246, + "learning_rate": 9.989851567973138e-07, + "loss": 0.697790801525116, + "step": 72, + "token_acc": 0.7760267430754537 + }, + { + "epoch": 0.21098265895953758, + "grad_norm": 0.6869871346194354, + "learning_rate": 9.988811741795566e-07, + "loss": 0.6186888217926025, + "step": 73, + "token_acc": 0.7994626021789282 + }, + { + "epoch": 0.2138728323699422, + "grad_norm": 0.6177183453130074, + "learning_rate": 9.987721269908005e-07, + "loss": 0.5868158340454102, + "step": 74, + "token_acc": 0.8114196656276566 + }, + { + "epoch": 0.21676300578034682, + "grad_norm": 0.6307801092890282, + "learning_rate": 9.98658016338077e-07, + "loss": 0.6723257303237915, + "step": 75, + "token_acc": 0.7827200467097494 + }, + { + "epoch": 0.21965317919075145, + "grad_norm": 0.6150476355618669, + "learning_rate": 9.985388433798215e-07, + "loss": 0.6530448198318481, + "step": 76, + "token_acc": 0.7907922080887895 + }, + { + "epoch": 0.22254335260115607, + "grad_norm": 0.5940300278296939, + "learning_rate": 9.984146093258608e-07, + "loss": 0.6855973601341248, + "step": 77, + "token_acc": 0.7784828714678302 + }, + { + "epoch": 0.2254335260115607, + "grad_norm": 0.9497443806056196, + "learning_rate": 9.982853154374013e-07, + "loss": 0.6745576858520508, + "step": 78, + "token_acc": 0.7854156213413614 + }, + { + "epoch": 0.22832369942196531, + "grad_norm": 0.6791196750467849, + "learning_rate": 9.981509630270167e-07, + "loss": 0.6383039951324463, + "step": 79, + "token_acc": 0.7940166430627679 + }, + { + "epoch": 0.23121387283236994, + "grad_norm": 0.6194193913683183, + "learning_rate": 9.980115534586333e-07, + "loss": 0.6046701669692993, + "step": 80, + "token_acc": 0.8031263032947291 + }, + { + "epoch": 0.23410404624277456, + "grad_norm": 0.584941512318404, + "learning_rate": 9.978670881475172e-07, + "loss": 0.6113057136535645, + "step": 81, + "token_acc": 0.8002890249696458 + }, + { + "epoch": 0.23699421965317918, + "grad_norm": 0.576070429321087, + "learning_rate": 9.9771756856026e-07, + "loss": 0.6508547067642212, + "step": 82, + "token_acc": 0.7917054316809551 + }, + { + "epoch": 0.2398843930635838, + "grad_norm": 0.5782915674069733, + "learning_rate": 9.975629962147633e-07, + "loss": 0.6592724323272705, + "step": 83, + "token_acc": 0.7841273280945267 + }, + { + "epoch": 0.24277456647398843, + "grad_norm": 0.5894596908351907, + "learning_rate": 9.974033726802235e-07, + "loss": 0.5925013422966003, + "step": 84, + "token_acc": 0.8060771521769962 + }, + { + "epoch": 0.24566473988439305, + "grad_norm": 0.5279159216055382, + "learning_rate": 9.972386995771164e-07, + "loss": 0.6444322466850281, + "step": 85, + "token_acc": 0.7914691943127962 + }, + { + "epoch": 0.24855491329479767, + "grad_norm": 0.5809453095781784, + "learning_rate": 9.970689785771798e-07, + "loss": 0.6508707404136658, + "step": 86, + "token_acc": 0.7889469472867465 + }, + { + "epoch": 0.2514450867052023, + "grad_norm": 0.6715617527059413, + "learning_rate": 9.968942114033973e-07, + "loss": 0.5962953567504883, + "step": 87, + "token_acc": 0.8063397578524576 + }, + { + "epoch": 0.2543352601156069, + "grad_norm": 0.6155392081496504, + "learning_rate": 9.967143998299802e-07, + "loss": 0.6590582132339478, + "step": 88, + "token_acc": 0.786015653473848 + }, + { + "epoch": 0.25722543352601157, + "grad_norm": 0.6351340196244468, + "learning_rate": 9.965295456823507e-07, + "loss": 0.6178431510925293, + "step": 89, + "token_acc": 0.799615789600598 + }, + { + "epoch": 0.26011560693641617, + "grad_norm": 0.6389337976646079, + "learning_rate": 9.963396508371217e-07, + "loss": 0.6065088510513306, + "step": 90, + "token_acc": 0.8027006050850137 + }, + { + "epoch": 0.2630057803468208, + "grad_norm": 0.5682640638544528, + "learning_rate": 9.961447172220785e-07, + "loss": 0.6684330105781555, + "step": 91, + "token_acc": 0.7839487407338119 + }, + { + "epoch": 0.2658959537572254, + "grad_norm": 0.6029647051880634, + "learning_rate": 9.959447468161596e-07, + "loss": 0.6358112096786499, + "step": 92, + "token_acc": 0.7908192833685276 + }, + { + "epoch": 0.26878612716763006, + "grad_norm": 0.5632656008285092, + "learning_rate": 9.957397416494366e-07, + "loss": 0.6601473093032837, + "step": 93, + "token_acc": 0.7853722190438847 + }, + { + "epoch": 0.27167630057803466, + "grad_norm": 0.6013944385740286, + "learning_rate": 9.955297038030926e-07, + "loss": 0.668410062789917, + "step": 94, + "token_acc": 0.7828623747800797 + }, + { + "epoch": 0.2745664739884393, + "grad_norm": 0.5541440784608198, + "learning_rate": 9.95314635409402e-07, + "loss": 0.6117832660675049, + "step": 95, + "token_acc": 0.7995787198241185 + }, + { + "epoch": 0.2774566473988439, + "grad_norm": 0.6314740935897156, + "learning_rate": 9.95094538651709e-07, + "loss": 0.6261177062988281, + "step": 96, + "token_acc": 0.7962018726778723 + }, + { + "epoch": 0.28034682080924855, + "grad_norm": 0.7158918907846333, + "learning_rate": 9.948694157644042e-07, + "loss": 0.6556503772735596, + "step": 97, + "token_acc": 0.7869902468442614 + }, + { + "epoch": 0.2832369942196532, + "grad_norm": 0.5701552977234003, + "learning_rate": 9.946392690329036e-07, + "loss": 0.6187049746513367, + "step": 98, + "token_acc": 0.8010530865652874 + }, + { + "epoch": 0.2861271676300578, + "grad_norm": 0.5860362253461248, + "learning_rate": 9.944041007936244e-07, + "loss": 0.5410789847373962, + "step": 99, + "token_acc": 0.8207894360088595 + }, + { + "epoch": 0.28901734104046245, + "grad_norm": 0.6303808407906236, + "learning_rate": 9.941639134339606e-07, + "loss": 0.5768465399742126, + "step": 100, + "token_acc": 0.8087328873195813 + }, + { + "epoch": 0.29190751445086704, + "grad_norm": 0.616425173315349, + "learning_rate": 9.939187093922609e-07, + "loss": 0.6295806169509888, + "step": 101, + "token_acc": 0.7958193257384945 + }, + { + "epoch": 0.2947976878612717, + "grad_norm": 0.5753993917901922, + "learning_rate": 9.936684911578017e-07, + "loss": 0.5983704328536987, + "step": 102, + "token_acc": 0.8031383517086323 + }, + { + "epoch": 0.2976878612716763, + "grad_norm": 0.6140080800303133, + "learning_rate": 9.93413261270763e-07, + "loss": 0.5729444026947021, + "step": 103, + "token_acc": 0.816418031517547 + }, + { + "epoch": 0.30057803468208094, + "grad_norm": 0.5607455073068854, + "learning_rate": 9.931530223222026e-07, + "loss": 0.5967170596122742, + "step": 104, + "token_acc": 0.803475704051983 + }, + { + "epoch": 0.30346820809248554, + "grad_norm": 0.5675327028480304, + "learning_rate": 9.928877769540293e-07, + "loss": 0.6241474151611328, + "step": 105, + "token_acc": 0.7967706129971308 + }, + { + "epoch": 0.3063583815028902, + "grad_norm": 0.6046538978438704, + "learning_rate": 9.926175278589767e-07, + "loss": 0.6553393602371216, + "step": 106, + "token_acc": 0.7874527013411549 + }, + { + "epoch": 0.3092485549132948, + "grad_norm": 0.5734166676914433, + "learning_rate": 9.923422777805751e-07, + "loss": 0.6570492386817932, + "step": 107, + "token_acc": 0.7870601190355553 + }, + { + "epoch": 0.31213872832369943, + "grad_norm": 0.6001726322335739, + "learning_rate": 9.920620295131245e-07, + "loss": 0.6794227361679077, + "step": 108, + "token_acc": 0.7787853169709925 + }, + { + "epoch": 0.315028901734104, + "grad_norm": 0.6099760009068769, + "learning_rate": 9.917767859016654e-07, + "loss": 0.615708589553833, + "step": 109, + "token_acc": 0.7985643236886592 + }, + { + "epoch": 0.3179190751445087, + "grad_norm": 0.5778662206360861, + "learning_rate": 9.91486549841951e-07, + "loss": 0.5809392929077148, + "step": 110, + "token_acc": 0.8094654316503208 + }, + { + "epoch": 0.3208092485549133, + "grad_norm": 0.5704401870141648, + "learning_rate": 9.911913242804158e-07, + "loss": 0.6263046264648438, + "step": 111, + "token_acc": 0.7955055464485222 + }, + { + "epoch": 0.3236994219653179, + "grad_norm": 0.613652119648305, + "learning_rate": 9.908911122141486e-07, + "loss": 0.5810531377792358, + "step": 112, + "token_acc": 0.8122967000471536 + }, + { + "epoch": 0.3265895953757225, + "grad_norm": 0.5754148794590288, + "learning_rate": 9.905859166908594e-07, + "loss": 0.6450198888778687, + "step": 113, + "token_acc": 0.787714712471994 + }, + { + "epoch": 0.32947976878612717, + "grad_norm": 0.8102498152797749, + "learning_rate": 9.902757408088501e-07, + "loss": 0.6492223739624023, + "step": 114, + "token_acc": 0.7880358603802299 + }, + { + "epoch": 0.33236994219653176, + "grad_norm": 0.525946407195948, + "learning_rate": 9.899605877169824e-07, + "loss": 0.5984295606613159, + "step": 115, + "token_acc": 0.8024764689756009 + }, + { + "epoch": 0.3352601156069364, + "grad_norm": 0.5751169418426346, + "learning_rate": 9.896404606146455e-07, + "loss": 0.6295244097709656, + "step": 116, + "token_acc": 0.7922646493276646 + }, + { + "epoch": 0.33815028901734107, + "grad_norm": 0.5079153092397871, + "learning_rate": 9.893153627517248e-07, + "loss": 0.5976470112800598, + "step": 117, + "token_acc": 0.8038826857227929 + }, + { + "epoch": 0.34104046242774566, + "grad_norm": 0.5841459704013869, + "learning_rate": 9.889852974285672e-07, + "loss": 0.6472890973091125, + "step": 118, + "token_acc": 0.789158388689134 + }, + { + "epoch": 0.3439306358381503, + "grad_norm": 0.6150844233030651, + "learning_rate": 9.886502679959497e-07, + "loss": 0.5413444638252258, + "step": 119, + "token_acc": 0.8222654666342334 + }, + { + "epoch": 0.3468208092485549, + "grad_norm": 0.5935208615034318, + "learning_rate": 9.883102778550434e-07, + "loss": 0.663335919380188, + "step": 120, + "token_acc": 0.7862711064419373 + }, + { + "epoch": 0.34971098265895956, + "grad_norm": 0.6268736075123943, + "learning_rate": 9.879653304573797e-07, + "loss": 0.6072404384613037, + "step": 121, + "token_acc": 0.8010549723328334 + }, + { + "epoch": 0.35260115606936415, + "grad_norm": 0.5583642618257684, + "learning_rate": 9.876154293048163e-07, + "loss": 0.6144070029258728, + "step": 122, + "token_acc": 0.796381277924315 + }, + { + "epoch": 0.3554913294797688, + "grad_norm": 0.5410450297039057, + "learning_rate": 9.872605779494997e-07, + "loss": 0.5954463481903076, + "step": 123, + "token_acc": 0.8055216585201416 + }, + { + "epoch": 0.3583815028901734, + "grad_norm": 0.6425891449290073, + "learning_rate": 9.869007799938305e-07, + "loss": 0.6611199378967285, + "step": 124, + "token_acc": 0.786190934231093 + }, + { + "epoch": 0.36127167630057805, + "grad_norm": 0.5146021782369569, + "learning_rate": 9.865360390904269e-07, + "loss": 0.6081857085227966, + "step": 125, + "token_acc": 0.8017568952922327 + }, + { + "epoch": 0.36416184971098264, + "grad_norm": 0.5766433781688939, + "learning_rate": 9.86166358942087e-07, + "loss": 0.609286904335022, + "step": 126, + "token_acc": 0.8002619382070126 + }, + { + "epoch": 0.3670520231213873, + "grad_norm": 0.5450128204125277, + "learning_rate": 9.857917433017508e-07, + "loss": 0.5991868376731873, + "step": 127, + "token_acc": 0.8008499444919779 + }, + { + "epoch": 0.3699421965317919, + "grad_norm": 0.5810734133360594, + "learning_rate": 9.854121959724635e-07, + "loss": 0.607757568359375, + "step": 128, + "token_acc": 0.7998384333607254 + }, + { + "epoch": 0.37283236994219654, + "grad_norm": 0.5770182474218292, + "learning_rate": 9.85027720807336e-07, + "loss": 0.5918303728103638, + "step": 129, + "token_acc": 0.8040288846142103 + }, + { + "epoch": 0.37572254335260113, + "grad_norm": 0.5360179518405197, + "learning_rate": 9.846383217095051e-07, + "loss": 0.646679162979126, + "step": 130, + "token_acc": 0.7929178624953734 + }, + { + "epoch": 0.3786127167630058, + "grad_norm": 0.5278251178995469, + "learning_rate": 9.842440026320958e-07, + "loss": 0.6081724166870117, + "step": 131, + "token_acc": 0.7979095393804223 + }, + { + "epoch": 0.3815028901734104, + "grad_norm": 0.5857831669587502, + "learning_rate": 9.838447675781793e-07, + "loss": 0.5776185989379883, + "step": 132, + "token_acc": 0.8089180214756997 + }, + { + "epoch": 0.38439306358381503, + "grad_norm": 0.49786698791997097, + "learning_rate": 9.834406206007335e-07, + "loss": 0.6665687561035156, + "step": 133, + "token_acc": 0.7817376207568673 + }, + { + "epoch": 0.3872832369942196, + "grad_norm": 0.5272403389699103, + "learning_rate": 9.83031565802601e-07, + "loss": 0.607385516166687, + "step": 134, + "token_acc": 0.8027202321406094 + }, + { + "epoch": 0.3901734104046243, + "grad_norm": 0.5881996711071641, + "learning_rate": 9.826176073364482e-07, + "loss": 0.6304242014884949, + "step": 135, + "token_acc": 0.7967265117890893 + }, + { + "epoch": 0.3930635838150289, + "grad_norm": 0.5540108888142588, + "learning_rate": 9.821987494047228e-07, + "loss": 0.6314468383789062, + "step": 136, + "token_acc": 0.7919692387557874 + }, + { + "epoch": 0.3959537572254335, + "grad_norm": 0.5722154073047628, + "learning_rate": 9.817749962596114e-07, + "loss": 0.602054238319397, + "step": 137, + "token_acc": 0.802066245506265 + }, + { + "epoch": 0.3988439306358382, + "grad_norm": 0.5596376441219622, + "learning_rate": 9.813463522029957e-07, + "loss": 0.640647292137146, + "step": 138, + "token_acc": 0.7918518615352437 + }, + { + "epoch": 0.40173410404624277, + "grad_norm": 0.5545182797573466, + "learning_rate": 9.809128215864096e-07, + "loss": 0.6066859364509583, + "step": 139, + "token_acc": 0.801196721208976 + }, + { + "epoch": 0.4046242774566474, + "grad_norm": 0.5784484895204948, + "learning_rate": 9.804744088109941e-07, + "loss": 0.5408949851989746, + "step": 140, + "token_acc": 0.8248328121430766 + }, + { + "epoch": 0.407514450867052, + "grad_norm": 0.5637555298781167, + "learning_rate": 9.80031118327454e-07, + "loss": 0.6107698678970337, + "step": 141, + "token_acc": 0.7982127620772081 + }, + { + "epoch": 0.41040462427745666, + "grad_norm": 0.603110232763829, + "learning_rate": 9.795829546360113e-07, + "loss": 0.5912826061248779, + "step": 142, + "token_acc": 0.8041540066906055 + }, + { + "epoch": 0.41329479768786126, + "grad_norm": 0.5873555056914542, + "learning_rate": 9.791299222863602e-07, + "loss": 0.6161830425262451, + "step": 143, + "token_acc": 0.799708864508567 + }, + { + "epoch": 0.4161849710982659, + "grad_norm": 0.6843944560990027, + "learning_rate": 9.786720258776213e-07, + "loss": 0.5474255681037903, + "step": 144, + "token_acc": 0.8186930860033726 + }, + { + "epoch": 0.4190751445086705, + "grad_norm": 0.51545250769897, + "learning_rate": 9.782092700582936e-07, + "loss": 0.6216602325439453, + "step": 145, + "token_acc": 0.7965911940150556 + }, + { + "epoch": 0.42196531791907516, + "grad_norm": 0.5937549088482647, + "learning_rate": 9.77741659526209e-07, + "loss": 0.6248494386672974, + "step": 146, + "token_acc": 0.7956684720442111 + }, + { + "epoch": 0.42485549132947975, + "grad_norm": 0.5399979093459059, + "learning_rate": 9.77269199028483e-07, + "loss": 0.6089432239532471, + "step": 147, + "token_acc": 0.796826403459652 + }, + { + "epoch": 0.4277456647398844, + "grad_norm": 0.5564248028198713, + "learning_rate": 9.76791893361468e-07, + "loss": 0.6312023401260376, + "step": 148, + "token_acc": 0.7918012705466769 + }, + { + "epoch": 0.430635838150289, + "grad_norm": 0.559936805840691, + "learning_rate": 9.763097473707035e-07, + "loss": 0.619454026222229, + "step": 149, + "token_acc": 0.7984878886834271 + }, + { + "epoch": 0.43352601156069365, + "grad_norm": 0.6044059322614584, + "learning_rate": 9.758227659508668e-07, + "loss": 0.5221510529518127, + "step": 150, + "token_acc": 0.8266117865021535 + }, + { + "epoch": 0.43641618497109824, + "grad_norm": 0.5692770162596946, + "learning_rate": 9.753309540457248e-07, + "loss": 0.6139217615127563, + "step": 151, + "token_acc": 0.7982664696096701 + }, + { + "epoch": 0.4393063583815029, + "grad_norm": 0.5330985388783729, + "learning_rate": 9.748343166480822e-07, + "loss": 0.6154735088348389, + "step": 152, + "token_acc": 0.7984871546515382 + }, + { + "epoch": 0.4421965317919075, + "grad_norm": 0.6065632918781179, + "learning_rate": 9.743328587997314e-07, + "loss": 0.5449005365371704, + "step": 153, + "token_acc": 0.8221805561096261 + }, + { + "epoch": 0.44508670520231214, + "grad_norm": 0.6274255114547471, + "learning_rate": 9.738265855914012e-07, + "loss": 0.6112866401672363, + "step": 154, + "token_acc": 0.7997394616484714 + }, + { + "epoch": 0.4479768786127168, + "grad_norm": 0.6000527996102515, + "learning_rate": 9.733155021627057e-07, + "loss": 0.6302502155303955, + "step": 155, + "token_acc": 0.7939255615270142 + }, + { + "epoch": 0.4508670520231214, + "grad_norm": 0.5716424963426585, + "learning_rate": 9.727996137020916e-07, + "loss": 0.5589959621429443, + "step": 156, + "token_acc": 0.8167590708119868 + }, + { + "epoch": 0.45375722543352603, + "grad_norm": 0.5793130145184638, + "learning_rate": 9.722789254467854e-07, + "loss": 0.5811511874198914, + "step": 157, + "token_acc": 0.8068220017796527 + }, + { + "epoch": 0.45664739884393063, + "grad_norm": 0.6447386736666927, + "learning_rate": 9.717534426827404e-07, + "loss": 0.6125731468200684, + "step": 158, + "token_acc": 0.7982601354147698 + }, + { + "epoch": 0.4595375722543353, + "grad_norm": 0.5583551050757221, + "learning_rate": 9.712231707445831e-07, + "loss": 0.5681207180023193, + "step": 159, + "token_acc": 0.812138891502776 + }, + { + "epoch": 0.4624277456647399, + "grad_norm": 0.6227411154474924, + "learning_rate": 9.70688115015559e-07, + "loss": 0.5606650114059448, + "step": 160, + "token_acc": 0.8128119485280195 + }, + { + "epoch": 0.4653179190751445, + "grad_norm": 0.5637826519102942, + "learning_rate": 9.701482809274787e-07, + "loss": 0.584591269493103, + "step": 161, + "token_acc": 0.809975090499813 + }, + { + "epoch": 0.4682080924855491, + "grad_norm": 0.5527836562945804, + "learning_rate": 9.696036739606606e-07, + "loss": 0.6178029775619507, + "step": 162, + "token_acc": 0.7982424352237725 + }, + { + "epoch": 0.47109826589595377, + "grad_norm": 0.5261451706415371, + "learning_rate": 9.690542996438777e-07, + "loss": 0.5772680640220642, + "step": 163, + "token_acc": 0.8055154702213526 + }, + { + "epoch": 0.47398843930635837, + "grad_norm": 0.598598068991984, + "learning_rate": 9.685001635543005e-07, + "loss": 0.5761500597000122, + "step": 164, + "token_acc": 0.8095295422689632 + }, + { + "epoch": 0.476878612716763, + "grad_norm": 0.5603114991623558, + "learning_rate": 9.679412713174398e-07, + "loss": 0.6070771217346191, + "step": 165, + "token_acc": 0.7988323213451658 + }, + { + "epoch": 0.4797687861271676, + "grad_norm": 0.5909619017551228, + "learning_rate": 9.673776286070905e-07, + "loss": 0.5829952955245972, + "step": 166, + "token_acc": 0.8056856359399237 + }, + { + "epoch": 0.48265895953757226, + "grad_norm": 0.7664205949048083, + "learning_rate": 9.668092411452735e-07, + "loss": 0.591526985168457, + "step": 167, + "token_acc": 0.805959940764539 + }, + { + "epoch": 0.48554913294797686, + "grad_norm": 0.5816382553386844, + "learning_rate": 9.66236114702178e-07, + "loss": 0.6718764901161194, + "step": 168, + "token_acc": 0.7819054715177417 + }, + { + "epoch": 0.4884393063583815, + "grad_norm": 0.5443192837285905, + "learning_rate": 9.656582550961018e-07, + "loss": 0.5771794319152832, + "step": 169, + "token_acc": 0.8120637180483624 + }, + { + "epoch": 0.4913294797687861, + "grad_norm": 0.5439506241087468, + "learning_rate": 9.650756681933947e-07, + "loss": 0.5797525644302368, + "step": 170, + "token_acc": 0.8072481275670452 + }, + { + "epoch": 0.49421965317919075, + "grad_norm": 0.5750701292908912, + "learning_rate": 9.644883599083957e-07, + "loss": 0.616324782371521, + "step": 171, + "token_acc": 0.7961593487416124 + }, + { + "epoch": 0.49710982658959535, + "grad_norm": 0.5292422990653295, + "learning_rate": 9.638963362033756e-07, + "loss": 0.6252388954162598, + "step": 172, + "token_acc": 0.7945571248522018 + }, + { + "epoch": 0.5, + "grad_norm": 0.519900156438812, + "learning_rate": 9.632996030884748e-07, + "loss": 0.6072378158569336, + "step": 173, + "token_acc": 0.7983872825711323 + }, + { + "epoch": 0.5028901734104047, + "grad_norm": 2.014285868322542, + "learning_rate": 9.626981666216439e-07, + "loss": 0.5167373418807983, + "step": 174, + "token_acc": 0.8304752994472689 + }, + { + "epoch": 0.5057803468208093, + "grad_norm": 0.6229356072638176, + "learning_rate": 9.620920329085802e-07, + "loss": 0.5613738894462585, + "step": 175, + "token_acc": 0.8164609282841512 + }, + { + "epoch": 0.5086705202312138, + "grad_norm": 0.6427491754173409, + "learning_rate": 9.614812081026678e-07, + "loss": 0.6089553236961365, + "step": 176, + "token_acc": 0.8013446815125724 + }, + { + "epoch": 0.5115606936416185, + "grad_norm": 0.4795382180524186, + "learning_rate": 9.608656984049132e-07, + "loss": 0.579177737236023, + "step": 177, + "token_acc": 0.806047379906923 + }, + { + "epoch": 0.5144508670520231, + "grad_norm": 0.5089663171794683, + "learning_rate": 9.602455100638835e-07, + "loss": 0.5813893675804138, + "step": 178, + "token_acc": 0.8087914556082915 + }, + { + "epoch": 0.5173410404624278, + "grad_norm": 0.6116010486180593, + "learning_rate": 9.596206493756432e-07, + "loss": 0.5549554824829102, + "step": 179, + "token_acc": 0.8173080502386111 + }, + { + "epoch": 0.5202312138728323, + "grad_norm": 0.4852226717563288, + "learning_rate": 9.589911226836895e-07, + "loss": 0.5808215737342834, + "step": 180, + "token_acc": 0.8052112098427888 + }, + { + "epoch": 0.523121387283237, + "grad_norm": 0.5270020853161572, + "learning_rate": 9.583569363788879e-07, + "loss": 0.6398844122886658, + "step": 181, + "token_acc": 0.7898708976833977 + }, + { + "epoch": 0.5260115606936416, + "grad_norm": 0.5073350335042175, + "learning_rate": 9.577180968994081e-07, + "loss": 0.6154753565788269, + "step": 182, + "token_acc": 0.7993068610377478 + }, + { + "epoch": 0.5289017341040463, + "grad_norm": 0.5631567506627345, + "learning_rate": 9.57074610730658e-07, + "loss": 0.5920361876487732, + "step": 183, + "token_acc": 0.8048126355828951 + }, + { + "epoch": 0.5317919075144508, + "grad_norm": 0.4995115799741094, + "learning_rate": 9.56426484405218e-07, + "loss": 0.5912809371948242, + "step": 184, + "token_acc": 0.8075411124942672 + }, + { + "epoch": 0.5346820809248555, + "grad_norm": 0.560250197890468, + "learning_rate": 9.557737245027746e-07, + "loss": 0.6125437021255493, + "step": 185, + "token_acc": 0.7972027972027972 + }, + { + "epoch": 0.5375722543352601, + "grad_norm": 0.5819218618969146, + "learning_rate": 9.551163376500542e-07, + "loss": 0.5732159614562988, + "step": 186, + "token_acc": 0.8115202124085258 + }, + { + "epoch": 0.5404624277456648, + "grad_norm": 0.6129732835255256, + "learning_rate": 9.544543305207546e-07, + "loss": 0.6079097986221313, + "step": 187, + "token_acc": 0.7997229197333102 + }, + { + "epoch": 0.5433526011560693, + "grad_norm": 0.5263001528585832, + "learning_rate": 9.537877098354784e-07, + "loss": 0.5925722718238831, + "step": 188, + "token_acc": 0.8029342210305924 + }, + { + "epoch": 0.546242774566474, + "grad_norm": 0.583594997315983, + "learning_rate": 9.531164823616646e-07, + "loss": 0.5865395069122314, + "step": 189, + "token_acc": 0.8063752604903651 + }, + { + "epoch": 0.5491329479768786, + "grad_norm": 0.5781895560822031, + "learning_rate": 9.524406549135193e-07, + "loss": 0.6117700338363647, + "step": 190, + "token_acc": 0.7980149336253496 + }, + { + "epoch": 0.5520231213872833, + "grad_norm": 0.4893230139872087, + "learning_rate": 9.517602343519471e-07, + "loss": 0.5652576684951782, + "step": 191, + "token_acc": 0.8107140229095636 + }, + { + "epoch": 0.5549132947976878, + "grad_norm": 0.5760419810427979, + "learning_rate": 9.510752275844809e-07, + "loss": 0.579891562461853, + "step": 192, + "token_acc": 0.805735200834105 + }, + { + "epoch": 0.5578034682080925, + "grad_norm": 0.5102671355626198, + "learning_rate": 9.503856415652125e-07, + "loss": 0.5964775681495667, + "step": 193, + "token_acc": 0.8034283288223744 + }, + { + "epoch": 0.5606936416184971, + "grad_norm": 0.4894002019430091, + "learning_rate": 9.496914832947214e-07, + "loss": 0.6064220666885376, + "step": 194, + "token_acc": 0.799232275930387 + }, + { + "epoch": 0.5635838150289018, + "grad_norm": 0.5939844831348525, + "learning_rate": 9.489927598200043e-07, + "loss": 0.6116449236869812, + "step": 195, + "token_acc": 0.797429447731885 + }, + { + "epoch": 0.5664739884393064, + "grad_norm": 0.4783949579372596, + "learning_rate": 9.482894782344024e-07, + "loss": 0.6082786321640015, + "step": 196, + "token_acc": 0.796939850416096 + }, + { + "epoch": 0.569364161849711, + "grad_norm": 0.5532830089434996, + "learning_rate": 9.475816456775312e-07, + "loss": 0.5998172760009766, + "step": 197, + "token_acc": 0.8034065270191963 + }, + { + "epoch": 0.5722543352601156, + "grad_norm": 0.5660410481873773, + "learning_rate": 9.468692693352062e-07, + "loss": 0.5715000629425049, + "step": 198, + "token_acc": 0.8105325892615268 + }, + { + "epoch": 0.5751445086705202, + "grad_norm": 0.5454360730485784, + "learning_rate": 9.461523564393714e-07, + "loss": 0.5121803283691406, + "step": 199, + "token_acc": 0.8285392705145792 + }, + { + "epoch": 0.5780346820809249, + "grad_norm": 0.5378535866046305, + "learning_rate": 9.454309142680246e-07, + "loss": 0.5945334434509277, + "step": 200, + "token_acc": 0.8058855053489177 + }, + { + "epoch": 0.5809248554913294, + "grad_norm": 0.569376306217556, + "learning_rate": 9.447049501451447e-07, + "loss": 0.5850614905357361, + "step": 201, + "token_acc": 0.8075420015918657 + }, + { + "epoch": 0.5838150289017341, + "grad_norm": 0.5596293780541032, + "learning_rate": 9.439744714406166e-07, + "loss": 0.5594047904014587, + "step": 202, + "token_acc": 0.8121667287250859 + }, + { + "epoch": 0.5867052023121387, + "grad_norm": 0.5138636330605458, + "learning_rate": 9.432394855701568e-07, + "loss": 0.5849941372871399, + "step": 203, + "token_acc": 0.8073615179939259 + }, + { + "epoch": 0.5895953757225434, + "grad_norm": 0.5804821715876541, + "learning_rate": 9.424999999952374e-07, + "loss": 0.5801274180412292, + "step": 204, + "token_acc": 0.8069783212978903 + }, + { + "epoch": 0.5924855491329479, + "grad_norm": 0.5724417549737069, + "learning_rate": 9.417560222230114e-07, + "loss": 0.549828827381134, + "step": 205, + "token_acc": 0.8177920383625401 + }, + { + "epoch": 0.5953757225433526, + "grad_norm": 0.5635873362301451, + "learning_rate": 9.410075598062357e-07, + "loss": 0.6004040241241455, + "step": 206, + "token_acc": 0.8004078427231751 + }, + { + "epoch": 0.5982658959537572, + "grad_norm": 0.5235901257461258, + "learning_rate": 9.402546203431947e-07, + "loss": 0.5270985960960388, + "step": 207, + "token_acc": 0.8231543624161074 + }, + { + "epoch": 0.6011560693641619, + "grad_norm": 0.5532559810628388, + "learning_rate": 9.394972114776229e-07, + "loss": 0.574277937412262, + "step": 208, + "token_acc": 0.8074010315538029 + }, + { + "epoch": 0.6040462427745664, + "grad_norm": 0.5812311718782175, + "learning_rate": 9.387353408986282e-07, + "loss": 0.595463216304779, + "step": 209, + "token_acc": 0.8024861291665605 + }, + { + "epoch": 0.6069364161849711, + "grad_norm": 0.5142938651985898, + "learning_rate": 9.379690163406128e-07, + "loss": 0.5852739214897156, + "step": 210, + "token_acc": 0.8058286827885552 + }, + { + "epoch": 0.6098265895953757, + "grad_norm": 0.5954842210532877, + "learning_rate": 9.371982455831946e-07, + "loss": 0.5914256572723389, + "step": 211, + "token_acc": 0.8022748583309552 + }, + { + "epoch": 0.6127167630057804, + "grad_norm": 0.5993748062356747, + "learning_rate": 9.364230364511295e-07, + "loss": 0.5815471410751343, + "step": 212, + "token_acc": 0.8078214734227942 + }, + { + "epoch": 0.615606936416185, + "grad_norm": 0.5946619701512068, + "learning_rate": 9.356433968142305e-07, + "loss": 0.5513661503791809, + "step": 213, + "token_acc": 0.8162251537633719 + }, + { + "epoch": 0.6184971098265896, + "grad_norm": 0.6203774782127278, + "learning_rate": 9.34859334587289e-07, + "loss": 0.5972813367843628, + "step": 214, + "token_acc": 0.8014712230836974 + }, + { + "epoch": 0.6213872832369942, + "grad_norm": 0.551145459721042, + "learning_rate": 9.340708577299936e-07, + "loss": 0.6008709669113159, + "step": 215, + "token_acc": 0.8010602678571429 + }, + { + "epoch": 0.6242774566473989, + "grad_norm": 0.5965436915708601, + "learning_rate": 9.332779742468495e-07, + "loss": 0.6075496673583984, + "step": 216, + "token_acc": 0.7974854091642866 + }, + { + "epoch": 0.6271676300578035, + "grad_norm": 0.5460165665763135, + "learning_rate": 9.324806921870975e-07, + "loss": 0.5693843364715576, + "step": 217, + "token_acc": 0.8103969870963759 + }, + { + "epoch": 0.630057803468208, + "grad_norm": 0.5966690969554563, + "learning_rate": 9.316790196446323e-07, + "loss": 0.5560802221298218, + "step": 218, + "token_acc": 0.8236988940183998 + }, + { + "epoch": 0.6329479768786127, + "grad_norm": 0.6560441235449157, + "learning_rate": 9.308729647579199e-07, + "loss": 0.5824184417724609, + "step": 219, + "token_acc": 0.8070714583452526 + }, + { + "epoch": 0.6358381502890174, + "grad_norm": 0.6006127755099283, + "learning_rate": 9.30062535709915e-07, + "loss": 0.6167861819267273, + "step": 220, + "token_acc": 0.796514221545372 + }, + { + "epoch": 0.638728323699422, + "grad_norm": 0.5570520813344141, + "learning_rate": 9.292477407279789e-07, + "loss": 0.6107242703437805, + "step": 221, + "token_acc": 0.7990834404515732 + }, + { + "epoch": 0.6416184971098265, + "grad_norm": 0.5419716560460497, + "learning_rate": 9.284285880837946e-07, + "loss": 0.5959486365318298, + "step": 222, + "token_acc": 0.8022954328356064 + }, + { + "epoch": 0.6445086705202312, + "grad_norm": 0.6657313771062484, + "learning_rate": 9.276050860932837e-07, + "loss": 0.5727354884147644, + "step": 223, + "token_acc": 0.8082750530162884 + }, + { + "epoch": 0.6473988439306358, + "grad_norm": 0.512607896262416, + "learning_rate": 9.267772431165218e-07, + "loss": 0.5810614228248596, + "step": 224, + "token_acc": 0.8100355584987692 + }, + { + "epoch": 0.6502890173410405, + "grad_norm": 0.5208342958049974, + "learning_rate": 9.259450675576535e-07, + "loss": 0.5924381017684937, + "step": 225, + "token_acc": 0.8029396939581946 + }, + { + "epoch": 0.653179190751445, + "grad_norm": 0.6880250488481687, + "learning_rate": 9.251085678648071e-07, + "loss": 0.6493653059005737, + "step": 226, + "token_acc": 0.7886282137800538 + }, + { + "epoch": 0.6560693641618497, + "grad_norm": 0.548308907840708, + "learning_rate": 9.242677525300088e-07, + "loss": 0.570950448513031, + "step": 227, + "token_acc": 0.810275809890639 + }, + { + "epoch": 0.6589595375722543, + "grad_norm": 0.5340467208226745, + "learning_rate": 9.234226300890972e-07, + "loss": 0.565179169178009, + "step": 228, + "token_acc": 0.8106098958194559 + }, + { + "epoch": 0.661849710982659, + "grad_norm": 0.5609587429682379, + "learning_rate": 9.225732091216354e-07, + "loss": 0.6229733824729919, + "step": 229, + "token_acc": 0.7947594792619757 + }, + { + "epoch": 0.6647398843930635, + "grad_norm": 0.640345970021987, + "learning_rate": 9.217194982508247e-07, + "loss": 0.556702196598053, + "step": 230, + "token_acc": 0.8141483516483516 + }, + { + "epoch": 0.6676300578034682, + "grad_norm": 0.551511374308891, + "learning_rate": 9.208615061434166e-07, + "loss": 0.6125736236572266, + "step": 231, + "token_acc": 0.7977603246777648 + }, + { + "epoch": 0.6705202312138728, + "grad_norm": 0.5163364555056573, + "learning_rate": 9.199992415096259e-07, + "loss": 0.5473246574401855, + "step": 232, + "token_acc": 0.8160722450845908 + }, + { + "epoch": 0.6734104046242775, + "grad_norm": 0.5669711665664704, + "learning_rate": 9.191327131030406e-07, + "loss": 0.543914794921875, + "step": 233, + "token_acc": 0.8196051836235239 + }, + { + "epoch": 0.6763005780346821, + "grad_norm": 0.5406802703932962, + "learning_rate": 9.182619297205347e-07, + "loss": 0.5660564303398132, + "step": 234, + "token_acc": 0.8103913761289696 + }, + { + "epoch": 0.6791907514450867, + "grad_norm": 0.556661118525528, + "learning_rate": 9.173869002021775e-07, + "loss": 0.6406779289245605, + "step": 235, + "token_acc": 0.7926350563544501 + }, + { + "epoch": 0.6820809248554913, + "grad_norm": 0.5201140983806046, + "learning_rate": 9.165076334311445e-07, + "loss": 0.6177135109901428, + "step": 236, + "token_acc": 0.7982128177119112 + }, + { + "epoch": 0.684971098265896, + "grad_norm": 0.5850116831250167, + "learning_rate": 9.156241383336278e-07, + "loss": 0.5401256680488586, + "step": 237, + "token_acc": 0.8215590591627244 + }, + { + "epoch": 0.6878612716763006, + "grad_norm": 0.6403194474900529, + "learning_rate": 9.147364238787443e-07, + "loss": 0.581301212310791, + "step": 238, + "token_acc": 0.8056872398548133 + }, + { + "epoch": 0.6907514450867052, + "grad_norm": 0.5674551611529516, + "learning_rate": 9.138444990784453e-07, + "loss": 0.6117105484008789, + "step": 239, + "token_acc": 0.7969433519630166 + }, + { + "epoch": 0.6936416184971098, + "grad_norm": 0.5668476584273359, + "learning_rate": 9.12948372987425e-07, + "loss": 0.6042872071266174, + "step": 240, + "token_acc": 0.8012008915710148 + }, + { + "epoch": 0.6965317919075145, + "grad_norm": 0.5372423597194518, + "learning_rate": 9.120480547030285e-07, + "loss": 0.5781703591346741, + "step": 241, + "token_acc": 0.8076352705410822 + }, + { + "epoch": 0.6994219653179191, + "grad_norm": 0.582884431687299, + "learning_rate": 9.111435533651595e-07, + "loss": 0.594234824180603, + "step": 242, + "token_acc": 0.8027408303103587 + }, + { + "epoch": 0.7023121387283237, + "grad_norm": 0.5468197379764062, + "learning_rate": 9.102348781561875e-07, + "loss": 0.537114143371582, + "step": 243, + "token_acc": 0.8224276312689462 + }, + { + "epoch": 0.7052023121387283, + "grad_norm": 0.5799094186562964, + "learning_rate": 9.093220383008544e-07, + "loss": 0.5844765901565552, + "step": 244, + "token_acc": 0.8037892679887568 + }, + { + "epoch": 0.708092485549133, + "grad_norm": 0.5735743433347377, + "learning_rate": 9.084050430661813e-07, + "loss": 0.6163278818130493, + "step": 245, + "token_acc": 0.7963933546643635 + }, + { + "epoch": 0.7109826589595376, + "grad_norm": 0.5675339701772788, + "learning_rate": 9.074839017613736e-07, + "loss": 0.5186026692390442, + "step": 246, + "token_acc": 0.8264138256627419 + }, + { + "epoch": 0.7138728323699421, + "grad_norm": 0.5682213760378196, + "learning_rate": 9.065586237377274e-07, + "loss": 0.5759379267692566, + "step": 247, + "token_acc": 0.8082834141978154 + }, + { + "epoch": 0.7167630057803468, + "grad_norm": 0.5222160620275426, + "learning_rate": 9.056292183885341e-07, + "loss": 0.5911962985992432, + "step": 248, + "token_acc": 0.803399969606123 + }, + { + "epoch": 0.7196531791907514, + "grad_norm": 0.5098026312902073, + "learning_rate": 9.046956951489852e-07, + "loss": 0.5775253772735596, + "step": 249, + "token_acc": 0.8074704886249294 + }, + { + "epoch": 0.7225433526011561, + "grad_norm": 0.524303335092293, + "learning_rate": 9.037580634960763e-07, + "loss": 0.5572794675827026, + "step": 250, + "token_acc": 0.8146691719232317 + }, + { + "epoch": 0.7254335260115607, + "grad_norm": 0.6033497475819745, + "learning_rate": 9.028163329485112e-07, + "loss": 0.5832095742225647, + "step": 251, + "token_acc": 0.8073202656110331 + }, + { + "epoch": 0.7283236994219653, + "grad_norm": 0.5556496694710653, + "learning_rate": 9.018705130666049e-07, + "loss": 0.5459315776824951, + "step": 252, + "token_acc": 0.8191452178897479 + }, + { + "epoch": 0.7312138728323699, + "grad_norm": 0.7747218495040153, + "learning_rate": 9.009206134521868e-07, + "loss": 0.5795873999595642, + "step": 253, + "token_acc": 0.8071730383987341 + }, + { + "epoch": 0.7341040462427746, + "grad_norm": 0.5652371374587928, + "learning_rate": 8.999666437485034e-07, + "loss": 0.5758365392684937, + "step": 254, + "token_acc": 0.811742473608758 + }, + { + "epoch": 0.7369942196531792, + "grad_norm": 0.5206182140440342, + "learning_rate": 8.990086136401198e-07, + "loss": 0.5303860306739807, + "step": 255, + "token_acc": 0.823020148188528 + }, + { + "epoch": 0.7398843930635838, + "grad_norm": 0.6450852115537637, + "learning_rate": 8.980465328528218e-07, + "loss": 0.5547192096710205, + "step": 256, + "token_acc": 0.8162106882834197 + }, + { + "epoch": 0.7427745664739884, + "grad_norm": 0.5196181500327283, + "learning_rate": 8.970804111535175e-07, + "loss": 0.5457019209861755, + "step": 257, + "token_acc": 0.8167301624082492 + }, + { + "epoch": 0.7456647398843931, + "grad_norm": 0.6356725122188899, + "learning_rate": 8.961102583501375e-07, + "loss": 0.5676227807998657, + "step": 258, + "token_acc": 0.8146457172245137 + }, + { + "epoch": 0.7485549132947977, + "grad_norm": 0.5766749980898508, + "learning_rate": 8.951360842915355e-07, + "loss": 0.5487492084503174, + "step": 259, + "token_acc": 0.8176302961517421 + }, + { + "epoch": 0.7514450867052023, + "grad_norm": 0.561193367543964, + "learning_rate": 8.941578988673885e-07, + "loss": 0.5508721470832825, + "step": 260, + "token_acc": 0.8148807459638577 + }, + { + "epoch": 0.7543352601156069, + "grad_norm": 1.1616614497713094, + "learning_rate": 8.931757120080965e-07, + "loss": 0.5649725794792175, + "step": 261, + "token_acc": 0.8123450235984954 + }, + { + "epoch": 0.7572254335260116, + "grad_norm": 0.6269083895254, + "learning_rate": 8.921895336846812e-07, + "loss": 0.5234044790267944, + "step": 262, + "token_acc": 0.826336871809926 + }, + { + "epoch": 0.7601156069364162, + "grad_norm": 0.5491932745407809, + "learning_rate": 8.911993739086852e-07, + "loss": 0.5335085391998291, + "step": 263, + "token_acc": 0.8243787856172078 + }, + { + "epoch": 0.7630057803468208, + "grad_norm": 0.6001894076535953, + "learning_rate": 8.902052427320703e-07, + "loss": 0.6009457111358643, + "step": 264, + "token_acc": 0.8005332320797702 + }, + { + "epoch": 0.7658959537572254, + "grad_norm": 0.6105633418239023, + "learning_rate": 8.892071502471154e-07, + "loss": 0.512947678565979, + "step": 265, + "token_acc": 0.8283333333333334 + }, + { + "epoch": 0.7687861271676301, + "grad_norm": 0.530310690982596, + "learning_rate": 8.882051065863139e-07, + "loss": 0.5578915476799011, + "step": 266, + "token_acc": 0.8134685584406639 + }, + { + "epoch": 0.7716763005780347, + "grad_norm": 0.6053842724913201, + "learning_rate": 8.871991219222712e-07, + "loss": 0.5307576656341553, + "step": 267, + "token_acc": 0.8237498632235475 + }, + { + "epoch": 0.7745664739884393, + "grad_norm": 0.5839374903786066, + "learning_rate": 8.861892064676008e-07, + "loss": 0.4724132716655731, + "step": 268, + "token_acc": 0.8406308417366578 + }, + { + "epoch": 0.7774566473988439, + "grad_norm": 0.5382380436884167, + "learning_rate": 8.851753704748219e-07, + "loss": 0.5864905118942261, + "step": 269, + "token_acc": 0.805320596148614 + }, + { + "epoch": 0.7803468208092486, + "grad_norm": 0.536612826265518, + "learning_rate": 8.841576242362533e-07, + "loss": 0.5369473695755005, + "step": 270, + "token_acc": 0.8202307927330842 + }, + { + "epoch": 0.7832369942196532, + "grad_norm": 0.48433135594375987, + "learning_rate": 8.831359780839107e-07, + "loss": 0.5745148062705994, + "step": 271, + "token_acc": 0.8114247865236928 + }, + { + "epoch": 0.7861271676300579, + "grad_norm": 0.565668286608129, + "learning_rate": 8.821104423894014e-07, + "loss": 0.5306930541992188, + "step": 272, + "token_acc": 0.8240810142731839 + }, + { + "epoch": 0.7890173410404624, + "grad_norm": 0.5347471169063638, + "learning_rate": 8.810810275638182e-07, + "loss": 0.5508551597595215, + "step": 273, + "token_acc": 0.8150747430289043 + }, + { + "epoch": 0.791907514450867, + "grad_norm": 0.5872611855148089, + "learning_rate": 8.800477440576346e-07, + "loss": 0.5582222938537598, + "step": 274, + "token_acc": 0.8141057178356111 + }, + { + "epoch": 0.7947976878612717, + "grad_norm": 0.5930933510081743, + "learning_rate": 8.790106023605985e-07, + "loss": 0.5265220403671265, + "step": 275, + "token_acc": 0.8236343698306786 + }, + { + "epoch": 0.7976878612716763, + "grad_norm": 0.5326943859900286, + "learning_rate": 8.779696130016252e-07, + "loss": 0.589282751083374, + "step": 276, + "token_acc": 0.8041843462366995 + }, + { + "epoch": 0.8005780346820809, + "grad_norm": 0.682574668475925, + "learning_rate": 8.769247865486915e-07, + "loss": 0.5634682178497314, + "step": 277, + "token_acc": 0.8131609072741031 + }, + { + "epoch": 0.8034682080924855, + "grad_norm": 0.6170926445265313, + "learning_rate": 8.758761336087273e-07, + "loss": 0.5282115340232849, + "step": 278, + "token_acc": 0.8240009668063165 + }, + { + "epoch": 0.8063583815028902, + "grad_norm": 0.5931538447313858, + "learning_rate": 8.748236648275087e-07, + "loss": 0.4907287061214447, + "step": 279, + "token_acc": 0.838809946714032 + }, + { + "epoch": 0.8092485549132948, + "grad_norm": 0.567206538957563, + "learning_rate": 8.737673908895497e-07, + "loss": 0.6097589731216431, + "step": 280, + "token_acc": 0.7990020422972478 + }, + { + "epoch": 0.8121387283236994, + "grad_norm": 0.5887119791348107, + "learning_rate": 8.727073225179937e-07, + "loss": 0.5625665187835693, + "step": 281, + "token_acc": 0.8113687537033379 + }, + { + "epoch": 0.815028901734104, + "grad_norm": 0.5836331757411469, + "learning_rate": 8.716434704745046e-07, + "loss": 0.513110339641571, + "step": 282, + "token_acc": 0.8275925912738822 + }, + { + "epoch": 0.8179190751445087, + "grad_norm": 0.6054924912257345, + "learning_rate": 8.705758455591576e-07, + "loss": 0.602730393409729, + "step": 283, + "token_acc": 0.8022713898227125 + }, + { + "epoch": 0.8208092485549133, + "grad_norm": 0.6236226833744741, + "learning_rate": 8.695044586103295e-07, + "loss": 0.5747796893119812, + "step": 284, + "token_acc": 0.8079837217906031 + }, + { + "epoch": 0.8236994219653179, + "grad_norm": 0.5865612629064065, + "learning_rate": 8.684293205045889e-07, + "loss": 0.6070411205291748, + "step": 285, + "token_acc": 0.7988344760774713 + }, + { + "epoch": 0.8265895953757225, + "grad_norm": 0.5503455006576133, + "learning_rate": 8.673504421565856e-07, + "loss": 0.5685064792633057, + "step": 286, + "token_acc": 0.8102210757057314 + }, + { + "epoch": 0.8294797687861272, + "grad_norm": 0.5972785565939337, + "learning_rate": 8.662678345189396e-07, + "loss": 0.46608567237854004, + "step": 287, + "token_acc": 0.8438823801959227 + }, + { + "epoch": 0.8323699421965318, + "grad_norm": 0.5201509566608107, + "learning_rate": 8.651815085821302e-07, + "loss": 0.5298614501953125, + "step": 288, + "token_acc": 0.8236416811984237 + }, + { + "epoch": 0.8352601156069365, + "grad_norm": 0.49819051940062725, + "learning_rate": 8.640914753743847e-07, + "loss": 0.5882748365402222, + "step": 289, + "token_acc": 0.8065492356638473 + }, + { + "epoch": 0.838150289017341, + "grad_norm": 0.6397626208223341, + "learning_rate": 8.629977459615654e-07, + "loss": 0.604642927646637, + "step": 290, + "token_acc": 0.798697597059869 + }, + { + "epoch": 0.8410404624277457, + "grad_norm": 0.5735121088769557, + "learning_rate": 8.619003314470586e-07, + "loss": 0.5657530426979065, + "step": 291, + "token_acc": 0.8134929241446619 + }, + { + "epoch": 0.8439306358381503, + "grad_norm": 0.6029592728755434, + "learning_rate": 8.607992429716608e-07, + "loss": 0.5807414054870605, + "step": 292, + "token_acc": 0.8062111084672681 + }, + { + "epoch": 0.846820809248555, + "grad_norm": 0.5204268288621456, + "learning_rate": 8.596944917134666e-07, + "loss": 0.5696761608123779, + "step": 293, + "token_acc": 0.8102849975611456 + }, + { + "epoch": 0.8497109826589595, + "grad_norm": 0.570216087116967, + "learning_rate": 8.585860888877536e-07, + "loss": 0.6144391298294067, + "step": 294, + "token_acc": 0.7976966055615415 + }, + { + "epoch": 0.8526011560693642, + "grad_norm": 0.525009085518107, + "learning_rate": 8.574740457468708e-07, + "loss": 0.5926086902618408, + "step": 295, + "token_acc": 0.8030848268880814 + }, + { + "epoch": 0.8554913294797688, + "grad_norm": 0.5397367841143723, + "learning_rate": 8.563583735801223e-07, + "loss": 0.5647125244140625, + "step": 296, + "token_acc": 0.8113542939673369 + }, + { + "epoch": 0.8583815028901735, + "grad_norm": 0.5453044997059636, + "learning_rate": 8.55239083713654e-07, + "loss": 0.5306450128555298, + "step": 297, + "token_acc": 0.8242952898276619 + }, + { + "epoch": 0.861271676300578, + "grad_norm": 0.49382426600759494, + "learning_rate": 8.541161875103379e-07, + "loss": 0.5655560493469238, + "step": 298, + "token_acc": 0.81170671232068 + }, + { + "epoch": 0.8641618497109826, + "grad_norm": 0.5609985492228051, + "learning_rate": 8.529896963696576e-07, + "loss": 0.5431415438652039, + "step": 299, + "token_acc": 0.8162933876284661 + }, + { + "epoch": 0.8670520231213873, + "grad_norm": 0.5476351474370762, + "learning_rate": 8.51859621727591e-07, + "loss": 0.5872442126274109, + "step": 300, + "token_acc": 0.8065929411453266 + }, + { + "epoch": 0.869942196531792, + "grad_norm": 0.5282221087597836, + "learning_rate": 8.507259750564961e-07, + "loss": 0.5451909899711609, + "step": 301, + "token_acc": 0.8188552557155108 + }, + { + "epoch": 0.8728323699421965, + "grad_norm": 0.503389270767867, + "learning_rate": 8.495887678649932e-07, + "loss": 0.5154858231544495, + "step": 302, + "token_acc": 0.8274329950559459 + }, + { + "epoch": 0.8757225433526011, + "grad_norm": 0.518940089504941, + "learning_rate": 8.484480116978486e-07, + "loss": 0.5244746208190918, + "step": 303, + "token_acc": 0.8264815952633637 + }, + { + "epoch": 0.8786127167630058, + "grad_norm": 0.573024895950047, + "learning_rate": 8.473037181358573e-07, + "loss": 0.592721700668335, + "step": 304, + "token_acc": 0.8035201013934049 + }, + { + "epoch": 0.8815028901734104, + "grad_norm": 0.5039735997055694, + "learning_rate": 8.461558987957252e-07, + "loss": 0.5656961798667908, + "step": 305, + "token_acc": 0.8130110070213994 + }, + { + "epoch": 0.884393063583815, + "grad_norm": 0.5476756827664239, + "learning_rate": 8.45004565329952e-07, + "loss": 0.5374190807342529, + "step": 306, + "token_acc": 0.820976424170279 + }, + { + "epoch": 0.8872832369942196, + "grad_norm": 0.5275746578408953, + "learning_rate": 8.438497294267116e-07, + "loss": 0.5982400178909302, + "step": 307, + "token_acc": 0.7999831918648626 + }, + { + "epoch": 0.8901734104046243, + "grad_norm": 0.532750300928086, + "learning_rate": 8.426914028097347e-07, + "loss": 0.584047794342041, + "step": 308, + "token_acc": 0.8066207177537092 + }, + { + "epoch": 0.8930635838150289, + "grad_norm": 0.5003914631256399, + "learning_rate": 8.415295972381889e-07, + "loss": 0.6089476346969604, + "step": 309, + "token_acc": 0.7978914509526754 + }, + { + "epoch": 0.8959537572254336, + "grad_norm": 0.6278624794022574, + "learning_rate": 8.403643245065597e-07, + "loss": 0.5697731375694275, + "step": 310, + "token_acc": 0.8108995234993658 + }, + { + "epoch": 0.8988439306358381, + "grad_norm": 0.6052633593556834, + "learning_rate": 8.391955964445309e-07, + "loss": 0.5913630723953247, + "step": 311, + "token_acc": 0.8023921969586315 + }, + { + "epoch": 0.9017341040462428, + "grad_norm": 0.5312386556419646, + "learning_rate": 8.38023424916864e-07, + "loss": 0.5818167924880981, + "step": 312, + "token_acc": 0.8053130715134147 + }, + { + "epoch": 0.9046242774566474, + "grad_norm": 0.5377630147019918, + "learning_rate": 8.368478218232787e-07, + "loss": 0.5994030237197876, + "step": 313, + "token_acc": 0.8010770419994847 + }, + { + "epoch": 0.9075144508670521, + "grad_norm": 0.6387143665462728, + "learning_rate": 8.356687990983305e-07, + "loss": 0.5747004747390747, + "step": 314, + "token_acc": 0.8103654791154791 + }, + { + "epoch": 0.9104046242774566, + "grad_norm": 0.5539012149779035, + "learning_rate": 8.344863687112913e-07, + "loss": 0.5109165906906128, + "step": 315, + "token_acc": 0.8275082819675849 + }, + { + "epoch": 0.9132947976878613, + "grad_norm": 0.5431996662851367, + "learning_rate": 8.333005426660271e-07, + "loss": 0.4984626770019531, + "step": 316, + "token_acc": 0.8326753471796506 + }, + { + "epoch": 0.9161849710982659, + "grad_norm": 0.5476844147731238, + "learning_rate": 8.321113330008756e-07, + "loss": 0.5582059025764465, + "step": 317, + "token_acc": 0.8131992060627932 + }, + { + "epoch": 0.9190751445086706, + "grad_norm": 0.5288904758826702, + "learning_rate": 8.309187517885249e-07, + "loss": 0.5965433120727539, + "step": 318, + "token_acc": 0.8015113167980331 + }, + { + "epoch": 0.9219653179190751, + "grad_norm": 0.5061439317002303, + "learning_rate": 8.297228111358906e-07, + "loss": 0.50608229637146, + "step": 319, + "token_acc": 0.8302445369795833 + }, + { + "epoch": 0.9248554913294798, + "grad_norm": 0.49043399117893216, + "learning_rate": 8.285235231839927e-07, + "loss": 0.5492719411849976, + "step": 320, + "token_acc": 0.8174581468830556 + }, + { + "epoch": 0.9277456647398844, + "grad_norm": 0.6174249587001943, + "learning_rate": 8.273209001078324e-07, + "loss": 0.553361177444458, + "step": 321, + "token_acc": 0.8119886458507264 + }, + { + "epoch": 0.930635838150289, + "grad_norm": 0.5616150428871276, + "learning_rate": 8.261149541162691e-07, + "loss": 0.6025636196136475, + "step": 322, + "token_acc": 0.8005087935801005 + }, + { + "epoch": 0.9335260115606936, + "grad_norm": 0.6478516612944865, + "learning_rate": 8.249056974518954e-07, + "loss": 0.5491775274276733, + "step": 323, + "token_acc": 0.8185532095041541 + }, + { + "epoch": 0.9364161849710982, + "grad_norm": 0.5031858383227522, + "learning_rate": 8.236931423909138e-07, + "loss": 0.6022853255271912, + "step": 324, + "token_acc": 0.8037384243419552 + }, + { + "epoch": 0.9393063583815029, + "grad_norm": 0.5752991697267287, + "learning_rate": 8.224773012430114e-07, + "loss": 0.5954960584640503, + "step": 325, + "token_acc": 0.8036680189317106 + }, + { + "epoch": 0.9421965317919075, + "grad_norm": 0.5295029516066992, + "learning_rate": 8.212581863512353e-07, + "loss": 0.5488483309745789, + "step": 326, + "token_acc": 0.8157750324575375 + }, + { + "epoch": 0.9450867052023122, + "grad_norm": 0.5368502799479243, + "learning_rate": 8.20035810091867e-07, + "loss": 0.5652696490287781, + "step": 327, + "token_acc": 0.8106361614705574 + }, + { + "epoch": 0.9479768786127167, + "grad_norm": 0.5847097314866032, + "learning_rate": 8.188101848742974e-07, + "loss": 0.544079065322876, + "step": 328, + "token_acc": 0.819971546427805 + }, + { + "epoch": 0.9508670520231214, + "grad_norm": 0.5255181020508993, + "learning_rate": 8.175813231408999e-07, + "loss": 0.4978986382484436, + "step": 329, + "token_acc": 0.8333199723062348 + }, + { + "epoch": 0.953757225433526, + "grad_norm": 0.5127048703010287, + "learning_rate": 8.163492373669047e-07, + "loss": 0.5805110931396484, + "step": 330, + "token_acc": 0.8056335113743647 + }, + { + "epoch": 0.9566473988439307, + "grad_norm": 0.652335019028349, + "learning_rate": 8.15113940060272e-07, + "loss": 0.5597442388534546, + "step": 331, + "token_acc": 0.8161630076551519 + }, + { + "epoch": 0.9595375722543352, + "grad_norm": 0.5947335075670345, + "learning_rate": 8.13875443761565e-07, + "loss": 0.5277099609375, + "step": 332, + "token_acc": 0.8274886297575488 + }, + { + "epoch": 0.9624277456647399, + "grad_norm": 0.5459606580402216, + "learning_rate": 8.126337610438229e-07, + "loss": 0.5635240077972412, + "step": 333, + "token_acc": 0.8108978939573075 + }, + { + "epoch": 0.9653179190751445, + "grad_norm": 0.5488564858287155, + "learning_rate": 8.113889045124323e-07, + "loss": 0.49523666501045227, + "step": 334, + "token_acc": 0.8329320341089853 + }, + { + "epoch": 0.9682080924855492, + "grad_norm": 0.5694023522198697, + "learning_rate": 8.101408868050008e-07, + "loss": 0.5316784381866455, + "step": 335, + "token_acc": 0.8213875427499967 + }, + { + "epoch": 0.9710982658959537, + "grad_norm": 0.5290670622343212, + "learning_rate": 8.088897205912271e-07, + "loss": 0.5768337249755859, + "step": 336, + "token_acc": 0.808409267610014 + }, + { + "epoch": 0.9739884393063584, + "grad_norm": 0.5630882737173935, + "learning_rate": 8.076354185727734e-07, + "loss": 0.5607028007507324, + "step": 337, + "token_acc": 0.8111738071422572 + }, + { + "epoch": 0.976878612716763, + "grad_norm": 0.5389758264031266, + "learning_rate": 8.06377993483136e-07, + "loss": 0.5800102949142456, + "step": 338, + "token_acc": 0.8064102564102564 + }, + { + "epoch": 0.9797687861271677, + "grad_norm": 0.6483925804091112, + "learning_rate": 8.051174580875163e-07, + "loss": 0.5936282873153687, + "step": 339, + "token_acc": 0.8033736003463585 + }, + { + "epoch": 0.9826589595375722, + "grad_norm": 0.5683588968241811, + "learning_rate": 8.038538251826912e-07, + "loss": 0.5602604150772095, + "step": 340, + "token_acc": 0.8103426182505487 + }, + { + "epoch": 0.9855491329479769, + "grad_norm": 0.4984007019353715, + "learning_rate": 8.025871075968826e-07, + "loss": 0.559136152267456, + "step": 341, + "token_acc": 0.8140824580290378 + }, + { + "epoch": 0.9884393063583815, + "grad_norm": 1.1899348194485317, + "learning_rate": 8.013173181896282e-07, + "loss": 0.5955883860588074, + "step": 342, + "token_acc": 0.8027926447988978 + }, + { + "epoch": 0.9913294797687862, + "grad_norm": 0.5388156404908695, + "learning_rate": 8.0004446985165e-07, + "loss": 0.5661012530326843, + "step": 343, + "token_acc": 0.8099668055056346 + }, + { + "epoch": 0.9942196531791907, + "grad_norm": 0.5412535831553995, + "learning_rate": 7.987685755047242e-07, + "loss": 0.6086287498474121, + "step": 344, + "token_acc": 0.7963722407145177 + }, + { + "epoch": 0.9971098265895953, + "grad_norm": 0.696761929081249, + "learning_rate": 7.974896481015494e-07, + "loss": 0.5823131799697876, + "step": 345, + "token_acc": 0.8073882514689755 + }, + { + "epoch": 1.0, + "grad_norm": 0.4953947640304795, + "learning_rate": 7.962077006256153e-07, + "loss": 0.5682995319366455, + "step": 346, + "token_acc": 0.8121095151492658 + }, + { + "epoch": 1.0028901734104045, + "grad_norm": 0.7111654355632505, + "learning_rate": 7.94922746091071e-07, + "loss": 0.6060156226158142, + "step": 347, + "token_acc": 0.8014354938608955 + }, + { + "epoch": 1.0057803468208093, + "grad_norm": 0.5507935056779134, + "learning_rate": 7.93634797542593e-07, + "loss": 0.5295247435569763, + "step": 348, + "token_acc": 0.8211228506318624 + }, + { + "epoch": 1.0086705202312138, + "grad_norm": 0.6189562361784823, + "learning_rate": 7.923438680552525e-07, + "loss": 0.5647916197776794, + "step": 349, + "token_acc": 0.8137873547100433 + }, + { + "epoch": 1.0115606936416186, + "grad_norm": 0.6801159002216328, + "learning_rate": 7.910499707343828e-07, + "loss": 0.590101420879364, + "step": 350, + "token_acc": 0.803803399890662 + }, + { + "epoch": 1.0144508670520231, + "grad_norm": 0.6049076830653918, + "learning_rate": 7.897531187154458e-07, + "loss": 0.5088500380516052, + "step": 351, + "token_acc": 0.8279876049759735 + }, + { + "epoch": 1.0173410404624277, + "grad_norm": 0.5654302790773965, + "learning_rate": 7.884533251638999e-07, + "loss": 0.5929542779922485, + "step": 352, + "token_acc": 0.8047063731856507 + }, + { + "epoch": 1.0202312138728324, + "grad_norm": 0.5880451344105353, + "learning_rate": 7.87150603275065e-07, + "loss": 0.5749261379241943, + "step": 353, + "token_acc": 0.8056116433808085 + }, + { + "epoch": 1.023121387283237, + "grad_norm": 0.5426830225682386, + "learning_rate": 7.85844966273989e-07, + "loss": 0.5945314168930054, + "step": 354, + "token_acc": 0.800486217737808 + }, + { + "epoch": 1.0260115606936415, + "grad_norm": 0.49678361176775165, + "learning_rate": 7.845364274153139e-07, + "loss": 0.4898013472557068, + "step": 355, + "token_acc": 0.8352619622320034 + }, + { + "epoch": 1.0289017341040463, + "grad_norm": 0.6954304853085829, + "learning_rate": 7.832249999831406e-07, + "loss": 0.5588274598121643, + "step": 356, + "token_acc": 0.8166684201080533 + }, + { + "epoch": 1.0317919075144508, + "grad_norm": 0.5310648615446059, + "learning_rate": 7.819106972908949e-07, + "loss": 0.5819897651672363, + "step": 357, + "token_acc": 0.8045070775826193 + }, + { + "epoch": 1.0346820809248556, + "grad_norm": 0.5923922817451516, + "learning_rate": 7.805935326811912e-07, + "loss": 0.5737313032150269, + "step": 358, + "token_acc": 0.8051378103467133 + }, + { + "epoch": 1.0375722543352601, + "grad_norm": 0.5178307979556245, + "learning_rate": 7.79273519525698e-07, + "loss": 0.5936248302459717, + "step": 359, + "token_acc": 0.8025767773866199 + }, + { + "epoch": 1.0404624277456647, + "grad_norm": 0.5286013733045867, + "learning_rate": 7.779506712250022e-07, + "loss": 0.5494135618209839, + "step": 360, + "token_acc": 0.8171926851655723 + }, + { + "epoch": 1.0433526011560694, + "grad_norm": 0.49585832807282065, + "learning_rate": 7.766250012084722e-07, + "loss": 0.5698336958885193, + "step": 361, + "token_acc": 0.8116101814090845 + }, + { + "epoch": 1.046242774566474, + "grad_norm": 0.6962712390013456, + "learning_rate": 7.752965229341219e-07, + "loss": 0.535956621170044, + "step": 362, + "token_acc": 0.822281059722762 + }, + { + "epoch": 1.0491329479768785, + "grad_norm": 0.5694059644679526, + "learning_rate": 7.739652498884747e-07, + "loss": 0.5675574541091919, + "step": 363, + "token_acc": 0.8093009931245225 + }, + { + "epoch": 1.0520231213872833, + "grad_norm": 0.5547323442483891, + "learning_rate": 7.726311955864261e-07, + "loss": 0.5611029863357544, + "step": 364, + "token_acc": 0.8125364888148433 + }, + { + "epoch": 1.0549132947976878, + "grad_norm": 0.5476729662614271, + "learning_rate": 7.712943735711062e-07, + "loss": 0.5374180674552917, + "step": 365, + "token_acc": 0.8212820320132261 + }, + { + "epoch": 1.0578034682080926, + "grad_norm": 0.5180731484879565, + "learning_rate": 7.699547974137426e-07, + "loss": 0.5433316230773926, + "step": 366, + "token_acc": 0.8200906177478174 + }, + { + "epoch": 1.060693641618497, + "grad_norm": 0.5798685069888638, + "learning_rate": 7.686124807135228e-07, + "loss": 0.5966153740882874, + "step": 367, + "token_acc": 0.8028633971139337 + }, + { + "epoch": 1.0635838150289016, + "grad_norm": 0.5594356403434023, + "learning_rate": 7.672674370974558e-07, + "loss": 0.5133764743804932, + "step": 368, + "token_acc": 0.8287475052817048 + }, + { + "epoch": 1.0664739884393064, + "grad_norm": 0.5414940672989453, + "learning_rate": 7.659196802202338e-07, + "loss": 0.5794786214828491, + "step": 369, + "token_acc": 0.8080960204454181 + }, + { + "epoch": 1.069364161849711, + "grad_norm": 0.5596146246622683, + "learning_rate": 7.645692237640937e-07, + "loss": 0.6179242134094238, + "step": 370, + "token_acc": 0.7978232829012561 + }, + { + "epoch": 1.0722543352601157, + "grad_norm": 0.5658616759599563, + "learning_rate": 7.632160814386779e-07, + "loss": 0.5489234924316406, + "step": 371, + "token_acc": 0.818960201793722 + }, + { + "epoch": 1.0751445086705202, + "grad_norm": 0.5583854062469837, + "learning_rate": 7.618602669808957e-07, + "loss": 0.5576378703117371, + "step": 372, + "token_acc": 0.8134194149383499 + }, + { + "epoch": 1.0780346820809248, + "grad_norm": 0.5709606663652054, + "learning_rate": 7.605017941547835e-07, + "loss": 0.5531469583511353, + "step": 373, + "token_acc": 0.8139197537682152 + }, + { + "epoch": 1.0809248554913296, + "grad_norm": 0.5401961587153568, + "learning_rate": 7.591406767513648e-07, + "loss": 0.5335639715194702, + "step": 374, + "token_acc": 0.8189074796640434 + }, + { + "epoch": 1.083815028901734, + "grad_norm": 0.5776452597256104, + "learning_rate": 7.577769285885108e-07, + "loss": 0.5792023539543152, + "step": 375, + "token_acc": 0.8059631052038535 + }, + { + "epoch": 1.0867052023121386, + "grad_norm": 0.6631103343737483, + "learning_rate": 7.564105635107996e-07, + "loss": 0.5358845591545105, + "step": 376, + "token_acc": 0.8186349045446866 + }, + { + "epoch": 1.0895953757225434, + "grad_norm": 0.49688934026931153, + "learning_rate": 7.550415953893756e-07, + "loss": 0.5017120242118835, + "step": 377, + "token_acc": 0.8296466328279073 + }, + { + "epoch": 1.092485549132948, + "grad_norm": 0.5499825048622536, + "learning_rate": 7.536700381218097e-07, + "loss": 0.5757490396499634, + "step": 378, + "token_acc": 0.8071212248675023 + }, + { + "epoch": 1.0953757225433527, + "grad_norm": 0.5724354451620394, + "learning_rate": 7.522959056319564e-07, + "loss": 0.5289810299873352, + "step": 379, + "token_acc": 0.8224057244166174 + }, + { + "epoch": 1.0982658959537572, + "grad_norm": 0.5295598164095123, + "learning_rate": 7.509192118698145e-07, + "loss": 0.5217394828796387, + "step": 380, + "token_acc": 0.8247749871572029 + }, + { + "epoch": 1.1011560693641618, + "grad_norm": 0.6732543146745934, + "learning_rate": 7.49539970811384e-07, + "loss": 0.5446665287017822, + "step": 381, + "token_acc": 0.8187780645617508 + }, + { + "epoch": 1.1040462427745665, + "grad_norm": 0.593141398734888, + "learning_rate": 7.481581964585244e-07, + "loss": 0.6174026131629944, + "step": 382, + "token_acc": 0.7958839535507607 + }, + { + "epoch": 1.106936416184971, + "grad_norm": 0.5915717748635032, + "learning_rate": 7.467739028388133e-07, + "loss": 0.5956196784973145, + "step": 383, + "token_acc": 0.8005577327975455 + }, + { + "epoch": 1.1098265895953756, + "grad_norm": 0.5486121690897104, + "learning_rate": 7.453871040054037e-07, + "loss": 0.602386474609375, + "step": 384, + "token_acc": 0.7985531236588805 + }, + { + "epoch": 1.1127167630057804, + "grad_norm": 0.6468015023115512, + "learning_rate": 7.439978140368803e-07, + "loss": 0.5264239311218262, + "step": 385, + "token_acc": 0.8247053516043534 + }, + { + "epoch": 1.115606936416185, + "grad_norm": 0.5396942599943407, + "learning_rate": 7.426060470371185e-07, + "loss": 0.5322436094284058, + "step": 386, + "token_acc": 0.8225644386194845 + }, + { + "epoch": 1.1184971098265897, + "grad_norm": 0.546318443194639, + "learning_rate": 7.412118171351395e-07, + "loss": 0.5636791586875916, + "step": 387, + "token_acc": 0.8132001591389744 + }, + { + "epoch": 1.1213872832369942, + "grad_norm": 0.5681580355518231, + "learning_rate": 7.398151384849679e-07, + "loss": 0.5519202351570129, + "step": 388, + "token_acc": 0.8136924046076314 + }, + { + "epoch": 1.1242774566473988, + "grad_norm": 0.5949989948835427, + "learning_rate": 7.384160252654873e-07, + "loss": 0.5511115789413452, + "step": 389, + "token_acc": 0.8144513354081949 + }, + { + "epoch": 1.1271676300578035, + "grad_norm": 0.4837423293992909, + "learning_rate": 7.370144916802969e-07, + "loss": 0.5643985867500305, + "step": 390, + "token_acc": 0.8112824957599688 + }, + { + "epoch": 1.130057803468208, + "grad_norm": 0.5611205998910804, + "learning_rate": 7.356105519575671e-07, + "loss": 0.5409538745880127, + "step": 391, + "token_acc": 0.8188429729320618 + }, + { + "epoch": 1.1329479768786128, + "grad_norm": 0.5181274015479428, + "learning_rate": 7.342042203498951e-07, + "loss": 0.5411881804466248, + "step": 392, + "token_acc": 0.8171947300974061 + }, + { + "epoch": 1.1358381502890174, + "grad_norm": 0.5497633972492808, + "learning_rate": 7.327955111341601e-07, + "loss": 0.5626124143600464, + "step": 393, + "token_acc": 0.8131716531422224 + }, + { + "epoch": 1.138728323699422, + "grad_norm": 0.569806645978514, + "learning_rate": 7.313844386113783e-07, + "loss": 0.533359169960022, + "step": 394, + "token_acc": 0.8227007051547947 + }, + { + "epoch": 1.1416184971098267, + "grad_norm": 0.5809695758427657, + "learning_rate": 7.299710171065584e-07, + "loss": 0.5428122282028198, + "step": 395, + "token_acc": 0.8167381946213591 + }, + { + "epoch": 1.1445086705202312, + "grad_norm": 0.5685994639717983, + "learning_rate": 7.28555260968555e-07, + "loss": 0.5661939382553101, + "step": 396, + "token_acc": 0.8107361575857062 + }, + { + "epoch": 1.147398843930636, + "grad_norm": 0.5687294924284086, + "learning_rate": 7.271371845699241e-07, + "loss": 0.4796743392944336, + "step": 397, + "token_acc": 0.8378044059980814 + }, + { + "epoch": 1.1502890173410405, + "grad_norm": 0.5570998116553988, + "learning_rate": 7.257168023067759e-07, + "loss": 0.5698948502540588, + "step": 398, + "token_acc": 0.8108394509164174 + }, + { + "epoch": 1.153179190751445, + "grad_norm": 0.5764653559793665, + "learning_rate": 7.242941285986303e-07, + "loss": 0.5216134190559387, + "step": 399, + "token_acc": 0.8264347873981053 + }, + { + "epoch": 1.1560693641618498, + "grad_norm": 0.5519714242613649, + "learning_rate": 7.228691778882692e-07, + "loss": 0.5965580940246582, + "step": 400, + "token_acc": 0.8008848328263255 + }, + { + "epoch": 1.1589595375722543, + "grad_norm": 0.5713833806622776, + "learning_rate": 7.2144196464159e-07, + "loss": 0.530504584312439, + "step": 401, + "token_acc": 0.8193537207392506 + }, + { + "epoch": 1.1618497109826589, + "grad_norm": 0.5112285942897958, + "learning_rate": 7.200125033474598e-07, + "loss": 0.5425513982772827, + "step": 402, + "token_acc": 0.8176038122905598 + }, + { + "epoch": 1.1647398843930636, + "grad_norm": 0.5891524284010872, + "learning_rate": 7.185808085175668e-07, + "loss": 0.5737115740776062, + "step": 403, + "token_acc": 0.811070949924867 + }, + { + "epoch": 1.1676300578034682, + "grad_norm": 0.8927491774092401, + "learning_rate": 7.171468946862743e-07, + "loss": 0.5100395083427429, + "step": 404, + "token_acc": 0.8297666772416578 + }, + { + "epoch": 1.1705202312138727, + "grad_norm": 0.6290027028336996, + "learning_rate": 7.157107764104723e-07, + "loss": 0.5254942178726196, + "step": 405, + "token_acc": 0.8239488461275081 + }, + { + "epoch": 1.1734104046242775, + "grad_norm": 0.5413566372730959, + "learning_rate": 7.142724682694299e-07, + "loss": 0.5764940977096558, + "step": 406, + "token_acc": 0.8086516073191842 + }, + { + "epoch": 1.176300578034682, + "grad_norm": 0.5581695811593094, + "learning_rate": 7.128319848646477e-07, + "loss": 0.5500423312187195, + "step": 407, + "token_acc": 0.8153743413040916 + }, + { + "epoch": 1.1791907514450868, + "grad_norm": 0.4681952163328979, + "learning_rate": 7.113893408197091e-07, + "loss": 0.5582858324050903, + "step": 408, + "token_acc": 0.8114563586911728 + }, + { + "epoch": 1.1820809248554913, + "grad_norm": 0.6826359609914151, + "learning_rate": 7.099445507801323e-07, + "loss": 0.49809369444847107, + "step": 409, + "token_acc": 0.8353448588307781 + }, + { + "epoch": 1.1849710982658959, + "grad_norm": 0.5090205197384219, + "learning_rate": 7.084976294132207e-07, + "loss": 0.6029922962188721, + "step": 410, + "token_acc": 0.7973656093105548 + }, + { + "epoch": 1.1878612716763006, + "grad_norm": 0.5269042882225241, + "learning_rate": 7.070485914079151e-07, + "loss": 0.5927149057388306, + "step": 411, + "token_acc": 0.8014037282759605 + }, + { + "epoch": 1.1907514450867052, + "grad_norm": 0.49950817881103576, + "learning_rate": 7.055974514746445e-07, + "loss": 0.5837708711624146, + "step": 412, + "token_acc": 0.8074309042384765 + }, + { + "epoch": 1.19364161849711, + "grad_norm": 0.5860116475494397, + "learning_rate": 7.041442243451752e-07, + "loss": 0.5210489630699158, + "step": 413, + "token_acc": 0.8244094424028096 + }, + { + "epoch": 1.1965317919075145, + "grad_norm": 0.5718657608384051, + "learning_rate": 7.026889247724635e-07, + "loss": 0.5820956230163574, + "step": 414, + "token_acc": 0.8042295599535557 + }, + { + "epoch": 1.199421965317919, + "grad_norm": 0.5054409513703455, + "learning_rate": 7.012315675305045e-07, + "loss": 0.5862281918525696, + "step": 415, + "token_acc": 0.8023793187527289 + }, + { + "epoch": 1.2023121387283238, + "grad_norm": 0.5766487774658408, + "learning_rate": 6.997721674141822e-07, + "loss": 0.520296037197113, + "step": 416, + "token_acc": 0.8252748600155311 + }, + { + "epoch": 1.2052023121387283, + "grad_norm": 0.537979220335716, + "learning_rate": 6.983107392391202e-07, + "loss": 0.5797343850135803, + "step": 417, + "token_acc": 0.80571660344046 + }, + { + "epoch": 1.208092485549133, + "grad_norm": 0.5396946740305607, + "learning_rate": 6.9684729784153e-07, + "loss": 0.6153110265731812, + "step": 418, + "token_acc": 0.7969049998485812 + }, + { + "epoch": 1.2109826589595376, + "grad_norm": 0.5642823581815699, + "learning_rate": 6.953818580780613e-07, + "loss": 0.5325438976287842, + "step": 419, + "token_acc": 0.8222246858832225 + }, + { + "epoch": 1.2138728323699421, + "grad_norm": 0.5535087521581403, + "learning_rate": 6.939144348256511e-07, + "loss": 0.5709867477416992, + "step": 420, + "token_acc": 0.8069591256176074 + }, + { + "epoch": 1.216763005780347, + "grad_norm": 0.572340555748076, + "learning_rate": 6.924450429813723e-07, + "loss": 0.5548975467681885, + "step": 421, + "token_acc": 0.8185377583894686 + }, + { + "epoch": 1.2196531791907514, + "grad_norm": 0.5155912490897337, + "learning_rate": 6.909736974622826e-07, + "loss": 0.5856627225875854, + "step": 422, + "token_acc": 0.8058833037013092 + }, + { + "epoch": 1.222543352601156, + "grad_norm": 0.5287358182605065, + "learning_rate": 6.895004132052735e-07, + "loss": 0.530200719833374, + "step": 423, + "token_acc": 0.822671307855992 + }, + { + "epoch": 1.2254335260115607, + "grad_norm": 0.5377464968526829, + "learning_rate": 6.88025205166918e-07, + "loss": 0.6028895974159241, + "step": 424, + "token_acc": 0.8013212984612038 + }, + { + "epoch": 1.2283236994219653, + "grad_norm": 0.5204405657753005, + "learning_rate": 6.865480883233189e-07, + "loss": 0.5590497851371765, + "step": 425, + "token_acc": 0.8117163218535146 + }, + { + "epoch": 1.2312138728323698, + "grad_norm": 0.45493496853760634, + "learning_rate": 6.850690776699573e-07, + "loss": 0.5726251602172852, + "step": 426, + "token_acc": 0.8084424978300127 + }, + { + "epoch": 1.2341040462427746, + "grad_norm": 0.6240376452291253, + "learning_rate": 6.835881882215395e-07, + "loss": 0.5343113541603088, + "step": 427, + "token_acc": 0.8196929353326794 + }, + { + "epoch": 1.2369942196531791, + "grad_norm": 0.5773298029457239, + "learning_rate": 6.821054350118458e-07, + "loss": 0.5317709445953369, + "step": 428, + "token_acc": 0.8196335435275461 + }, + { + "epoch": 1.239884393063584, + "grad_norm": 0.5477278016005382, + "learning_rate": 6.806208330935766e-07, + "loss": 0.5721542835235596, + "step": 429, + "token_acc": 0.8069397675429067 + }, + { + "epoch": 1.2427745664739884, + "grad_norm": 0.5954432022727356, + "learning_rate": 6.791343975381999e-07, + "loss": 0.59670090675354, + "step": 430, + "token_acc": 0.8028038691690053 + }, + { + "epoch": 1.245664739884393, + "grad_norm": 0.6299231511446614, + "learning_rate": 6.776461434357993e-07, + "loss": 0.5712985396385193, + "step": 431, + "token_acc": 0.8093430920755399 + }, + { + "epoch": 1.2485549132947977, + "grad_norm": 0.5405979300580379, + "learning_rate": 6.761560858949192e-07, + "loss": 0.5809611082077026, + "step": 432, + "token_acc": 0.8070006162733515 + }, + { + "epoch": 1.2514450867052023, + "grad_norm": 0.5516822339033575, + "learning_rate": 6.746642400424131e-07, + "loss": 0.5620344281196594, + "step": 433, + "token_acc": 0.8121798185065721 + }, + { + "epoch": 1.254335260115607, + "grad_norm": 0.5284837836987685, + "learning_rate": 6.731706210232882e-07, + "loss": 0.5855224132537842, + "step": 434, + "token_acc": 0.8044497743554139 + }, + { + "epoch": 1.2572254335260116, + "grad_norm": 0.5627730241670859, + "learning_rate": 6.716752440005537e-07, + "loss": 0.5670550465583801, + "step": 435, + "token_acc": 0.8096381386958137 + }, + { + "epoch": 1.260115606936416, + "grad_norm": 0.538509679886266, + "learning_rate": 6.701781241550648e-07, + "loss": 0.5526491403579712, + "step": 436, + "token_acc": 0.8155125315340866 + }, + { + "epoch": 1.2630057803468209, + "grad_norm": 0.4771561540026018, + "learning_rate": 6.686792766853705e-07, + "loss": 0.5505247712135315, + "step": 437, + "token_acc": 0.8138159537283621 + }, + { + "epoch": 1.2658959537572254, + "grad_norm": 0.5223829257694631, + "learning_rate": 6.671787168075575e-07, + "loss": 0.5447695255279541, + "step": 438, + "token_acc": 0.8178192464935741 + }, + { + "epoch": 1.2687861271676302, + "grad_norm": 0.5159364504277794, + "learning_rate": 6.656764597550975e-07, + "loss": 0.5982085466384888, + "step": 439, + "token_acc": 0.8001320834327017 + }, + { + "epoch": 1.2716763005780347, + "grad_norm": 0.5310637224775283, + "learning_rate": 6.641725207786909e-07, + "loss": 0.5778173208236694, + "step": 440, + "token_acc": 0.8066611125837846 + }, + { + "epoch": 1.2745664739884393, + "grad_norm": 0.56776340532874, + "learning_rate": 6.626669151461133e-07, + "loss": 0.5481947660446167, + "step": 441, + "token_acc": 0.8165455226676658 + }, + { + "epoch": 1.2774566473988438, + "grad_norm": 0.5289033874903101, + "learning_rate": 6.611596581420599e-07, + "loss": 0.5178524255752563, + "step": 442, + "token_acc": 0.8276837132314907 + }, + { + "epoch": 1.2803468208092486, + "grad_norm": 0.6054263277819003, + "learning_rate": 6.596507650679899e-07, + "loss": 0.5819660425186157, + "step": 443, + "token_acc": 0.8038088791803834 + }, + { + "epoch": 1.2832369942196533, + "grad_norm": 0.5487293303925478, + "learning_rate": 6.581402512419723e-07, + "loss": 0.5847280621528625, + "step": 444, + "token_acc": 0.80743134495099 + }, + { + "epoch": 1.2861271676300579, + "grad_norm": 0.5388475336099026, + "learning_rate": 6.566281319985295e-07, + "loss": 0.5863124132156372, + "step": 445, + "token_acc": 0.8067254504627854 + }, + { + "epoch": 1.2890173410404624, + "grad_norm": 0.5538452871257553, + "learning_rate": 6.551144226884815e-07, + "loss": 0.5669398307800293, + "step": 446, + "token_acc": 0.8087953975429001 + }, + { + "epoch": 1.291907514450867, + "grad_norm": 0.557772227891473, + "learning_rate": 6.53599138678791e-07, + "loss": 0.5209745764732361, + "step": 447, + "token_acc": 0.8239799595072235 + }, + { + "epoch": 1.2947976878612717, + "grad_norm": 0.6127169435529054, + "learning_rate": 6.520822953524065e-07, + "loss": 0.5106294751167297, + "step": 448, + "token_acc": 0.8277936680145971 + }, + { + "epoch": 1.2976878612716762, + "grad_norm": 0.5375147488907324, + "learning_rate": 6.505639081081066e-07, + "loss": 0.5071303844451904, + "step": 449, + "token_acc": 0.8268003446613994 + }, + { + "epoch": 1.300578034682081, + "grad_norm": 0.5553311529997369, + "learning_rate": 6.490439923603435e-07, + "loss": 0.5532734394073486, + "step": 450, + "token_acc": 0.8134406172882417 + }, + { + "epoch": 1.3034682080924855, + "grad_norm": 0.5998759397432016, + "learning_rate": 6.475225635390863e-07, + "loss": 0.5865392088890076, + "step": 451, + "token_acc": 0.8023424626486245 + }, + { + "epoch": 1.30635838150289, + "grad_norm": 0.5417420736704273, + "learning_rate": 6.459996370896652e-07, + "loss": 0.546296238899231, + "step": 452, + "token_acc": 0.8187062949013282 + }, + { + "epoch": 1.3092485549132948, + "grad_norm": 0.5655148261341275, + "learning_rate": 6.444752284726135e-07, + "loss": 0.5877007246017456, + "step": 453, + "token_acc": 0.8039364919354839 + }, + { + "epoch": 1.3121387283236994, + "grad_norm": 0.6144864679165839, + "learning_rate": 6.429493531635114e-07, + "loss": 0.5454727411270142, + "step": 454, + "token_acc": 0.8179015382597002 + }, + { + "epoch": 1.3150289017341041, + "grad_norm": 0.5513024274913209, + "learning_rate": 6.414220266528291e-07, + "loss": 0.553301215171814, + "step": 455, + "token_acc": 0.8119396930565884 + }, + { + "epoch": 1.3179190751445087, + "grad_norm": 0.5291432658218749, + "learning_rate": 6.398932644457689e-07, + "loss": 0.5474492311477661, + "step": 456, + "token_acc": 0.8148487159928808 + }, + { + "epoch": 1.3208092485549132, + "grad_norm": 0.5239384490420579, + "learning_rate": 6.383630820621081e-07, + "loss": 0.5769109725952148, + "step": 457, + "token_acc": 0.8075285980313913 + }, + { + "epoch": 1.323699421965318, + "grad_norm": 0.5372997474035569, + "learning_rate": 6.368314950360415e-07, + "loss": 0.5458542108535767, + "step": 458, + "token_acc": 0.818262614678899 + }, + { + "epoch": 1.3265895953757225, + "grad_norm": 0.5222784886904625, + "learning_rate": 6.352985189160234e-07, + "loss": 0.543486475944519, + "step": 459, + "token_acc": 0.8140883445049911 + }, + { + "epoch": 1.3294797687861273, + "grad_norm": 0.5656149822293426, + "learning_rate": 6.337641692646106e-07, + "loss": 0.5165099501609802, + "step": 460, + "token_acc": 0.8232782145649256 + }, + { + "epoch": 1.3323699421965318, + "grad_norm": 0.5339208409670375, + "learning_rate": 6.322284616583026e-07, + "loss": 0.568447470664978, + "step": 461, + "token_acc": 0.8107062348801407 + }, + { + "epoch": 1.3352601156069364, + "grad_norm": 0.534789315369846, + "learning_rate": 6.306914116873862e-07, + "loss": 0.5637167692184448, + "step": 462, + "token_acc": 0.8118799414154401 + }, + { + "epoch": 1.3381502890173411, + "grad_norm": 0.5013992587561265, + "learning_rate": 6.291530349557749e-07, + "loss": 0.6041359305381775, + "step": 463, + "token_acc": 0.8002847429734529 + }, + { + "epoch": 1.3410404624277457, + "grad_norm": 0.6327002649058038, + "learning_rate": 6.27613347080851e-07, + "loss": 0.5996913909912109, + "step": 464, + "token_acc": 0.8028000921266601 + }, + { + "epoch": 1.3439306358381504, + "grad_norm": 0.47925020942862323, + "learning_rate": 6.260723636933076e-07, + "loss": 0.5272285342216492, + "step": 465, + "token_acc": 0.8219443104776792 + }, + { + "epoch": 1.346820809248555, + "grad_norm": 0.5418997127974843, + "learning_rate": 6.2453010043699e-07, + "loss": 0.5982799530029297, + "step": 466, + "token_acc": 0.8018455748733745 + }, + { + "epoch": 1.3497109826589595, + "grad_norm": 0.511563505395346, + "learning_rate": 6.22986572968736e-07, + "loss": 0.5489825010299683, + "step": 467, + "token_acc": 0.8149126753184632 + }, + { + "epoch": 1.352601156069364, + "grad_norm": 0.6199984691110088, + "learning_rate": 6.214417969582181e-07, + "loss": 0.5509693622589111, + "step": 468, + "token_acc": 0.8135395589697864 + }, + { + "epoch": 1.3554913294797688, + "grad_norm": 0.9112236282410355, + "learning_rate": 6.198957880877833e-07, + "loss": 0.5764250755310059, + "step": 469, + "token_acc": 0.8059208967249633 + }, + { + "epoch": 1.3583815028901733, + "grad_norm": 0.5989342589849401, + "learning_rate": 6.183485620522946e-07, + "loss": 0.5593207478523254, + "step": 470, + "token_acc": 0.8130887081520711 + }, + { + "epoch": 1.361271676300578, + "grad_norm": 0.539630418011966, + "learning_rate": 6.168001345589715e-07, + "loss": 0.5798720121383667, + "step": 471, + "token_acc": 0.8067868478007105 + }, + { + "epoch": 1.3641618497109826, + "grad_norm": 0.5728505086100849, + "learning_rate": 6.152505213272307e-07, + "loss": 0.5105577707290649, + "step": 472, + "token_acc": 0.8268291947926711 + }, + { + "epoch": 1.3670520231213872, + "grad_norm": 0.5731864783632108, + "learning_rate": 6.136997380885259e-07, + "loss": 0.505968451499939, + "step": 473, + "token_acc": 0.8303852677489701 + }, + { + "epoch": 1.369942196531792, + "grad_norm": 0.5786843206230191, + "learning_rate": 6.12147800586189e-07, + "loss": 0.570541501045227, + "step": 474, + "token_acc": 0.8074693848475233 + }, + { + "epoch": 1.3728323699421965, + "grad_norm": 0.5300325283027945, + "learning_rate": 6.105947245752696e-07, + "loss": 0.5622447729110718, + "step": 475, + "token_acc": 0.8132824737156444 + }, + { + "epoch": 1.3757225433526012, + "grad_norm": 0.5128915878177316, + "learning_rate": 6.090405258223756e-07, + "loss": 0.5856798887252808, + "step": 476, + "token_acc": 0.8047204813663714 + }, + { + "epoch": 1.3786127167630058, + "grad_norm": 0.6515506714427548, + "learning_rate": 6.074852201055121e-07, + "loss": 0.5826733112335205, + "step": 477, + "token_acc": 0.8034700052323068 + }, + { + "epoch": 1.3815028901734103, + "grad_norm": 0.5411318320511171, + "learning_rate": 6.059288232139225e-07, + "loss": 0.5210794806480408, + "step": 478, + "token_acc": 0.8267671925390047 + }, + { + "epoch": 1.384393063583815, + "grad_norm": 0.5057150421228545, + "learning_rate": 6.043713509479277e-07, + "loss": 0.5771398544311523, + "step": 479, + "token_acc": 0.805000332629771 + }, + { + "epoch": 1.3872832369942196, + "grad_norm": 0.5683244072025584, + "learning_rate": 6.028128191187653e-07, + "loss": 0.5385507941246033, + "step": 480, + "token_acc": 0.8176837578528416 + }, + { + "epoch": 1.3901734104046244, + "grad_norm": 0.5074682411792649, + "learning_rate": 6.012532435484297e-07, + "loss": 0.5577852725982666, + "step": 481, + "token_acc": 0.8137183546223177 + }, + { + "epoch": 1.393063583815029, + "grad_norm": 0.5087445776495183, + "learning_rate": 5.996926400695113e-07, + "loss": 0.5707537531852722, + "step": 482, + "token_acc": 0.8081652461733929 + }, + { + "epoch": 1.3959537572254335, + "grad_norm": 0.6070774288583548, + "learning_rate": 5.981310245250351e-07, + "loss": 0.5291765928268433, + "step": 483, + "token_acc": 0.8198564644248993 + }, + { + "epoch": 1.3988439306358382, + "grad_norm": 0.6426245006677934, + "learning_rate": 5.965684127683012e-07, + "loss": 0.5093721151351929, + "step": 484, + "token_acc": 0.8283427901813247 + }, + { + "epoch": 1.4017341040462428, + "grad_norm": 0.5235719939982498, + "learning_rate": 5.950048206627228e-07, + "loss": 0.5404484272003174, + "step": 485, + "token_acc": 0.8198352412538783 + }, + { + "epoch": 1.4046242774566475, + "grad_norm": 0.5330826415435456, + "learning_rate": 5.934402640816651e-07, + "loss": 0.6019877195358276, + "step": 486, + "token_acc": 0.7991196347629723 + }, + { + "epoch": 1.407514450867052, + "grad_norm": 0.5402624006228682, + "learning_rate": 5.918747589082852e-07, + "loss": 0.512151300907135, + "step": 487, + "token_acc": 0.8255600510667488 + }, + { + "epoch": 1.4104046242774566, + "grad_norm": 0.6034074325578554, + "learning_rate": 5.903083210353695e-07, + "loss": 0.5242146253585815, + "step": 488, + "token_acc": 0.821293480679374 + }, + { + "epoch": 1.4132947976878611, + "grad_norm": 0.6270290133131012, + "learning_rate": 5.887409663651736e-07, + "loss": 0.5783629417419434, + "step": 489, + "token_acc": 0.8050973979809469 + }, + { + "epoch": 1.416184971098266, + "grad_norm": 0.5697754520754279, + "learning_rate": 5.8717271080926e-07, + "loss": 0.5560973882675171, + "step": 490, + "token_acc": 0.8151623266302166 + }, + { + "epoch": 1.4190751445086704, + "grad_norm": 0.5157305125572653, + "learning_rate": 5.856035702883368e-07, + "loss": 0.5741870999336243, + "step": 491, + "token_acc": 0.8082165363392618 + }, + { + "epoch": 1.4219653179190752, + "grad_norm": 0.552078767595136, + "learning_rate": 5.840335607320963e-07, + "loss": 0.5855275392532349, + "step": 492, + "token_acc": 0.8052095872614805 + }, + { + "epoch": 1.4248554913294798, + "grad_norm": 0.503224099727086, + "learning_rate": 5.824626980790532e-07, + "loss": 0.5036199688911438, + "step": 493, + "token_acc": 0.8295647769617597 + }, + { + "epoch": 1.4277456647398843, + "grad_norm": 0.535330314229148, + "learning_rate": 5.808909982763825e-07, + "loss": 0.5614448189735413, + "step": 494, + "token_acc": 0.8112195584194068 + }, + { + "epoch": 1.430635838150289, + "grad_norm": 0.5656493275743161, + "learning_rate": 5.793184772797577e-07, + "loss": 0.5648437142372131, + "step": 495, + "token_acc": 0.809333342296497 + }, + { + "epoch": 1.4335260115606936, + "grad_norm": 0.591964902056671, + "learning_rate": 5.777451510531894e-07, + "loss": 0.4516139626502991, + "step": 496, + "token_acc": 0.8457953488372093 + }, + { + "epoch": 1.4364161849710984, + "grad_norm": 0.5299075126510611, + "learning_rate": 5.761710355688627e-07, + "loss": 0.4779651165008545, + "step": 497, + "token_acc": 0.8387296285988187 + }, + { + "epoch": 1.439306358381503, + "grad_norm": 0.5231792243250346, + "learning_rate": 5.745961468069749e-07, + "loss": 0.5104596614837646, + "step": 498, + "token_acc": 0.8271942849713633 + }, + { + "epoch": 1.4421965317919074, + "grad_norm": 0.6000529888737813, + "learning_rate": 5.730205007555733e-07, + "loss": 0.6098222136497498, + "step": 499, + "token_acc": 0.797237394529817 + }, + { + "epoch": 1.4450867052023122, + "grad_norm": 0.5852668345047015, + "learning_rate": 5.714441134103936e-07, + "loss": 0.5637513995170593, + "step": 500, + "token_acc": 0.8103524746275665 + }, + { + "epoch": 1.4450867052023122, + "eval_loss": 0.5809597969055176, + "eval_runtime": 69.4729, + "eval_samples_per_second": 1.583, + "eval_steps_per_second": 0.202, + "eval_token_acc": 0.8065338513984092, + "step": 500 + }, + { + "epoch": 1.4479768786127167, + "grad_norm": 0.4857410434739766, + "learning_rate": 5.698670007746966e-07, + "loss": 0.5209301710128784, + "step": 501, + "token_acc": 0.8231232032245636 + }, + { + "epoch": 1.4508670520231215, + "grad_norm": 0.4784985253670375, + "learning_rate": 5.682891788591065e-07, + "loss": 0.5571726560592651, + "step": 502, + "token_acc": 0.8108515538539766 + }, + { + "epoch": 1.453757225433526, + "grad_norm": 0.5367888787021339, + "learning_rate": 5.66710663681448e-07, + "loss": 0.49731090664863586, + "step": 503, + "token_acc": 0.8312890657633916 + }, + { + "epoch": 1.4566473988439306, + "grad_norm": 0.610496198886357, + "learning_rate": 5.651314712665832e-07, + "loss": 0.5665647387504578, + "step": 504, + "token_acc": 0.8079466209795078 + }, + { + "epoch": 1.4595375722543353, + "grad_norm": 1.0378355029945652, + "learning_rate": 5.635516176462501e-07, + "loss": 0.5903141498565674, + "step": 505, + "token_acc": 0.8006371133060007 + }, + { + "epoch": 1.4624277456647399, + "grad_norm": 0.5540207350664488, + "learning_rate": 5.619711188588986e-07, + "loss": 0.5362493991851807, + "step": 506, + "token_acc": 0.820343725019984 + }, + { + "epoch": 1.4653179190751446, + "grad_norm": 0.5144874632858891, + "learning_rate": 5.603899909495283e-07, + "loss": 0.5462620258331299, + "step": 507, + "token_acc": 0.8145612480715733 + }, + { + "epoch": 1.4682080924855492, + "grad_norm": 0.5592031195717259, + "learning_rate": 5.58808249969526e-07, + "loss": 0.5476292371749878, + "step": 508, + "token_acc": 0.8147672146736102 + }, + { + "epoch": 1.4710982658959537, + "grad_norm": 0.8823564949169135, + "learning_rate": 5.57225911976502e-07, + "loss": 0.5868964195251465, + "step": 509, + "token_acc": 0.804788821591468 + }, + { + "epoch": 1.4739884393063583, + "grad_norm": 0.5547092232918307, + "learning_rate": 5.556429930341273e-07, + "loss": 0.5038424134254456, + "step": 510, + "token_acc": 0.8334411359013724 + }, + { + "epoch": 1.476878612716763, + "grad_norm": 0.5799136969979296, + "learning_rate": 5.540595092119708e-07, + "loss": 0.5707584619522095, + "step": 511, + "token_acc": 0.810527226273487 + }, + { + "epoch": 1.4797687861271676, + "grad_norm": 0.5034147261225864, + "learning_rate": 5.52475476585336e-07, + "loss": 0.5583351850509644, + "step": 512, + "token_acc": 0.8103952305319614 + }, + { + "epoch": 1.4826589595375723, + "grad_norm": 0.5160609299204681, + "learning_rate": 5.508909112350976e-07, + "loss": 0.5299844145774841, + "step": 513, + "token_acc": 0.8211946274807083 + }, + { + "epoch": 1.4855491329479769, + "grad_norm": 0.48690505381618093, + "learning_rate": 5.493058292475387e-07, + "loss": 0.5815989375114441, + "step": 514, + "token_acc": 0.8052997388378583 + }, + { + "epoch": 1.4884393063583814, + "grad_norm": 0.5497798749732475, + "learning_rate": 5.477202467141864e-07, + "loss": 0.5317429900169373, + "step": 515, + "token_acc": 0.8238470637503765 + }, + { + "epoch": 1.4913294797687862, + "grad_norm": 0.6297718557593524, + "learning_rate": 5.46134179731651e-07, + "loss": 0.5170228481292725, + "step": 516, + "token_acc": 0.8249339191625676 + }, + { + "epoch": 1.4942196531791907, + "grad_norm": 0.5879194826209626, + "learning_rate": 5.445476444014591e-07, + "loss": 0.5530685186386108, + "step": 517, + "token_acc": 0.8124287116369134 + }, + { + "epoch": 1.4971098265895955, + "grad_norm": 0.7172467911918745, + "learning_rate": 5.429606568298925e-07, + "loss": 0.5767130851745605, + "step": 518, + "token_acc": 0.8076758697324558 + }, + { + "epoch": 1.5, + "grad_norm": 0.47225756227931015, + "learning_rate": 5.413732331278248e-07, + "loss": 0.5357682704925537, + "step": 519, + "token_acc": 0.818332255376673 + }, + { + "epoch": 1.5028901734104045, + "grad_norm": 0.5315785549808126, + "learning_rate": 5.397853894105559e-07, + "loss": 0.5103631019592285, + "step": 520, + "token_acc": 0.8272772712126261 + }, + { + "epoch": 1.5057803468208093, + "grad_norm": 0.5411201442197484, + "learning_rate": 5.381971417976505e-07, + "loss": 0.6071707606315613, + "step": 521, + "token_acc": 0.7980659432441779 + }, + { + "epoch": 1.5086705202312138, + "grad_norm": 0.48884895821181845, + "learning_rate": 5.366085064127734e-07, + "loss": 0.5692754983901978, + "step": 522, + "token_acc": 0.8067921134275868 + }, + { + "epoch": 1.5115606936416186, + "grad_norm": 0.5118700142105465, + "learning_rate": 5.350194993835257e-07, + "loss": 0.5697520971298218, + "step": 523, + "token_acc": 0.8085179483452373 + }, + { + "epoch": 1.5144508670520231, + "grad_norm": 0.5936738113687722, + "learning_rate": 5.33430136841282e-07, + "loss": 0.5466612577438354, + "step": 524, + "token_acc": 0.8143604233276328 + }, + { + "epoch": 1.5173410404624277, + "grad_norm": 0.48377394646569144, + "learning_rate": 5.318404349210255e-07, + "loss": 0.5685998201370239, + "step": 525, + "token_acc": 0.808461779914424 + }, + { + "epoch": 1.5202312138728322, + "grad_norm": 0.5190453952524928, + "learning_rate": 5.302504097611846e-07, + "loss": 0.5479923486709595, + "step": 526, + "token_acc": 0.8150076205934166 + }, + { + "epoch": 1.523121387283237, + "grad_norm": 0.6234920552697755, + "learning_rate": 5.286600775034699e-07, + "loss": 0.5165071487426758, + "step": 527, + "token_acc": 0.8261736549800983 + }, + { + "epoch": 1.5260115606936417, + "grad_norm": 0.5881560338514248, + "learning_rate": 5.270694542927088e-07, + "loss": 0.5723020434379578, + "step": 528, + "token_acc": 0.808951938948829 + }, + { + "epoch": 1.5289017341040463, + "grad_norm": 0.5975961668165296, + "learning_rate": 5.254785562766829e-07, + "loss": 0.5684691667556763, + "step": 529, + "token_acc": 0.8089175396185871 + }, + { + "epoch": 1.5317919075144508, + "grad_norm": 0.6478162796925766, + "learning_rate": 5.238873996059637e-07, + "loss": 0.49971041083335876, + "step": 530, + "token_acc": 0.8301960912691917 + }, + { + "epoch": 1.5346820809248554, + "grad_norm": 0.6430164741639133, + "learning_rate": 5.222960004337476e-07, + "loss": 0.539410412311554, + "step": 531, + "token_acc": 0.8203352152694456 + }, + { + "epoch": 1.5375722543352601, + "grad_norm": 0.5795319284660402, + "learning_rate": 5.207043749156944e-07, + "loss": 0.5065566897392273, + "step": 532, + "token_acc": 0.8278279073124954 + }, + { + "epoch": 1.5404624277456649, + "grad_norm": 0.5464317098167678, + "learning_rate": 5.191125392097604e-07, + "loss": 0.5445448160171509, + "step": 533, + "token_acc": 0.8166163521084138 + }, + { + "epoch": 1.5433526011560694, + "grad_norm": 0.5152523695934649, + "learning_rate": 5.175205094760361e-07, + "loss": 0.5751731991767883, + "step": 534, + "token_acc": 0.8060461344386376 + }, + { + "epoch": 1.546242774566474, + "grad_norm": 0.5393208162828292, + "learning_rate": 5.159283018765819e-07, + "loss": 0.5777266621589661, + "step": 535, + "token_acc": 0.8069097414119084 + }, + { + "epoch": 1.5491329479768785, + "grad_norm": 0.5264135658228388, + "learning_rate": 5.143359325752638e-07, + "loss": 0.555731475353241, + "step": 536, + "token_acc": 0.8131375804713217 + }, + { + "epoch": 1.5520231213872833, + "grad_norm": 0.553539191702997, + "learning_rate": 5.127434177375893e-07, + "loss": 0.5539097189903259, + "step": 537, + "token_acc": 0.812809830006887 + }, + { + "epoch": 1.5549132947976878, + "grad_norm": 0.8304938276922723, + "learning_rate": 5.111507735305434e-07, + "loss": 0.535222589969635, + "step": 538, + "token_acc": 0.8182165566153093 + }, + { + "epoch": 1.5578034682080926, + "grad_norm": 0.5723817981155602, + "learning_rate": 5.095580161224244e-07, + "loss": 0.5616499185562134, + "step": 539, + "token_acc": 0.8143216251104015 + }, + { + "epoch": 1.560693641618497, + "grad_norm": 0.5555286868999088, + "learning_rate": 5.079651616826801e-07, + "loss": 0.5724209547042847, + "step": 540, + "token_acc": 0.8077700594252842 + }, + { + "epoch": 1.5635838150289016, + "grad_norm": 0.576829331739999, + "learning_rate": 5.063722263817427e-07, + "loss": 0.5502010583877563, + "step": 541, + "token_acc": 0.8148729355841307 + }, + { + "epoch": 1.5664739884393064, + "grad_norm": 0.6980607962330599, + "learning_rate": 5.047792263908659e-07, + "loss": 0.5372669696807861, + "step": 542, + "token_acc": 0.8214026830309711 + }, + { + "epoch": 1.569364161849711, + "grad_norm": 0.5728162578490732, + "learning_rate": 5.031861778819601e-07, + "loss": 0.5055459141731262, + "step": 543, + "token_acc": 0.8276528811478554 + }, + { + "epoch": 1.5722543352601157, + "grad_norm": 0.588844313912188, + "learning_rate": 5.015930970274277e-07, + "loss": 0.5107961893081665, + "step": 544, + "token_acc": 0.8256070951933737 + }, + { + "epoch": 1.5751445086705202, + "grad_norm": 0.49402725355257393, + "learning_rate": 5e-07, + "loss": 0.5780792236328125, + "step": 545, + "token_acc": 0.80536919727071 + }, + { + "epoch": 1.5780346820809248, + "grad_norm": 0.5150482904703839, + "learning_rate": 4.984069029725722e-07, + "loss": 0.5730597972869873, + "step": 546, + "token_acc": 0.8071878326447399 + }, + { + "epoch": 1.5809248554913293, + "grad_norm": 0.6120632562818131, + "learning_rate": 4.968138221180401e-07, + "loss": 0.48976290225982666, + "step": 547, + "token_acc": 0.8329062019477191 + }, + { + "epoch": 1.583815028901734, + "grad_norm": 0.5693765551777754, + "learning_rate": 4.95220773609134e-07, + "loss": 0.5690828561782837, + "step": 548, + "token_acc": 0.8103057397715957 + }, + { + "epoch": 1.5867052023121389, + "grad_norm": 0.5356011166477922, + "learning_rate": 4.936277736182573e-07, + "loss": 0.5775788426399231, + "step": 549, + "token_acc": 0.8077360101658677 + }, + { + "epoch": 1.5895953757225434, + "grad_norm": 0.5425409660783537, + "learning_rate": 4.9203483831732e-07, + "loss": 0.5838006138801575, + "step": 550, + "token_acc": 0.8051269382791122 + }, + { + "epoch": 1.592485549132948, + "grad_norm": 0.531381332935958, + "learning_rate": 4.904419838775755e-07, + "loss": 0.528168797492981, + "step": 551, + "token_acc": 0.8208675592063154 + }, + { + "epoch": 1.5953757225433525, + "grad_norm": 0.5122923018471659, + "learning_rate": 4.888492264694565e-07, + "loss": 0.5490496158599854, + "step": 552, + "token_acc": 0.8156343068498415 + }, + { + "epoch": 1.5982658959537572, + "grad_norm": 0.5537803937619057, + "learning_rate": 4.872565822624106e-07, + "loss": 0.5283633470535278, + "step": 553, + "token_acc": 0.8213697374264063 + }, + { + "epoch": 1.601156069364162, + "grad_norm": 0.533996696099063, + "learning_rate": 4.856640674247363e-07, + "loss": 0.5403317213058472, + "step": 554, + "token_acc": 0.8186500168747891 + }, + { + "epoch": 1.6040462427745665, + "grad_norm": 0.6968861408661483, + "learning_rate": 4.840716981234181e-07, + "loss": 0.5232794880867004, + "step": 555, + "token_acc": 0.8258206662354464 + }, + { + "epoch": 1.606936416184971, + "grad_norm": 0.5457170981213912, + "learning_rate": 4.82479490523964e-07, + "loss": 0.5531569123268127, + "step": 556, + "token_acc": 0.8132714653155657 + }, + { + "epoch": 1.6098265895953756, + "grad_norm": 0.5611664995745906, + "learning_rate": 4.808874607902397e-07, + "loss": 0.580593466758728, + "step": 557, + "token_acc": 0.8061331347873197 + }, + { + "epoch": 1.6127167630057804, + "grad_norm": 0.49146546445526984, + "learning_rate": 4.792956250843055e-07, + "loss": 0.5263780355453491, + "step": 558, + "token_acc": 0.8212147967727204 + }, + { + "epoch": 1.6156069364161851, + "grad_norm": 0.5650532769234693, + "learning_rate": 4.777039995662522e-07, + "loss": 0.535209596157074, + "step": 559, + "token_acc": 0.8197443965795302 + }, + { + "epoch": 1.6184971098265897, + "grad_norm": 0.6028109251795714, + "learning_rate": 4.7611260039403655e-07, + "loss": 0.5842093825340271, + "step": 560, + "token_acc": 0.8079101659544867 + }, + { + "epoch": 1.6213872832369942, + "grad_norm": 0.59069578828569, + "learning_rate": 4.7452144372331715e-07, + "loss": 0.49987393617630005, + "step": 561, + "token_acc": 0.8312081956170992 + }, + { + "epoch": 1.6242774566473988, + "grad_norm": 0.5752034924536564, + "learning_rate": 4.7293054570729126e-07, + "loss": 0.5631648302078247, + "step": 562, + "token_acc": 0.8105449311754528 + }, + { + "epoch": 1.6271676300578035, + "grad_norm": 0.48011026987442956, + "learning_rate": 4.7133992249653026e-07, + "loss": 0.6020775437355042, + "step": 563, + "token_acc": 0.80207682093969 + }, + { + "epoch": 1.630057803468208, + "grad_norm": 0.6157896994330491, + "learning_rate": 4.697495902388154e-07, + "loss": 0.5418002009391785, + "step": 564, + "token_acc": 0.8178849600782141 + }, + { + "epoch": 1.6329479768786128, + "grad_norm": 0.5711847053504078, + "learning_rate": 4.681595650789746e-07, + "loss": 0.5428210496902466, + "step": 565, + "token_acc": 0.815186965701749 + }, + { + "epoch": 1.6358381502890174, + "grad_norm": 0.5202306815183112, + "learning_rate": 4.6656986315871815e-07, + "loss": 0.5333169102668762, + "step": 566, + "token_acc": 0.8192019018509085 + }, + { + "epoch": 1.638728323699422, + "grad_norm": 0.5862764371195341, + "learning_rate": 4.649805006164743e-07, + "loss": 0.5256876349449158, + "step": 567, + "token_acc": 0.8224795998947091 + }, + { + "epoch": 1.6416184971098264, + "grad_norm": 0.5972850501922398, + "learning_rate": 4.6339149358722675e-07, + "loss": 0.4838550388813019, + "step": 568, + "token_acc": 0.8348972296693477 + }, + { + "epoch": 1.6445086705202312, + "grad_norm": 0.5597928387418396, + "learning_rate": 4.618028582023495e-07, + "loss": 0.5284090042114258, + "step": 569, + "token_acc": 0.8216369128482156 + }, + { + "epoch": 1.647398843930636, + "grad_norm": 0.6008687154199086, + "learning_rate": 4.6021461058944415e-07, + "loss": 0.5147076845169067, + "step": 570, + "token_acc": 0.8275472384008092 + }, + { + "epoch": 1.6502890173410405, + "grad_norm": 0.6575913400532123, + "learning_rate": 4.5862676687217526e-07, + "loss": 0.5117477178573608, + "step": 571, + "token_acc": 0.8287706152259228 + }, + { + "epoch": 1.653179190751445, + "grad_norm": 0.5137586329958652, + "learning_rate": 4.5703934317010727e-07, + "loss": 0.5332241058349609, + "step": 572, + "token_acc": 0.8202151610509888 + }, + { + "epoch": 1.6560693641618496, + "grad_norm": 0.565500132263929, + "learning_rate": 4.5545235559854105e-07, + "loss": 0.5527046918869019, + "step": 573, + "token_acc": 0.8138320979141528 + }, + { + "epoch": 1.6589595375722543, + "grad_norm": 0.5302962565332909, + "learning_rate": 4.5386582026834904e-07, + "loss": 0.5092106461524963, + "step": 574, + "token_acc": 0.8281128993919504 + }, + { + "epoch": 1.661849710982659, + "grad_norm": 0.5821742123016643, + "learning_rate": 4.5227975328581335e-07, + "loss": 0.5064735412597656, + "step": 575, + "token_acc": 0.827575659879804 + }, + { + "epoch": 1.6647398843930636, + "grad_norm": 0.5963479290796274, + "learning_rate": 4.5069417075246146e-07, + "loss": 0.4928985834121704, + "step": 576, + "token_acc": 0.8335413266775463 + }, + { + "epoch": 1.6676300578034682, + "grad_norm": 0.6048528428075496, + "learning_rate": 4.491090887649024e-07, + "loss": 0.49480709433555603, + "step": 577, + "token_acc": 0.8347347057118005 + }, + { + "epoch": 1.6705202312138727, + "grad_norm": 0.6285946360216301, + "learning_rate": 4.475245234146639e-07, + "loss": 0.49079689383506775, + "step": 578, + "token_acc": 0.83443186255369 + }, + { + "epoch": 1.6734104046242775, + "grad_norm": 0.5603272652152215, + "learning_rate": 4.459404907880292e-07, + "loss": 0.5334948897361755, + "step": 579, + "token_acc": 0.8186869024041065 + }, + { + "epoch": 1.6763005780346822, + "grad_norm": 0.5366750310588114, + "learning_rate": 4.443570069658727e-07, + "loss": 0.5434994101524353, + "step": 580, + "token_acc": 0.816468327847366 + }, + { + "epoch": 1.6791907514450868, + "grad_norm": 0.5467060355475981, + "learning_rate": 4.42774088023498e-07, + "loss": 0.5757695436477661, + "step": 581, + "token_acc": 0.8080333034841515 + }, + { + "epoch": 1.6820809248554913, + "grad_norm": 0.6184966009398549, + "learning_rate": 4.4119175003047407e-07, + "loss": 0.5647035241127014, + "step": 582, + "token_acc": 0.8111076384093734 + }, + { + "epoch": 1.6849710982658959, + "grad_norm": 0.5185867079907565, + "learning_rate": 4.396100090504717e-07, + "loss": 0.5796575546264648, + "step": 583, + "token_acc": 0.8038202807075824 + }, + { + "epoch": 1.6878612716763006, + "grad_norm": 0.813643580955912, + "learning_rate": 4.380288811411015e-07, + "loss": 0.4743460416793823, + "step": 584, + "token_acc": 0.8386408207372227 + }, + { + "epoch": 1.6907514450867052, + "grad_norm": 0.5897820309260559, + "learning_rate": 4.364483823537498e-07, + "loss": 0.5133877992630005, + "step": 585, + "token_acc": 0.8280596690740123 + }, + { + "epoch": 1.69364161849711, + "grad_norm": 0.5045181308055782, + "learning_rate": 4.3486852873341675e-07, + "loss": 0.4322221279144287, + "step": 586, + "token_acc": 0.8542273580630543 + }, + { + "epoch": 1.6965317919075145, + "grad_norm": 0.5368324019397285, + "learning_rate": 4.3328933631855195e-07, + "loss": 0.5392330884933472, + "step": 587, + "token_acc": 0.8167310479753804 + }, + { + "epoch": 1.699421965317919, + "grad_norm": 0.6325523087901944, + "learning_rate": 4.317108211408933e-07, + "loss": 0.5353363752365112, + "step": 588, + "token_acc": 0.8181194354468216 + }, + { + "epoch": 1.7023121387283235, + "grad_norm": 0.5524128184191415, + "learning_rate": 4.301329992253034e-07, + "loss": 0.49616819620132446, + "step": 589, + "token_acc": 0.8328951746002753 + }, + { + "epoch": 1.7052023121387283, + "grad_norm": 0.5034001899067154, + "learning_rate": 4.285558865896065e-07, + "loss": 0.60711270570755, + "step": 590, + "token_acc": 0.79853336934882 + }, + { + "epoch": 1.708092485549133, + "grad_norm": 0.5374954529356852, + "learning_rate": 4.2697949924442667e-07, + "loss": 0.5293912291526794, + "step": 591, + "token_acc": 0.823666171683991 + }, + { + "epoch": 1.7109826589595376, + "grad_norm": 0.5635901606786159, + "learning_rate": 4.2540385319302524e-07, + "loss": 0.5353492498397827, + "step": 592, + "token_acc": 0.8201790482173709 + }, + { + "epoch": 1.7138728323699421, + "grad_norm": 0.5253802438717141, + "learning_rate": 4.2382896443113723e-07, + "loss": 0.5334903001785278, + "step": 593, + "token_acc": 0.818032814303156 + }, + { + "epoch": 1.7167630057803467, + "grad_norm": 0.4950360437778214, + "learning_rate": 4.222548489468105e-07, + "loss": 0.5341077446937561, + "step": 594, + "token_acc": 0.8223698601883738 + }, + { + "epoch": 1.7196531791907514, + "grad_norm": 0.5514023397940045, + "learning_rate": 4.2068152272024233e-07, + "loss": 0.5363609194755554, + "step": 595, + "token_acc": 0.8196168676738834 + }, + { + "epoch": 1.7225433526011562, + "grad_norm": 0.5623269464968738, + "learning_rate": 4.1910900172361763e-07, + "loss": 0.5504116415977478, + "step": 596, + "token_acc": 0.8151576025420944 + }, + { + "epoch": 1.7254335260115607, + "grad_norm": 0.5274551240137945, + "learning_rate": 4.175373019209468e-07, + "loss": 0.5549143552780151, + "step": 597, + "token_acc": 0.8107931600579981 + }, + { + "epoch": 1.7283236994219653, + "grad_norm": 0.5704477484512106, + "learning_rate": 4.159664392679038e-07, + "loss": 0.5494258403778076, + "step": 598, + "token_acc": 0.8168460618486246 + }, + { + "epoch": 1.7312138728323698, + "grad_norm": 0.6161778636830428, + "learning_rate": 4.143964297116633e-07, + "loss": 0.5577751994132996, + "step": 599, + "token_acc": 0.8121810843728358 + }, + { + "epoch": 1.7341040462427746, + "grad_norm": 0.6075742333688984, + "learning_rate": 4.1282728919074005e-07, + "loss": 0.5403814315795898, + "step": 600, + "token_acc": 0.821105101452986 + }, + { + "epoch": 1.7369942196531793, + "grad_norm": 0.6520533036933062, + "learning_rate": 4.1125903363482634e-07, + "loss": 0.47892940044403076, + "step": 601, + "token_acc": 0.8369930163846361 + }, + { + "epoch": 1.739884393063584, + "grad_norm": 0.5680876440782588, + "learning_rate": 4.0969167896463046e-07, + "loss": 0.5336910486221313, + "step": 602, + "token_acc": 0.8216713342322719 + }, + { + "epoch": 1.7427745664739884, + "grad_norm": 0.7080634828510891, + "learning_rate": 4.0812524109171475e-07, + "loss": 0.524694561958313, + "step": 603, + "token_acc": 0.8261413383364603 + }, + { + "epoch": 1.745664739884393, + "grad_norm": 0.528594204710658, + "learning_rate": 4.0655973591833475e-07, + "loss": 0.5086634755134583, + "step": 604, + "token_acc": 0.8286352131054758 + }, + { + "epoch": 1.7485549132947977, + "grad_norm": 0.6260551904964319, + "learning_rate": 4.0499517933727727e-07, + "loss": 0.48479533195495605, + "step": 605, + "token_acc": 0.8348625638530771 + }, + { + "epoch": 1.7514450867052023, + "grad_norm": 0.5425421161730628, + "learning_rate": 4.034315872316987e-07, + "loss": 0.5817371606826782, + "step": 606, + "token_acc": 0.8068743095851797 + }, + { + "epoch": 1.754335260115607, + "grad_norm": 0.5183265889747526, + "learning_rate": 4.018689754749648e-07, + "loss": 0.508335292339325, + "step": 607, + "token_acc": 0.8271757714886951 + }, + { + "epoch": 1.7572254335260116, + "grad_norm": 0.5542866259664111, + "learning_rate": 4.0030735993048884e-07, + "loss": 0.5586389899253845, + "step": 608, + "token_acc": 0.8166898202884842 + }, + { + "epoch": 1.760115606936416, + "grad_norm": 0.5411864859640132, + "learning_rate": 3.987467564515703e-07, + "loss": 0.4601624608039856, + "step": 609, + "token_acc": 0.84508010404543 + }, + { + "epoch": 1.7630057803468207, + "grad_norm": 0.524886018198833, + "learning_rate": 3.971871808812347e-07, + "loss": 0.6006595492362976, + "step": 610, + "token_acc": 0.8011782786885245 + }, + { + "epoch": 1.7658959537572254, + "grad_norm": 0.6317327126827325, + "learning_rate": 3.956286490520724e-07, + "loss": 0.509284496307373, + "step": 611, + "token_acc": 0.8325460029684483 + }, + { + "epoch": 1.7687861271676302, + "grad_norm": 0.5390581631300952, + "learning_rate": 3.9407117678607756e-07, + "loss": 0.4938768744468689, + "step": 612, + "token_acc": 0.8321855607688815 + }, + { + "epoch": 1.7716763005780347, + "grad_norm": 0.6560783619375582, + "learning_rate": 3.9251477989448795e-07, + "loss": 0.517693042755127, + "step": 613, + "token_acc": 0.8247808891627084 + }, + { + "epoch": 1.7745664739884393, + "grad_norm": 0.5602632255167417, + "learning_rate": 3.909594741776246e-07, + "loss": 0.5566587448120117, + "step": 614, + "token_acc": 0.812049268832398 + }, + { + "epoch": 1.7774566473988438, + "grad_norm": 0.5947561408697656, + "learning_rate": 3.8940527542473033e-07, + "loss": 0.5609596967697144, + "step": 615, + "token_acc": 0.8135071333264908 + }, + { + "epoch": 1.7803468208092486, + "grad_norm": 0.5666442289982523, + "learning_rate": 3.8785219941381096e-07, + "loss": 0.5130019187927246, + "step": 616, + "token_acc": 0.8260872845234054 + }, + { + "epoch": 1.7832369942196533, + "grad_norm": 0.5455613722107414, + "learning_rate": 3.8630026191147405e-07, + "loss": 0.5589362978935242, + "step": 617, + "token_acc": 0.812414640315063 + }, + { + "epoch": 1.7861271676300579, + "grad_norm": 0.550217294387885, + "learning_rate": 3.8474947867276943e-07, + "loss": 0.5442770719528198, + "step": 618, + "token_acc": 0.8159889681462442 + }, + { + "epoch": 1.7890173410404624, + "grad_norm": 0.6147473096977814, + "learning_rate": 3.8319986544102843e-07, + "loss": 0.5019974708557129, + "step": 619, + "token_acc": 0.8287660341354818 + }, + { + "epoch": 1.791907514450867, + "grad_norm": 0.5247209374319454, + "learning_rate": 3.8165143794770536e-07, + "loss": 0.5381553769111633, + "step": 620, + "token_acc": 0.8177024482109227 + }, + { + "epoch": 1.7947976878612717, + "grad_norm": 0.5828193451002669, + "learning_rate": 3.8010421191221684e-07, + "loss": 0.523591160774231, + "step": 621, + "token_acc": 0.8240329148286393 + }, + { + "epoch": 1.7976878612716765, + "grad_norm": 0.6015955817395803, + "learning_rate": 3.78558203041782e-07, + "loss": 0.539184033870697, + "step": 622, + "token_acc": 0.8198696606927818 + }, + { + "epoch": 1.800578034682081, + "grad_norm": 0.6008612726420935, + "learning_rate": 3.7701342703126394e-07, + "loss": 0.48327842354774475, + "step": 623, + "token_acc": 0.8381134839691216 + }, + { + "epoch": 1.8034682080924855, + "grad_norm": 0.6147376285603221, + "learning_rate": 3.754698995630101e-07, + "loss": 0.5317155122756958, + "step": 624, + "token_acc": 0.8217411222039681 + }, + { + "epoch": 1.80635838150289, + "grad_norm": 0.6052477258361706, + "learning_rate": 3.7392763630669243e-07, + "loss": 0.5276878476142883, + "step": 625, + "token_acc": 0.8253162139403252 + }, + { + "epoch": 1.8092485549132948, + "grad_norm": 0.6010435836572232, + "learning_rate": 3.7238665291914906e-07, + "loss": 0.5263775587081909, + "step": 626, + "token_acc": 0.8255283062505889 + }, + { + "epoch": 1.8121387283236994, + "grad_norm": 0.46459212133429395, + "learning_rate": 3.7084696504422525e-07, + "loss": 0.547301173210144, + "step": 627, + "token_acc": 0.8155224935354174 + }, + { + "epoch": 1.8150289017341041, + "grad_norm": 0.567681963556663, + "learning_rate": 3.693085883126137e-07, + "loss": 0.504138708114624, + "step": 628, + "token_acc": 0.8300083822296731 + }, + { + "epoch": 1.8179190751445087, + "grad_norm": 0.5584446222303159, + "learning_rate": 3.6777153834169726e-07, + "loss": 0.5485329031944275, + "step": 629, + "token_acc": 0.8132374537904492 + }, + { + "epoch": 1.8208092485549132, + "grad_norm": 0.5610791187838037, + "learning_rate": 3.6623583073538965e-07, + "loss": 0.5641239881515503, + "step": 630, + "token_acc": 0.8092657184953543 + }, + { + "epoch": 1.8236994219653178, + "grad_norm": 0.5571741993799751, + "learning_rate": 3.647014810839766e-07, + "loss": 0.5435695648193359, + "step": 631, + "token_acc": 0.8177736577401747 + }, + { + "epoch": 1.8265895953757225, + "grad_norm": 0.49451328689884416, + "learning_rate": 3.6316850496395855e-07, + "loss": 0.5079208612442017, + "step": 632, + "token_acc": 0.8277710403419788 + }, + { + "epoch": 1.8294797687861273, + "grad_norm": 0.731312278004029, + "learning_rate": 3.6163691793789183e-07, + "loss": 0.5612790584564209, + "step": 633, + "token_acc": 0.8145309625996321 + }, + { + "epoch": 1.8323699421965318, + "grad_norm": 0.5433070122384833, + "learning_rate": 3.6010673555423116e-07, + "loss": 0.5702831149101257, + "step": 634, + "token_acc": 0.8084171358992268 + }, + { + "epoch": 1.8352601156069364, + "grad_norm": 0.5731111882216399, + "learning_rate": 3.585779733471709e-07, + "loss": 0.5208647847175598, + "step": 635, + "token_acc": 0.8247836812568473 + }, + { + "epoch": 1.838150289017341, + "grad_norm": 0.5863236667781423, + "learning_rate": 3.5705064683648855e-07, + "loss": 0.5619288682937622, + "step": 636, + "token_acc": 0.8113308744654901 + }, + { + "epoch": 1.8410404624277457, + "grad_norm": 0.5914772914689451, + "learning_rate": 3.555247715273867e-07, + "loss": 0.49036872386932373, + "step": 637, + "token_acc": 0.8374078180826161 + }, + { + "epoch": 1.8439306358381504, + "grad_norm": 0.5295217861583622, + "learning_rate": 3.5400036291033485e-07, + "loss": 0.5192829966545105, + "step": 638, + "token_acc": 0.8258416465326863 + }, + { + "epoch": 1.846820809248555, + "grad_norm": 0.5366095434473555, + "learning_rate": 3.5247743646091367e-07, + "loss": 0.48854076862335205, + "step": 639, + "token_acc": 0.8355026160864565 + }, + { + "epoch": 1.8497109826589595, + "grad_norm": 0.552265227323895, + "learning_rate": 3.509560076396567e-07, + "loss": 0.5541850924491882, + "step": 640, + "token_acc": 0.8161763703067251 + }, + { + "epoch": 1.852601156069364, + "grad_norm": 0.5766930712255567, + "learning_rate": 3.4943609189189345e-07, + "loss": 0.49490103125572205, + "step": 641, + "token_acc": 0.8331491368709432 + }, + { + "epoch": 1.8554913294797688, + "grad_norm": 0.535142297976956, + "learning_rate": 3.4791770464759347e-07, + "loss": 0.4898555278778076, + "step": 642, + "token_acc": 0.8374039851247991 + }, + { + "epoch": 1.8583815028901736, + "grad_norm": 0.6183254820329128, + "learning_rate": 3.4640086132120906e-07, + "loss": 0.5269954800605774, + "step": 643, + "token_acc": 0.8234169800850853 + }, + { + "epoch": 1.861271676300578, + "grad_norm": 0.5689322137373185, + "learning_rate": 3.4488557731151845e-07, + "loss": 0.5776628851890564, + "step": 644, + "token_acc": 0.8088350364511105 + }, + { + "epoch": 1.8641618497109826, + "grad_norm": 0.6658391987358445, + "learning_rate": 3.433718680014705e-07, + "loss": 0.5674536228179932, + "step": 645, + "token_acc": 0.8111267784268523 + }, + { + "epoch": 1.8670520231213872, + "grad_norm": 0.5702895217250429, + "learning_rate": 3.418597487580277e-07, + "loss": 0.5942685008049011, + "step": 646, + "token_acc": 0.8022179198440608 + }, + { + "epoch": 1.869942196531792, + "grad_norm": 0.5309534408388851, + "learning_rate": 3.4034923493201007e-07, + "loss": 0.5299490690231323, + "step": 647, + "token_acc": 0.821584668833352 + }, + { + "epoch": 1.8728323699421965, + "grad_norm": 0.5410494679792496, + "learning_rate": 3.388403418579401e-07, + "loss": 0.606309175491333, + "step": 648, + "token_acc": 0.798714223159906 + }, + { + "epoch": 1.8757225433526012, + "grad_norm": 0.5885088182247251, + "learning_rate": 3.3733308485388654e-07, + "loss": 0.5152050256729126, + "step": 649, + "token_acc": 0.8267703435171321 + }, + { + "epoch": 1.8786127167630058, + "grad_norm": 0.5654387308838804, + "learning_rate": 3.3582747922130903e-07, + "loss": 0.5702789425849915, + "step": 650, + "token_acc": 0.8114149857200532 + }, + { + "epoch": 1.8815028901734103, + "grad_norm": 0.5850200396224108, + "learning_rate": 3.343235402449025e-07, + "loss": 0.5715373754501343, + "step": 651, + "token_acc": 0.809812202628705 + }, + { + "epoch": 1.8843930635838149, + "grad_norm": 0.556702805056612, + "learning_rate": 3.3282128319244237e-07, + "loss": 0.5341757535934448, + "step": 652, + "token_acc": 0.8190304033783219 + }, + { + "epoch": 1.8872832369942196, + "grad_norm": 0.5947101357097584, + "learning_rate": 3.313207233146296e-07, + "loss": 0.5120434165000916, + "step": 653, + "token_acc": 0.8284752116658459 + }, + { + "epoch": 1.8901734104046244, + "grad_norm": 0.582059481324802, + "learning_rate": 3.2982187584493516e-07, + "loss": 0.55910724401474, + "step": 654, + "token_acc": 0.8136601394849785 + }, + { + "epoch": 1.893063583815029, + "grad_norm": 0.5455003297751219, + "learning_rate": 3.283247559994463e-07, + "loss": 0.4808557629585266, + "step": 655, + "token_acc": 0.8359401998347231 + }, + { + "epoch": 1.8959537572254335, + "grad_norm": 0.5917330827702398, + "learning_rate": 3.268293789767118e-07, + "loss": 0.5275037288665771, + "step": 656, + "token_acc": 0.8203649654462709 + }, + { + "epoch": 1.898843930635838, + "grad_norm": 0.604537834207858, + "learning_rate": 3.2533575995758694e-07, + "loss": 0.536374568939209, + "step": 657, + "token_acc": 0.8204949969817257 + }, + { + "epoch": 1.9017341040462428, + "grad_norm": 0.4877298329861977, + "learning_rate": 3.2384391410508066e-07, + "loss": 0.5517327785491943, + "step": 658, + "token_acc": 0.8144875608045037 + }, + { + "epoch": 1.9046242774566475, + "grad_norm": 0.5138107466063505, + "learning_rate": 3.223538565642009e-07, + "loss": 0.5936318635940552, + "step": 659, + "token_acc": 0.8033954818487206 + }, + { + "epoch": 1.907514450867052, + "grad_norm": 0.6408117816293808, + "learning_rate": 3.2086560246180016e-07, + "loss": 0.5199168920516968, + "step": 660, + "token_acc": 0.823338105590611 + }, + { + "epoch": 1.9104046242774566, + "grad_norm": 0.6769271622378699, + "learning_rate": 3.1937916690642355e-07, + "loss": 0.5296117067337036, + "step": 661, + "token_acc": 0.8234518795819685 + }, + { + "epoch": 1.9132947976878611, + "grad_norm": 0.5205148500482691, + "learning_rate": 3.178945649881543e-07, + "loss": 0.4881097674369812, + "step": 662, + "token_acc": 0.8381457544657637 + }, + { + "epoch": 1.916184971098266, + "grad_norm": 0.533469943639252, + "learning_rate": 3.1641181177846046e-07, + "loss": 0.5646488666534424, + "step": 663, + "token_acc": 0.8092274601183008 + }, + { + "epoch": 1.9190751445086707, + "grad_norm": 0.5079029266136241, + "learning_rate": 3.1493092233004277e-07, + "loss": 0.565247654914856, + "step": 664, + "token_acc": 0.8091681448977687 + }, + { + "epoch": 1.9219653179190752, + "grad_norm": 0.5846146749149876, + "learning_rate": 3.1345191167668106e-07, + "loss": 0.46707916259765625, + "step": 665, + "token_acc": 0.8448507638926736 + }, + { + "epoch": 1.9248554913294798, + "grad_norm": 0.6115493897752081, + "learning_rate": 3.119747948330821e-07, + "loss": 0.49020782113075256, + "step": 666, + "token_acc": 0.8343801519151217 + }, + { + "epoch": 1.9277456647398843, + "grad_norm": 0.5665579491864339, + "learning_rate": 3.1049958679472645e-07, + "loss": 0.4773547649383545, + "step": 667, + "token_acc": 0.840464602970709 + }, + { + "epoch": 1.930635838150289, + "grad_norm": 0.5428950150023341, + "learning_rate": 3.0902630253771725e-07, + "loss": 0.5331814885139465, + "step": 668, + "token_acc": 0.8203493165709791 + }, + { + "epoch": 1.9335260115606936, + "grad_norm": 0.535673154611531, + "learning_rate": 3.0755495701862785e-07, + "loss": 0.5440840125083923, + "step": 669, + "token_acc": 0.8188541358240693 + }, + { + "epoch": 1.9364161849710984, + "grad_norm": 0.4836434667966126, + "learning_rate": 3.06085565174349e-07, + "loss": 0.5037864446640015, + "step": 670, + "token_acc": 0.8303648820337454 + }, + { + "epoch": 1.939306358381503, + "grad_norm": 0.6272828775317285, + "learning_rate": 3.046181419219386e-07, + "loss": 0.5913348197937012, + "step": 671, + "token_acc": 0.804053529366086 + }, + { + "epoch": 1.9421965317919074, + "grad_norm": 0.47821443556435045, + "learning_rate": 3.031527021584701e-07, + "loss": 0.5496195554733276, + "step": 672, + "token_acc": 0.8131932821607896 + }, + { + "epoch": 1.9450867052023122, + "grad_norm": 0.5368717641927174, + "learning_rate": 3.0168926076087986e-07, + "loss": 0.5248396396636963, + "step": 673, + "token_acc": 0.8238304421235078 + }, + { + "epoch": 1.9479768786127167, + "grad_norm": 0.5546004209488442, + "learning_rate": 3.002278325858177e-07, + "loss": 0.5503116846084595, + "step": 674, + "token_acc": 0.8154341018265293 + }, + { + "epoch": 1.9508670520231215, + "grad_norm": 0.5406553961850177, + "learning_rate": 2.987684324694957e-07, + "loss": 0.5093920230865479, + "step": 675, + "token_acc": 0.8285504848168147 + }, + { + "epoch": 1.953757225433526, + "grad_norm": 0.5070602927484339, + "learning_rate": 2.9731107522753654e-07, + "loss": 0.6153904795646667, + "step": 676, + "token_acc": 0.7934051997463538 + }, + { + "epoch": 1.9566473988439306, + "grad_norm": 0.6200327187024355, + "learning_rate": 2.9585577565482484e-07, + "loss": 0.49602842330932617, + "step": 677, + "token_acc": 0.8349261185482811 + }, + { + "epoch": 1.9595375722543351, + "grad_norm": 0.5432813085052021, + "learning_rate": 2.944025485253557e-07, + "loss": 0.5533842444419861, + "step": 678, + "token_acc": 0.8136697934557625 + }, + { + "epoch": 1.9624277456647399, + "grad_norm": 0.5655183170978749, + "learning_rate": 2.929514085920848e-07, + "loss": 0.5408231019973755, + "step": 679, + "token_acc": 0.8149668765846079 + }, + { + "epoch": 1.9653179190751446, + "grad_norm": 0.5348380476951098, + "learning_rate": 2.915023705867793e-07, + "loss": 0.5112613439559937, + "step": 680, + "token_acc": 0.8288466633304877 + }, + { + "epoch": 1.9682080924855492, + "grad_norm": 0.5587948082197168, + "learning_rate": 2.900554492198677e-07, + "loss": 0.5132273435592651, + "step": 681, + "token_acc": 0.8262983388869136 + }, + { + "epoch": 1.9710982658959537, + "grad_norm": 0.6468264753422917, + "learning_rate": 2.886106591802908e-07, + "loss": 0.49628451466560364, + "step": 682, + "token_acc": 0.8309623989848394 + }, + { + "epoch": 1.9739884393063583, + "grad_norm": 0.8088000703258003, + "learning_rate": 2.871680151353523e-07, + "loss": 0.566349983215332, + "step": 683, + "token_acc": 0.813486073930626 + }, + { + "epoch": 1.976878612716763, + "grad_norm": 0.5639785659667156, + "learning_rate": 2.8572753173057e-07, + "loss": 0.5700632333755493, + "step": 684, + "token_acc": 0.8086862859910506 + }, + { + "epoch": 1.9797687861271678, + "grad_norm": 0.5543121051930197, + "learning_rate": 2.842892235895279e-07, + "loss": 0.5271592140197754, + "step": 685, + "token_acc": 0.8250378942459045 + }, + { + "epoch": 1.9826589595375723, + "grad_norm": 0.5567574729556525, + "learning_rate": 2.828531053137257e-07, + "loss": 0.528691828250885, + "step": 686, + "token_acc": 0.8240472063720813 + }, + { + "epoch": 1.9855491329479769, + "grad_norm": 0.582442051806669, + "learning_rate": 2.814191914824332e-07, + "loss": 0.5287505388259888, + "step": 687, + "token_acc": 0.821006600414202 + }, + { + "epoch": 1.9884393063583814, + "grad_norm": 0.5452501250540314, + "learning_rate": 2.799874966525403e-07, + "loss": 0.5334792733192444, + "step": 688, + "token_acc": 0.8213241825401043 + }, + { + "epoch": 1.9913294797687862, + "grad_norm": 0.5482828728372189, + "learning_rate": 2.785580353584099e-07, + "loss": 0.5632658004760742, + "step": 689, + "token_acc": 0.8116547561426986 + }, + { + "epoch": 1.9942196531791907, + "grad_norm": 0.633529877080459, + "learning_rate": 2.771308221117309e-07, + "loss": 0.516349196434021, + "step": 690, + "token_acc": 0.8251189141964578 + }, + { + "epoch": 1.9971098265895955, + "grad_norm": 0.5330351124089759, + "learning_rate": 2.757058714013697e-07, + "loss": 0.5631735324859619, + "step": 691, + "token_acc": 0.8110226467289205 + }, + { + "epoch": 2.0, + "grad_norm": 0.5696997466472099, + "learning_rate": 2.7428319769322415e-07, + "loss": 0.5440479516983032, + "step": 692, + "token_acc": 0.8158318122461348 + }, + { + "epoch": 2.0028901734104045, + "grad_norm": 0.5585685445254689, + "learning_rate": 2.7286281543007597e-07, + "loss": 0.5391400456428528, + "step": 693, + "token_acc": 0.8175343274767459 + }, + { + "epoch": 2.005780346820809, + "grad_norm": 0.4706256621473158, + "learning_rate": 2.714447390314449e-07, + "loss": 0.5360602140426636, + "step": 694, + "token_acc": 0.8195729923051913 + }, + { + "epoch": 2.008670520231214, + "grad_norm": 0.4975918712102163, + "learning_rate": 2.700289828934416e-07, + "loss": 0.5223442316055298, + "step": 695, + "token_acc": 0.8266022386843656 + }, + { + "epoch": 2.0115606936416186, + "grad_norm": 0.6855664652178536, + "learning_rate": 2.686155613886215e-07, + "loss": 0.5413398146629333, + "step": 696, + "token_acc": 0.8206837181461728 + }, + { + "epoch": 2.014450867052023, + "grad_norm": 0.48324739879314504, + "learning_rate": 2.672044888658399e-07, + "loss": 0.5646222829818726, + "step": 697, + "token_acc": 0.8079876543209876 + }, + { + "epoch": 2.0173410404624277, + "grad_norm": 0.5416524165161476, + "learning_rate": 2.65795779650105e-07, + "loss": 0.5677503347396851, + "step": 698, + "token_acc": 0.8107366402887164 + }, + { + "epoch": 2.020231213872832, + "grad_norm": 0.5180032228711846, + "learning_rate": 2.64389448042433e-07, + "loss": 0.5446953773498535, + "step": 699, + "token_acc": 0.8148853386782998 + }, + { + "epoch": 2.023121387283237, + "grad_norm": 0.5242926098982621, + "learning_rate": 2.6298550831970307e-07, + "loss": 0.5251763463020325, + "step": 700, + "token_acc": 0.8224519443333264 + }, + { + "epoch": 2.0260115606936417, + "grad_norm": 0.52590432100961, + "learning_rate": 2.615839747345127e-07, + "loss": 0.5811551809310913, + "step": 701, + "token_acc": 0.8070368200019533 + }, + { + "epoch": 2.0289017341040463, + "grad_norm": 0.5346477392780163, + "learning_rate": 2.6018486151503213e-07, + "loss": 0.5263258218765259, + "step": 702, + "token_acc": 0.8226229312836096 + }, + { + "epoch": 2.031791907514451, + "grad_norm": 0.6702369614403866, + "learning_rate": 2.5878818286486026e-07, + "loss": 0.4835773706436157, + "step": 703, + "token_acc": 0.8352293317787196 + }, + { + "epoch": 2.0346820809248554, + "grad_norm": 0.5810005206971598, + "learning_rate": 2.573939529628816e-07, + "loss": 0.5316369533538818, + "step": 704, + "token_acc": 0.8213102951763859 + }, + { + "epoch": 2.03757225433526, + "grad_norm": 0.5814408850367526, + "learning_rate": 2.560021859631196e-07, + "loss": 0.531090259552002, + "step": 705, + "token_acc": 0.8247005161281525 + }, + { + "epoch": 2.040462427745665, + "grad_norm": 0.5620278975131617, + "learning_rate": 2.5461289599459646e-07, + "loss": 0.4695814847946167, + "step": 706, + "token_acc": 0.8385467145834584 + }, + { + "epoch": 2.0433526011560694, + "grad_norm": 0.5109837854766828, + "learning_rate": 2.532260971611867e-07, + "loss": 0.5594449043273926, + "step": 707, + "token_acc": 0.8109966953664819 + }, + { + "epoch": 2.046242774566474, + "grad_norm": 0.5657246379091214, + "learning_rate": 2.5184180354147554e-07, + "loss": 0.520602285861969, + "step": 708, + "token_acc": 0.8247487538513655 + }, + { + "epoch": 2.0491329479768785, + "grad_norm": 0.4918673470663886, + "learning_rate": 2.5046002918861606e-07, + "loss": 0.5579814910888672, + "step": 709, + "token_acc": 0.8135782994649099 + }, + { + "epoch": 2.052023121387283, + "grad_norm": 0.48477796977022586, + "learning_rate": 2.490807881301855e-07, + "loss": 0.5919597744941711, + "step": 710, + "token_acc": 0.8019583967529172 + }, + { + "epoch": 2.054913294797688, + "grad_norm": 0.6496075635378676, + "learning_rate": 2.477040943680436e-07, + "loss": 0.48429036140441895, + "step": 711, + "token_acc": 0.8355824403733149 + }, + { + "epoch": 2.0578034682080926, + "grad_norm": 0.5519540209458493, + "learning_rate": 2.4632996187819034e-07, + "loss": 0.506065309047699, + "step": 712, + "token_acc": 0.8278258846453057 + }, + { + "epoch": 2.060693641618497, + "grad_norm": 0.5287310217228682, + "learning_rate": 2.4495840461062433e-07, + "loss": 0.5793042778968811, + "step": 713, + "token_acc": 0.8061971483241775 + }, + { + "epoch": 2.0635838150289016, + "grad_norm": 0.5904419866749646, + "learning_rate": 2.435894364892005e-07, + "loss": 0.573466420173645, + "step": 714, + "token_acc": 0.8098105997674032 + }, + { + "epoch": 2.066473988439306, + "grad_norm": 0.6225416912989975, + "learning_rate": 2.4222307141148906e-07, + "loss": 0.48143109679222107, + "step": 715, + "token_acc": 0.836179983151357 + }, + { + "epoch": 2.069364161849711, + "grad_norm": 0.5109219477999456, + "learning_rate": 2.4085932324863507e-07, + "loss": 0.544453501701355, + "step": 716, + "token_acc": 0.8168550972356652 + }, + { + "epoch": 2.0722543352601157, + "grad_norm": 0.544868652560984, + "learning_rate": 2.394982058452165e-07, + "loss": 0.550638735294342, + "step": 717, + "token_acc": 0.813385770281816 + }, + { + "epoch": 2.0751445086705202, + "grad_norm": 0.5334855839219953, + "learning_rate": 2.3813973301910427e-07, + "loss": 0.484441876411438, + "step": 718, + "token_acc": 0.8346531540424537 + }, + { + "epoch": 2.078034682080925, + "grad_norm": 0.5494544655057828, + "learning_rate": 2.3678391856132202e-07, + "loss": 0.5680737495422363, + "step": 719, + "token_acc": 0.8124086743334372 + }, + { + "epoch": 2.0809248554913293, + "grad_norm": 0.6045748429466216, + "learning_rate": 2.3543077623590635e-07, + "loss": 0.5128438472747803, + "step": 720, + "token_acc": 0.8279022575462924 + }, + { + "epoch": 2.0838150289017343, + "grad_norm": 0.48256069429990633, + "learning_rate": 2.3408031977976623e-07, + "loss": 0.5861136317253113, + "step": 721, + "token_acc": 0.8029797322959706 + }, + { + "epoch": 2.086705202312139, + "grad_norm": 0.5653447327029175, + "learning_rate": 2.3273256290254402e-07, + "loss": 0.537794828414917, + "step": 722, + "token_acc": 0.8187106929644486 + }, + { + "epoch": 2.0895953757225434, + "grad_norm": 0.511608140122125, + "learning_rate": 2.3138751928647727e-07, + "loss": 0.5536022782325745, + "step": 723, + "token_acc": 0.8143630972354428 + }, + { + "epoch": 2.092485549132948, + "grad_norm": 0.6461334504435571, + "learning_rate": 2.3004520258625737e-07, + "loss": 0.547166645526886, + "step": 724, + "token_acc": 0.8144167909990558 + }, + { + "epoch": 2.0953757225433525, + "grad_norm": 0.5280363246093879, + "learning_rate": 2.2870562642889392e-07, + "loss": 0.5407837629318237, + "step": 725, + "token_acc": 0.81717697615801 + }, + { + "epoch": 2.098265895953757, + "grad_norm": 0.5895491785859862, + "learning_rate": 2.2736880441357398e-07, + "loss": 0.5352712273597717, + "step": 726, + "token_acc": 0.8206253892344479 + }, + { + "epoch": 2.101156069364162, + "grad_norm": 0.510490807616544, + "learning_rate": 2.2603475011152517e-07, + "loss": 0.5849488973617554, + "step": 727, + "token_acc": 0.8032212807794704 + }, + { + "epoch": 2.1040462427745665, + "grad_norm": 0.5074478903676131, + "learning_rate": 2.247034770658781e-07, + "loss": 0.5740774869918823, + "step": 728, + "token_acc": 0.8094154108581142 + }, + { + "epoch": 2.106936416184971, + "grad_norm": 0.49465264402350506, + "learning_rate": 2.2337499879152772e-07, + "loss": 0.5517815351486206, + "step": 729, + "token_acc": 0.8150811818935997 + }, + { + "epoch": 2.1098265895953756, + "grad_norm": 0.5409252325098711, + "learning_rate": 2.2204932877499778e-07, + "loss": 0.5680674314498901, + "step": 730, + "token_acc": 0.8076237225087722 + }, + { + "epoch": 2.11271676300578, + "grad_norm": 0.5667599272734437, + "learning_rate": 2.2072648047430182e-07, + "loss": 0.546800971031189, + "step": 731, + "token_acc": 0.8193202586524828 + }, + { + "epoch": 2.115606936416185, + "grad_norm": 0.5820288457006244, + "learning_rate": 2.1940646731880885e-07, + "loss": 0.5512528419494629, + "step": 732, + "token_acc": 0.8157494966528321 + }, + { + "epoch": 2.1184971098265897, + "grad_norm": 0.4949523232866875, + "learning_rate": 2.180893027091052e-07, + "loss": 0.5347863435745239, + "step": 733, + "token_acc": 0.8186724373395966 + }, + { + "epoch": 2.121387283236994, + "grad_norm": 0.5570654028702667, + "learning_rate": 2.1677500001685946e-07, + "loss": 0.5904409289360046, + "step": 734, + "token_acc": 0.80330335262698 + }, + { + "epoch": 2.1242774566473988, + "grad_norm": 0.5169029043729536, + "learning_rate": 2.154635725846861e-07, + "loss": 0.516341507434845, + "step": 735, + "token_acc": 0.8256773697978942 + }, + { + "epoch": 2.1271676300578033, + "grad_norm": 0.5202271523957221, + "learning_rate": 2.1415503372601096e-07, + "loss": 0.5516679286956787, + "step": 736, + "token_acc": 0.8166926940731877 + }, + { + "epoch": 2.1300578034682083, + "grad_norm": 0.5270674995884185, + "learning_rate": 2.1284939672493506e-07, + "loss": 0.5113083124160767, + "step": 737, + "token_acc": 0.8254448999891605 + }, + { + "epoch": 2.132947976878613, + "grad_norm": 0.5738812261029933, + "learning_rate": 2.1154667483609994e-07, + "loss": 0.5508044958114624, + "step": 738, + "token_acc": 0.8145577840874766 + }, + { + "epoch": 2.1358381502890174, + "grad_norm": 0.5552867531342636, + "learning_rate": 2.1024688128455432e-07, + "loss": 0.5606477856636047, + "step": 739, + "token_acc": 0.8107334996977912 + }, + { + "epoch": 2.138728323699422, + "grad_norm": 0.6511169378075016, + "learning_rate": 2.0895002926561733e-07, + "loss": 0.5715325474739075, + "step": 740, + "token_acc": 0.808644395970687 + }, + { + "epoch": 2.1416184971098264, + "grad_norm": 0.5104195470816412, + "learning_rate": 2.0765613194474756e-07, + "loss": 0.5317230224609375, + "step": 741, + "token_acc": 0.8196870394179812 + }, + { + "epoch": 2.1445086705202314, + "grad_norm": 0.5222197914536979, + "learning_rate": 2.0636520245740708e-07, + "loss": 0.581384003162384, + "step": 742, + "token_acc": 0.8044084027512044 + }, + { + "epoch": 2.147398843930636, + "grad_norm": 0.5216435736648604, + "learning_rate": 2.0507725390892895e-07, + "loss": 0.5070130825042725, + "step": 743, + "token_acc": 0.8285304030472848 + }, + { + "epoch": 2.1502890173410405, + "grad_norm": 0.5689993002879171, + "learning_rate": 2.0379229937438475e-07, + "loss": 0.5079813599586487, + "step": 744, + "token_acc": 0.8282544832726795 + }, + { + "epoch": 2.153179190751445, + "grad_norm": 0.5478897581085619, + "learning_rate": 2.0251035189845045e-07, + "loss": 0.5614432692527771, + "step": 745, + "token_acc": 0.8101714880561034 + }, + { + "epoch": 2.1560693641618496, + "grad_norm": 0.5625549603262265, + "learning_rate": 2.012314244952758e-07, + "loss": 0.46915191411972046, + "step": 746, + "token_acc": 0.8398674842185119 + }, + { + "epoch": 2.1589595375722546, + "grad_norm": 0.5888007906160326, + "learning_rate": 1.9995553014834986e-07, + "loss": 0.5621305704116821, + "step": 747, + "token_acc": 0.8091583390025296 + }, + { + "epoch": 2.161849710982659, + "grad_norm": 0.5611702979006163, + "learning_rate": 1.9868268181037184e-07, + "loss": 0.5150927901268005, + "step": 748, + "token_acc": 0.8226671153861205 + }, + { + "epoch": 2.1647398843930636, + "grad_norm": 0.5111806577194473, + "learning_rate": 1.9741289240311754e-07, + "loss": 0.5273150205612183, + "step": 749, + "token_acc": 0.822871650821089 + }, + { + "epoch": 2.167630057803468, + "grad_norm": 0.5196873584862519, + "learning_rate": 1.9614617481730882e-07, + "loss": 0.5140695571899414, + "step": 750, + "token_acc": 0.8273383116061258 + }, + { + "epoch": 2.1705202312138727, + "grad_norm": 0.5735974858092083, + "learning_rate": 1.948825419124837e-07, + "loss": 0.5572013854980469, + "step": 751, + "token_acc": 0.8135551173589466 + }, + { + "epoch": 2.1734104046242773, + "grad_norm": 0.5173068836847717, + "learning_rate": 1.9362200651686406e-07, + "loss": 0.4991053640842438, + "step": 752, + "token_acc": 0.8299385295624275 + }, + { + "epoch": 2.1763005780346822, + "grad_norm": 0.5835529062955169, + "learning_rate": 1.9236458142722672e-07, + "loss": 0.4967957139015198, + "step": 753, + "token_acc": 0.8307953955965303 + }, + { + "epoch": 2.179190751445087, + "grad_norm": 0.5877111733686488, + "learning_rate": 1.9111027940877283e-07, + "loss": 0.5488715767860413, + "step": 754, + "token_acc": 0.8119714508486775 + }, + { + "epoch": 2.1820809248554913, + "grad_norm": 0.5937906866706819, + "learning_rate": 1.898591131949992e-07, + "loss": 0.5290513038635254, + "step": 755, + "token_acc": 0.8182620202911337 + }, + { + "epoch": 2.184971098265896, + "grad_norm": 0.5973610860546952, + "learning_rate": 1.8861109548756764e-07, + "loss": 0.5482075810432434, + "step": 756, + "token_acc": 0.8168008865903214 + }, + { + "epoch": 2.1878612716763004, + "grad_norm": 0.6092890006866195, + "learning_rate": 1.873662389561771e-07, + "loss": 0.5488214492797852, + "step": 757, + "token_acc": 0.8205397467749234 + }, + { + "epoch": 2.1907514450867054, + "grad_norm": 0.5100060557982842, + "learning_rate": 1.861245562384351e-07, + "loss": 0.5582944750785828, + "step": 758, + "token_acc": 0.8142653999590552 + }, + { + "epoch": 2.19364161849711, + "grad_norm": 0.5534172002173429, + "learning_rate": 1.8488605993972806e-07, + "loss": 0.5284197926521301, + "step": 759, + "token_acc": 0.8226439546852772 + }, + { + "epoch": 2.1965317919075145, + "grad_norm": 0.5676418034969823, + "learning_rate": 1.8365076263309542e-07, + "loss": 0.5176257491111755, + "step": 760, + "token_acc": 0.8240463351308168 + }, + { + "epoch": 2.199421965317919, + "grad_norm": 0.5273849733875124, + "learning_rate": 1.8241867685910007e-07, + "loss": 0.5415469408035278, + "step": 761, + "token_acc": 0.8159108203203757 + }, + { + "epoch": 2.2023121387283235, + "grad_norm": 0.5675178250606417, + "learning_rate": 1.8118981512570254e-07, + "loss": 0.495791494846344, + "step": 762, + "token_acc": 0.833165862256412 + }, + { + "epoch": 2.2052023121387285, + "grad_norm": 0.5356879254901209, + "learning_rate": 1.7996418990813293e-07, + "loss": 0.5700979828834534, + "step": 763, + "token_acc": 0.8082553122201417 + }, + { + "epoch": 2.208092485549133, + "grad_norm": 0.5440506283017456, + "learning_rate": 1.7874181364876462e-07, + "loss": 0.5215957164764404, + "step": 764, + "token_acc": 0.8242129054849903 + }, + { + "epoch": 2.2109826589595376, + "grad_norm": 0.48724727796349754, + "learning_rate": 1.7752269875698872e-07, + "loss": 0.48275503516197205, + "step": 765, + "token_acc": 0.8372185670308444 + }, + { + "epoch": 2.213872832369942, + "grad_norm": 0.6530933074612743, + "learning_rate": 1.763068576090862e-07, + "loss": 0.5122123956680298, + "step": 766, + "token_acc": 0.8289117165401221 + }, + { + "epoch": 2.2167630057803467, + "grad_norm": 0.5132130783753541, + "learning_rate": 1.750943025481046e-07, + "loss": 0.5450626611709595, + "step": 767, + "token_acc": 0.8163703808809519 + }, + { + "epoch": 2.2196531791907512, + "grad_norm": 0.5763340107528144, + "learning_rate": 1.73885045883731e-07, + "loss": 0.5134228467941284, + "step": 768, + "token_acc": 0.8268736586467864 + }, + { + "epoch": 2.222543352601156, + "grad_norm": 0.5678033281126066, + "learning_rate": 1.726790998921675e-07, + "loss": 0.5369815826416016, + "step": 769, + "token_acc": 0.8197942785502621 + }, + { + "epoch": 2.2254335260115607, + "grad_norm": 0.5494081888054269, + "learning_rate": 1.7147647681600735e-07, + "loss": 0.583419144153595, + "step": 770, + "token_acc": 0.8045412637492227 + }, + { + "epoch": 2.2283236994219653, + "grad_norm": 0.5002570926978792, + "learning_rate": 1.7027718886410948e-07, + "loss": 0.5762687921524048, + "step": 771, + "token_acc": 0.8050788141720897 + }, + { + "epoch": 2.23121387283237, + "grad_norm": 0.5621625282852232, + "learning_rate": 1.6908124821147517e-07, + "loss": 0.5734193325042725, + "step": 772, + "token_acc": 0.8072726721307747 + }, + { + "epoch": 2.2341040462427744, + "grad_norm": 0.5805542620358577, + "learning_rate": 1.6788866699912434e-07, + "loss": 0.5245779156684875, + "step": 773, + "token_acc": 0.8224566435530849 + }, + { + "epoch": 2.2369942196531793, + "grad_norm": 0.5784351770858037, + "learning_rate": 1.6669945733397288e-07, + "loss": 0.5163431763648987, + "step": 774, + "token_acc": 0.8234030645429656 + }, + { + "epoch": 2.239884393063584, + "grad_norm": 0.5443607425066719, + "learning_rate": 1.6551363128870866e-07, + "loss": 0.48509231209754944, + "step": 775, + "token_acc": 0.8364400070660744 + }, + { + "epoch": 2.2427745664739884, + "grad_norm": 0.5838705468342498, + "learning_rate": 1.643312009016694e-07, + "loss": 0.5485388040542603, + "step": 776, + "token_acc": 0.814316289454411 + }, + { + "epoch": 2.245664739884393, + "grad_norm": 0.5113123373755981, + "learning_rate": 1.631521781767214e-07, + "loss": 0.5461674928665161, + "step": 777, + "token_acc": 0.8178670064564116 + }, + { + "epoch": 2.2485549132947975, + "grad_norm": 0.5316036267961789, + "learning_rate": 1.6197657508313595e-07, + "loss": 0.5362288951873779, + "step": 778, + "token_acc": 0.8175199117906136 + }, + { + "epoch": 2.2514450867052025, + "grad_norm": 0.6922569927006882, + "learning_rate": 1.608044035554692e-07, + "loss": 0.5441286563873291, + "step": 779, + "token_acc": 0.8158920316612874 + }, + { + "epoch": 2.254335260115607, + "grad_norm": 0.6638081905493092, + "learning_rate": 1.5963567549344026e-07, + "loss": 0.5481600761413574, + "step": 780, + "token_acc": 0.8147708894878706 + }, + { + "epoch": 2.2572254335260116, + "grad_norm": 0.5594541395187226, + "learning_rate": 1.5847040276181113e-07, + "loss": 0.5381879210472107, + "step": 781, + "token_acc": 0.8191574437700821 + }, + { + "epoch": 2.260115606936416, + "grad_norm": 0.6007103186375023, + "learning_rate": 1.5730859719026535e-07, + "loss": 0.537074863910675, + "step": 782, + "token_acc": 0.8190765218606167 + }, + { + "epoch": 2.2630057803468207, + "grad_norm": 0.5565956593496582, + "learning_rate": 1.561502705732883e-07, + "loss": 0.4965110719203949, + "step": 783, + "token_acc": 0.8309357060849598 + }, + { + "epoch": 2.2658959537572256, + "grad_norm": 0.5642893968640419, + "learning_rate": 1.5499543467004812e-07, + "loss": 0.5519629120826721, + "step": 784, + "token_acc": 0.8145803817619548 + }, + { + "epoch": 2.26878612716763, + "grad_norm": 0.6562655659982366, + "learning_rate": 1.538441012042747e-07, + "loss": 0.5342061519622803, + "step": 785, + "token_acc": 0.8214097726480007 + }, + { + "epoch": 2.2716763005780347, + "grad_norm": 0.5502255728162866, + "learning_rate": 1.526962818641428e-07, + "loss": 0.5008838176727295, + "step": 786, + "token_acc": 0.8290141252177352 + }, + { + "epoch": 2.2745664739884393, + "grad_norm": 0.5549954985905744, + "learning_rate": 1.5155198830215144e-07, + "loss": 0.4954628348350525, + "step": 787, + "token_acc": 0.8334000233928208 + }, + { + "epoch": 2.277456647398844, + "grad_norm": 0.6131059587737819, + "learning_rate": 1.5041123213500673e-07, + "loss": 0.5419051647186279, + "step": 788, + "token_acc": 0.8164740751406938 + }, + { + "epoch": 2.2803468208092488, + "grad_norm": 0.6247230822104177, + "learning_rate": 1.4927402494350383e-07, + "loss": 0.5040674805641174, + "step": 789, + "token_acc": 0.8298278970337606 + }, + { + "epoch": 2.2832369942196533, + "grad_norm": 0.5169557886712214, + "learning_rate": 1.4814037827240894e-07, + "loss": 0.4267565608024597, + "step": 790, + "token_acc": 0.85461239288595 + }, + { + "epoch": 2.286127167630058, + "grad_norm": 0.5453091300597913, + "learning_rate": 1.4701030363034244e-07, + "loss": 0.5594276189804077, + "step": 791, + "token_acc": 0.8131839426158908 + }, + { + "epoch": 2.2890173410404624, + "grad_norm": 0.5304410532256004, + "learning_rate": 1.4588381248966185e-07, + "loss": 0.5278592109680176, + "step": 792, + "token_acc": 0.8218627568498552 + }, + { + "epoch": 2.291907514450867, + "grad_norm": 0.6120665191114517, + "learning_rate": 1.4476091628634597e-07, + "loss": 0.575430691242218, + "step": 793, + "token_acc": 0.807088911218437 + }, + { + "epoch": 2.294797687861272, + "grad_norm": 0.5799839527530729, + "learning_rate": 1.4364162641987776e-07, + "loss": 0.5156550407409668, + "step": 794, + "token_acc": 0.8260783412329787 + }, + { + "epoch": 2.2976878612716765, + "grad_norm": 0.5602063299660717, + "learning_rate": 1.425259542531293e-07, + "loss": 0.5343849658966064, + "step": 795, + "token_acc": 0.8199821131979047 + }, + { + "epoch": 2.300578034682081, + "grad_norm": 0.4887450635971321, + "learning_rate": 1.414139111122463e-07, + "loss": 0.5308408141136169, + "step": 796, + "token_acc": 0.8229694371764182 + }, + { + "epoch": 2.3034682080924855, + "grad_norm": 0.4993867501606219, + "learning_rate": 1.4030550828653354e-07, + "loss": 0.5518777966499329, + "step": 797, + "token_acc": 0.8136998348383776 + }, + { + "epoch": 2.30635838150289, + "grad_norm": 0.5067023143157817, + "learning_rate": 1.3920075702833918e-07, + "loss": 0.5633761882781982, + "step": 798, + "token_acc": 0.8110373410357782 + }, + { + "epoch": 2.3092485549132946, + "grad_norm": 0.49845534995334795, + "learning_rate": 1.380996685529413e-07, + "loss": 0.5841176509857178, + "step": 799, + "token_acc": 0.8055892737380623 + }, + { + "epoch": 2.3121387283236996, + "grad_norm": 0.5671598446889555, + "learning_rate": 1.370022540384347e-07, + "loss": 0.5178837180137634, + "step": 800, + "token_acc": 0.8236206769170149 + }, + { + "epoch": 2.315028901734104, + "grad_norm": 0.4945445707298972, + "learning_rate": 1.3590852462561536e-07, + "loss": 0.5855327844619751, + "step": 801, + "token_acc": 0.8038555657047487 + }, + { + "epoch": 2.3179190751445087, + "grad_norm": 0.5806465370535545, + "learning_rate": 1.3481849141786977e-07, + "loss": 0.5570707321166992, + "step": 802, + "token_acc": 0.8127311126755344 + }, + { + "epoch": 2.320809248554913, + "grad_norm": 0.6159090128169195, + "learning_rate": 1.337321654810605e-07, + "loss": 0.510475754737854, + "step": 803, + "token_acc": 0.8252182347235694 + }, + { + "epoch": 2.3236994219653178, + "grad_norm": 0.5376860591208902, + "learning_rate": 1.3264955784341436e-07, + "loss": 0.5326089859008789, + "step": 804, + "token_acc": 0.8201670917441944 + }, + { + "epoch": 2.3265895953757223, + "grad_norm": 0.673299584166168, + "learning_rate": 1.3157067949541108e-07, + "loss": 0.58345627784729, + "step": 805, + "token_acc": 0.8029432260094861 + }, + { + "epoch": 2.3294797687861273, + "grad_norm": 0.5206280305901979, + "learning_rate": 1.304955413896705e-07, + "loss": 0.574557900428772, + "step": 806, + "token_acc": 0.8069745418082558 + }, + { + "epoch": 2.332369942196532, + "grad_norm": 0.5136292360134201, + "learning_rate": 1.294241544408425e-07, + "loss": 0.5320082902908325, + "step": 807, + "token_acc": 0.8200797060551261 + }, + { + "epoch": 2.3352601156069364, + "grad_norm": 0.6862994942563941, + "learning_rate": 1.2835652952549535e-07, + "loss": 0.506873607635498, + "step": 808, + "token_acc": 0.8275425473721735 + }, + { + "epoch": 2.338150289017341, + "grad_norm": 0.512551355029386, + "learning_rate": 1.272926774820063e-07, + "loss": 0.5066085457801819, + "step": 809, + "token_acc": 0.8297983521714544 + }, + { + "epoch": 2.3410404624277454, + "grad_norm": 0.5604007523428769, + "learning_rate": 1.2623260911045032e-07, + "loss": 0.5025891065597534, + "step": 810, + "token_acc": 0.829209325638134 + }, + { + "epoch": 2.3439306358381504, + "grad_norm": 0.5268748443036352, + "learning_rate": 1.251763351724912e-07, + "loss": 0.4720842242240906, + "step": 811, + "token_acc": 0.8390679336697509 + }, + { + "epoch": 2.346820809248555, + "grad_norm": 0.5272184591480457, + "learning_rate": 1.241238663912727e-07, + "loss": 0.5422724485397339, + "step": 812, + "token_acc": 0.8181165262000732 + }, + { + "epoch": 2.3497109826589595, + "grad_norm": 0.6478156561205365, + "learning_rate": 1.2307521345130856e-07, + "loss": 0.4997095465660095, + "step": 813, + "token_acc": 0.83579220127889 + }, + { + "epoch": 2.352601156069364, + "grad_norm": 0.5596818812581189, + "learning_rate": 1.2203038699837482e-07, + "loss": 0.5354875326156616, + "step": 814, + "token_acc": 0.8179522864334984 + }, + { + "epoch": 2.3554913294797686, + "grad_norm": 0.5092123540436737, + "learning_rate": 1.2098939763940146e-07, + "loss": 0.5460278987884521, + "step": 815, + "token_acc": 0.8163918561804444 + }, + { + "epoch": 2.3583815028901736, + "grad_norm": 0.5800331579268285, + "learning_rate": 1.1995225594236535e-07, + "loss": 0.5022585988044739, + "step": 816, + "token_acc": 0.8274375641464249 + }, + { + "epoch": 2.361271676300578, + "grad_norm": 0.5756167659083334, + "learning_rate": 1.1891897243618183e-07, + "loss": 0.5118639469146729, + "step": 817, + "token_acc": 0.8277416762854647 + }, + { + "epoch": 2.3641618497109826, + "grad_norm": 0.7044868964257237, + "learning_rate": 1.1788955761059848e-07, + "loss": 0.5586499571800232, + "step": 818, + "token_acc": 0.8113651781794964 + }, + { + "epoch": 2.367052023121387, + "grad_norm": 0.5795349651059425, + "learning_rate": 1.168640219160893e-07, + "loss": 0.46478456258773804, + "step": 819, + "token_acc": 0.8425433103736172 + }, + { + "epoch": 2.3699421965317917, + "grad_norm": 0.5417472517233258, + "learning_rate": 1.1584237576374672e-07, + "loss": 0.5370988845825195, + "step": 820, + "token_acc": 0.8190044958253051 + }, + { + "epoch": 2.3728323699421967, + "grad_norm": 0.5406033227296971, + "learning_rate": 1.1482462952517819e-07, + "loss": 0.5212105512619019, + "step": 821, + "token_acc": 0.8224046418092507 + }, + { + "epoch": 2.3757225433526012, + "grad_norm": 0.6158759615805948, + "learning_rate": 1.1381079353239915e-07, + "loss": 0.5457302331924438, + "step": 822, + "token_acc": 0.8143862498308296 + }, + { + "epoch": 2.378612716763006, + "grad_norm": 0.5823036775149597, + "learning_rate": 1.1280087807772881e-07, + "loss": 0.5847820043563843, + "step": 823, + "token_acc": 0.8055109662743706 + }, + { + "epoch": 2.3815028901734103, + "grad_norm": 0.5934874612721635, + "learning_rate": 1.1179489341368614e-07, + "loss": 0.527098536491394, + "step": 824, + "token_acc": 0.8198975500818406 + }, + { + "epoch": 2.384393063583815, + "grad_norm": 0.48776844524252105, + "learning_rate": 1.1079284975288456e-07, + "loss": 0.5120328068733215, + "step": 825, + "token_acc": 0.8243783599233836 + }, + { + "epoch": 2.38728323699422, + "grad_norm": 0.6146965565569307, + "learning_rate": 1.097947572679298e-07, + "loss": 0.5407025814056396, + "step": 826, + "token_acc": 0.8166508538899431 + }, + { + "epoch": 2.3901734104046244, + "grad_norm": 0.5334859468151563, + "learning_rate": 1.0880062609131485e-07, + "loss": 0.5002784729003906, + "step": 827, + "token_acc": 0.8304765759384802 + }, + { + "epoch": 2.393063583815029, + "grad_norm": 0.5390442828664261, + "learning_rate": 1.0781046631531887e-07, + "loss": 0.539802074432373, + "step": 828, + "token_acc": 0.8201954263661371 + }, + { + "epoch": 2.3959537572254335, + "grad_norm": 0.5913404588285502, + "learning_rate": 1.0682428799190357e-07, + "loss": 0.5389546155929565, + "step": 829, + "token_acc": 0.8186631949877636 + }, + { + "epoch": 2.398843930635838, + "grad_norm": 0.5442985144352179, + "learning_rate": 1.0584210113261138e-07, + "loss": 0.5016453862190247, + "step": 830, + "token_acc": 0.8323601673886272 + }, + { + "epoch": 2.401734104046243, + "grad_norm": 0.5335838263183578, + "learning_rate": 1.0486391570846447e-07, + "loss": 0.5271462202072144, + "step": 831, + "token_acc": 0.8242358536755963 + }, + { + "epoch": 2.4046242774566475, + "grad_norm": 0.49716550117440406, + "learning_rate": 1.0388974164986247e-07, + "loss": 0.55882728099823, + "step": 832, + "token_acc": 0.8099962892130277 + }, + { + "epoch": 2.407514450867052, + "grad_norm": 0.47857456778328644, + "learning_rate": 1.0291958884648244e-07, + "loss": 0.49896830320358276, + "step": 833, + "token_acc": 0.8291924229963124 + }, + { + "epoch": 2.4104046242774566, + "grad_norm": 0.5097765363216997, + "learning_rate": 1.0195346714717812e-07, + "loss": 0.5477476716041565, + "step": 834, + "token_acc": 0.8156213758444858 + }, + { + "epoch": 2.413294797687861, + "grad_norm": 0.5235000424585246, + "learning_rate": 1.0099138635988024e-07, + "loss": 0.5449202060699463, + "step": 835, + "token_acc": 0.8174131547081592 + }, + { + "epoch": 2.416184971098266, + "grad_norm": 0.5918110484158251, + "learning_rate": 1.0003335625149667e-07, + "loss": 0.47566699981689453, + "step": 836, + "token_acc": 0.8377055807323248 + }, + { + "epoch": 2.4190751445086707, + "grad_norm": 0.5851719068244339, + "learning_rate": 9.907938654781306e-08, + "loss": 0.5465905666351318, + "step": 837, + "token_acc": 0.8147972978299083 + }, + { + "epoch": 2.421965317919075, + "grad_norm": 0.5682204824677508, + "learning_rate": 9.812948693339518e-08, + "loss": 0.5738434791564941, + "step": 838, + "token_acc": 0.8094719444296344 + }, + { + "epoch": 2.4248554913294798, + "grad_norm": 0.49007877801128724, + "learning_rate": 9.718366705148878e-08, + "loss": 0.5543205738067627, + "step": 839, + "token_acc": 0.8132528289037656 + }, + { + "epoch": 2.4277456647398843, + "grad_norm": 0.5842704513292558, + "learning_rate": 9.62419365039237e-08, + "loss": 0.5389681458473206, + "step": 840, + "token_acc": 0.8200700065948241 + }, + { + "epoch": 2.430635838150289, + "grad_norm": 0.5770762126755756, + "learning_rate": 9.530430485101477e-08, + "loss": 0.5231157541275024, + "step": 841, + "token_acc": 0.8205874308194584 + }, + { + "epoch": 2.433526011560694, + "grad_norm": 0.7677432650260306, + "learning_rate": 9.437078161146589e-08, + "loss": 0.48806625604629517, + "step": 842, + "token_acc": 0.8331080698798665 + }, + { + "epoch": 2.4364161849710984, + "grad_norm": 0.644925234497109, + "learning_rate": 9.344137626227266e-08, + "loss": 0.5736875534057617, + "step": 843, + "token_acc": 0.8089128548407091 + }, + { + "epoch": 2.439306358381503, + "grad_norm": 0.7396158526047033, + "learning_rate": 9.251609823862638e-08, + "loss": 0.4797173738479614, + "step": 844, + "token_acc": 0.8373787499437789 + }, + { + "epoch": 2.4421965317919074, + "grad_norm": 0.5468960652000051, + "learning_rate": 9.15949569338188e-08, + "loss": 0.5192615985870361, + "step": 845, + "token_acc": 0.8244522788344224 + }, + { + "epoch": 2.445086705202312, + "grad_norm": 0.5315006428054552, + "learning_rate": 9.067796169914549e-08, + "loss": 0.5097811222076416, + "step": 846, + "token_acc": 0.827042571766035 + }, + { + "epoch": 2.447976878612717, + "grad_norm": 0.7439553982785114, + "learning_rate": 8.976512184381246e-08, + "loss": 0.49079883098602295, + "step": 847, + "token_acc": 0.8330292060799148 + }, + { + "epoch": 2.4508670520231215, + "grad_norm": 0.6047154396535889, + "learning_rate": 8.885644663484049e-08, + "loss": 0.5638853311538696, + "step": 848, + "token_acc": 0.8139317111350264 + }, + { + "epoch": 2.453757225433526, + "grad_norm": 0.5113685852977929, + "learning_rate": 8.795194529697148e-08, + "loss": 0.5080073475837708, + "step": 849, + "token_acc": 0.8294516082294987 + }, + { + "epoch": 2.4566473988439306, + "grad_norm": 0.5784270460360631, + "learning_rate": 8.705162701257501e-08, + "loss": 0.4831171929836273, + "step": 850, + "token_acc": 0.8367839034908794 + }, + { + "epoch": 2.459537572254335, + "grad_norm": 0.8859232576451248, + "learning_rate": 8.615550092155477e-08, + "loss": 0.49585288763046265, + "step": 851, + "token_acc": 0.8318051901511245 + }, + { + "epoch": 2.4624277456647397, + "grad_norm": 0.5397198676813016, + "learning_rate": 8.526357612125573e-08, + "loss": 0.5402971506118774, + "step": 852, + "token_acc": 0.8140772038815954 + }, + { + "epoch": 2.4653179190751446, + "grad_norm": 0.5962698285712602, + "learning_rate": 8.437586166637206e-08, + "loss": 0.4982019066810608, + "step": 853, + "token_acc": 0.8291487495756479 + }, + { + "epoch": 2.468208092485549, + "grad_norm": 0.639088875669763, + "learning_rate": 8.349236656885544e-08, + "loss": 0.5227348804473877, + "step": 854, + "token_acc": 0.8234732997252996 + }, + { + "epoch": 2.4710982658959537, + "grad_norm": 0.5125821343592164, + "learning_rate": 8.261309979782255e-08, + "loss": 0.5540283918380737, + "step": 855, + "token_acc": 0.8137015888618007 + }, + { + "epoch": 2.4739884393063583, + "grad_norm": 0.6336792834178986, + "learning_rate": 8.173807027946528e-08, + "loss": 0.5213714838027954, + "step": 856, + "token_acc": 0.8260184658469347 + }, + { + "epoch": 2.476878612716763, + "grad_norm": 0.741297514751174, + "learning_rate": 8.086728689695921e-08, + "loss": 0.4948037564754486, + "step": 857, + "token_acc": 0.8296993252484727 + }, + { + "epoch": 2.479768786127168, + "grad_norm": 0.5470631077862728, + "learning_rate": 8.000075849037408e-08, + "loss": 0.5469754934310913, + "step": 858, + "token_acc": 0.8164498833341608 + }, + { + "epoch": 2.4826589595375723, + "grad_norm": 0.4864695217391108, + "learning_rate": 7.913849385658333e-08, + "loss": 0.5522366762161255, + "step": 859, + "token_acc": 0.8114838802706048 + }, + { + "epoch": 2.485549132947977, + "grad_norm": 0.6284131013971183, + "learning_rate": 7.828050174917527e-08, + "loss": 0.5867525935173035, + "step": 860, + "token_acc": 0.8053583956414843 + }, + { + "epoch": 2.4884393063583814, + "grad_norm": 0.6601691347825654, + "learning_rate": 7.742679087836462e-08, + "loss": 0.4591352045536041, + "step": 861, + "token_acc": 0.8464259952598495 + }, + { + "epoch": 2.491329479768786, + "grad_norm": 0.5223754803762156, + "learning_rate": 7.657736991090263e-08, + "loss": 0.5479453206062317, + "step": 862, + "token_acc": 0.8136173830420323 + }, + { + "epoch": 2.494219653179191, + "grad_norm": 0.6063178523383044, + "learning_rate": 7.573224746999107e-08, + "loss": 0.4984654486179352, + "step": 863, + "token_acc": 0.8310789771475875 + }, + { + "epoch": 2.4971098265895955, + "grad_norm": 0.5664401315392263, + "learning_rate": 7.4891432135193e-08, + "loss": 0.5375936031341553, + "step": 864, + "token_acc": 0.8193700891772278 + }, + { + "epoch": 2.5, + "grad_norm": 0.5684032151067252, + "learning_rate": 7.405493244234651e-08, + "loss": 0.5382214188575745, + "step": 865, + "token_acc": 0.8159053497942387 + }, + { + "epoch": 2.5028901734104045, + "grad_norm": 1.6304188232278813, + "learning_rate": 7.322275688347818e-08, + "loss": 0.5420823097229004, + "step": 866, + "token_acc": 0.8175298965740142 + }, + { + "epoch": 2.505780346820809, + "grad_norm": 0.5256843006054661, + "learning_rate": 7.239491390671631e-08, + "loss": 0.5603017807006836, + "step": 867, + "token_acc": 0.8130635711477354 + }, + { + "epoch": 2.508670520231214, + "grad_norm": 0.5290906377318529, + "learning_rate": 7.157141191620548e-08, + "loss": 0.4974015951156616, + "step": 868, + "token_acc": 0.8317996586674097 + }, + { + "epoch": 2.5115606936416186, + "grad_norm": 0.5009279956947961, + "learning_rate": 7.075225927202105e-08, + "loss": 0.5346574187278748, + "step": 869, + "token_acc": 0.8163790337713909 + }, + { + "epoch": 2.514450867052023, + "grad_norm": 0.4774847145184863, + "learning_rate": 6.993746429008496e-08, + "loss": 0.5793315768241882, + "step": 870, + "token_acc": 0.8044435794476767 + }, + { + "epoch": 2.5173410404624277, + "grad_norm": 0.579794607346244, + "learning_rate": 6.912703524208019e-08, + "loss": 0.4764576852321625, + "step": 871, + "token_acc": 0.8377503092002259 + }, + { + "epoch": 2.520231213872832, + "grad_norm": 0.5013881127258889, + "learning_rate": 6.832098035536759e-08, + "loss": 0.525843620300293, + "step": 872, + "token_acc": 0.8231466097001345 + }, + { + "epoch": 2.523121387283237, + "grad_norm": 0.48167613678527704, + "learning_rate": 6.751930781290238e-08, + "loss": 0.5380637049674988, + "step": 873, + "token_acc": 0.8183076636731655 + }, + { + "epoch": 2.5260115606936417, + "grad_norm": 0.4540447849829041, + "learning_rate": 6.672202575315044e-08, + "loss": 0.49698758125305176, + "step": 874, + "token_acc": 0.831075612916876 + }, + { + "epoch": 2.5289017341040463, + "grad_norm": 0.6661593346201325, + "learning_rate": 6.59291422700064e-08, + "loss": 0.4850313663482666, + "step": 875, + "token_acc": 0.8362135876193946 + }, + { + "epoch": 2.531791907514451, + "grad_norm": 0.505051966727968, + "learning_rate": 6.514066541271085e-08, + "loss": 0.499431312084198, + "step": 876, + "token_acc": 0.831420351210136 + }, + { + "epoch": 2.5346820809248554, + "grad_norm": 0.5882259006732896, + "learning_rate": 6.435660318576935e-08, + "loss": 0.5504227876663208, + "step": 877, + "token_acc": 0.8158776668803223 + }, + { + "epoch": 2.5375722543352603, + "grad_norm": 0.5391399587353708, + "learning_rate": 6.357696354887049e-08, + "loss": 0.5507422685623169, + "step": 878, + "token_acc": 0.8168785222461945 + }, + { + "epoch": 2.540462427745665, + "grad_norm": 0.5480460384925314, + "learning_rate": 6.28017544168053e-08, + "loss": 0.5473015308380127, + "step": 879, + "token_acc": 0.8178865534976365 + }, + { + "epoch": 2.5433526011560694, + "grad_norm": 0.5389986372049553, + "learning_rate": 6.20309836593873e-08, + "loss": 0.5189315676689148, + "step": 880, + "token_acc": 0.8252666894202909 + }, + { + "epoch": 2.546242774566474, + "grad_norm": 0.5707417078989917, + "learning_rate": 6.126465910137163e-08, + "loss": 0.5234180092811584, + "step": 881, + "token_acc": 0.8232250912282323 + }, + { + "epoch": 2.5491329479768785, + "grad_norm": 0.5632951051957191, + "learning_rate": 6.0502788522377e-08, + "loss": 0.5196454524993896, + "step": 882, + "token_acc": 0.8240517651811349 + }, + { + "epoch": 2.5520231213872835, + "grad_norm": 0.5312909361373286, + "learning_rate": 5.974537965680537e-08, + "loss": 0.5485826134681702, + "step": 883, + "token_acc": 0.8127245781077416 + }, + { + "epoch": 2.5549132947976876, + "grad_norm": 0.6429627848350591, + "learning_rate": 5.899244019376426e-08, + "loss": 0.5010867714881897, + "step": 884, + "token_acc": 0.8311800993506927 + }, + { + "epoch": 2.5578034682080926, + "grad_norm": 0.5223405882575716, + "learning_rate": 5.824397777698858e-08, + "loss": 0.5297751426696777, + "step": 885, + "token_acc": 0.8206137655553849 + }, + { + "epoch": 2.560693641618497, + "grad_norm": 0.8020502475631341, + "learning_rate": 5.7500000004762574e-08, + "loss": 0.5593537092208862, + "step": 886, + "token_acc": 0.811829619947517 + }, + { + "epoch": 2.5635838150289016, + "grad_norm": 0.6258112537179114, + "learning_rate": 5.676051442984325e-08, + "loss": 0.5434359908103943, + "step": 887, + "token_acc": 0.8160674580340842 + }, + { + "epoch": 2.5664739884393066, + "grad_norm": 0.5482233640675082, + "learning_rate": 5.602552855938325e-08, + "loss": 0.5392587780952454, + "step": 888, + "token_acc": 0.8183432292939603 + }, + { + "epoch": 2.5693641618497107, + "grad_norm": 0.5339167311609386, + "learning_rate": 5.529504985485528e-08, + "loss": 0.5843528509140015, + "step": 889, + "token_acc": 0.8041726059349488 + }, + { + "epoch": 2.5722543352601157, + "grad_norm": 0.5526129075488465, + "learning_rate": 5.456908573197544e-08, + "loss": 0.4785343408584595, + "step": 890, + "token_acc": 0.8354585097240348 + }, + { + "epoch": 2.5751445086705202, + "grad_norm": 0.5932930782479724, + "learning_rate": 5.384764356062865e-08, + "loss": 0.501940131187439, + "step": 891, + "token_acc": 0.8283741560885075 + }, + { + "epoch": 2.578034682080925, + "grad_norm": 0.5946977220929661, + "learning_rate": 5.313073066479379e-08, + "loss": 0.5379625558853149, + "step": 892, + "token_acc": 0.8177655126778356 + }, + { + "epoch": 2.5809248554913293, + "grad_norm": 0.5663018542099373, + "learning_rate": 5.2418354322468884e-08, + "loss": 0.4645715057849884, + "step": 893, + "token_acc": 0.8437703660317277 + }, + { + "epoch": 2.583815028901734, + "grad_norm": 0.5603090911019164, + "learning_rate": 5.1710521765597593e-08, + "loss": 0.5438505411148071, + "step": 894, + "token_acc": 0.8167114037179182 + }, + { + "epoch": 2.586705202312139, + "grad_norm": 0.5650529942357706, + "learning_rate": 5.100724017999575e-08, + "loss": 0.537551760673523, + "step": 895, + "token_acc": 0.8162509350365383 + }, + { + "epoch": 2.5895953757225434, + "grad_norm": 0.5946617661686765, + "learning_rate": 5.0308516705278525e-08, + "loss": 0.5363532304763794, + "step": 896, + "token_acc": 0.8188319733413082 + }, + { + "epoch": 2.592485549132948, + "grad_norm": 0.529447543384607, + "learning_rate": 4.961435843478751e-08, + "loss": 0.547370195388794, + "step": 897, + "token_acc": 0.8166483874998265 + }, + { + "epoch": 2.5953757225433525, + "grad_norm": 0.5564539974665098, + "learning_rate": 4.892477241551901e-08, + "loss": 0.5567014813423157, + "step": 898, + "token_acc": 0.8142607154390945 + }, + { + "epoch": 2.598265895953757, + "grad_norm": 0.6758226853294469, + "learning_rate": 4.8239765648052985e-08, + "loss": 0.5622668862342834, + "step": 899, + "token_acc": 0.8094786656801085 + }, + { + "epoch": 2.601156069364162, + "grad_norm": 0.6030746534353, + "learning_rate": 4.755934508648057e-08, + "loss": 0.48511946201324463, + "step": 900, + "token_acc": 0.8383746553751593 + }, + { + "epoch": 2.6040462427745665, + "grad_norm": 0.5291224134313559, + "learning_rate": 4.688351763833531e-08, + "loss": 0.5561063289642334, + "step": 901, + "token_acc": 0.811450131453075 + }, + { + "epoch": 2.606936416184971, + "grad_norm": 0.5231587422483082, + "learning_rate": 4.621229016452155e-08, + "loss": 0.585370659828186, + "step": 902, + "token_acc": 0.8056932036025608 + }, + { + "epoch": 2.6098265895953756, + "grad_norm": 1.1223139233293984, + "learning_rate": 4.554566947924537e-08, + "loss": 0.5447970628738403, + "step": 903, + "token_acc": 0.8164786148920761 + }, + { + "epoch": 2.61271676300578, + "grad_norm": 0.5225735759201205, + "learning_rate": 4.4883662349945784e-08, + "loss": 0.5505392551422119, + "step": 904, + "token_acc": 0.8164482180639134 + }, + { + "epoch": 2.615606936416185, + "grad_norm": 0.54473619880049, + "learning_rate": 4.422627549722519e-08, + "loss": 0.5359902381896973, + "step": 905, + "token_acc": 0.820455104729094 + }, + { + "epoch": 2.6184971098265897, + "grad_norm": 0.7561505246031067, + "learning_rate": 4.357351559478201e-08, + "loss": 0.47267240285873413, + "step": 906, + "token_acc": 0.8387789854590445 + }, + { + "epoch": 2.621387283236994, + "grad_norm": 0.5548449336113677, + "learning_rate": 4.2925389269341916e-08, + "loss": 0.5412442684173584, + "step": 907, + "token_acc": 0.8155705621117785 + }, + { + "epoch": 2.6242774566473988, + "grad_norm": 0.7283156817419644, + "learning_rate": 4.228190310059182e-08, + "loss": 0.5299142599105835, + "step": 908, + "token_acc": 0.8230541763009774 + }, + { + "epoch": 2.6271676300578033, + "grad_norm": 0.5365454152037888, + "learning_rate": 4.164306362111208e-08, + "loss": 0.5737514495849609, + "step": 909, + "token_acc": 0.8103234930175004 + }, + { + "epoch": 2.6300578034682083, + "grad_norm": 0.5438553812892487, + "learning_rate": 4.100887731631053e-08, + "loss": 0.5420162677764893, + "step": 910, + "token_acc": 0.8180698387235383 + }, + { + "epoch": 2.632947976878613, + "grad_norm": 0.64070798422041, + "learning_rate": 4.0379350624356766e-08, + "loss": 0.5189142823219299, + "step": 911, + "token_acc": 0.8237202834249387 + }, + { + "epoch": 2.6358381502890174, + "grad_norm": 0.47802319033882207, + "learning_rate": 3.975448993611652e-08, + "loss": 0.5308249592781067, + "step": 912, + "token_acc": 0.8203262576745515 + }, + { + "epoch": 2.638728323699422, + "grad_norm": 0.5724668109330596, + "learning_rate": 3.913430159508696e-08, + "loss": 0.5157672166824341, + "step": 913, + "token_acc": 0.8241608973797213 + }, + { + "epoch": 2.6416184971098264, + "grad_norm": 0.5470703054848514, + "learning_rate": 3.8518791897332204e-08, + "loss": 0.5976561307907104, + "step": 914, + "token_acc": 0.8007923950822223 + }, + { + "epoch": 2.6445086705202314, + "grad_norm": 0.5294401571240512, + "learning_rate": 3.790796709141975e-08, + "loss": 0.5527437925338745, + "step": 915, + "token_acc": 0.8132948131146666 + }, + { + "epoch": 2.647398843930636, + "grad_norm": 0.6321676647074376, + "learning_rate": 3.7301833378356073e-08, + "loss": 0.4902818202972412, + "step": 916, + "token_acc": 0.8343280912033046 + }, + { + "epoch": 2.6502890173410405, + "grad_norm": 0.6734799143444675, + "learning_rate": 3.67003969115251e-08, + "loss": 0.5476257801055908, + "step": 917, + "token_acc": 0.8164087189044648 + }, + { + "epoch": 2.653179190751445, + "grad_norm": 0.4933080483096889, + "learning_rate": 3.610366379662455e-08, + "loss": 0.5034703612327576, + "step": 918, + "token_acc": 0.8296526697770866 + }, + { + "epoch": 2.6560693641618496, + "grad_norm": 0.5701973114157253, + "learning_rate": 3.551164009160429e-08, + "loss": 0.5260199904441833, + "step": 919, + "token_acc": 0.8228647844657014 + }, + { + "epoch": 2.6589595375722546, + "grad_norm": 0.4606917700933646, + "learning_rate": 3.4924331806605314e-08, + "loss": 0.5847440361976624, + "step": 920, + "token_acc": 0.8036149091590186 + }, + { + "epoch": 2.661849710982659, + "grad_norm": 0.5312291603560868, + "learning_rate": 3.4341744903897963e-08, + "loss": 0.5280716419219971, + "step": 921, + "token_acc": 0.8217670827512655 + }, + { + "epoch": 2.6647398843930636, + "grad_norm": 0.5137738686874723, + "learning_rate": 3.376388529782215e-08, + "loss": 0.5434746146202087, + "step": 922, + "token_acc": 0.8166855043797683 + }, + { + "epoch": 2.667630057803468, + "grad_norm": 0.5112438107405131, + "learning_rate": 3.319075885472644e-08, + "loss": 0.4704023599624634, + "step": 923, + "token_acc": 0.8407168549429551 + }, + { + "epoch": 2.6705202312138727, + "grad_norm": 0.5633980375468464, + "learning_rate": 3.262237139290952e-08, + "loss": 0.5437241792678833, + "step": 924, + "token_acc": 0.8174555734488506 + }, + { + "epoch": 2.6734104046242777, + "grad_norm": 0.4789519578675391, + "learning_rate": 3.205872868256021e-08, + "loss": 0.5591274499893188, + "step": 925, + "token_acc": 0.8126648310155333 + }, + { + "epoch": 2.6763005780346822, + "grad_norm": 0.545383577218125, + "learning_rate": 3.149983644569948e-08, + "loss": 0.4846089482307434, + "step": 926, + "token_acc": 0.8357118170559603 + }, + { + "epoch": 2.679190751445087, + "grad_norm": 0.5624813066511716, + "learning_rate": 3.094570035612226e-08, + "loss": 0.5257154703140259, + "step": 927, + "token_acc": 0.8209082215813688 + }, + { + "epoch": 2.6820809248554913, + "grad_norm": 0.5921212603993137, + "learning_rate": 3.0396326039339507e-08, + "loss": 0.5992392897605896, + "step": 928, + "token_acc": 0.7986864607734648 + }, + { + "epoch": 2.684971098265896, + "grad_norm": 0.5498631051018497, + "learning_rate": 2.9851719072521487e-08, + "loss": 0.5509431958198547, + "step": 929, + "token_acc": 0.8177149696899494 + }, + { + "epoch": 2.687861271676301, + "grad_norm": 0.5215571767600914, + "learning_rate": 2.9311884984440873e-08, + "loss": 0.561446487903595, + "step": 930, + "token_acc": 0.8129055922352012 + }, + { + "epoch": 2.690751445086705, + "grad_norm": 0.559786563643402, + "learning_rate": 2.8776829255416967e-08, + "loss": 0.5166699290275574, + "step": 931, + "token_acc": 0.8237840118657938 + }, + { + "epoch": 2.69364161849711, + "grad_norm": 0.5753952050911679, + "learning_rate": 2.8246557317259723e-08, + "loss": 0.5357648134231567, + "step": 932, + "token_acc": 0.8212208495005039 + }, + { + "epoch": 2.6965317919075145, + "grad_norm": 0.5636571499534591, + "learning_rate": 2.7721074553214596e-08, + "loss": 0.5390565395355225, + "step": 933, + "token_acc": 0.8159201695282208 + }, + { + "epoch": 2.699421965317919, + "grad_norm": 0.5407560890645442, + "learning_rate": 2.7200386297908386e-08, + "loss": 0.541710615158081, + "step": 934, + "token_acc": 0.8174959891247107 + }, + { + "epoch": 2.7023121387283235, + "grad_norm": 0.48421827585155863, + "learning_rate": 2.6684497837294208e-08, + "loss": 0.5409998297691345, + "step": 935, + "token_acc": 0.8210280803345742 + }, + { + "epoch": 2.705202312138728, + "grad_norm": 0.49710877088501176, + "learning_rate": 2.6173414408598826e-08, + "loss": 0.5135529637336731, + "step": 936, + "token_acc": 0.8251490888501849 + }, + { + "epoch": 2.708092485549133, + "grad_norm": 0.6329172467067579, + "learning_rate": 2.5667141200268694e-08, + "loss": 0.5547735691070557, + "step": 937, + "token_acc": 0.8145400135743814 + }, + { + "epoch": 2.7109826589595376, + "grad_norm": 0.5576557557006313, + "learning_rate": 2.5165683351917765e-08, + "loss": 0.5579146146774292, + "step": 938, + "token_acc": 0.8112171853454817 + }, + { + "epoch": 2.713872832369942, + "grad_norm": 0.5905103597710084, + "learning_rate": 2.4669045954275046e-08, + "loss": 0.5442934632301331, + "step": 939, + "token_acc": 0.818311620283537 + }, + { + "epoch": 2.7167630057803467, + "grad_norm": 0.6610701567101593, + "learning_rate": 2.4177234049133023e-08, + "loss": 0.49151283502578735, + "step": 940, + "token_acc": 0.8325153415650084 + }, + { + "epoch": 2.7196531791907512, + "grad_norm": 0.6214821823759014, + "learning_rate": 2.369025262929658e-08, + "loss": 0.5725831389427185, + "step": 941, + "token_acc": 0.8070232229912145 + }, + { + "epoch": 2.722543352601156, + "grad_norm": 0.5547499629666095, + "learning_rate": 2.3208106638531842e-08, + "loss": 0.5330009460449219, + "step": 942, + "token_acc": 0.8195172027623966 + }, + { + "epoch": 2.7254335260115607, + "grad_norm": 0.5521438894414953, + "learning_rate": 2.2730800971516862e-08, + "loss": 0.5747419595718384, + "step": 943, + "token_acc": 0.8086665948043549 + }, + { + "epoch": 2.7283236994219653, + "grad_norm": 0.6317779099057246, + "learning_rate": 2.225834047379099e-08, + "loss": 0.49804458022117615, + "step": 944, + "token_acc": 0.8307906934881418 + }, + { + "epoch": 2.73121387283237, + "grad_norm": 0.5560572315857666, + "learning_rate": 2.1790729941706276e-08, + "loss": 0.5384119153022766, + "step": 945, + "token_acc": 0.8186016301942814 + }, + { + "epoch": 2.7341040462427744, + "grad_norm": 0.5706315776877087, + "learning_rate": 2.132797412237869e-08, + "loss": 0.5331531167030334, + "step": 946, + "token_acc": 0.8183284045442989 + }, + { + "epoch": 2.7369942196531793, + "grad_norm": 0.5767818083804982, + "learning_rate": 2.087007771363969e-08, + "loss": 0.5555546879768372, + "step": 947, + "token_acc": 0.8130259084965389 + }, + { + "epoch": 2.739884393063584, + "grad_norm": 0.5074851398256462, + "learning_rate": 2.041704536398875e-08, + "loss": 0.5641285181045532, + "step": 948, + "token_acc": 0.8102424125823674 + }, + { + "epoch": 2.7427745664739884, + "grad_norm": 0.5656737111306388, + "learning_rate": 1.9968881672545957e-08, + "loss": 0.5804109573364258, + "step": 949, + "token_acc": 0.8069046557228511 + }, + { + "epoch": 2.745664739884393, + "grad_norm": 0.5396023274518039, + "learning_rate": 1.9525591189005874e-08, + "loss": 0.5026800632476807, + "step": 950, + "token_acc": 0.8291645642615152 + }, + { + "epoch": 2.7485549132947975, + "grad_norm": 0.5545085068594241, + "learning_rate": 1.9087178413590476e-08, + "loss": 0.5121109485626221, + "step": 951, + "token_acc": 0.829365647193499 + }, + { + "epoch": 2.7514450867052025, + "grad_norm": 0.5744534847489216, + "learning_rate": 1.8653647797004236e-08, + "loss": 0.5073999166488647, + "step": 952, + "token_acc": 0.8286528286528286 + }, + { + "epoch": 2.754335260115607, + "grad_norm": 0.5473570344774414, + "learning_rate": 1.8225003740388545e-08, + "loss": 0.5411463975906372, + "step": 953, + "token_acc": 0.8197644649257553 + }, + { + "epoch": 2.7572254335260116, + "grad_norm": 0.5960870996950273, + "learning_rate": 1.7801250595277095e-08, + "loss": 0.45802488923072815, + "step": 954, + "token_acc": 0.8439128432584406 + }, + { + "epoch": 2.760115606936416, + "grad_norm": 0.5872410848204962, + "learning_rate": 1.738239266355185e-08, + "loss": 0.5364171862602234, + "step": 955, + "token_acc": 0.8192522793328644 + }, + { + "epoch": 2.7630057803468207, + "grad_norm": 0.5452386927866908, + "learning_rate": 1.6968434197399072e-08, + "loss": 0.5837544202804565, + "step": 956, + "token_acc": 0.8051349532888352 + }, + { + "epoch": 2.7658959537572256, + "grad_norm": 0.5752700596867665, + "learning_rate": 1.655937939926655e-08, + "loss": 0.5129964351654053, + "step": 957, + "token_acc": 0.8282252791972994 + }, + { + "epoch": 2.76878612716763, + "grad_norm": 0.5428098765109344, + "learning_rate": 1.6155232421820653e-08, + "loss": 0.5746065378189087, + "step": 958, + "token_acc": 0.8089228223154 + }, + { + "epoch": 2.7716763005780347, + "grad_norm": 0.5949829280630812, + "learning_rate": 1.5755997367904173e-08, + "loss": 0.4916711747646332, + "step": 959, + "token_acc": 0.8342608068069589 + }, + { + "epoch": 2.7745664739884393, + "grad_norm": 0.5674429218313363, + "learning_rate": 1.536167829049495e-08, + "loss": 0.5395721197128296, + "step": 960, + "token_acc": 0.8203693073096058 + }, + { + "epoch": 2.777456647398844, + "grad_norm": 0.561452376268135, + "learning_rate": 1.497227919266414e-08, + "loss": 0.51889967918396, + "step": 961, + "token_acc": 0.8233378239163167 + }, + { + "epoch": 2.7803468208092488, + "grad_norm": 0.6257227381883494, + "learning_rate": 1.4587804027536454e-08, + "loss": 0.5111842155456543, + "step": 962, + "token_acc": 0.8274028303059359 + }, + { + "epoch": 2.7832369942196533, + "grad_norm": 0.5900526631508034, + "learning_rate": 1.420825669824921e-08, + "loss": 0.5204794406890869, + "step": 963, + "token_acc": 0.8234049795759579 + }, + { + "epoch": 2.786127167630058, + "grad_norm": 0.509902068102799, + "learning_rate": 1.3833641057913015e-08, + "loss": 0.47923728823661804, + "step": 964, + "token_acc": 0.8353080111030787 + }, + { + "epoch": 2.7890173410404624, + "grad_norm": 0.5460825106119277, + "learning_rate": 1.346396090957297e-08, + "loss": 0.520375669002533, + "step": 965, + "token_acc": 0.8276919599125914 + }, + { + "epoch": 2.791907514450867, + "grad_norm": 0.5432685057122655, + "learning_rate": 1.309922000616942e-08, + "loss": 0.5795409679412842, + "step": 966, + "token_acc": 0.8071895906398279 + }, + { + "epoch": 2.794797687861272, + "grad_norm": 0.5657536988747344, + "learning_rate": 1.2739422050500436e-08, + "loss": 0.5345174074172974, + "step": 967, + "token_acc": 0.8179120793316155 + }, + { + "epoch": 2.7976878612716765, + "grad_norm": 0.521811401090051, + "learning_rate": 1.2384570695183782e-08, + "loss": 0.5313125252723694, + "step": 968, + "token_acc": 0.8208080793990667 + }, + { + "epoch": 2.800578034682081, + "grad_norm": 0.5951506599748814, + "learning_rate": 1.2034669542620223e-08, + "loss": 0.5154579877853394, + "step": 969, + "token_acc": 0.8274639716414208 + }, + { + "epoch": 2.8034682080924855, + "grad_norm": 0.7493969316675455, + "learning_rate": 1.168972214495667e-08, + "loss": 0.4610113203525543, + "step": 970, + "token_acc": 0.8410565847986298 + }, + { + "epoch": 2.80635838150289, + "grad_norm": 0.6158144745722535, + "learning_rate": 1.1349732004050205e-08, + "loss": 0.5308967232704163, + "step": 971, + "token_acc": 0.823366838754401 + }, + { + "epoch": 2.809248554913295, + "grad_norm": 0.49701991004281837, + "learning_rate": 1.101470257143261e-08, + "loss": 0.5433156490325928, + "step": 972, + "token_acc": 0.8172732427363528 + }, + { + "epoch": 2.812138728323699, + "grad_norm": 0.614964929129747, + "learning_rate": 1.0684637248275175e-08, + "loss": 0.4856722056865692, + "step": 973, + "token_acc": 0.8371653570989119 + }, + { + "epoch": 2.815028901734104, + "grad_norm": 0.5531928817079772, + "learning_rate": 1.0359539385354387e-08, + "loss": 0.5472983121871948, + "step": 974, + "token_acc": 0.8166184194819147 + }, + { + "epoch": 2.8179190751445087, + "grad_norm": 0.6036213061429313, + "learning_rate": 1.0039412283017523e-08, + "loss": 0.5529719591140747, + "step": 975, + "token_acc": 0.8155163061650604 + }, + { + "epoch": 2.820809248554913, + "grad_norm": 0.5564254532918392, + "learning_rate": 9.724259191149774e-09, + "loss": 0.4628450572490692, + "step": 976, + "token_acc": 0.8427982220798462 + }, + { + "epoch": 2.8236994219653178, + "grad_norm": 0.5588830748507647, + "learning_rate": 9.414083309140453e-09, + "loss": 0.5567787289619446, + "step": 977, + "token_acc": 0.8121751346288926 + }, + { + "epoch": 2.8265895953757223, + "grad_norm": 0.5529058564154966, + "learning_rate": 9.108887785851338e-09, + "loss": 0.5580377578735352, + "step": 978, + "token_acc": 0.8109314422108472 + }, + { + "epoch": 2.8294797687861273, + "grad_norm": 0.61646098239251, + "learning_rate": 8.808675719584158e-09, + "loss": 0.5375653505325317, + "step": 979, + "token_acc": 0.8192844783892899 + }, + { + "epoch": 2.832369942196532, + "grad_norm": 0.5248181521879705, + "learning_rate": 8.513450158049106e-09, + "loss": 0.5359894037246704, + "step": 980, + "token_acc": 0.8180794693882546 + }, + { + "epoch": 2.8352601156069364, + "grad_norm": 0.530766621077344, + "learning_rate": 8.22321409833443e-09, + "loss": 0.5032058358192444, + "step": 981, + "token_acc": 0.8299942928720195 + }, + { + "epoch": 2.838150289017341, + "grad_norm": 0.5767728092897907, + "learning_rate": 7.93797048687539e-09, + "loss": 0.555617094039917, + "step": 982, + "token_acc": 0.8127699150828953 + }, + { + "epoch": 2.8410404624277454, + "grad_norm": 0.5275196163844481, + "learning_rate": 7.657722219424789e-09, + "loss": 0.5177302956581116, + "step": 983, + "token_acc": 0.8254756164272545 + }, + { + "epoch": 2.8439306358381504, + "grad_norm": 0.7188190918164308, + "learning_rate": 7.382472141023221e-09, + "loss": 0.5488888025283813, + "step": 984, + "token_acc": 0.8139118457300275 + }, + { + "epoch": 2.846820809248555, + "grad_norm": 0.5053524666497287, + "learning_rate": 7.112223045970589e-09, + "loss": 0.5309122800827026, + "step": 985, + "token_acc": 0.818977587114551 + }, + { + "epoch": 2.8497109826589595, + "grad_norm": 0.49254982998325725, + "learning_rate": 6.8469776777973494e-09, + "loss": 0.48389381170272827, + "step": 986, + "token_acc": 0.839111193678302 + }, + { + "epoch": 2.852601156069364, + "grad_norm": 0.5088843284530131, + "learning_rate": 6.5867387292369295e-09, + "loss": 0.5327301025390625, + "step": 987, + "token_acc": 0.8190361305134541 + }, + { + "epoch": 2.8554913294797686, + "grad_norm": 0.5579589460192081, + "learning_rate": 6.331508842198296e-09, + "loss": 0.46285098791122437, + "step": 988, + "token_acc": 0.8444943903023158 + }, + { + "epoch": 2.8583815028901736, + "grad_norm": 0.5480219063407678, + "learning_rate": 6.081290607739042e-09, + "loss": 0.4747048616409302, + "step": 989, + "token_acc": 0.8427808981834031 + }, + { + "epoch": 2.861271676300578, + "grad_norm": 0.7741942154519839, + "learning_rate": 5.836086566039289e-09, + "loss": 0.5887913703918457, + "step": 990, + "token_acc": 0.8049742371893245 + }, + { + "epoch": 2.8641618497109826, + "grad_norm": 0.5193852803751504, + "learning_rate": 5.595899206375654e-09, + "loss": 0.5110014081001282, + "step": 991, + "token_acc": 0.8288312763590261 + }, + { + "epoch": 2.867052023121387, + "grad_norm": 0.5341612707698237, + "learning_rate": 5.360730967096272e-09, + "loss": 0.5477676391601562, + "step": 992, + "token_acc": 0.8129789165141573 + }, + { + "epoch": 2.8699421965317917, + "grad_norm": 0.7306055692439172, + "learning_rate": 5.130584235595703e-09, + "loss": 0.5541284680366516, + "step": 993, + "token_acc": 0.8145775823594559 + }, + { + "epoch": 2.8728323699421967, + "grad_norm": 0.5713799415951762, + "learning_rate": 4.9054613482910065e-09, + "loss": 0.44801950454711914, + "step": 994, + "token_acc": 0.845931691583633 + }, + { + "epoch": 2.8757225433526012, + "grad_norm": 0.5839589911780936, + "learning_rate": 4.685364590597929e-09, + "loss": 0.5638971924781799, + "step": 995, + "token_acc": 0.8107071579171281 + }, + { + "epoch": 2.878612716763006, + "grad_norm": 0.5287376481818248, + "learning_rate": 4.470296196907364e-09, + "loss": 0.5595090389251709, + "step": 996, + "token_acc": 0.8104899471905078 + }, + { + "epoch": 2.8815028901734103, + "grad_norm": 0.5379724615788479, + "learning_rate": 4.260258350563317e-09, + "loss": 0.5029683709144592, + "step": 997, + "token_acc": 0.8288261472452321 + }, + { + "epoch": 2.884393063583815, + "grad_norm": 0.6018325527774611, + "learning_rate": 4.055253183840257e-09, + "loss": 0.5635591149330139, + "step": 998, + "token_acc": 0.8117199938369883 + }, + { + "epoch": 2.88728323699422, + "grad_norm": 0.5473646076466034, + "learning_rate": 3.855282777921465e-09, + "loss": 0.44404757022857666, + "step": 999, + "token_acc": 0.8481432594156987 + }, + { + "epoch": 2.8901734104046244, + "grad_norm": 0.607676333795665, + "learning_rate": 3.660349162878329e-09, + "loss": 0.5595177412033081, + "step": 1000, + "token_acc": 0.8098022742758105 + }, + { + "epoch": 2.8901734104046244, + "eval_loss": 0.5740217566490173, + "eval_runtime": 69.5297, + "eval_samples_per_second": 1.582, + "eval_steps_per_second": 0.201, + "eval_token_acc": 0.808306147135369, + "step": 1000 + }, + { + "epoch": 2.893063583815029, + "grad_norm": 0.6028179153533768, + "learning_rate": 3.4704543176491407e-09, + "loss": 0.5201370716094971, + "step": 1001, + "token_acc": 0.8248979009505466 + }, + { + "epoch": 2.8959537572254335, + "grad_norm": 0.5618469428482809, + "learning_rate": 3.285600170019609e-09, + "loss": 0.4737909138202667, + "step": 1002, + "token_acc": 0.8380801687763713 + }, + { + "epoch": 2.898843930635838, + "grad_norm": 0.520670079505936, + "learning_rate": 3.10578859660271e-09, + "loss": 0.4949793815612793, + "step": 1003, + "token_acc": 0.8310451985643839 + }, + { + "epoch": 2.901734104046243, + "grad_norm": 0.5898385451823664, + "learning_rate": 2.9310214228202014e-09, + "loss": 0.5583693981170654, + "step": 1004, + "token_acc": 0.8109677906011918 + }, + { + "epoch": 2.9046242774566475, + "grad_norm": 0.5434063241260475, + "learning_rate": 2.7613004228835836e-09, + "loss": 0.5403155088424683, + "step": 1005, + "token_acc": 0.8173558831911802 + }, + { + "epoch": 2.907514450867052, + "grad_norm": 0.5472051803786162, + "learning_rate": 2.59662731977639e-09, + "loss": 0.5251212120056152, + "step": 1006, + "token_acc": 0.8263490698267074 + }, + { + "epoch": 2.9104046242774566, + "grad_norm": 0.49207250611822545, + "learning_rate": 2.437003785236702e-09, + "loss": 0.5539924502372742, + "step": 1007, + "token_acc": 0.8112695897164994 + }, + { + "epoch": 2.913294797687861, + "grad_norm": 0.5002736177395538, + "learning_rate": 2.2824314397399404e-09, + "loss": 0.5284777283668518, + "step": 1008, + "token_acc": 0.8207929017091751 + }, + { + "epoch": 2.916184971098266, + "grad_norm": 0.5322616545740584, + "learning_rate": 2.132911852482766e-09, + "loss": 0.5585949420928955, + "step": 1009, + "token_acc": 0.8104817895999946 + }, + { + "epoch": 2.9190751445086707, + "grad_norm": 0.5531944879626155, + "learning_rate": 1.9884465413667063e-09, + "loss": 0.5428365468978882, + "step": 1010, + "token_acc": 0.815299992762539 + }, + { + "epoch": 2.921965317919075, + "grad_norm": 0.5219295200504247, + "learning_rate": 1.8490369729832755e-09, + "loss": 0.5256614685058594, + "step": 1011, + "token_acc": 0.8222089510292981 + }, + { + "epoch": 2.9248554913294798, + "grad_norm": 0.5231759747194448, + "learning_rate": 1.714684562598545e-09, + "loss": 0.5462931990623474, + "step": 1012, + "token_acc": 0.8166555934189188 + }, + { + "epoch": 2.9277456647398843, + "grad_norm": 0.511178905264401, + "learning_rate": 1.5853906741392086e-09, + "loss": 0.48754703998565674, + "step": 1013, + "token_acc": 0.8340968562927913 + }, + { + "epoch": 2.9306358381502893, + "grad_norm": 0.49209363879670576, + "learning_rate": 1.4611566201785386e-09, + "loss": 0.6072345972061157, + "step": 1014, + "token_acc": 0.796086135633005 + }, + { + "epoch": 2.9335260115606934, + "grad_norm": 0.5468806874394325, + "learning_rate": 1.3419836619229519e-09, + "loss": 0.5350404381752014, + "step": 1015, + "token_acc": 0.8205611421851678 + }, + { + "epoch": 2.9364161849710984, + "grad_norm": 0.5545661554638134, + "learning_rate": 1.227873009199465e-09, + "loss": 0.48873502016067505, + "step": 1016, + "token_acc": 0.8335308101581073 + }, + { + "epoch": 2.939306358381503, + "grad_norm": 0.6117033520146128, + "learning_rate": 1.1188258204433144e-09, + "loss": 0.5223637819290161, + "step": 1017, + "token_acc": 0.8220580971784899 + }, + { + "epoch": 2.9421965317919074, + "grad_norm": 0.5990530756110558, + "learning_rate": 1.0148432026860775e-09, + "loss": 0.5375405550003052, + "step": 1018, + "token_acc": 0.8204211966851669 + }, + { + "epoch": 2.9450867052023124, + "grad_norm": 0.5179575810720268, + "learning_rate": 9.159262115445709e-10, + "loss": 0.5529065132141113, + "step": 1019, + "token_acc": 0.8146867269147271 + }, + { + "epoch": 2.9479768786127165, + "grad_norm": 0.4852204771957678, + "learning_rate": 8.220758512100246e-10, + "loss": 0.5473994016647339, + "step": 1020, + "token_acc": 0.8154385812017952 + }, + { + "epoch": 2.9508670520231215, + "grad_norm": 0.5869353604242789, + "learning_rate": 7.332930744380905e-10, + "loss": 0.5176626443862915, + "step": 1021, + "token_acc": 0.8273430939731791 + }, + { + "epoch": 2.953757225433526, + "grad_norm": 0.5602528809896415, + "learning_rate": 6.49578782538851e-10, + "loss": 0.5115993618965149, + "step": 1022, + "token_acc": 0.8288524482039359 + }, + { + "epoch": 2.9566473988439306, + "grad_norm": 0.5342085317349031, + "learning_rate": 5.709338253679363e-10, + "loss": 0.5524012446403503, + "step": 1023, + "token_acc": 0.8131655170976683 + }, + { + "epoch": 2.959537572254335, + "grad_norm": 0.5776521748726285, + "learning_rate": 4.973590013178652e-10, + "loss": 0.5437720417976379, + "step": 1024, + "token_acc": 0.8181899648876977 + }, + { + "epoch": 2.9624277456647397, + "grad_norm": 0.5915883065627155, + "learning_rate": 4.288550573098293e-10, + "loss": 0.5497083067893982, + "step": 1025, + "token_acc": 0.8166504174699635 + }, + { + "epoch": 2.9653179190751446, + "grad_norm": 0.519862153616305, + "learning_rate": 3.6542268878608785e-10, + "loss": 0.5397800207138062, + "step": 1026, + "token_acc": 0.8185784280824216 + }, + { + "epoch": 2.968208092485549, + "grad_norm": 0.6328021139986955, + "learning_rate": 3.070625397031401e-10, + "loss": 0.5588440299034119, + "step": 1027, + "token_acc": 0.8125476802049286 + }, + { + "epoch": 2.9710982658959537, + "grad_norm": 0.5575020860016229, + "learning_rate": 2.537752025249529e-10, + "loss": 0.5562065839767456, + "step": 1028, + "token_acc": 0.8104220354019687 + }, + { + "epoch": 2.9739884393063583, + "grad_norm": 0.5378061802083338, + "learning_rate": 2.0556121821696527e-10, + "loss": 0.5177541971206665, + "step": 1029, + "token_acc": 0.8242314812400594 + }, + { + "epoch": 2.976878612716763, + "grad_norm": 0.5832757184904683, + "learning_rate": 1.6242107624070412e-10, + "loss": 0.49845069646835327, + "step": 1030, + "token_acc": 0.8330388762567243 + }, + { + "epoch": 2.979768786127168, + "grad_norm": 0.7982615431706986, + "learning_rate": 1.2435521454884358e-10, + "loss": 0.5247231125831604, + "step": 1031, + "token_acc": 0.823871938586352 + }, + { + "epoch": 2.9826589595375723, + "grad_norm": 0.5127749961245016, + "learning_rate": 9.136401958059759e-11, + "loss": 0.5525383353233337, + "step": 1032, + "token_acc": 0.8136602187346615 + }, + { + "epoch": 2.985549132947977, + "grad_norm": 0.542665341113767, + "learning_rate": 6.34478262578897e-11, + "loss": 0.5264041423797607, + "step": 1033, + "token_acc": 0.8259248289322793 + }, + { + "epoch": 2.9884393063583814, + "grad_norm": 0.5981387552317852, + "learning_rate": 4.0606917981966804e-11, + "loss": 0.5639816522598267, + "step": 1034, + "token_acc": 0.811261064452967 + }, + { + "epoch": 2.991329479768786, + "grad_norm": 0.5182263398780822, + "learning_rate": 2.2841526630512642e-11, + "loss": 0.5699348449707031, + "step": 1035, + "token_acc": 0.8084916570295722 + }, + { + "epoch": 2.994219653179191, + "grad_norm": 0.48173987479445357, + "learning_rate": 1.0151832555205242e-11, + "loss": 0.5670179128646851, + "step": 1036, + "token_acc": 0.8119991095280499 + }, + { + "epoch": 2.9971098265895955, + "grad_norm": 0.5532608077856682, + "learning_rate": 2.5379645800516215e-12, + "loss": 0.5611600875854492, + "step": 1037, + "token_acc": 0.8147770004529734 + }, + { + "epoch": 3.0, + "grad_norm": 0.5148238785537761, + "learning_rate": 0.0, + "loss": 0.5508678555488586, + "step": 1038, + "token_acc": 0.8153577131547579 + }, + { + "epoch": 3.0, + "eval_loss": 0.5740059018135071, + "eval_runtime": 69.9798, + "eval_samples_per_second": 1.572, + "eval_steps_per_second": 0.2, + "eval_token_acc": 0.808306147135369, + "step": 1038 + } + ], + "logging_steps": 1, + "max_steps": 1038, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1140072026079232.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}